From 3c0979c64481f07a6bf9c1775601845ac5fa57f3 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 23 Jul 2025 13:27:45 +0100 Subject: [PATCH 01/93] arm64/sme: Drop inaccurate documentation of streaming mode switches The SME ABI documentation contains an inaccurate description of the architectural streaming mode entry/exit behaviour, just remove it since this is better documented by the architecture or with the rest of the documentation for the specific software interfaces concerned. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- Documentation/arch/arm64/sme.rst | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst index 4cb38330e704..583f2ee9cb97 100644 --- a/Documentation/arch/arm64/sme.rst +++ b/Documentation/arch/arm64/sme.rst @@ -81,17 +81,7 @@ The ZA matrix is square with each side having as many bytes as a streaming mode SVE vector. -3. Sharing of streaming and non-streaming mode SVE state ---------------------------------------------------------- - -It is implementation defined which if any parts of the SVE state are shared -between streaming and non-streaming modes. When switching between modes -via software interfaces such as ptrace if no register content is provided as -part of switching no state will be assumed to be shared and everything will -be zeroed. - - -4. System call behaviour +3. System call behaviour ------------------------- * On syscall PSTATE.ZA is preserved, if PSTATE.ZA==1 then the contents of the @@ -112,7 +102,7 @@ be zeroed. exceptions for execve() described in section 6. -5. Signal handling +4. Signal handling ------------------- * Signal handlers are invoked with PSTATE.SM=0, PSTATE.ZA=0, and TPIDR2_EL0=0. From a679e5683d3eef22ca12514ff8784b2b914ebedc Mon Sep 17 00:00:00 2001 From: Bala-Vignesh-Reddy Date: Fri, 8 Aug 2025 13:38:30 +0530 Subject: [PATCH 02/93] selftests: arm64: Check fread return value in exec_target Fix -Wunused-result warning generated when compiled with gcc 13.3.0, by checking fread's return value and handling errors, preventing potential failures when reading from stdin. Fixes compiler warning: warning: ignoring return value of 'fread' declared with attribute 'warn_unused_result' [-Wunused-result] Fixes: 806a15b2545e ("kselftests/arm64: add PAuth test for whether exec() changes keys") Signed-off-by: Bala-Vignesh-Reddy Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/pauth/exec_target.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/pauth/exec_target.c b/tools/testing/selftests/arm64/pauth/exec_target.c index 4435600ca400..e597861b26d6 100644 --- a/tools/testing/selftests/arm64/pauth/exec_target.c +++ b/tools/testing/selftests/arm64/pauth/exec_target.c @@ -13,7 +13,12 @@ int main(void) unsigned long hwcaps; size_t val; - fread(&val, sizeof(size_t), 1, stdin); + size_t size = fread(&val, sizeof(size_t), 1, stdin); + + if (size != 1) { + fprintf(stderr, "Could not read input from stdin\n"); + return EXIT_FAILURE; + } /* don't try to execute illegal (unimplemented) instructions) caller * should have checked this and keep worker simple From 791d703baddd141dc7b80a69bcd237b6b79150ea Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 12 Aug 2025 15:37:00 +0100 Subject: [PATCH 03/93] kselftest/arm64: Log error codes in sve-ptrace Use ksft_perror() to report error codes from failing ptrace operations to make it easier to interpret logs when things go wrong. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/sve-ptrace.c | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index b22303778fb0..4cba3bcff660 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -95,19 +95,27 @@ static int do_child(void) static int get_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd) { struct iovec iov; + int ret; iov.iov_base = fpsimd; iov.iov_len = sizeof(*fpsimd); - return ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); + ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_GETREGSET)"); + return ret; } static int set_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd) { struct iovec iov; + int ret; iov.iov_base = fpsimd; iov.iov_len = sizeof(*fpsimd); - return ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); + ret = ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_SETREGSET)"); + return ret; } static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, @@ -117,6 +125,7 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, void *p; size_t sz = sizeof *sve; struct iovec iov; + int ret; while (1) { if (*size < sz) { @@ -132,8 +141,11 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, iov.iov_base = *buf; iov.iov_len = sz; - if (ptrace(PTRACE_GETREGSET, pid, type->regset, &iov)) + ret = ptrace(PTRACE_GETREGSET, pid, type->regset, &iov); + if (ret) { + ksft_perror("ptrace(PTRACE_GETREGSET)"); goto error; + } sve = *buf; if (sve->size <= sz) @@ -152,10 +164,14 @@ static int set_sve(pid_t pid, const struct vec_type *type, const struct user_sve_header *sve) { struct iovec iov; + int ret; iov.iov_base = (void *)sve; iov.iov_len = sve->size; - return ptrace(PTRACE_SETREGSET, pid, type->regset, &iov); + ret = ptrace(PTRACE_SETREGSET, pid, type->regset, &iov); + if (ret == -1) + ksft_perror("ptrace(PTRACE_SETREGSET)"); + return ret; } /* Validate setting and getting the inherit flag */ From 50af02425afc72b1b47c4a0a0b9c9bdaa1a1b347 Mon Sep 17 00:00:00 2001 From: Bala-Vignesh-Reddy Date: Thu, 7 Aug 2025 17:12:29 +0530 Subject: [PATCH 04/93] selftests: arm64: Fix -Waddress warning in tpidr2 test Thanks to -Waddress, the compiler warns that the ksft_test_result() invocations in the arm64 tpidr2 selftest are always true. Oops. Fix the test by, err, actually running the test functions. Fixes: 6d80cb73131d ("kselftest/arm64: Convert tpidr2 test to use kselftest.h") Signed-off-by: Bala-Vignesh-Reddy Reviewed-by: Anshuman Khandual Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/tpidr2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index f58a9f89b952..4c89ab0f1010 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -227,10 +227,10 @@ int main(int argc, char **argv) ret = open("/proc/sys/abi/sme_default_vector_length", O_RDONLY, 0); if (ret >= 0) { ksft_test_result(default_value(), "default_value\n"); - ksft_test_result(write_read, "write_read\n"); - ksft_test_result(write_sleep_read, "write_sleep_read\n"); - ksft_test_result(write_fork_read, "write_fork_read\n"); - ksft_test_result(write_clone_read, "write_clone_read\n"); + ksft_test_result(write_read(), "write_read\n"); + ksft_test_result(write_sleep_read(), "write_sleep_read\n"); + ksft_test_result(write_fork_read(), "write_fork_read\n"); + ksft_test_result(write_clone_read(), "write_clone_read\n"); } else { ksft_print_msg("SME support not present\n"); From 740cdafd0d998903c1faeee921028a8a78698be5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 17:13:02 +0200 Subject: [PATCH 05/93] kselftest/arm64/gcs: Correctly check return value when disabling GCS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The return value was not assigned to 'ret', so the check afterwards does not do anything. Fixes: 3d37d4307e0f ("kselftest/arm64: Add very basic GCS test program") Signed-off-by: Thomas Weißschuh Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/gcs/basic-gcs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c index 54f9c888249d..100d2a983155 100644 --- a/tools/testing/selftests/arm64/gcs/basic-gcs.c +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -410,7 +410,7 @@ int main(void) } /* One last test: disable GCS, we can do this one time */ - my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); + ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); if (ret != 0) ksft_print_msg("Failed to disable GCS: %d\n", ret); From a985fe638344492727528e52416211dda1c391d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 17:13:03 +0200 Subject: [PATCH 06/93] kselftest/arm64/gcs: Use nolibc's getauxval() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nolibc now does have getauxval(), use it. Signed-off-by: Thomas Weißschuh Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/gcs/basic-gcs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c index 100d2a983155..250977abc398 100644 --- a/tools/testing/selftests/arm64/gcs/basic-gcs.c +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -10,6 +10,7 @@ #include #include +#include #include #include "kselftest.h" @@ -386,14 +387,13 @@ int main(void) ksft_print_header(); - /* - * We don't have getauxval() with nolibc so treat a failure to - * read GCS state as a lack of support and skip. - */ + if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) + ksft_exit_skip("SKIP GCS not supported\n"); + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode, 0, 0, 0); if (ret != 0) - ksft_exit_skip("Failed to read GCS state: %d\n", ret); + ksft_exit_fail_msg("Failed to read GCS state: %d\n", ret); if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) { gcs_mode = PR_SHADOW_STACK_ENABLE; From 3198780eaf37c071052edac109d99bff77e6ce5c Mon Sep 17 00:00:00 2001 From: Vivek Yadav Date: Sat, 23 Aug 2025 23:14:00 -0700 Subject: [PATCH 07/93] kselftest/arm64: Remove extra blank line Remove an unnecessary blank line to improve code style consistency. ``` [command] ./scripts/checkpatch.pl --strict -f [output] CHECK: Please don't use multiple blank lines CHECK: Blank lines aren't necessary before a close brace '}' ``` Signed-off-by: Vivek Yadav Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/hwcap.c | 1 - tools/testing/selftests/arm64/bti/assembler.h | 1 - tools/testing/selftests/arm64/fp/fp-ptrace.c | 1 - tools/testing/selftests/arm64/fp/vec-syscfg.c | 1 - tools/testing/selftests/arm64/fp/zt-ptrace.c | 1 - tools/testing/selftests/arm64/gcs/gcs-locking.c | 1 - 6 files changed, 6 deletions(-) diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index 002ec38a8bbb..27d4790c2f0c 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -55,7 +55,6 @@ static void cmpbr_sigill(void) /* Not implemented, too complicated and unreliable anyway */ } - static void crc32_sigill(void) { /* CRC32W W0, W0, W1 */ diff --git a/tools/testing/selftests/arm64/bti/assembler.h b/tools/testing/selftests/arm64/bti/assembler.h index 04e7b72880ef..141cdcbf0b8f 100644 --- a/tools/testing/selftests/arm64/bti/assembler.h +++ b/tools/testing/selftests/arm64/bti/assembler.h @@ -14,7 +14,6 @@ #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) - .macro startfn name:req .globl \name \name: diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index 124bc883365e..3dc195f977ba 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -1569,7 +1569,6 @@ static void run_sve_tests(void) &test_config); } } - } static void run_sme_tests(void) diff --git a/tools/testing/selftests/arm64/fp/vec-syscfg.c b/tools/testing/selftests/arm64/fp/vec-syscfg.c index ea9c7d47790f..2d75d342eeb9 100644 --- a/tools/testing/selftests/arm64/fp/vec-syscfg.c +++ b/tools/testing/selftests/arm64/fp/vec-syscfg.c @@ -690,7 +690,6 @@ static inline void smstop(void) asm volatile("msr S0_3_C4_C6_3, xzr"); } - /* * Verify we can change the SVE vector length while SME is active and * continue to use SME afterwards. diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c index 584b8d59b7ea..a7f34040fbf1 100644 --- a/tools/testing/selftests/arm64/fp/zt-ptrace.c +++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c @@ -108,7 +108,6 @@ static int get_zt(pid_t pid, char zt[ZT_SIG_REG_BYTES]) return ptrace(PTRACE_GETREGSET, pid, NT_ARM_ZT, &iov); } - static int set_zt(pid_t pid, const char zt[ZT_SIG_REG_BYTES]) { struct iovec iov; diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c b/tools/testing/selftests/arm64/gcs/gcs-locking.c index 989f75a491b7..1e6abb136ffd 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-locking.c +++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c @@ -165,7 +165,6 @@ TEST_F(valid_modes, lock_enable_disable_others) ASSERT_EQ(ret, 0); ASSERT_EQ(mode, PR_SHADOW_STACK_ALL_MODES); - ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, variant->mode); ASSERT_EQ(ret, 0); From a940568ccde433a0d06aadd4735f7974fd2c59e1 Mon Sep 17 00:00:00 2001 From: Vivek Yadav Date: Sat, 23 Aug 2025 23:14:01 -0700 Subject: [PATCH 08/93] kselftest/arm64: Supress warning and improve readability The comment was correct, but `checkpatch` script flagged it with a warning as shown in the output section. The comment is slightly modified to improve readability, which also suppresses the warning. ``` [command] ./script/checkpatch.pl --strict -f tools/testing/selftests/arm64/fp/fp-stress.c [output] WARNING: Possible repeated word: 'on' ``` Signed-off-by: Vivek Yadav Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/fp-stress.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index 74e23208b94c..3a0ae96cf909 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -105,8 +105,8 @@ static void child_start(struct child_data *child, const char *program) /* * Read from the startup pipe, there should be no data - * and we should block until it is closed. We just - * carry on on error since this isn't super critical. + * and we should block until it is closed. We just + * carry-on on error since this isn't super critical. */ ret = read(3, &i, sizeof(i)); if (ret < 0) From 62e8a9fbaad147b65bda9362c2d8a52a86a0bac3 Mon Sep 17 00:00:00 2001 From: Vivek Yadav Date: Sat, 23 Aug 2025 23:14:02 -0700 Subject: [PATCH 09/93] kselftest/arm64: Add parentheses around sizeof for clarity Added parentheses around sizeof to make the expression clearer and improve readability. This change has no functional impact. ``` [command] ./scripts/checkpatch.pl tools/testing/selftests/arm64/fp/sve-ptrace.c [output] WARNING: sizeof *sve should be sizeof(*sve) ``` Signed-off-by: Vivek Yadav Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/sve-ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index 4cba3bcff660..79bcc2369cdb 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -123,7 +123,7 @@ static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type, { struct user_sve_header *sve; void *p; - size_t sz = sizeof *sve; + size_t sz = sizeof(*sve); struct iovec iov; int ret; From 14a41628c470f4aa069075cdcf6ec0138b6cf1da Mon Sep 17 00:00:00 2001 From: "Nikola Z. Ivanov" Date: Wed, 27 Aug 2025 00:49:13 +0300 Subject: [PATCH 10/93] selftests/arm64: Fix grammatical error in string literals Fix grammatical error in + construct related to memory allocation checks. In essence change "Failed to allocated" to "Failed to allocate". Signed-off-by: Nikola Z. Ivanov Reviewed-by: Mark Brown Reviewed-by: Bagas Sanjaya Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/fp-stress.c | 2 +- tools/testing/selftests/arm64/fp/kernel-test.c | 4 ++-- tools/testing/selftests/arm64/gcs/gcs-stress.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index 3a0ae96cf909..9349aa630c84 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -549,7 +549,7 @@ int main(int argc, char **argv) evs = calloc(tests, sizeof(*evs)); if (!evs) - ksft_exit_fail_msg("Failed to allocated %d epoll events\n", + ksft_exit_fail_msg("Failed to allocate %d epoll events\n", tests); for (i = 0; i < cpus; i++) { diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c index e3cec3723ffa..0c40007d1282 100644 --- a/tools/testing/selftests/arm64/fp/kernel-test.c +++ b/tools/testing/selftests/arm64/fp/kernel-test.c @@ -188,13 +188,13 @@ static bool create_socket(void) ref = malloc(digest_len); if (!ref) { - printf("Failed to allocated %d byte reference\n", digest_len); + printf("Failed to allocate %d byte reference\n", digest_len); return false; } digest = malloc(digest_len); if (!digest) { - printf("Failed to allocated %d byte digest\n", digest_len); + printf("Failed to allocate %d byte digest\n", digest_len); return false; } diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c index bbc7f4950c13..cf316d78ea97 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-stress.c +++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c @@ -433,7 +433,7 @@ int main(int argc, char **argv) evs = calloc(tests, sizeof(*evs)); if (!evs) - ksft_exit_fail_msg("Failed to allocated %d epoll events\n", + ksft_exit_fail_msg("Failed to allocate %d epoll events\n", tests); for (i = 0; i < gcs_threads; i++) From 80c4e1948908f725564997c78d10110d9b21a015 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 09:56:45 +0200 Subject: [PATCH 11/93] arm64: vdso32: Stop suppressing warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These warnings don't seem to trigger anymore. Probably due to the introduction of the vdso/ header namespace. Nowadays these suppression only hide real problems. Re-enable the warnings. Signed-off-by: Thomas Weißschuh Signed-off-by: Will Deacon --- arch/arm64/kernel/vdso32/Makefile | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index f2dfdc7dc818..fd80123bc8e6 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -21,8 +21,6 @@ endif cc32-option = $(call try-run,\ $(CC_COMPAT) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) -cc32-disable-warning = $(call try-run,\ - $(CC_COMPAT) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) # We cannot use the global flags to compile the vDSO files, the main reason # being that the 32-bit compiler may be older than the main (64-bit) compiler @@ -74,16 +72,6 @@ VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) VDSO_CFLAGS += -Werror=date-time VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) -# The 32-bit compiler does not provide 128-bit integers, which are used in -# some headers that are indirectly included from the vDSO code. -# This hack makes the compiler happy and should trigger a warning/error if -# variables of such type are referenced. -VDSO_CFLAGS += -D__uint128_t='void*' -# Silence some warnings coming from headers that operate on long's -# (on GCC 4.8 or older, there is unfortunately no way to silence this warning) -VDSO_CFLAGS += $(call cc32-disable-warning,shift-count-overflow) -VDSO_CFLAGS += -Wno-int-to-pointer-cast - # Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is # unreliable. ifeq ($(CONFIG_THUMB2_COMPAT_VDSO), y) From 281817dffe2855fc0dc07c205b03df630dc6ba38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 21 Aug 2025 09:56:46 +0200 Subject: [PATCH 12/93] arm64: vdso32: Respect -Werror from kbuild MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compiler flags for the compat vDSO are built manually as they are not compatible with the ones from kbuild. CONFIG_WERROR is not respected. Explicitly inherit -Werror from kbuild. Signed-off-by: Thomas Weißschuh Signed-off-by: Will Deacon --- arch/arm64/kernel/vdso32/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index fd80123bc8e6..5de4deaf4299 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -61,6 +61,7 @@ VDSO_CFLAGS += -DENABLE_COMPAT_VDSO=1 # KBUILD_CFLAGS from top-level Makefile VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ + $(filter -Werror,$(KBUILD_CPPFLAGS)) \ -Werror-implicit-function-declaration \ -Wno-format-security \ -std=gnu11 From 2c2529e470627a8de22ec366d2f4c3146fb3fe96 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 29 Aug 2025 10:51:41 +0100 Subject: [PATCH 13/93] arm64: sysreg: Fix and tidy up sysreg field definitions Fix the value of ID_PFR1_EL1.Security NSACR_RFR to be 0b0010, as per DDI0601/2025-06, which wasn't correctly set when introduced in commit 1224308075f1 ("arm64/sysreg: Convert ID_PFR1_EL1 to automatic generation"). While at it, remove redundant definitions of CPACR_EL12 and RCWSMASK_EL1 and fix some typos. Reviewed-by: Anshuman Khandual Acked-by: Marc Zyngier Signed-off-by: Fuad Tabba Acked-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 696ab1f32a67..f1a012ee0db6 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -31,7 +31,7 @@ # Mapping # EndSysreg -# Where multiple system regsiters are not VHE aliases but share a +# Where multiple system registers are not VHE aliases but share a # common layout, a SysregFields block can be used to describe the # shared layout: @@ -54,7 +54,7 @@ # # In general it is recommended that new enumeration items be named for the # feature that introduces them (eg, FEAT_LS64_ACCDATA introduces enumeration -# item ACCDATA) though it may be more taseful to do something else. +# item ACCDATA) though it may be more tasteful to do something else. Sysreg OSDTRRX_EL1 2 0 0 0 2 Res0 63:32 @@ -474,7 +474,7 @@ EndEnum Enum 7:4 Security 0b0000 NI 0b0001 EL3 - 0b0001 NSACR_RFR + 0b0010 NSACR_RFR EndEnum UnsignedEnum 3:0 ProgMod 0b0000 NI @@ -2528,10 +2528,6 @@ Field 17:16 ZEN Res0 15:0 EndSysreg -Sysreg CPACR_EL12 3 5 1 0 2 -Mapping CPACR_EL1 -EndSysreg - Sysreg CPACRALIAS_EL1 3 0 1 4 4 Mapping CPACR_EL1 EndSysreg @@ -2576,10 +2572,6 @@ Sysreg PFAR_EL12 3 5 6 0 5 Mapping PFAR_EL1 EndSysreg -Sysreg RCWSMASK_EL1 3 0 13 0 3 -Field 63:0 RCWSMASK -EndSysreg - Sysreg SCTLR2_EL1 3 0 1 0 3 Res0 63:13 Field 12 CPTM0 From f4d4ebc84995178273740f3e601e97fdefc561d2 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 29 Aug 2025 10:51:42 +0100 Subject: [PATCH 14/93] arm64: sysreg: Correct sign definitions for EIESB and DoubleLock The `ID_AA64MMFR4_EL1.EIESB` field, is an unsigned enumeration, but was incorrectly defined as a `SignedEnum` when introduced in commit cfc680bb04c5 ("arm64: sysreg: Add layout for ID_AA64MMFR4_EL1"). This is corrected to `UnsignedEnum`. Conversely, the `ID_AA64DFR0_EL1.DoubleLock` field, is a signed enumeration, but was incorrectly defined as an `UnsignedEnum`. This is corrected to `SignedEnum`, which wasn't correctly set when annotated as such in commit ad16d4cf0b4f ("arm64/sysreg: Initial unsigned annotations for ID registers"). Signed-off-by: Fuad Tabba Acked-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index f1a012ee0db6..d396fa587ec1 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1693,7 +1693,7 @@ UnsignedEnum 43:40 TraceFilt 0b0000 NI 0b0001 IMP EndEnum -UnsignedEnum 39:36 DoubleLock +SignedEnum 39:36 DoubleLock 0b0000 IMP 0b1111 NI EndEnum @@ -2409,7 +2409,7 @@ UnsignedEnum 11:8 ASID2 0b0000 NI 0b0001 IMP EndEnum -SignedEnum 7:4 EIESB +UnsignedEnum 7:4 EIESB 0b0000 NI 0b0001 ToEL3 0b0010 ToELx From 382cbbe7fb2ae841a7e7b4c40a02f12afc803c69 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 29 Aug 2025 10:51:43 +0100 Subject: [PATCH 15/93] arm64: sysreg: Add validation checks to sysreg header generation script The gen_sysreg.awk script processes the system register specification in the sysreg text file to generate C macro definitions. The current script will silently accept certain errors in the specification file, leading to incorrect header generation. For example, a Sysreg or SysregFields can be accidentally duplicated, causing its macros to be emitted twice. An Enum can contain duplicate values for different items, which is architecturally incorrect. Add checks to catch these errors at build time. The script now tracks all seen Sysreg and SysregFields definitions and checks for duplicates. It also tracks values within each Enum block to ensure entries are unique. Acked-by: Marc Zyngier Signed-off-by: Fuad Tabba Acked-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/tools/gen-sysreg.awk | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk index f2a1732cb1f6..bbbb812603e8 100755 --- a/arch/arm64/tools/gen-sysreg.awk +++ b/arch/arm64/tools/gen-sysreg.awk @@ -122,6 +122,10 @@ $1 == "SysregFields" && block_current() == "Root" { res1 = "UL(0)" unkn = "UL(0)" + if (reg in defined_fields) + fatal("Duplicate SysregFields definition for " reg) + defined_fields[reg] = 1 + next_bit = 63 next @@ -162,6 +166,10 @@ $1 == "Sysreg" && block_current() == "Root" { res1 = "UL(0)" unkn = "UL(0)" + if (reg in defined_regs) + fatal("Duplicate Sysreg definition for " reg) + defined_regs[reg] = 1 + define("REG_" reg, "S" op0 "_" op1 "_C" crn "_C" crm "_" op2) define("SYS_" reg, "sys_reg(" op0 ", " op1 ", " crn ", " crm ", " op2 ")") @@ -284,6 +292,8 @@ $1 == "SignedEnum" && (block_current() == "Sysreg" || block_current() == "Sysreg define_field(reg, field, msb, lsb) define_field_sign(reg, field, "true") + delete seen_enum_vals + next } @@ -297,6 +307,8 @@ $1 == "UnsignedEnum" && (block_current() == "Sysreg" || block_current() == "Sysr define_field(reg, field, msb, lsb) define_field_sign(reg, field, "false") + delete seen_enum_vals + next } @@ -309,6 +321,8 @@ $1 == "Enum" && (block_current() == "Sysreg" || block_current() == "SysregFields define_field(reg, field, msb, lsb) + delete seen_enum_vals + next } @@ -320,6 +334,8 @@ $1 == "EndEnum" && block_current() == "Enum" { lsb = null print "" + delete seen_enum_vals + block_pop() next } @@ -329,6 +345,10 @@ $1 == "EndEnum" && block_current() == "Enum" { val = $1 name = $2 + if (val in seen_enum_vals) + fatal("Duplicate Enum value " val " for " name) + seen_enum_vals[val] = 1 + define(reg "_" field "_" name, "UL(" val ")") next } From 788b8f6af60b22f78739fc758456b51b2b916dbd Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:26 +0800 Subject: [PATCH 16/93] arm64: ptrace: Replace interrupts_enabled() with regs_irqs_disabled() The generic entry code expects architecture code to provide regs_irqs_disabled(regs) function, but arm64 does not have this and provides interrupts_enabled(regs), which has the opposite polarity. In preparation for moving arm64 over to the generic entry code, relace arm64's interrupts_enabled() with regs_irqs_disabled() and update its callers under arch/arm64. For the moment, a definition of interrupts_enabled() is provided for the GICv3 driver. Once arch/arm implement regs_irqs_disabled(), this can be removed. Delete the fast_interrupts_enabled() macro as it is unused and we don't want any new users to show up. No functional changes. Reviewed-by: Ada Couprie Diaz Acked-by: Mark Rutland Suggested-by: Mark Rutland Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/daifflags.h | 2 +- arch/arm64/include/asm/ptrace.h | 9 +++++---- arch/arm64/include/asm/xen/events.h | 2 +- arch/arm64/kernel/acpi.c | 2 +- arch/arm64/kernel/debug-monitors.c | 2 +- arch/arm64/kernel/entry-common.c | 4 ++-- arch/arm64/kernel/sdei.c | 2 +- 7 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index fbb5c99eb2f9..5fca48009043 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -128,7 +128,7 @@ static inline void local_daif_inherit(struct pt_regs *regs) { unsigned long flags = regs->pstate & DAIF_MASK; - if (interrupts_enabled(regs)) + if (!regs_irqs_disabled(regs)) trace_hardirqs_on(); if (system_uses_irq_prio_masking()) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 47ff8654c5ec..8b915d4a9d4b 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -214,11 +214,12 @@ static inline void forget_syscall(struct pt_regs *regs) (regs)->pmr == GIC_PRIO_IRQON : \ true) -#define interrupts_enabled(regs) \ - (!((regs)->pstate & PSR_I_BIT) && irqs_priority_unmasked(regs)) +static __always_inline bool regs_irqs_disabled(const struct pt_regs *regs) +{ + return (regs->pstate & PSR_I_BIT) || !irqs_priority_unmasked(regs); +} -#define fast_interrupts_enabled(regs) \ - (!((regs)->pstate & PSR_F_BIT)) +#define interrupts_enabled(regs) (!regs_irqs_disabled(regs)) static inline unsigned long user_stack_pointer(struct pt_regs *regs) { diff --git a/arch/arm64/include/asm/xen/events.h b/arch/arm64/include/asm/xen/events.h index 2788e95d0ff0..2977b5fe068d 100644 --- a/arch/arm64/include/asm/xen/events.h +++ b/arch/arm64/include/asm/xen/events.h @@ -14,7 +14,7 @@ enum ipi_vector { static inline int xen_irqs_disabled(struct pt_regs *regs) { - return !interrupts_enabled(regs); + return regs_irqs_disabled(regs); } #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 4d529ff7ba51..3fbce0a9a0fe 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -407,7 +407,7 @@ int apei_claim_sea(struct pt_regs *regs) return_to_irqs_enabled = !irqs_disabled_flags(arch_local_save_flags()); if (regs) - return_to_irqs_enabled = interrupts_enabled(regs); + return_to_irqs_enabled = !regs_irqs_disabled(regs); /* * SEA can interrupt SError, mask it and describe this as an NMI so diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 110d9ff54174..85fc162a6f9b 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -167,7 +167,7 @@ static void send_user_sigtrap(int si_code) if (WARN_ON(!user_mode(regs))) return; - if (interrupts_enabled(regs)) + if (!regs_irqs_disabled(regs)) local_irq_enable(); arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 2b0c5925502e..8e798f46ad28 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -74,7 +74,7 @@ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) { lockdep_assert_irqs_disabled(); - if (interrupts_enabled(regs)) { + if (!regs_irqs_disabled(regs)) { if (regs->exit_rcu) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); @@ -662,7 +662,7 @@ static void noinstr el1_interrupt(struct pt_regs *regs, { write_sysreg(DAIF_PROCCTX_NOIRQ, daif); - if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) + if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && regs_irqs_disabled(regs)) __el1_pnmi(regs, handler); else __el1_irq(regs, handler); diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index 6f24a0251e18..95169f7b6531 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -243,7 +243,7 @@ unsigned long __kprobes do_sdei_event(struct pt_regs *regs, * If we interrupted the kernel with interrupts masked, we always go * back to wherever we came from. */ - if (mode == kernel_mode && !interrupts_enabled(regs)) + if (mode == kernel_mode && regs_irqs_disabled(regs)) return SDEI_EV_HANDLED; /* From ee776d68ba47cc8e2022f8c2218f1891a1244197 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:27 +0800 Subject: [PATCH 17/93] arm64: entry: Refactor the entry and exit for exceptions from EL1 The generic entry code uses irqentry_state_t to track lockdep and RCU state across exception entry and return. For historical reasons, arm64 embeds similar fields within its pt_regs structure. In preparation for moving arm64 over to the generic entry code, pull these fields out of arm64's pt_regs, and use a separate structure, matching the style of the generic entry code. No functional changes. Acked-by: Mark Rutland Reviewed-by: Ada Couprie Diaz Suggested-by: Mark Rutland Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/ptrace.h | 4 - arch/arm64/kernel/entry-common.c | 163 ++++++++++++++++++++----------- 2 files changed, 106 insertions(+), 61 deletions(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 8b915d4a9d4b..65b053a24d82 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -169,10 +169,6 @@ struct pt_regs { u64 sdei_ttbr1; struct frame_record_meta stackframe; - - /* Only valid for some EL1 exceptions. */ - u64 lockdep_hardirqs; - u64 exit_rcu; }; /* For correct stack alignment, pt_regs has to be a multiple of 16 bytes. */ diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 8e798f46ad28..93c95fc51cc0 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -29,6 +29,13 @@ #include #include +typedef struct irqentry_state { + union { + bool exit_rcu; + bool lockdep; + }; +} arm64_irqentry_state_t; + /* * Handle IRQ/context state management when entering from kernel mode. * Before this function is called it is not safe to call regular kernel code, @@ -37,29 +44,37 @@ * This is intended to match the logic in irqentry_enter(), handling the kernel * mode transitions only. */ -static __always_inline void __enter_from_kernel_mode(struct pt_regs *regs) +static __always_inline arm64_irqentry_state_t __enter_from_kernel_mode(struct pt_regs *regs) { - regs->exit_rcu = false; + arm64_irqentry_state_t state = { + .exit_rcu = false, + }; if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { lockdep_hardirqs_off(CALLER_ADDR0); ct_irq_enter(); trace_hardirqs_off_finish(); - regs->exit_rcu = true; - return; + state.exit_rcu = true; + return state; } lockdep_hardirqs_off(CALLER_ADDR0); rcu_irq_enter_check_tick(); trace_hardirqs_off_finish(); + + return state; } -static void noinstr enter_from_kernel_mode(struct pt_regs *regs) +static noinstr arm64_irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) { - __enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = __enter_from_kernel_mode(regs); mte_check_tfsr_entry(); mte_disable_tco_entry(current); + + return state; } /* @@ -70,12 +85,13 @@ static void noinstr enter_from_kernel_mode(struct pt_regs *regs) * This is intended to match the logic in irqentry_exit(), handling the kernel * mode transitions only, and with preemption handled elsewhere. */ -static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) +static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs, + arm64_irqentry_state_t state) { lockdep_assert_irqs_disabled(); if (!regs_irqs_disabled(regs)) { - if (regs->exit_rcu) { + if (state.exit_rcu) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); ct_irq_exit(); @@ -85,15 +101,16 @@ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs) trace_hardirqs_on(); } else { - if (regs->exit_rcu) + if (state.exit_rcu) ct_irq_exit(); } } -static void noinstr exit_to_kernel_mode(struct pt_regs *regs) +static void noinstr exit_to_kernel_mode(struct pt_regs *regs, + arm64_irqentry_state_t state) { mte_check_tfsr_exit(); - __exit_to_kernel_mode(regs); + __exit_to_kernel_mode(regs, state); } /* @@ -194,9 +211,11 @@ asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) * mode. Before this function is called it is not safe to call regular kernel * code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_enter_nmi(struct pt_regs *regs) +static noinstr arm64_irqentry_state_t arm64_enter_nmi(struct pt_regs *regs) { - regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + arm64_irqentry_state_t state; + + state.lockdep = lockdep_hardirqs_enabled(); __nmi_enter(); lockdep_hardirqs_off(CALLER_ADDR0); @@ -205,6 +224,8 @@ static void noinstr arm64_enter_nmi(struct pt_regs *regs) trace_hardirqs_off_finish(); ftrace_nmi_enter(); + + return state; } /* @@ -212,19 +233,18 @@ static void noinstr arm64_enter_nmi(struct pt_regs *regs) * mode. After this function returns it is not safe to call regular kernel * code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_exit_nmi(struct pt_regs *regs) +static void noinstr arm64_exit_nmi(struct pt_regs *regs, + arm64_irqentry_state_t state) { - bool restore = regs->lockdep_hardirqs; - ftrace_nmi_exit(); - if (restore) { + if (state.lockdep) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); } ct_nmi_exit(); lockdep_hardirq_exit(); - if (restore) + if (state.lockdep) lockdep_hardirqs_on(CALLER_ADDR0); __nmi_exit(); } @@ -234,14 +254,18 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs) * kernel mode. Before this function is called it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) +static noinstr arm64_irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) { - regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + arm64_irqentry_state_t state; + + state.lockdep = lockdep_hardirqs_enabled(); lockdep_hardirqs_off(CALLER_ADDR0); ct_nmi_enter(); trace_hardirqs_off_finish(); + + return state; } /* @@ -249,17 +273,16 @@ static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) * kernel mode. After this function returns it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs) +static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, + arm64_irqentry_state_t state) { - bool restore = regs->lockdep_hardirqs; - - if (restore) { + if (state.lockdep) { trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); } ct_nmi_exit(); - if (restore) + if (state.lockdep) lockdep_hardirqs_on(CALLER_ADDR0); } @@ -475,73 +498,87 @@ UNHANDLED(el1t, 64, error) static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + arm64_irqentry_state_t state; - enter_from_kernel_mode(regs); + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_mem_abort(far, esr, regs); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + arm64_irqentry_state_t state; - enter_from_kernel_mode(regs); + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_sp_pc_abort(far, esr, regs); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_undef(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_bti(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_gcs(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_mops(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) { - arm64_enter_el1_dbg(regs); + arm64_irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); do_breakpoint(esr, regs); debug_exception_exit(regs); - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr) { - arm64_enter_el1_dbg(regs); + arm64_irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); if (!cortex_a76_erratum_1463225_debug_handler(regs)) { debug_exception_enter(regs); /* @@ -554,37 +591,42 @@ static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr) do_el1_softstep(esr, regs); debug_exception_exit(regs); } - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr) { /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); + arm64_irqentry_state_t state; - arm64_enter_el1_dbg(regs); + state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); do_watchpoint(far, esr, regs); debug_exception_exit(regs); - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr) { - arm64_enter_el1_dbg(regs); + arm64_irqentry_state_t state; + + state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); do_el1_brk64(esr, regs); debug_exception_exit(regs); - arm64_exit_el1_dbg(regs); + arm64_exit_el1_dbg(regs, state); } static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_fpac(regs, esr); local_daif_mask(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) @@ -639,15 +681,19 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) static __always_inline void __el1_pnmi(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - arm64_enter_nmi(regs); + arm64_irqentry_state_t state; + + state = arm64_enter_nmi(regs); do_interrupt_handler(regs, handler); - arm64_exit_nmi(regs); + arm64_exit_nmi(regs, state); } static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - enter_from_kernel_mode(regs); + arm64_irqentry_state_t state; + + state = enter_from_kernel_mode(regs); irq_enter_rcu(); do_interrupt_handler(regs, handler); @@ -655,7 +701,7 @@ static __always_inline void __el1_irq(struct pt_regs *regs, arm64_preempt_schedule_irq(); - exit_to_kernel_mode(regs); + exit_to_kernel_mode(regs, state); } static void noinstr el1_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) @@ -681,11 +727,12 @@ asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs) asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); + arm64_irqentry_state_t state; local_daif_restore(DAIF_ERRCTX); - arm64_enter_nmi(regs); + state = arm64_enter_nmi(regs); do_serror(regs, esr); - arm64_exit_nmi(regs); + arm64_exit_nmi(regs, state); } static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) @@ -997,12 +1044,13 @@ asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs) static void noinstr __el0_error_handler_common(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); + arm64_irqentry_state_t state; enter_from_user_mode(regs); local_daif_restore(DAIF_ERRCTX); - arm64_enter_nmi(regs); + state = arm64_enter_nmi(regs); do_serror(regs, esr); - arm64_exit_nmi(regs); + arm64_exit_nmi(regs, state); local_daif_restore(DAIF_PROCCTX); exit_to_user_mode(regs); } @@ -1122,6 +1170,7 @@ asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) asmlinkage noinstr unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { + arm64_irqentry_state_t state; unsigned long ret; /* @@ -1146,9 +1195,9 @@ __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) else if (cpu_has_pan()) set_pstate_pan(0); - arm64_enter_nmi(regs); + state = arm64_enter_nmi(regs); ret = do_sdei_event(regs, arg); - arm64_exit_nmi(regs); + arm64_exit_nmi(regs, state); return ret; } From 77c1953946391e38c1e5120230f8df14f85219a7 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:28 +0800 Subject: [PATCH 18/93] arm64: entry: Rework arm64_preempt_schedule_irq() The generic entry code has the form: | raw_irqentry_exit_cond_resched() | { | if (!preempt_count()) { | ... | if (need_resched()) | preempt_schedule_irq(); | } | } In preparation for moving arm64 over to the generic entry code, align the structure of the arm64 code with raw_irqentry_exit_cond_resched() from the generic entry code. Reviewed-by: Ada Couprie Diaz Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/kernel/entry-common.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 93c95fc51cc0..dd7903f371ad 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -294,10 +294,10 @@ DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); #define need_irq_preemption() (IS_ENABLED(CONFIG_PREEMPTION)) #endif -static void __sched arm64_preempt_schedule_irq(void) +static inline bool arm64_preempt_schedule_irq(void) { if (!need_irq_preemption()) - return; + return false; /* * Note: thread_info::preempt_count includes both thread_info::count @@ -305,7 +305,7 @@ static void __sched arm64_preempt_schedule_irq(void) * preempt_count(). */ if (READ_ONCE(current_thread_info()->preempt_count) != 0) - return; + return false; /* * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC @@ -314,7 +314,7 @@ static void __sched arm64_preempt_schedule_irq(void) * DAIF we must have handled an NMI, so skip preemption. */ if (system_uses_irq_prio_masking() && read_sysreg(daif)) - return; + return false; /* * Preempting a task from an IRQ means we leave copies of PSTATE @@ -324,8 +324,10 @@ static void __sched arm64_preempt_schedule_irq(void) * Only allow a task to be preempted once cpufeatures have been * enabled. */ - if (system_capabilities_finalized()) - preempt_schedule_irq(); + if (!system_capabilities_finalized()) + return false; + + return true; } static void do_interrupt_handler(struct pt_regs *regs, @@ -699,7 +701,8 @@ static __always_inline void __el1_irq(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - arm64_preempt_schedule_irq(); + if (arm64_preempt_schedule_irq()) + preempt_schedule_irq(); exit_to_kernel_mode(regs, state); } From c74c44c6ae207e196c4c31c4a243abb0811a5974 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:29 +0800 Subject: [PATCH 19/93] arm64: entry: Use preempt_count() and need_resched() helper The generic entry code uses preempt_count() and need_resched() helpers to check if it should do preempt_schedule_irq(). Currently, arm64 use its own check logic, that is "READ_ONCE(current_thread_info()->preempt_count == 0", which is equivalent to "preempt_count() == 0 && need_resched()". In preparation for moving arm64 over to the generic entry code, use these helpers to replace arm64's own code and move it ahead. No functional changes. Reviewed-by: Ada Couprie Diaz Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/kernel/entry-common.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index dd7903f371ad..1ba1d40fa6a7 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -299,14 +299,6 @@ static inline bool arm64_preempt_schedule_irq(void) if (!need_irq_preemption()) return false; - /* - * Note: thread_info::preempt_count includes both thread_info::count - * and thread_info::need_resched, and is not equivalent to - * preempt_count(). - */ - if (READ_ONCE(current_thread_info()->preempt_count) != 0) - return false; - /* * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC * priority masking is used the GIC irqchip driver will clear DAIF.IF @@ -701,8 +693,10 @@ static __always_inline void __el1_irq(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - if (arm64_preempt_schedule_irq()) - preempt_schedule_irq(); + if (!preempt_count() && need_resched()) { + if (arm64_preempt_schedule_irq()) + preempt_schedule_irq(); + } exit_to_kernel_mode(regs, state); } From 3c973c51bfbaf356367afa46b94f9100a7d672f2 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:30 +0800 Subject: [PATCH 20/93] entry: Add arch_irqentry_exit_need_resched() for arm64 Compared to the generic entry code, ARM64 does additional checks when deciding to reschedule on return from interrupt. So introduce arch_irqentry_exit_need_resched() in the need_resched() condition of the generic raw_irqentry_exit_cond_resched(), with a NOP default. This will allow ARM64 to implement the architecture specific version for switching over to the generic entry code. Suggested-by: Ada Couprie Diaz Suggested-by: Mark Rutland Suggested-by: Kevin Brodsky Suggested-by: Thomas Gleixner Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- kernel/entry/common.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 408d28b5179d..f62e1d1b2063 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -143,6 +143,20 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } +/** + * arch_irqentry_exit_need_resched - Architecture specific need resched function + * + * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed. + * Defaults return true. + * + * The main purpose is to permit arch to avoid preemption of a task from an IRQ. + */ +static inline bool arch_irqentry_exit_need_resched(void); + +#ifndef arch_irqentry_exit_need_resched +static inline bool arch_irqentry_exit_need_resched(void) { return true; } +#endif + void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { @@ -150,7 +164,7 @@ void raw_irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (need_resched() && arch_irqentry_exit_need_resched()) preempt_schedule_irq(); } } From 64f4b8b15f1c3c9a4e416fc5b5b4dc354b78e75e Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:31 +0800 Subject: [PATCH 21/93] arm64: entry: Refactor preempt_schedule_irq() check code To align the structure of the code with irqentry_exit_cond_resched() from the generic entry code, hoist the need_irq_preemption() and IS_ENABLED() check earlier. And different preemption check functions are defined based on whether dynamic preemption is enabled. Signed-off-by: Jinjie Ruan Reviewed-by: Ada Couprie Diaz Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/preempt.h | 6 ++++++ arch/arm64/kernel/entry-common.c | 37 +++++++++++++++++++------------- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h index 0159b625cc7f..c2437ea0790f 100644 --- a/arch/arm64/include/asm/preempt.h +++ b/arch/arm64/include/asm/preempt.h @@ -85,6 +85,7 @@ static inline bool should_resched(int preempt_offset) void preempt_schedule(void); void preempt_schedule_notrace(void); +void raw_irqentry_exit_cond_resched(void); #ifdef CONFIG_PREEMPT_DYNAMIC DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); @@ -92,13 +93,18 @@ void dynamic_preempt_schedule(void); #define __preempt_schedule() dynamic_preempt_schedule() void dynamic_preempt_schedule_notrace(void); #define __preempt_schedule_notrace() dynamic_preempt_schedule_notrace() +void dynamic_irqentry_exit_cond_resched(void); +#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() #else /* CONFIG_PREEMPT_DYNAMIC */ #define __preempt_schedule() preempt_schedule() #define __preempt_schedule_notrace() preempt_schedule_notrace() +#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() #endif /* CONFIG_PREEMPT_DYNAMIC */ +#else /* CONFIG_PREEMPTION */ +#define irqentry_exit_cond_resched() {} #endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 1ba1d40fa6a7..64066c643f97 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -286,19 +286,8 @@ static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, lockdep_hardirqs_on(CALLER_ADDR0); } -#ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -#define need_irq_preemption() \ - (static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) -#else -#define need_irq_preemption() (IS_ENABLED(CONFIG_PREEMPTION)) -#endif - static inline bool arm64_preempt_schedule_irq(void) { - if (!need_irq_preemption()) - return false; - /* * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC * priority masking is used the GIC irqchip driver will clear DAIF.IF @@ -682,6 +671,26 @@ static __always_inline void __el1_pnmi(struct pt_regs *regs, arm64_exit_nmi(regs, state); } +#ifdef CONFIG_PREEMPTION +void raw_irqentry_exit_cond_resched(void) +{ + if (!preempt_count()) { + if (need_resched() && arm64_preempt_schedule_irq()) + preempt_schedule_irq(); + } +} +#endif + +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) + return; + raw_irqentry_exit_cond_resched(); +} +#endif + static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { @@ -693,10 +702,8 @@ static __always_inline void __el1_irq(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - if (!preempt_count() && need_resched()) { - if (arm64_preempt_schedule_irq()) - preempt_schedule_irq(); - } + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); exit_to_kernel_mode(regs, state); } From 99eb057ccd675b2f0fc71a362553164c65c349a2 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:32 +0800 Subject: [PATCH 22/93] arm64: entry: Move arm64_preempt_schedule_irq() into __exit_to_kernel_mode() The arm64 entry code only preempts a kernel context upon a return from a regular IRQ exception. The generic entry code may preempt a kernel context for any exception return where irqentry_exit() is used, and so may preempt other exceptions such as faults. In preparation for moving arm64 over to the generic entry code, align arm64 with the generic behaviour by calling arm64_preempt_schedule_irq() from exit_to_kernel_mode(). To make this possible, arm64_preempt_schedule_irq() and dynamic/raw_irqentry_exit_cond_resched() are moved earlier in the file, with no changes. As Mark pointed out, this change will have the following 2 key impact: - " We'll preempt even without taking a "real" interrupt. That shouldn't result in preemption that wasn't possible before, but it does change the probability of preempting at certain points, and might have a performance impact, so probably warrants a benchmark." - " We will not preempt when taking interrupts from a region of kernel code where IRQs are enabled but RCU is not watching, matching the behaviour of the generic entry code. This has the potential to introduce livelock if we can ever have a screaming interrupt in such a region, so we'll need to go figure out whether that's actually a problem. Having this as a separate patch will make it easier to test/bisect for that specifically." Reviewed-by: Ada Couprie Diaz Suggested-by: Mark Rutland Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/kernel/entry-common.c | 96 ++++++++++++++++---------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 64066c643f97..f52067d17baf 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -77,6 +77,51 @@ static noinstr arm64_irqentry_state_t enter_from_kernel_mode(struct pt_regs *reg return state; } +static inline bool arm64_preempt_schedule_irq(void) +{ + /* + * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC + * priority masking is used the GIC irqchip driver will clear DAIF.IF + * using gic_arch_enable_irqs() for normal IRQs. If anything is set in + * DAIF we must have handled an NMI, so skip preemption. + */ + if (system_uses_irq_prio_masking() && read_sysreg(daif)) + return false; + + /* + * Preempting a task from an IRQ means we leave copies of PSTATE + * on the stack. cpufeature's enable calls may modify PSTATE, but + * resuming one of these preempted tasks would undo those changes. + * + * Only allow a task to be preempted once cpufeatures have been + * enabled. + */ + if (!system_capabilities_finalized()) + return false; + + return true; +} + +#ifdef CONFIG_PREEMPTION +void raw_irqentry_exit_cond_resched(void) +{ + if (!preempt_count()) { + if (need_resched() && arm64_preempt_schedule_irq()) + preempt_schedule_irq(); + } +} +#endif + +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) + return; + raw_irqentry_exit_cond_resched(); +} +#endif + /* * Handle IRQ/context state management when exiting to kernel mode. * After this function returns it is not safe to call regular kernel code, @@ -99,6 +144,9 @@ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs, return; } + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); + trace_hardirqs_on(); } else { if (state.exit_rcu) @@ -286,31 +334,6 @@ static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, lockdep_hardirqs_on(CALLER_ADDR0); } -static inline bool arm64_preempt_schedule_irq(void) -{ - /* - * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC - * priority masking is used the GIC irqchip driver will clear DAIF.IF - * using gic_arch_enable_irqs() for normal IRQs. If anything is set in - * DAIF we must have handled an NMI, so skip preemption. - */ - if (system_uses_irq_prio_masking() && read_sysreg(daif)) - return false; - - /* - * Preempting a task from an IRQ means we leave copies of PSTATE - * on the stack. cpufeature's enable calls may modify PSTATE, but - * resuming one of these preempted tasks would undo those changes. - * - * Only allow a task to be preempted once cpufeatures have been - * enabled. - */ - if (!system_capabilities_finalized()) - return false; - - return true; -} - static void do_interrupt_handler(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { @@ -671,26 +694,6 @@ static __always_inline void __el1_pnmi(struct pt_regs *regs, arm64_exit_nmi(regs, state); } -#ifdef CONFIG_PREEMPTION -void raw_irqentry_exit_cond_resched(void) -{ - if (!preempt_count()) { - if (need_resched() && arm64_preempt_schedule_irq()) - preempt_schedule_irq(); - } -} -#endif - -#ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -void dynamic_irqentry_exit_cond_resched(void) -{ - if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) - return; - raw_irqentry_exit_cond_resched(); -} -#endif - static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { @@ -702,9 +705,6 @@ static __always_inline void __el1_irq(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - if (IS_ENABLED(CONFIG_PREEMPTION)) - irqentry_exit_cond_resched(); - exit_to_kernel_mode(regs, state); } static void noinstr el1_interrupt(struct pt_regs *regs, From b3cf07851b6c4aa8683557905cd898da9ae8c634 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 15 Aug 2025 11:06:33 +0800 Subject: [PATCH 23/93] arm64: entry: Switch to generic IRQ entry Currently, x86, Riscv and Loongarch use the generic entry code, which makes maintainer's work easier and code more elegant. Start converting arm64 to use the generic entry infrastructure from kernel/entry/* by switching it to generic IRQ entry, which removes 100+ lines of duplicate code. arm64 will completely switch to generic entry in a later series. The changes are below: - Remove *enter_from/exit_to_kernel_mode(), and wrap with generic irqentry_enter/exit() as their code and functionality are almost identical. - Define ARCH_EXIT_TO_USER_MODE_WORK and implement arch_exit_to_user_mode_work() to check arm64-specific thread flags "_TIF_MTE_ASYNC_FAULT" and "_TIF_FOREIGN_FPSTATE". So also remove *enter_from/exit_to_user_mode(), and wrap with generic enter_from/exit_to_user_mode() because they are exactly the same. - Remove arm64_enter/exit_nmi() and use generic irqentry_nmi_enter/exit() because they're exactly the same, so the temporary arm64 version irqentry_state can also be removed. - Remove PREEMPT_DYNAMIC code, as generic irqentry_exit_cond_resched() has the same functionality. - Implement arch_irqentry_exit_need_resched() with arm64_preempt_schedule_irq() for arm64 which will allow arm64 to do its architecture specific checks. Tested-by: Ada Couprie Diaz Suggested-by: Ada Couprie Diaz Suggested-by: Mark Rutland Signed-off-by: Jinjie Ruan Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/entry-common.h | 57 ++++ arch/arm64/include/asm/exception.h | 1 - arch/arm64/include/asm/preempt.h | 8 - arch/arm64/kernel/entry-common.c | 378 +++++++------------------- arch/arm64/kernel/signal.c | 3 +- 6 files changed, 156 insertions(+), 292 deletions(-) create mode 100644 arch/arm64/include/asm/entry-common.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a6..6bb60a0620ec 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -151,6 +151,7 @@ config ARM64 select GENERIC_EARLY_IOREMAP select GENERIC_IDLE_POLL_SETUP select GENERIC_IOREMAP + select GENERIC_IRQ_ENTRY select GENERIC_IRQ_IPI select GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD select GENERIC_IRQ_PROBE diff --git a/arch/arm64/include/asm/entry-common.h b/arch/arm64/include/asm/entry-common.h new file mode 100644 index 000000000000..cab8cd78f693 --- /dev/null +++ b/arch/arm64/include/asm/entry-common.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_ARM64_ENTRY_COMMON_H +#define _ASM_ARM64_ENTRY_COMMON_H + +#include + +#include +#include +#include +#include +#include + +#define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_MTE_ASYNC_FAULT | _TIF_FOREIGN_FPSTATE) + +static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs, + unsigned long ti_work) +{ + if (ti_work & _TIF_MTE_ASYNC_FAULT) { + clear_thread_flag(TIF_MTE_ASYNC_FAULT); + send_sig_fault(SIGSEGV, SEGV_MTEAERR, (void __user *)NULL, current); + } + + if (ti_work & _TIF_FOREIGN_FPSTATE) + fpsimd_restore_current_state(); +} + +#define arch_exit_to_user_mode_work arch_exit_to_user_mode_work + +static inline bool arch_irqentry_exit_need_resched(void) +{ + /* + * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC + * priority masking is used the GIC irqchip driver will clear DAIF.IF + * using gic_arch_enable_irqs() for normal IRQs. If anything is set in + * DAIF we must have handled an NMI, so skip preemption. + */ + if (system_uses_irq_prio_masking() && read_sysreg(daif)) + return false; + + /* + * Preempting a task from an IRQ means we leave copies of PSTATE + * on the stack. cpufeature's enable calls may modify PSTATE, but + * resuming one of these preempted tasks would undo those changes. + * + * Only allow a task to be preempted once cpufeatures have been + * enabled. + */ + if (!system_capabilities_finalized()) + return false; + + return true; +} + +#define arch_irqentry_exit_need_resched arch_irqentry_exit_need_resched + +#endif /* _ASM_ARM64_ENTRY_COMMON_H */ diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index e3874c4fc399..a2da3cb21c24 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -89,7 +89,6 @@ void do_el1_fpac(struct pt_regs *regs, unsigned long esr); void do_el0_mops(struct pt_regs *regs, unsigned long esr); void do_el1_mops(struct pt_regs *regs, unsigned long esr); void do_serror(struct pt_regs *regs, unsigned long esr); -void do_signal(struct pt_regs *regs); void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far); #endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h index c2437ea0790f..932ea4b62042 100644 --- a/arch/arm64/include/asm/preempt.h +++ b/arch/arm64/include/asm/preempt.h @@ -2,7 +2,6 @@ #ifndef __ASM_PREEMPT_H #define __ASM_PREEMPT_H -#include #include #define PREEMPT_NEED_RESCHED BIT(32) @@ -85,26 +84,19 @@ static inline bool should_resched(int preempt_offset) void preempt_schedule(void); void preempt_schedule_notrace(void); -void raw_irqentry_exit_cond_resched(void); #ifdef CONFIG_PREEMPT_DYNAMIC -DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); void dynamic_preempt_schedule(void); #define __preempt_schedule() dynamic_preempt_schedule() void dynamic_preempt_schedule_notrace(void); #define __preempt_schedule_notrace() dynamic_preempt_schedule_notrace() -void dynamic_irqentry_exit_cond_resched(void); -#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() #else /* CONFIG_PREEMPT_DYNAMIC */ #define __preempt_schedule() preempt_schedule() #define __preempt_schedule_notrace() preempt_schedule_notrace() -#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() #endif /* CONFIG_PREEMPT_DYNAMIC */ -#else /* CONFIG_PREEMPTION */ -#define irqentry_exit_cond_resched() {} #endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index f52067d17baf..f546a914f041 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -29,13 +30,6 @@ #include #include -typedef struct irqentry_state { - union { - bool exit_rcu; - bool lockdep; - }; -} arm64_irqentry_state_t; - /* * Handle IRQ/context state management when entering from kernel mode. * Before this function is called it is not safe to call regular kernel code, @@ -44,31 +38,14 @@ typedef struct irqentry_state { * This is intended to match the logic in irqentry_enter(), handling the kernel * mode transitions only. */ -static __always_inline arm64_irqentry_state_t __enter_from_kernel_mode(struct pt_regs *regs) +static __always_inline irqentry_state_t __enter_from_kernel_mode(struct pt_regs *regs) { - arm64_irqentry_state_t state = { - .exit_rcu = false, - }; - - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { - lockdep_hardirqs_off(CALLER_ADDR0); - ct_irq_enter(); - trace_hardirqs_off_finish(); - - state.exit_rcu = true; - return state; - } - - lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter_check_tick(); - trace_hardirqs_off_finish(); - - return state; + return irqentry_enter(regs); } -static noinstr arm64_irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) +static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = __enter_from_kernel_mode(regs); mte_check_tfsr_entry(); @@ -77,51 +54,6 @@ static noinstr arm64_irqentry_state_t enter_from_kernel_mode(struct pt_regs *reg return state; } -static inline bool arm64_preempt_schedule_irq(void) -{ - /* - * DAIF.DA are cleared at the start of IRQ/FIQ handling, and when GIC - * priority masking is used the GIC irqchip driver will clear DAIF.IF - * using gic_arch_enable_irqs() for normal IRQs. If anything is set in - * DAIF we must have handled an NMI, so skip preemption. - */ - if (system_uses_irq_prio_masking() && read_sysreg(daif)) - return false; - - /* - * Preempting a task from an IRQ means we leave copies of PSTATE - * on the stack. cpufeature's enable calls may modify PSTATE, but - * resuming one of these preempted tasks would undo those changes. - * - * Only allow a task to be preempted once cpufeatures have been - * enabled. - */ - if (!system_capabilities_finalized()) - return false; - - return true; -} - -#ifdef CONFIG_PREEMPTION -void raw_irqentry_exit_cond_resched(void) -{ - if (!preempt_count()) { - if (need_resched() && arm64_preempt_schedule_irq()) - preempt_schedule_irq(); - } -} -#endif - -#ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -void dynamic_irqentry_exit_cond_resched(void) -{ - if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) - return; - raw_irqentry_exit_cond_resched(); -} -#endif - /* * Handle IRQ/context state management when exiting to kernel mode. * After this function returns it is not safe to call regular kernel code, @@ -131,31 +63,13 @@ void dynamic_irqentry_exit_cond_resched(void) * mode transitions only, and with preemption handled elsewhere. */ static __always_inline void __exit_to_kernel_mode(struct pt_regs *regs, - arm64_irqentry_state_t state) + irqentry_state_t state) { - lockdep_assert_irqs_disabled(); - - if (!regs_irqs_disabled(regs)) { - if (state.exit_rcu) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - ct_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); - return; - } - - if (IS_ENABLED(CONFIG_PREEMPTION)) - irqentry_exit_cond_resched(); - - trace_hardirqs_on(); - } else { - if (state.exit_rcu) - ct_irq_exit(); - } + irqentry_exit(regs, state); } static void noinstr exit_to_kernel_mode(struct pt_regs *regs, - arm64_irqentry_state_t state) + irqentry_state_t state) { mte_check_tfsr_exit(); __exit_to_kernel_mode(regs, state); @@ -166,18 +80,15 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs, * Before this function is called it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static __always_inline void __enter_from_user_mode(void) +static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { - lockdep_hardirqs_off(CALLER_ADDR0); - CT_WARN_ON(ct_state() != CT_STATE_USER); - user_exit_irqoff(); - trace_hardirqs_off_finish(); + enter_from_user_mode(regs); mte_disable_tco_entry(current); } -static __always_inline void enter_from_user_mode(struct pt_regs *regs) +static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) { - __enter_from_user_mode(); + __enter_from_user_mode(regs); } /* @@ -185,116 +96,19 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static __always_inline void __exit_to_user_mode(void) + +static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - user_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) -{ - do { - local_irq_enable(); - - if (thread_flags & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) - schedule(); - - if (thread_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - if (thread_flags & _TIF_MTE_ASYNC_FAULT) { - clear_thread_flag(TIF_MTE_ASYNC_FAULT); - send_sig_fault(SIGSEGV, SEGV_MTEAERR, - (void __user *)NULL, current); - } - - if (thread_flags & _TIF_PATCH_PENDING) - klp_update_patch_state(current); - - if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - do_signal(regs); - - if (thread_flags & _TIF_NOTIFY_RESUME) - resume_user_mode_work(regs); - - if (thread_flags & _TIF_FOREIGN_FPSTATE) - fpsimd_restore_current_state(); - - local_irq_disable(); - thread_flags = read_thread_flags(); - } while (thread_flags & _TIF_WORK_MASK); -} - -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) -{ - unsigned long flags; - local_irq_disable(); - - flags = read_thread_flags(); - if (unlikely(flags & _TIF_WORK_MASK)) - do_notify_resume(regs, flags); - - local_daif_mask(); - - lockdep_sys_exit(); -} - -static __always_inline void exit_to_user_mode(struct pt_regs *regs) -{ exit_to_user_mode_prepare(regs); + local_daif_mask(); mte_check_tfsr_exit(); - __exit_to_user_mode(); + exit_to_user_mode(); } asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) { - exit_to_user_mode(regs); -} - -/* - * Handle IRQ/context state management when entering an NMI from user/kernel - * mode. Before this function is called it is not safe to call regular kernel - * code, instrumentable code, or any code which may trigger an exception. - */ -static noinstr arm64_irqentry_state_t arm64_enter_nmi(struct pt_regs *regs) -{ - arm64_irqentry_state_t state; - - state.lockdep = lockdep_hardirqs_enabled(); - - __nmi_enter(); - lockdep_hardirqs_off(CALLER_ADDR0); - lockdep_hardirq_enter(); - ct_nmi_enter(); - - trace_hardirqs_off_finish(); - ftrace_nmi_enter(); - - return state; -} - -/* - * Handle IRQ/context state management when exiting an NMI from user/kernel - * mode. After this function returns it is not safe to call regular kernel - * code, instrumentable code, or any code which may trigger an exception. - */ -static void noinstr arm64_exit_nmi(struct pt_regs *regs, - arm64_irqentry_state_t state) -{ - ftrace_nmi_exit(); - if (state.lockdep) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - } - - ct_nmi_exit(); - lockdep_hardirq_exit(); - if (state.lockdep) - lockdep_hardirqs_on(CALLER_ADDR0); - __nmi_exit(); + arm64_exit_to_user_mode(regs); } /* @@ -302,9 +116,9 @@ static void noinstr arm64_exit_nmi(struct pt_regs *regs, * kernel mode. Before this function is called it is not safe to call regular * kernel code, instrumentable code, or any code which may trigger an exception. */ -static noinstr arm64_irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) +static noinstr irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) { - arm64_irqentry_state_t state; + irqentry_state_t state; state.lockdep = lockdep_hardirqs_enabled(); @@ -322,7 +136,7 @@ static noinstr arm64_irqentry_state_t arm64_enter_el1_dbg(struct pt_regs *regs) * kernel code, instrumentable code, or any code which may trigger an exception. */ static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs, - arm64_irqentry_state_t state) + irqentry_state_t state) { if (state.lockdep) { trace_hardirqs_on_prepare(); @@ -353,7 +167,7 @@ extern void (*handle_arch_fiq)(struct pt_regs *); static void noinstr __panic_unhandled(struct pt_regs *regs, const char *vector, unsigned long esr) { - arm64_enter_nmi(regs); + irqentry_nmi_enter(regs); console_verbose(); @@ -504,7 +318,7 @@ UNHANDLED(el1t, 64, error) static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -516,7 +330,7 @@ static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -527,7 +341,7 @@ static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -538,7 +352,7 @@ static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -549,7 +363,7 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -560,7 +374,7 @@ static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -571,7 +385,7 @@ static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); @@ -582,7 +396,7 @@ static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) static void noinstr el1_softstp(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = arm64_enter_el1_dbg(regs); if (!cortex_a76_erratum_1463225_debug_handler(regs)) { @@ -604,7 +418,7 @@ static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr) { /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); - arm64_irqentry_state_t state; + irqentry_state_t state; state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); @@ -615,7 +429,7 @@ static void noinstr el1_watchpt(struct pt_regs *regs, unsigned long esr) static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = arm64_enter_el1_dbg(regs); debug_exception_enter(regs); @@ -626,7 +440,7 @@ static void noinstr el1_brk64(struct pt_regs *regs, unsigned long esr) static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); local_daif_inherit(regs); @@ -687,17 +501,17 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) static __always_inline void __el1_pnmi(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - arm64_irqentry_state_t state; + irqentry_state_t state; - state = arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_interrupt_handler(regs, handler); - arm64_exit_nmi(regs, state); + irqentry_nmi_exit(regs, state); } static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - arm64_irqentry_state_t state; + irqentry_state_t state; state = enter_from_kernel_mode(regs); @@ -731,22 +545,22 @@ asmlinkage void noinstr el1h_64_fiq_handler(struct pt_regs *regs) asmlinkage void noinstr el1h_64_error_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); - arm64_irqentry_state_t state; + irqentry_state_t state; local_daif_restore(DAIF_ERRCTX); - state = arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_serror(regs, esr); - arm64_exit_nmi(regs, state); + irqentry_nmi_exit(regs, state); } static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) @@ -761,50 +575,50 @@ static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(far)) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sve_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sme_acc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sme_acc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_fpsimd_exc(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sys(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_sys(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) @@ -814,58 +628,58 @@ static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(instruction_pointer(regs))) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(far, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(regs->sp, esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_undef(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_undef(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_bti(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_bti(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_mops(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_gcs(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); bad_el0_sync(regs, 0, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr) @@ -873,12 +687,12 @@ static void noinstr el0_breakpt(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(regs->pc)) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); debug_exception_enter(regs); do_breakpoint(esr, regs); debug_exception_exit(regs); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) @@ -886,7 +700,7 @@ static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(regs->pc)) arm64_apply_bp_hardening(); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); /* * After handling a breakpoint, we suspend the breakpoint * and use single-step to move to the next instruction. @@ -897,7 +711,7 @@ static void noinstr el0_softstp(struct pt_regs *regs, unsigned long esr) local_daif_restore(DAIF_PROCCTX); do_el0_softstep(esr, regs); } - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr) @@ -905,39 +719,39 @@ static void noinstr el0_watchpt(struct pt_regs *regs, unsigned long esr) /* Watchpoints are the only debug exception to write FAR_EL1 */ unsigned long far = read_sysreg(far_el1); - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); debug_exception_enter(regs); do_watchpoint(far, esr, regs); debug_exception_exit(regs); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_brk64(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_svc(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); fpsimd_syscall_exit(); } static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_fpac(regs, esr); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) @@ -1011,7 +825,7 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) static void noinstr el0_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); write_sysreg(DAIF_PROCCTX_NOIRQ, daif); @@ -1022,7 +836,7 @@ static void noinstr el0_interrupt(struct pt_regs *regs, do_interrupt_handler(regs, handler); irq_exit_rcu(); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr __el0_irq_handler_common(struct pt_regs *regs) @@ -1048,15 +862,15 @@ asmlinkage void noinstr el0t_64_fiq_handler(struct pt_regs *regs) static void noinstr __el0_error_handler_common(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); - arm64_irqentry_state_t state; + irqentry_state_t state; - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_ERRCTX); - state = arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); do_serror(regs, esr); - arm64_exit_nmi(regs, state); + irqentry_nmi_exit(regs, state); local_daif_restore(DAIF_PROCCTX); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) @@ -1067,27 +881,27 @@ asmlinkage void noinstr el0t_64_error_handler(struct pt_regs *regs) #ifdef CONFIG_COMPAT static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_el0_cp15(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_svc_compat(struct pt_regs *regs) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); do_el0_svc_compat(regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr) { - enter_from_user_mode(regs); + arm64_enter_from_user_mode(regs); local_daif_restore(DAIF_PROCCTX); do_bkpt32(esr, regs); - exit_to_user_mode(regs); + arm64_exit_to_user_mode(regs); } asmlinkage void noinstr el0t_32_sync_handler(struct pt_regs *regs) @@ -1166,7 +980,7 @@ asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) unsigned long esr = read_sysreg(esr_el1); unsigned long far = read_sysreg(far_el1); - arm64_enter_nmi(regs); + irqentry_nmi_enter(regs); panic_bad_stack(regs, esr, far); } @@ -1174,7 +988,7 @@ asmlinkage void noinstr __noreturn handle_bad_stack(struct pt_regs *regs) asmlinkage noinstr unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { - arm64_irqentry_state_t state; + irqentry_state_t state; unsigned long ret; /* @@ -1199,9 +1013,9 @@ __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) else if (cpu_has_pan()) set_pstate_pan(0); - state = arm64_enter_nmi(regs); + state = irqentry_nmi_enter(regs); ret = do_sdei_event(regs, arg); - arm64_exit_nmi(regs, state); + irqentry_nmi_exit(regs, state); return ret; } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index db3f972f8cd9..1110eeb21f57 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1576,7 +1577,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ -void do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; From 220928e52cb03d223b3acad3888baf0687486d21 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 18 Aug 2025 20:21:18 +0100 Subject: [PATCH 24/93] arm64/hwcap: Add hwcap for FEAT_LSFE FEAT_LSFE (Large System Float Extension), providing atomic floating point memory operations, is optional from v9.5. This feature adds no new architectural stare and we have no immediate use for it in the kernel so simply provide a hwcap for it to support discovery by userspace. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- Documentation/arch/arm64/elf_hwcaps.rst | 4 ++++ arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpufeature.c | 2 ++ arch/arm64/kernel/cpuinfo.c | 1 + 5 files changed, 9 insertions(+) diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst index f58ada4d6cb2..a15df4956849 100644 --- a/Documentation/arch/arm64/elf_hwcaps.rst +++ b/Documentation/arch/arm64/elf_hwcaps.rst @@ -441,6 +441,10 @@ HWCAP3_MTE_FAR HWCAP3_MTE_STORE_ONLY Functionality implied by ID_AA64PFR2_EL1.MTESTOREONLY == 0b0001. +HWCAP3_LSFE + Functionality implied by ID_AA64ISAR3_EL1.LSFE == 0b0001 + + 4. Unused AT_HWCAP bits ----------------------- diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 13f94c8ddfc0..6d567265467c 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -178,6 +178,7 @@ #define __khwcap3_feature(x) (const_ilog2(HWCAP3_ ## x) + 128) #define KERNEL_HWCAP_MTE_FAR __khwcap3_feature(MTE_FAR) #define KERNEL_HWCAP_MTE_STORE_ONLY __khwcap3_feature(MTE_STORE_ONLY) +#define KERNEL_HWCAP_LSFE __khwcap3_feature(LSFE) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 72c78468b806..575564ecdb0b 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -145,5 +145,6 @@ */ #define HWCAP3_MTE_FAR (1UL << 0) #define HWCAP3_MTE_STORE_ONLY (1UL << 1) +#define HWCAP3_LSFE (1UL << 2) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9ad065f15f1d..b1219f14459f 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -278,6 +278,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { static const struct arm64_ftr_bits ftr_id_aa64isar3[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -3252,6 +3253,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM), HWCAP_CAP(ID_AA64ISAR2_EL1, LUT, IMP, CAP_HWCAP, KERNEL_HWCAP_LUT), HWCAP_CAP(ID_AA64ISAR3_EL1, FAMINMAX, IMP, CAP_HWCAP, KERNEL_HWCAP_FAMINMAX), + HWCAP_CAP(ID_AA64ISAR3_EL1, LSFE, IMP, CAP_HWCAP, KERNEL_HWCAP_LSFE), HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index ba834909a28b..c44e6d94f5de 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -162,6 +162,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SME_SMOP4] = "smesmop4", [KERNEL_HWCAP_MTE_FAR] = "mtefar", [KERNEL_HWCAP_MTE_STORE_ONLY] = "mtestoreonly", + [KERNEL_HWCAP_LSFE] = "lsfe", }; #ifdef CONFIG_COMPAT From 47687aa4d9c91a9d9b1dbae40c242cd291030bd9 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:15 -0500 Subject: [PATCH 25/93] arm64: probes: Break ret out from bl/blr Prepare for GCS by breaking RET out into its own function, where it makes more sense to encapsulate the new behavior independent from the branch instructions. Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/decode-insn.c | 7 ++++--- arch/arm64/kernel/probes/simulate-insn.c | 10 +++++++++- arch/arm64/kernel/probes/simulate-insn.h | 3 ++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 6438bf62e753..4137cc5ef031 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -108,9 +108,10 @@ arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) aarch64_insn_is_bl(insn)) { api->handler = simulate_b_bl; } else if (aarch64_insn_is_br(insn) || - aarch64_insn_is_blr(insn) || - aarch64_insn_is_ret(insn)) { - api->handler = simulate_br_blr_ret; + aarch64_insn_is_blr(insn)) { + api->handler = simulate_br_blr; + } else if (aarch64_insn_is_ret(insn)) { + api->handler = simulate_ret; } else { /* * Instruction cannot be stepped out-of-line and we don't diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 4c6d2d712fbd..09a0b36122d0 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -126,7 +126,7 @@ simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs) } void __kprobes -simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs) +simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs) { int xn = (opcode >> 5) & 0x1f; @@ -138,6 +138,14 @@ simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs) set_x_reg(regs, 30, addr + 4); } +void __kprobes +simulate_ret(u32 opcode, long addr, struct pt_regs *regs) +{ + int xn = (opcode >> 5) & 0x1f; + + instruction_pointer_set(regs, get_x_reg(regs, xn)); +} + void __kprobes simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs) { diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h index efb2803ec943..9e772a292d56 100644 --- a/arch/arm64/kernel/probes/simulate-insn.h +++ b/arch/arm64/kernel/probes/simulate-insn.h @@ -11,7 +11,8 @@ void simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs); void simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs); -void simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs); +void simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs); +void simulate_ret(u32 opcode, long addr, struct pt_regs *regs); void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs); From ea920b50ac9ff13ef0282428bd80395ea134a26c Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:16 -0500 Subject: [PATCH 26/93] arm64: uaccess: Move existing GCS accessors definitions to gcs.h We are going to add some additional GCS access helpers to gcs.h in order to avoid some forward reference problems with uaccess. In preparation for that, lets move the existing gcssttr() and put_user_gcs() routines into gcs.h where it makes sense to keep all the accessors together. Further, the code which uses them already includes gcs.h and there is an existing CONFIG_ARM64_GCS check we can reuse. The GCSSTTR instruction description comment is corrected during the move. Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- arch/arm64/include/asm/gcs.h | 37 ++++++++++++++++++++++++++++- arch/arm64/include/asm/uaccess.h | 40 -------------------------------- 2 files changed, 36 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 5bc432234d3a..10c68d3e6e30 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -21,7 +21,7 @@ static inline void gcsstr(u64 *addr, u64 val) register u64 *_addr __asm__ ("x0") = addr; register long _val __asm__ ("x1") = val; - /* GCSSTTR x1, x0 */ + /* GCSSTTR x1, [x0] */ asm volatile( ".inst 0xd91f1c01\n" : @@ -81,6 +81,41 @@ static inline int gcs_check_locked(struct task_struct *task, return 0; } +static inline int gcssttr(unsigned long __user *addr, unsigned long val) +{ + register unsigned long __user *_addr __asm__ ("x0") = addr; + register unsigned long _val __asm__ ("x1") = val; + int err = 0; + + /* GCSSTTR x1, [x0] */ + asm volatile( + "1: .inst 0xd91f1c01\n" + "2: \n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) + : "+r" (err) + : "rZ" (_val), "r" (_addr) + : "memory"); + + return err; +} + +static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, + int *err) +{ + int ret; + + if (!access_ok((char __user *)addr, sizeof(u64))) { + *err = -EFAULT; + return; + } + + uaccess_ttbr0_enable(); + ret = gcssttr(addr, val); + if (ret != 0) + *err = ret; + uaccess_ttbr0_disable(); +} + #else static inline bool task_gcs_el0_enabled(struct task_struct *task) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 5b91803201ef..1aa4ecb73429 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -502,44 +502,4 @@ static inline size_t probe_subpage_writeable(const char __user *uaddr, #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ -#ifdef CONFIG_ARM64_GCS - -static inline int gcssttr(unsigned long __user *addr, unsigned long val) -{ - register unsigned long __user *_addr __asm__ ("x0") = addr; - register unsigned long _val __asm__ ("x1") = val; - int err = 0; - - /* GCSSTTR x1, x0 */ - asm volatile( - "1: .inst 0xd91f1c01\n" - "2: \n" - _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) - : "+r" (err) - : "rZ" (_val), "r" (_addr) - : "memory"); - - return err; -} - -static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, - int *err) -{ - int ret; - - if (!access_ok((char __user *)addr, sizeof(u64))) { - *err = -EFAULT; - return; - } - - uaccess_ttbr0_enable(); - ret = gcssttr(addr, val); - if (ret != 0) - *err = ret; - uaccess_ttbr0_disable(); -} - - -#endif /* CONFIG_ARM64_GCS */ - #endif /* __ASM_UACCESS_H */ From 030b3ffbdac75005ef73af752a42cd48c7bba155 Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Wed, 3 Sep 2025 17:52:07 -0700 Subject: [PATCH 27/93] arm64: mm: Cast start/end markers to char *, not u64 There are a few memset() calls in map_kernel.c that cast marker-symbol addresses to u64 in order to perform pointer subtraction (range size computation). Cast them with (char *) instead, aligning with idiomatic C pointer arithmetic. This patch provably has no effect at runtime: I have verified that .text of vmlinux is identical after this change. Signed-off-by: Sam Edwards Signed-off-by: Will Deacon --- arch/arm64/kernel/pi/map_kernel.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index 0f4bd7771859..2b3047860230 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -179,7 +179,7 @@ static void __init remap_idmap_for_lpa2(void) * Don't bother with the FDT, we no longer need it after this. */ memset(init_idmap_pg_dir, 0, - (u64)init_idmap_pg_end - (u64)init_idmap_pg_dir); + (char *)init_idmap_pg_end - (char *)init_idmap_pg_dir); create_init_idmap(init_idmap_pg_dir, mask); dsb(ishst); @@ -188,7 +188,7 @@ static void __init remap_idmap_for_lpa2(void) set_ttbr0_for_lpa2((u64)init_idmap_pg_dir); /* wipe the temporary ID map from memory */ - memset(init_pg_dir, 0, (u64)init_pg_end - (u64)init_pg_dir); + memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir); } static void __init map_fdt(u64 fdt) @@ -242,7 +242,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) map_fdt((u64)fdt); /* Clear BSS and the initial page tables */ - memset(__bss_start, 0, (u64)init_pg_end - (u64)__bss_start); + memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start); /* Parse the command line for CPU feature overrides */ chosen = fdt_path_offset(fdt, chosen_str); From c56aa9a67a0853ffcf64ebe7f1dbe5a5a7c315cc Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Wed, 3 Sep 2025 17:52:08 -0700 Subject: [PATCH 28/93] arm64: mm: Make map_fdt() return mapped pointer Currently map_fdt() accepts a physical address and relies on the caller to keep using the same value after mapping, since the implementation happens to install an identity mapping. This obscures the fact that the usable pointer is defined by the mapping, not by the input value. Since the mapping determines pointer validity, it is more natural to produce the pointer at mapping time. Change map_fdt() to return a void * pointing to the mapped FDT. This clarifies the data flow, removes the implicit identity assumption, and prepares for making map_fdt() accept a phys_addr_t in a follow-up change. Signed-off-by: Sam Edwards Signed-off-by: Will Deacon --- arch/arm64/kernel/pi/map_kernel.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index 2b3047860230..5dc4107b5a7f 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -191,7 +191,7 @@ static void __init remap_idmap_for_lpa2(void) memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir); } -static void __init map_fdt(u64 fdt) +static void *__init map_fdt(u64 fdt) { static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE); u64 efdt = fdt + MAX_FDT_SIZE; @@ -205,6 +205,8 @@ static void __init map_fdt(u64 fdt) fdt, PAGE_KERNEL, IDMAP_ROOT_LEVEL, (pte_t *)init_idmap_pg_dir, false, 0); dsb(ishst); + + return (void *)fdt; } /* @@ -238,15 +240,14 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) int root_level = 4 - CONFIG_PGTABLE_LEVELS; int va_bits = VA_BITS; int chosen; - - map_fdt((u64)fdt); + void *fdt_mapped = map_fdt((u64)fdt); /* Clear BSS and the initial page tables */ memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start); /* Parse the command line for CPU feature overrides */ - chosen = fdt_path_offset(fdt, chosen_str); - init_feature_override(boot_status, fdt, chosen); + chosen = fdt_path_offset(fdt_mapped, chosen_str); + init_feature_override(boot_status, fdt_mapped, chosen); if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && !cpu_has_lva()) { va_bits = VA_BITS_MIN; @@ -266,7 +267,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) * fill in the high bits from the seed. */ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { - u64 kaslr_seed = kaslr_early_init(fdt, chosen); + u64 kaslr_seed = kaslr_early_init(fdt_mapped, chosen); if (kaslr_seed && kaslr_requires_kpti()) arm64_use_ng_mappings = ng_mappings_allowed(); From b868fff5b10b6d09506e93e489ee19166bf6c5d2 Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Wed, 3 Sep 2025 17:52:09 -0700 Subject: [PATCH 29/93] arm64: mm: Represent physical memory with phys_addr_t and resource_size_t This is a type-correctness cleanup to MMU/boot code that replaces several instances of void * and u64 with phys_addr_t (to represent addresses) and resource_size_t (to represent sizes) to emphasize that the code in question concerns physical memory specifically. The rationale for this change is to improve clarity and readability in a few modules that handle both types (physical and virtual) of address and differentiation is essential. I have left u64 in cases where the address may be either physical or virtual, where the address is exclusively virtual but used in heavy pointer arithmetic, and in cases I may have overlooked. I do not necessarily consider u64 the ideal type in those situations, but it avoids breaking existing semantics in this cleanup. This patch provably has no effect at runtime: I have verified that .text of vmlinux is identical after this change. Signed-off-by: Sam Edwards Signed-off-by: Will Deacon --- arch/arm64/kernel/pi/map_kernel.c | 26 +++++++++++++------------- arch/arm64/kernel/pi/map_range.c | 20 ++++++++++++-------- arch/arm64/kernel/pi/pi.h | 9 +++++---- arch/arm64/mm/init.c | 6 +++--- arch/arm64/mm/mmu.c | 17 +++++++++-------- 5 files changed, 42 insertions(+), 36 deletions(-) diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index 5dc4107b5a7f..e6d35eff1486 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -18,9 +18,9 @@ extern const u8 __eh_frame_start[], __eh_frame_end[]; -extern void idmap_cpu_replace_ttbr1(void *pgdir); +extern void idmap_cpu_replace_ttbr1(phys_addr_t pgdir); -static void __init map_segment(pgd_t *pg_dir, u64 *pgd, u64 va_offset, +static void __init map_segment(pgd_t *pg_dir, phys_addr_t *pgd, u64 va_offset, void *start, void *end, pgprot_t prot, bool may_use_cont, int root_level) { @@ -40,7 +40,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) { bool enable_scs = IS_ENABLED(CONFIG_UNWIND_PATCH_PAC_INTO_SCS); bool twopass = IS_ENABLED(CONFIG_RELOCATABLE); - u64 pgdp = (u64)init_pg_dir + PAGE_SIZE; + phys_addr_t pgdp = (phys_addr_t)init_pg_dir + PAGE_SIZE; pgprot_t text_prot = PAGE_KERNEL_ROX; pgprot_t data_prot = PAGE_KERNEL; pgprot_t prot; @@ -90,7 +90,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) true, root_level); dsb(ishst); - idmap_cpu_replace_ttbr1(init_pg_dir); + idmap_cpu_replace_ttbr1((phys_addr_t)init_pg_dir); if (twopass) { if (IS_ENABLED(CONFIG_RELOCATABLE)) @@ -129,10 +129,10 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) /* Copy the root page table to its final location */ memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PAGE_SIZE); dsb(ishst); - idmap_cpu_replace_ttbr1(swapper_pg_dir); + idmap_cpu_replace_ttbr1((phys_addr_t)swapper_pg_dir); } -static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr) +static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(phys_addr_t ttbr) { u64 sctlr = read_sysreg(sctlr_el1); u64 tcr = read_sysreg(tcr_el1) | TCR_DS; @@ -172,7 +172,7 @@ static void __init remap_idmap_for_lpa2(void) */ create_init_idmap(init_pg_dir, mask); dsb(ishst); - set_ttbr0_for_lpa2((u64)init_pg_dir); + set_ttbr0_for_lpa2((phys_addr_t)init_pg_dir); /* * Recreate the initial ID map with the same granularity as before. @@ -185,17 +185,17 @@ static void __init remap_idmap_for_lpa2(void) dsb(ishst); /* switch back to the updated initial ID map */ - set_ttbr0_for_lpa2((u64)init_idmap_pg_dir); + set_ttbr0_for_lpa2((phys_addr_t)init_idmap_pg_dir); /* wipe the temporary ID map from memory */ memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir); } -static void *__init map_fdt(u64 fdt) +static void *__init map_fdt(phys_addr_t fdt) { static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE); - u64 efdt = fdt + MAX_FDT_SIZE; - u64 ptep = (u64)ptes; + phys_addr_t efdt = fdt + MAX_FDT_SIZE; + phys_addr_t ptep = (phys_addr_t)ptes; /* We're idmapped when called */ /* * Map up to MAX_FDT_SIZE bytes, but avoid overlap with @@ -232,7 +232,7 @@ static bool __init ng_mappings_allowed(void) return true; } -asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) +asmlinkage void __init early_map_kernel(u64 boot_status, phys_addr_t fdt) { static char const chosen_str[] __initconst = "/chosen"; u64 va_base, pa_base = (u64)&_text; @@ -240,7 +240,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) int root_level = 4 - CONFIG_PGTABLE_LEVELS; int va_bits = VA_BITS; int chosen; - void *fdt_mapped = map_fdt((u64)fdt); + void *fdt_mapped = map_fdt(fdt); /* Clear BSS and the initial page tables */ memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start); diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c index 7982788e7b9a..de52cd85c691 100644 --- a/arch/arm64/kernel/pi/map_range.c +++ b/arch/arm64/kernel/pi/map_range.c @@ -26,8 +26,9 @@ * @va_offset: Offset between a physical page and its current mapping * in the VA space */ -void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset) +void __init map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset) { u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; ptdesc_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; @@ -87,19 +88,22 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, } } -asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) +asmlinkage phys_addr_t __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) { - u64 ptep = (u64)pg_dir + PAGE_SIZE; + phys_addr_t ptep = (phys_addr_t)pg_dir + PAGE_SIZE; /* MMU is off */ pgprot_t text_prot = PAGE_KERNEL_ROX; pgprot_t data_prot = PAGE_KERNEL; pgprot_val(text_prot) &= ~clrmask; pgprot_val(data_prot) &= ~clrmask; - map_range(&ptep, (u64)_stext, (u64)__initdata_begin, (u64)_stext, - text_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); - map_range(&ptep, (u64)__initdata_begin, (u64)_end, (u64)__initdata_begin, - data_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); + /* MMU is off; pointer casts to phys_addr_t are safe */ + map_range(&ptep, (u64)_stext, (u64)__initdata_begin, + (phys_addr_t)_stext, text_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); + map_range(&ptep, (u64)__initdata_begin, (u64)_end, + (phys_addr_t)__initdata_begin, data_prot, IDMAP_ROOT_LEVEL, + (pte_t *)pg_dir, false, 0); return ptep; } diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h index 46cafee7829f..08ef9f80456b 100644 --- a/arch/arm64/kernel/pi/pi.h +++ b/arch/arm64/kernel/pi/pi.h @@ -29,9 +29,10 @@ u64 kaslr_early_init(void *fdt, int chosen); void relocate_kernel(u64 offset); int scs_patch(const u8 eh_frame[], int size); -void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset); +void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset); -asmlinkage void early_map_kernel(u64 boot_status, void *fdt); +asmlinkage void early_map_kernel(u64 boot_status, phys_addr_t fdt); -asmlinkage u64 create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); +asmlinkage phys_addr_t create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index ea84a61ed508..70c2ca813c18 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -243,7 +243,7 @@ void __init arm64_memblock_init(void) */ if (memory_limit != PHYS_ADDR_MAX) { memblock_mem_limit_remove_map(memory_limit); - memblock_add(__pa_symbol(_text), (u64)(_end - _text)); + memblock_add(__pa_symbol(_text), (resource_size_t)(_end - _text)); } if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { @@ -252,8 +252,8 @@ void __init arm64_memblock_init(void) * initrd to become inaccessible via the linear mapping. * Otherwise, this is a no-op */ - u64 base = phys_initrd_start & PAGE_MASK; - u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; + phys_addr_t base = phys_initrd_start & PAGE_MASK; + resource_size_t size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; /* * We can only add back the initrd memory if we don't end up diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 34e5d78af076..de463040582c 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -794,17 +794,18 @@ static void __init declare_kernel_vmas(void) declare_vma(&vmlinux_seg[4], _data, _end, 0); } -void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, - int level, pte_t *tbl, bool may_use_cont, u64 va_offset); +void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, + u64 va_offset); static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; static void __init create_idmap(void) { - u64 start = __pa_symbol(__idmap_text_start); - u64 end = __pa_symbol(__idmap_text_end); - u64 ptep = __pa_symbol(idmap_ptes); + phys_addr_t start = __pa_symbol(__idmap_text_start); + phys_addr_t end = __pa_symbol(__idmap_text_end); + phys_addr_t ptep = __pa_symbol(idmap_ptes); __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX, IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, @@ -812,7 +813,7 @@ static void __init create_idmap(void) if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) { extern u32 __idmap_kpti_flag; - u64 pa = __pa_symbol(&__idmap_kpti_flag); + phys_addr_t pa = __pa_symbol(&__idmap_kpti_flag); /* * The KPTI G-to-nG conversion code needs a read-write mapping @@ -1331,8 +1332,8 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) struct range arch_get_mappable_range(void) { struct range mhp_range; - u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); - u64 end_linear_pa = __pa(PAGE_END - 1); + phys_addr_t start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); + phys_addr_t end_linear_pa = __pa(PAGE_END - 1); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { /* From c0f303d7d4723b01c686e949e6f26a93e5cda910 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Tue, 9 Sep 2025 11:32:35 +0800 Subject: [PATCH 30/93] arm64: mm: Rework the 'rodata=' options As per admin guide documentation, "rodata=on" should be the default on platforms. Documentation/admin-guide/kernel-parameters.txt describes these options as rodata= [KNL,EARLY] on Mark read-only kernel memory as read-only (default). off Leave read-only kernel memory writable for debugging. full Mark read-only kernel memory and aliases as read-only [arm64] But on arm64 platform, RODATA_FULL_DEFAULT_ENABLED is enabled by default, so "rodata=full" is the default instead. For parity with other architectures, namely x86, rework 'rodata=on' to match the current "full" behaviour and replace 'rodata=full' with a new 'rodata=noalias' option which retains writable aliases in the direct map for memory regions outside of the kernel image. Signed-off-by: Huang Shijie Reviewed-by: Anshuman Khandual Signed-off-by: Will Deacon --- Documentation/admin-guide/kernel-parameters.txt | 5 +++-- arch/arm64/include/asm/setup.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 747a55abf494..fe99652c584e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6405,8 +6405,9 @@ rodata= [KNL,EARLY] on Mark read-only kernel memory as read-only (default). off Leave read-only kernel memory writable for debugging. - full Mark read-only kernel memory and aliases as read-only - [arm64] + noalias Mark read-only kernel memory as read-only but retain + writable aliases in the direct map for regions outside + of the kernel image. [arm64] rockchip.usb_uart [EARLY] diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h index ba269a7a3201..3d96dde4d214 100644 --- a/arch/arm64/include/asm/setup.h +++ b/arch/arm64/include/asm/setup.h @@ -21,7 +21,7 @@ static inline bool arch_parse_debug_rodata(char *arg) if (!arg) return false; - if (!strcmp(arg, "full")) { + if (!strcmp(arg, "on")) { rodata_enabled = rodata_full = true; return true; } @@ -31,7 +31,7 @@ static inline bool arch_parse_debug_rodata(char *arg) return true; } - if (!strcmp(arg, "on")) { + if (!strcmp(arg, "noalias")) { rodata_enabled = true; rodata_full = false; return true; From bfbbb0d3215f0f6ef622cc8066e5f6afda6960a2 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Tue, 9 Sep 2025 11:32:36 +0800 Subject: [PATCH 31/93] arm64/Kconfig: Remove CONFIG_RODATA_FULL_DEFAULT_ENABLED Now that 'rodata=full' has been removed in favour of parity with x86, CONFIG_RODATA_FULL_DEFAULT_ENABLED no longer serves a useful purpose. Remove it. Reviewed-by: Anshuman Khandual Signed-off-by: Huang Shijie Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 14 -------------- arch/arm64/mm/pageattr.c | 2 +- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a6..8b3c23ee7a64 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1698,20 +1698,6 @@ config MITIGATE_SPECTRE_BRANCH_HISTORY When taking an exception from user-space, a sequence of branches or a firmware call overwrites the branch history. -config RODATA_FULL_DEFAULT_ENABLED - bool "Apply r/o permissions of VM areas also to their linear aliases" - default y - help - Apply read-only attributes of VM areas to the linear alias of - the backing pages as well. This prevents code or read-only data - from being modified (inadvertently or intentionally) via another - mapping of the same memory page. This additional enhancement can - be turned off at runtime by passing rodata=[off|on] (and turned on - with rodata=full if this option is set to 'n') - - This requires the linear region to be mapped down to pages, - which may adversely affect performance in some cases. - config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" depends on !KCSAN diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 04d4a8f676db..667aff1efe49 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -20,7 +20,7 @@ struct page_change_data { pgprot_t clear_mask; }; -bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); +bool rodata_full __ro_after_init = true; bool can_set_direct_map(void) { From 19dd484cd19c308e2ea763f8d31e43a7f1ab6141 Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Fri, 12 Sep 2025 20:09:05 -0400 Subject: [PATCH 32/93] arm64/fpsimd: simplify sme_setup() The function checks info->vq_map for emptiness right before calling find_last_bit(). We can use the find_last_bit() output and save on bitmap_empty() call, which is O(N). Signed-off-by: Yury Norov (NVIDIA) Signed-off-by: Will Deacon --- arch/arm64/kernel/fpsimd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index c37f02d7194e..e3f8f51748bc 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1265,6 +1265,8 @@ void __init sme_setup(void) if (!system_supports_sme()) return; + min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); + /* * SME doesn't require any particular vector length be * supported but it does require at least one. We should have @@ -1272,9 +1274,8 @@ void __init sme_setup(void) * let's double check here. The bitmap is SVE_VQ_MAP sized for * sharing with SVE. */ - WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX)); + WARN_ON(min_bit >= SVE_VQ_MAX); - min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX); info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit)); max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX); From 9cd2a7f1180f9b6fe5214abc90eaf5c053f545ee Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:17 -0500 Subject: [PATCH 33/93] arm64: uaccess: Add additional userspace GCS accessors Uprobes need more advanced read, push, and pop userspace GCS functionality. Implement those features using the existing gcsstr() and copy_from_user(). Its important to note that GCS pages can be read by normal instructions, but the hardware validates that pages used by GCS specific operations, have a GCS privilege set. We aren't validating this in load_user_gcs because it requires stabilizing the VMA over the read which may fault. Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas Reviewed-by: Mark Brown [will: Add '__force' to gcspr cast in pop_user_gcs()] Signed-off-by: Will Deacon --- arch/arm64/include/asm/gcs.h | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 10c68d3e6e30..8fa0707069e8 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -116,6 +116,47 @@ static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, uaccess_ttbr0_disable(); } +static inline void push_user_gcs(unsigned long val, int *err) +{ + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + + gcspr -= sizeof(u64); + put_user_gcs(val, (unsigned long __user *)gcspr, err); + if (!*err) + write_sysreg_s(gcspr, SYS_GCSPR_EL0); +} + +/* + * Unlike put/push_user_gcs() above, get/pop_user_gsc() doesn't + * validate the GCS permission is set on the page being read. This + * differs from how the hardware works when it consumes data stored at + * GCSPR. Callers should ensure this is acceptable. + */ +static inline u64 get_user_gcs(unsigned long __user *addr, int *err) +{ + unsigned long ret; + u64 load = 0; + + /* Ensure previous GCS operation are visible before we read the page */ + gcsb_dsync(); + ret = copy_from_user(&load, addr, sizeof(load)); + if (ret != 0) + *err = ret; + return load; +} + +static inline u64 pop_user_gcs(int *err) +{ + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + u64 read_val; + + read_val = get_user_gcs((__force unsigned long __user *)gcspr, err); + if (!*err) + write_sysreg_s(gcspr + sizeof(u64), SYS_GCSPR_EL0); + + return read_val; +} + #else static inline bool task_gcs_el0_enabled(struct task_struct *task) @@ -126,6 +167,10 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task) static inline void gcs_set_el0_mode(struct task_struct *task) { } static inline void gcs_free(struct task_struct *task) { } static inline void gcs_preserve_current_state(void) { } +static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, + int *err) { } +static inline void push_user_gcs(unsigned long val, int *err) { } + static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, const struct kernel_clone_args *args) { @@ -136,6 +181,15 @@ static inline int gcs_check_locked(struct task_struct *task, { return 0; } +static inline u64 get_user_gcs(unsigned long __user *addr, int *err) +{ + *err = -EFAULT; + return 0; +} +static inline u64 pop_user_gcs(int *err) +{ + return 0; +} #endif From efb07ac534e24e22a7eb32815fb50f69931cdeae Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:18 -0500 Subject: [PATCH 34/93] arm64: probes: Add GCS support to bl/blr/ret The arm64 probe simulation doesn't currently have logic in place to deal with GCS and this results in core dumps if probes are inserted at control flow locations. Fix-up bl, blr and ret to manipulate the shadow stack as needed. While we manipulate and validate the shadow stack correctly, the hardware provides additional security by only allowing GCS operations against pages which are marked to support GCS. For writing there is gcssttr() which enforces this, but there isn't an equivalent for reading. This means that uprobe users should be aware that probing on control flow instructions which require reading the shadow stack (ex: ret) offers lower security guarantees than what is achieved without the uprobe active. Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/simulate-insn.c | 42 +++++++++++++++++++----- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 09a0b36122d0..97ed4db75417 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -13,6 +13,7 @@ #include #include "simulate-insn.h" +#include "asm/gcs.h" #define bbl_displacement(insn) \ sign_extend32(((insn) & 0x3ffffff) << 2, 27) @@ -49,6 +50,21 @@ static inline u32 get_w_reg(struct pt_regs *regs, int reg) return lower_32_bits(pt_regs_read_reg(regs, reg)); } +static inline int update_lr(struct pt_regs *regs, long addr) +{ + int err = 0; + + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + push_user_gcs(addr, &err); + if (err) { + force_sig(SIGSEGV); + return err; + } + } + procedure_link_pointer_set(regs, addr); + return err; +} + static bool __kprobes check_cbz(u32 opcode, struct pt_regs *regs) { int xn = opcode & 0x1f; @@ -107,9 +123,9 @@ simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs) { int disp = bbl_displacement(opcode); - /* Link register is x30 */ if (opcode & (1 << 31)) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; instruction_pointer_set(regs, addr + disp); } @@ -129,21 +145,31 @@ void __kprobes simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs) { int xn = (opcode >> 5) & 0x1f; + int b_target = get_x_reg(regs, xn); - /* update pc first in case we're doing a "blr lr" */ - instruction_pointer_set(regs, get_x_reg(regs, xn)); - - /* Link register is x30 */ if (((opcode >> 21) & 0x3) == 1) - set_x_reg(regs, 30, addr + 4); + if (update_lr(regs, addr + 4)) + return; + + instruction_pointer_set(regs, b_target); } void __kprobes simulate_ret(u32 opcode, long addr, struct pt_regs *regs) { + u64 ret_addr; + int err = 0; int xn = (opcode >> 5) & 0x1f; + unsigned long r_target = get_x_reg(regs, xn); - instruction_pointer_set(regs, get_x_reg(regs, xn)); + if (user_mode(regs) && task_gcs_el0_enabled(current)) { + ret_addr = pop_user_gcs(&err); + if (err || ret_addr != r_target) { + force_sig(SIGSEGV); + return; + } + } + instruction_pointer_set(regs, r_target); } void __kprobes From 4a601714bb24926507b2051c4a95f07e0e142004 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:19 -0500 Subject: [PATCH 35/93] arm64: uprobes: Add GCS support to uretprobes Ret probes work by changing the value in the link register at the probe location to return to the probe rather than the calling routine. Thus the GCS needs to be updated with this address as well. Since its possible to insert probes at locations where the current value of the LR doesn't match the GCS state this needs to be detected and handled in order to maintain the existing no-fault behavior. Co-developed-by: Steve Capper Signed-off-by: Steve Capper Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas [will: Add '__force' to gcspr casts in arch_uretprobe_hijack_return_addr()] Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/uprobes.c | 33 ++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 1f91fd2a8187..2799bdb2fb82 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "decode-insn.h" @@ -159,11 +160,43 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { unsigned long orig_ret_vaddr; + unsigned long gcs_ret_vaddr; + int err = 0; + u64 gcspr; orig_ret_vaddr = procedure_link_pointer(regs); + + if (task_gcs_el0_enabled(current)) { + gcspr = read_sysreg_s(SYS_GCSPR_EL0); + gcs_ret_vaddr = get_user_gcs((__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + + /* + * If the LR and GCS return addr don't match, then some kind of PAC + * signing or control flow occurred since entering the probed function. + * Likely because the user is attempting to retprobe on an instruction + * that isn't a function boundary or inside a leaf function. Explicitly + * abort this retprobe because it will generate a GCS exception. + */ + if (gcs_ret_vaddr != orig_ret_vaddr) { + orig_ret_vaddr = -1; + goto out; + } + + put_user_gcs(trampoline_vaddr, (__force unsigned long __user *)gcspr, &err); + if (err) { + force_sig(SIGSEGV); + goto out; + } + } + /* Replace the return addr with trampoline addr */ procedure_link_pointer_set(regs, trampoline_vaddr); +out: return orig_ret_vaddr; } From cc66c711e58f5dd29da79c24ca699a0312e012e3 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:20 -0500 Subject: [PATCH 36/93] arm64: Kconfig: Remove GCS restrictions on UPROBES Now that the uprobe paths have been made GCS compatible drop the Kconfig restriction. Signed-off-by: Jeremy Linton Acked-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a6..c61572bbe59b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2225,7 +2225,6 @@ config ARM64_GCS default y select ARCH_HAS_USER_SHADOW_STACK select ARCH_USES_HIGH_VMA_FLAGS - depends on !UPROBES help Guarded Control Stack (GCS) provides support for a separate stack with restricted access which contains only return From ba1afc94deb849eab843a372b969444581add2c9 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Sun, 24 Aug 2025 22:34:21 -0500 Subject: [PATCH 37/93] uprobes: uprobe_warn should use passed task uprobe_warn() is passed a task structure, yet its using current. For the most part this shouldn't matter, but since a task structure is provided, lets use it. Fixes: 248d3a7b2f10 ("uprobes: Change uprobe_copy_process() to dup return_instances") Signed-off-by: Jeremy Linton Reviewed-by: Catalin Marinas Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Signed-off-by: Will Deacon --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd..4b97d16f731c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -121,7 +121,7 @@ struct xol_area { static void uprobe_warn(struct task_struct *t, const char *msg) { - pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); + pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg); } /* From 5b7bdc4402b12bdad747cce305ecbc9737aed7ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 16 Sep 2025 18:51:35 +0200 Subject: [PATCH 38/93] kselftest/arm64/gcs/basic-gcs: Respect parent directory CFLAGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit basic-gcs has it's own make rule to handle the special compiler invocation to build against nolibc. This rule does not respect the $(CFLAGS) passed by the Makefile from the parent directory. However these $(CFLAGS) set up the include path to include the UAPI headers from the current kernel. Due to this the asm/hwcap.h header is used from the toolchain instead of the UAPI and the definition of HWCAP_GCS is not found. Restructure the rule for basic-gcs to respect the $(CFLAGS). Also drop those options which are already provided by $(CFLAGS). Reported-by: Naresh Kamboju Closes: https://lore.kernel.org/lkml/CA+G9fYv77X+kKz2YT6xw7=9UrrotTbQ6fgNac7oohOg8BgGvtw@mail.gmail.com/ Fixes: a985fe638344 ("kselftest/arm64/gcs: Use nolibc's getauxval()") Tested-by: Linux Kernel Functional Testing Signed-off-by: Thomas Weißschuh Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/gcs/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index d2f3497a9103..1fbbf0ca1f02 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -14,11 +14,11 @@ LDLIBS+=-lpthread include ../../lib.mk $(OUTPUT)/basic-gcs: basic-gcs.c - $(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ - -static -include ../../../../include/nolibc/nolibc.h \ + $(CC) $(CFLAGS) -fno-asynchronous-unwind-tables -fno-ident -s -nostdlib -nostdinc \ + -static -I../../../../include/nolibc -include ../../../../include/nolibc/nolibc.h \ -I../../../../../usr/include \ -std=gnu99 -I../.. -g \ - -ffreestanding -Wall $^ -o $@ -lgcc + -ffreestanding $^ -o $@ -lgcc $(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S $(CC) -nostdlib $^ -o $@ From f38c2c3e572ce0ce5c01de0358ed70328e0cb5af Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 18 Sep 2025 06:26:55 +0000 Subject: [PATCH 39/93] arm64: cputype: Add Cortex-A720AE definitions Add cputype definitions for Cortex-A720AE. These will be used for errata detection in subsequent patches. These values can be found in the Cortex-A720AE TRM: https://developer.arm.com/documentation/102828/0001/ ... in Table A-187 Signed-off-by: Kuninori Morimoto Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 661735616787..b10eba7f5247 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -96,6 +96,7 @@ #define ARM_CPU_PART_NEOVERSE_V3 0xD84 #define ARM_CPU_PART_CORTEX_X925 0xD85 #define ARM_CPU_PART_CORTEX_A725 0xD87 +#define ARM_CPU_PART_CORTEX_A720AE 0xD89 #define ARM_CPU_PART_NEOVERSE_N3 0xD8E #define APM_CPU_PART_XGENE 0x000 @@ -185,6 +186,7 @@ #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3) #define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925) #define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725) +#define MIDR_CORTEX_A720AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720AE) #define MIDR_NEOVERSE_N3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N3) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) From 3ba8d4aa42bd5dc6e2493ce4a73bb41c9cfd77ca Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 18 Sep 2025 06:27:12 +0000 Subject: [PATCH 40/93] arm64: errata: Expand speculative SSBS workaround for Cortex-A720AE It is same as Cortex-A720. Link: https://lore.kernel.org/all/aMlFwbDjJ6yKuxTv@J2N7QTR9R3.cambridge.arm.com/ Signed-off-by: Kuninori Morimoto Signed-off-by: Will Deacon --- arch/arm64/kernel/cpu_errata.c | 1 + arch/arm64/kernel/proton-pack.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 59d723c9ab8f..7ff6b49beaaf 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -531,6 +531,7 @@ static const struct midr_range erratum_spec_ssbs_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), MIDR_ALL_VERSIONS(MIDR_CORTEX_A725), MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index edf1783ffc81..f9a32dfde006 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -884,6 +884,7 @@ static u8 spectre_bhb_loop_affected(void) static const struct midr_range spectre_bhb_k38_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE), {}, }; static const struct midr_range spectre_bhb_k32_list[] = { From dd68f51febbd6eb8a40872724f4c9bcc84442114 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 20 Aug 2025 19:29:03 +0100 Subject: [PATCH 41/93] kselftest/arm64: Verify that we reject out of bounds VLs in sve-ptrace We do not currently have a test that asserts that we reject attempts to set a vector length smaller than SVE_VL_MIN or larger than SVE_VL_MAX, add one since that is our current behaviour. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/sve-ptrace.c | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index 79bcc2369cdb..0ce841a7bb6e 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -66,7 +66,7 @@ static const struct vec_type vec_types[] = { }; #define VL_TESTS (((TEST_VQ_MAX - SVE_VQ_MIN) + 1) * 4) -#define FLAG_TESTS 2 +#define FLAG_TESTS 4 #define FPSIMD_TESTS 2 #define EXPECTED_TESTS ((VL_TESTS + FLAG_TESTS + FPSIMD_TESTS) * ARRAY_SIZE(vec_types)) @@ -286,6 +286,25 @@ static void check_u32(unsigned int vl, const char *reg, } } +/* Set out of range VLs */ +static void ptrace_set_vl_ranges(pid_t child, const struct vec_type *type) +{ + struct user_sve_header sve; + int ret; + + memset(&sve, 0, sizeof(sve)); + sve.flags = SVE_PT_REGS_SVE; + sve.size = sizeof(sve); + + ret = set_sve(child, type, &sve); + ksft_test_result(ret != 0, "%s Set invalid VL 0\n", type->name); + + sve.vl = SVE_VL_MAX + SVE_VQ_BYTES; + ret = set_sve(child, type, &sve); + ksft_test_result(ret != 0, "%s Set invalid VL %d\n", type->name, + SVE_VL_MAX + SVE_VQ_BYTES); +} + /* Access the FPSIMD registers via the SVE regset */ static void ptrace_sve_fpsimd(pid_t child, const struct vec_type *type) { @@ -719,6 +738,17 @@ static int do_parent(pid_t child) vec_types[i].name); } + /* Setting out of bounds VLs should fail */ + if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) { + ptrace_set_vl_ranges(child, &vec_types[i]); + } else { + ksft_test_result_skip("%s Set invalid VL 0\n", + vec_types[i].name); + ksft_test_result_skip("%s Set invalid VL %d\n", + vec_types[i].name, + SVE_VL_MAX + SVE_VQ_BYTES); + } + /* Step through every possible VQ */ for (vq = SVE_VQ_MIN; vq <= TEST_VQ_MAX; vq++) { vl = sve_vl_from_vq(vq); From 09b5febf84262a303ecedf0821e03b8d8492a38b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 20 Aug 2025 19:29:04 +0100 Subject: [PATCH 42/93] kselftest/arm64: Check that unsupported regsets fail in sve-ptrace Add a test which verifies that NT_ARM_SVE and NT_ARM_SSVE reads and writes are rejected as expected when the relevant architecture feature is not supported. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/sve-ptrace.c | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index 0ce841a7bb6e..e0fc3a001e28 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -174,6 +174,38 @@ static int set_sve(pid_t pid, const struct vec_type *type, return ret; } +/* A read operation fails */ +static void read_fails(pid_t child, const struct vec_type *type) +{ + struct user_sve_header *new_sve = NULL; + size_t new_sve_size = 0; + void *ret; + + ret = get_sve(child, type, (void **)&new_sve, &new_sve_size); + + ksft_test_result(ret == NULL, "%s unsupported read fails\n", + type->name); + + free(new_sve); +} + +/* A write operation fails */ +static void write_fails(pid_t child, const struct vec_type *type) +{ + struct user_sve_header sve; + int ret; + + /* Just the header, no data */ + memset(&sve, 0, sizeof(sve)); + sve.size = sizeof(sve); + sve.flags = SVE_PT_REGS_SVE; + sve.vl = SVE_VL_MIN; + ret = set_sve(child, type, &sve); + + ksft_test_result(ret != 0, "%s unsupported write fails\n", + type->name); +} + /* Validate setting and getting the inherit flag */ static void ptrace_set_get_inherit(pid_t child, const struct vec_type *type) { @@ -718,6 +750,20 @@ static int do_parent(pid_t child) } for (i = 0; i < ARRAY_SIZE(vec_types); i++) { + /* + * If the vector type isn't supported reads and writes + * should fail. + */ + if (!(getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap)) { + read_fails(child, &vec_types[i]); + write_fails(child, &vec_types[i]); + } else { + ksft_test_result_skip("%s unsupported read fails\n", + vec_types[i].name); + ksft_test_result_skip("%s unsupported write fails\n", + vec_types[i].name); + } + /* FPSIMD via SVE regset */ if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) { ptrace_sve_fpsimd(child, &vec_types[i]); From f8cc02321bfca71967db1620d684aa3abab59612 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Thu, 21 Aug 2025 19:01:49 +0800 Subject: [PATCH 43/93] dt-bindings: perf: fsl-imx-ddr: Add a compatible string fsl,imx94-ddr-pmu for i.MX94 i.MX94 has a DDR Performance Monitor Unit which is compatible with i.MX93. This will add a compatible for i.MX94. Reviewed-by: Peng Fan Reviewed-by: Frank Li Acked-by: Conor Dooley Signed-off-by: Xu Yang Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml index 8597ea625edb..d2e578d6b83b 100644 --- a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml +++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml @@ -33,6 +33,7 @@ properties: - items: - enum: - fsl,imx91-ddr-pmu + - fsl,imx94-ddr-pmu - fsl,imx95-ddr-pmu - const: fsl,imx93-ddr-pmu From e4d9e8fb406bef3936aceac85d3f400b1acdbe73 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Thu, 21 Aug 2025 19:01:50 +0800 Subject: [PATCH 44/93] perf: imx_perf: add support for i.MX94 platform Add compatible string and related devtype for i.MX94 platform. Reviewed-by: Peng Fan Reviewed-by: Frank Li Signed-off-by: Xu Yang Signed-off-by: Will Deacon --- drivers/perf/fsl_imx9_ddr_perf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c index 267754fdf581..7050b48c0467 100644 --- a/drivers/perf/fsl_imx9_ddr_perf.c +++ b/drivers/perf/fsl_imx9_ddr_perf.c @@ -104,6 +104,11 @@ static const struct imx_ddr_devtype_data imx93_devtype_data = { .filter_ver = DDR_PERF_AXI_FILTER_V1 }; +static const struct imx_ddr_devtype_data imx94_devtype_data = { + .identifier = "imx94", + .filter_ver = DDR_PERF_AXI_FILTER_V2 +}; + static const struct imx_ddr_devtype_data imx95_devtype_data = { .identifier = "imx95", .filter_ver = DDR_PERF_AXI_FILTER_V2 @@ -122,6 +127,7 @@ static inline bool axi_filter_v2(struct ddr_pmu *pmu) static const struct of_device_id imx_ddr_pmu_dt_ids[] = { { .compatible = "fsl,imx91-ddr-pmu", .data = &imx91_devtype_data }, { .compatible = "fsl,imx93-ddr-pmu", .data = &imx93_devtype_data }, + { .compatible = "fsl,imx94-ddr-pmu", .data = &imx94_devtype_data }, { .compatible = "fsl,imx95-ddr-pmu", .data = &imx95_devtype_data }, { /* sentinel */ } }; From 2c599c68c43e65fa333b222effbeab61cbd35df5 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Thu, 21 Aug 2025 19:01:52 +0800 Subject: [PATCH 45/93] MAINTAINERS: include fsl_imx9_ddr_perf.c and some perf metric files The fsl_imx9_ddr_perf.c and some perf metric files under tools/perf/pmu-events/arch/arm64/freescale/ is missing in MAINTAINERS. Add them and add me as another maintainer. Reviewed-by: Frank Li Signed-off-by: Xu Yang Signed-off-by: Will Deacon --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa4..3815a2c4b3a8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9744,11 +9744,14 @@ F: drivers/video/fbdev/imxfb.c FREESCALE IMX DDR PMU DRIVER M: Frank Li +M: Xu Yang L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: Documentation/admin-guide/perf/imx-ddr.rst F: Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml F: drivers/perf/fsl_imx8_ddr_perf.c +F: drivers/perf/fsl_imx9_ddr_perf.c +F: tools/perf/pmu-events/arch/arm64/freescale/ FREESCALE IMX I2C DRIVER M: Oleksij Rempel From 1e558fb31bec3076500219cc417f477fe10a8463 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Wed, 13 Aug 2025 16:32:57 +0800 Subject: [PATCH 46/93] drivers: perf: use us_to_ktime() where appropriate The arm_ccn_pmu_poll_period_us are more suitable for using the us_to_ktime(). This can make the code more concise and enhance readability. Signed-off-by: Xichao Zhao Acked-by: Mark Rutland Signed-off-by: Will Deacon --- drivers/perf/arm-ccn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c index 1a0d0e1a2263..8af3563fdf60 100644 --- a/drivers/perf/arm-ccn.c +++ b/drivers/perf/arm-ccn.c @@ -565,7 +565,7 @@ module_param_named(pmu_poll_period_us, arm_ccn_pmu_poll_period_us, uint, static ktime_t arm_ccn_pmu_timer_period(void) { - return ns_to_ktime((u64)arm_ccn_pmu_poll_period_us * 1000); + return us_to_ktime((u64)arm_ccn_pmu_poll_period_us); } From 71396cfac97d0249fa7d8dcc8e649b6ba4c090e4 Mon Sep 17 00:00:00 2001 From: Ilkka Koskinen Date: Thu, 28 Aug 2025 15:35:19 -0700 Subject: [PATCH 47/93] perf/dwc_pcie: Support counting multiple lane events in parallel While Designware PCIe PMU allows to count only one time based event at a time, it allows to count all the lane events simultaneously. After the patch one is able to count a group of lane events: $ perf stat -e '{dwc_rootport/tx_memory_write,lane=1/,dwc_rootport/rx_memory_read,lane=0/}' dd if=/dev/nvme0n1 of=/dev/null bs=1M count=1 Earlier the events wouldn't have been counted successfully. Signed-off-by: Ilkka Koskinen Signed-off-by: Will Deacon --- .../admin-guide/perf/dwc_pcie_pmu.rst | 4 +- drivers/perf/dwc_pcie_pmu.c | 161 ++++++++++++++---- 2 files changed, 132 insertions(+), 33 deletions(-) diff --git a/Documentation/admin-guide/perf/dwc_pcie_pmu.rst b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst index cb376f335f40..167f9281fbf5 100644 --- a/Documentation/admin-guide/perf/dwc_pcie_pmu.rst +++ b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst @@ -16,8 +16,8 @@ provides the following two features: - one 64-bit counter for Time Based Analysis (RX/TX data throughput and time spent in each low-power LTSSM state) and -- one 32-bit counter for Event Counting (error and non-error events for - a specified lane) +- one 32-bit counter per event for Event Counting (error and non-error + events for a specified lane) Note: There is no interrupt for counter overflow. diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index 146ff57813fb..d77f767cde89 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -39,6 +39,10 @@ #define DWC_PCIE_EVENT_CLEAR GENMASK(1, 0) #define DWC_PCIE_EVENT_PER_CLEAR 0x1 +/* Event Selection Field has two subfields */ +#define DWC_PCIE_CNT_EVENT_SEL_GROUP GENMASK(11, 8) +#define DWC_PCIE_CNT_EVENT_SEL_EVID GENMASK(7, 0) + #define DWC_PCIE_EVENT_CNT_DATA 0xC #define DWC_PCIE_TIME_BASED_ANAL_CTL 0x10 @@ -73,6 +77,10 @@ enum dwc_pcie_event_type { DWC_PCIE_EVENT_TYPE_MAX, }; +#define DWC_PCIE_LANE_GROUP_6 6 +#define DWC_PCIE_LANE_GROUP_7 7 +#define DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP 256 + #define DWC_PCIE_LANE_EVENT_MAX_PERIOD GENMASK_ULL(31, 0) #define DWC_PCIE_MAX_PERIOD GENMASK_ULL(63, 0) @@ -82,8 +90,11 @@ struct dwc_pcie_pmu { u16 ras_des_offset; u32 nr_lanes; + /* Groups #6 and #7 */ + DECLARE_BITMAP(lane_events, 2 * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP); + struct perf_event *time_based_event; + struct hlist_node cpuhp_node; - struct perf_event *event[DWC_PCIE_EVENT_TYPE_MAX]; int on_cpu; }; @@ -246,19 +257,26 @@ static const struct attribute_group *dwc_pcie_attr_groups[] = { }; static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu, + struct perf_event *event, bool enable) { struct pci_dev *pdev = pcie_pmu->pdev; u16 ras_des_offset = pcie_pmu->ras_des_offset; + int event_id = DWC_PCIE_EVENT_ID(event); + int lane = DWC_PCIE_EVENT_LANE(event); + u32 ctrl; + + ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) | + FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) | + FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR); if (enable) - pci_clear_and_set_config_dword(pdev, - ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, - DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON); + ctrl |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON); else - pci_clear_and_set_config_dword(pdev, - ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, - DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF); + ctrl |= FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF); + + pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + ctrl); } static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu, @@ -276,11 +294,22 @@ static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event) { struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); struct pci_dev *pdev = pcie_pmu->pdev; + int event_id = DWC_PCIE_EVENT_ID(event); + int lane = DWC_PCIE_EVENT_LANE(event); u16 ras_des_offset = pcie_pmu->ras_des_offset; - u32 val; + u32 val, ctrl; + ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) | + FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) | + FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON); + pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + ctrl); pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val); + ctrl |= FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR); + pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + ctrl); + return val; } @@ -329,26 +358,77 @@ static void dwc_pcie_pmu_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event); - u64 delta, prev, now = 0; + u64 delta, prev, now; + + if (type == DWC_PCIE_LANE_EVENT) { + now = dwc_pcie_pmu_read_lane_event_counter(event) & + DWC_PCIE_LANE_EVENT_MAX_PERIOD; + local64_add(now, &event->count); + return; + } do { prev = local64_read(&hwc->prev_count); - - if (type == DWC_PCIE_LANE_EVENT) - now = dwc_pcie_pmu_read_lane_event_counter(event); - else if (type == DWC_PCIE_TIME_BASE_EVENT) - now = dwc_pcie_pmu_read_time_based_counter(event); + now = dwc_pcie_pmu_read_time_based_counter(event); } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); delta = (now - prev) & DWC_PCIE_MAX_PERIOD; - /* 32-bit counter for Lane Event Counting */ - if (type == DWC_PCIE_LANE_EVENT) - delta &= DWC_PCIE_LANE_EVENT_MAX_PERIOD; - local64_add(delta, &event->count); } +static int dwc_pcie_pmu_validate_add_lane_event(struct perf_event *event, + unsigned long val_lane_events[]) +{ + int event_id, event_nr, group; + + event_id = DWC_PCIE_EVENT_ID(event); + event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id); + group = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id); + + if (group != DWC_PCIE_LANE_GROUP_6 && group != DWC_PCIE_LANE_GROUP_7) + return -EINVAL; + + group -= DWC_PCIE_LANE_GROUP_6; + + if (test_and_set_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr, + val_lane_events)) + return -EINVAL; + + return 0; +} + +static int dwc_pcie_pmu_validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + DECLARE_BITMAP(val_lane_events, 2 * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP); + bool time_event; + int type; + + type = DWC_PCIE_EVENT_TYPE(leader); + if (type == DWC_PCIE_TIME_BASE_EVENT) + time_event = true; + else + if (dwc_pcie_pmu_validate_add_lane_event(leader, val_lane_events)) + return -ENOSPC; + + for_each_sibling_event(sibling, leader) { + type = DWC_PCIE_EVENT_TYPE(sibling); + if (type == DWC_PCIE_TIME_BASE_EVENT) { + if (time_event) + return -ENOSPC; + + time_event = true; + continue; + } + + if (dwc_pcie_pmu_validate_add_lane_event(sibling, val_lane_events)) + return -ENOSPC; + } + + return 0; +} + static int dwc_pcie_pmu_event_init(struct perf_event *event) { struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); @@ -367,10 +447,6 @@ static int dwc_pcie_pmu_event_init(struct perf_event *event) if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) return -EINVAL; - if (event->group_leader != event && - !is_software_event(event->group_leader)) - return -EINVAL; - for_each_sibling_event(sibling, event->group_leader) { if (sibling->pmu != event->pmu && !is_software_event(sibling)) return -EINVAL; @@ -385,6 +461,9 @@ static int dwc_pcie_pmu_event_init(struct perf_event *event) return -EINVAL; } + if (dwc_pcie_pmu_validate_group(event)) + return -ENOSPC; + event->cpu = pcie_pmu->on_cpu; return 0; @@ -400,7 +479,7 @@ static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags) local64_set(&hwc->prev_count, 0); if (type == DWC_PCIE_LANE_EVENT) - dwc_pcie_pmu_lane_event_enable(pcie_pmu, true); + dwc_pcie_pmu_lane_event_enable(pcie_pmu, event, true); else if (type == DWC_PCIE_TIME_BASE_EVENT) dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true); } @@ -414,12 +493,13 @@ static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags) if (event->hw.state & PERF_HES_STOPPED) return; + dwc_pcie_pmu_event_update(event); + if (type == DWC_PCIE_LANE_EVENT) - dwc_pcie_pmu_lane_event_enable(pcie_pmu, false); + dwc_pcie_pmu_lane_event_enable(pcie_pmu, event, false); else if (type == DWC_PCIE_TIME_BASE_EVENT) dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false); - dwc_pcie_pmu_event_update(event); hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; } @@ -434,14 +514,17 @@ static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags) u16 ras_des_offset = pcie_pmu->ras_des_offset; u32 ctrl; - /* one counter for each type and it is in use */ - if (pcie_pmu->event[type]) - return -ENOSPC; - - pcie_pmu->event[type] = event; hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; if (type == DWC_PCIE_LANE_EVENT) { + int event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id); + int group = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id) - + DWC_PCIE_LANE_GROUP_6; + + if (test_and_set_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr, + pcie_pmu->lane_events)) + return -ENOSPC; + /* EVENT_COUNTER_DATA_REG needs clear manually */ ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) | FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) | @@ -450,6 +533,11 @@ static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags) pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, ctrl); } else if (type == DWC_PCIE_TIME_BASE_EVENT) { + if (pcie_pmu->time_based_event) + return -ENOSPC; + + pcie_pmu->time_based_event = event; + /* * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely * use it with any manually controlled duration. And it is @@ -478,7 +566,18 @@ static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags) dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE); perf_event_update_userpage(event); - pcie_pmu->event[type] = NULL; + + if (type == DWC_PCIE_TIME_BASE_EVENT) { + pcie_pmu->time_based_event = NULL; + } else { + int event_id = DWC_PCIE_EVENT_ID(event); + int event_nr = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_EVID, event_id); + int group = FIELD_GET(DWC_PCIE_CNT_EVENT_SEL_GROUP, event_id) - + DWC_PCIE_LANE_GROUP_6; + + clear_bit(group * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP + event_nr, + pcie_pmu->lane_events); + } } static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node) From a7005ff2d0a5dfb15ba7152f4fb325ad9e00a472 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 1 Sep 2025 13:40:30 +0100 Subject: [PATCH 48/93] arm64: sysreg: Add new PMSFCR_EL1 fields and PMSDSFR_EL1 register Add new fields and register that are introduced for the features FEAT_SPE_EFT (extended filtering) and FEAT_SPE_FDS (data source filtering). Tested-by: Leo Yan Reviewed-by: Anshuman Khandual Acked-by: Will Deacon Signed-off-by: James Clark Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 696ab1f32a67..b743fc8ffe5d 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2994,11 +2994,20 @@ Field 0 RND EndSysreg Sysreg PMSFCR_EL1 3 0 9 9 4 -Res0 63:19 +Res0 63:53 +Field 52 SIMDm +Field 51 FPm +Field 50 STm +Field 49 LDm +Field 48 Bm +Res0 47:21 +Field 20 SIMD +Field 19 FP Field 18 ST Field 17 LD Field 16 B -Res0 15:4 +Res0 15:5 +Field 4 FDS Field 3 FnE Field 2 FL Field 1 FT From b4401403afb992844fa47513ac9c94520722c43d Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 1 Sep 2025 13:40:31 +0100 Subject: [PATCH 49/93] perf: arm_spe: Support FEAT_SPEv1p4 filters FEAT_SPEv1p4 (optional from Armv8.8) adds some new filter bits and also makes some previously available bits unavailable again e.g: E[30], bit [30] When FEAT_SPEv1p4 is _not_ implemented ... Continuing to hard code the valid filter bits for each version isn't scalable, and it also doesn't work for filter bits that aren't related to SPE version. For example most bits have a further condition: E[15], bit [15] When ... and filtering on event 15 is supported: Whether "filtering on event 15" is implemented or not is only discoverable from the TRM of that specific CPU or by probing PMSEVFR_EL1. Instead of hard coding them, write all 1s to the PMSEVFR_EL1 register and read it back to discover the RES0 bits. Unsupported bits are RAZ/WI so should read as 0s. For any hardware that doesn't strictly follow RAZ/WI for unsupported filters: Any bits that should have been supported in a specific SPE version but now incorrectly appear to be RES0 wouldn't have worked anyway, so it's better to fail to open events that request them rather than behaving unexpectedly. Bits that aren't implemented but also aren't RAZ/WI will be incorrectly reported as supported, but allowing them to be used is harmless. Testing on N1SDP shows the probed RES0 bits to be the same as the hard coded ones. The FVP with SPEv1p4 shows only additional new RES0 bits, i.e. no previously hard coded RES0 bits are missing. Tested-by: Leo Yan Signed-off-by: James Clark Signed-off-by: Will Deacon --- arch/arm64/include/asm/sysreg.h | 9 --------- drivers/perf/arm_spe_pmu.c | 23 +++++++---------------- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index d5b5f2ae1afa..20cbd9860c8f 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -344,15 +344,6 @@ #define SYS_PAR_EL1_ATTR GENMASK_ULL(63, 56) #define SYS_PAR_EL1_F0_RES0 (GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52)) -/*** Statistical Profiling Extension ***/ -#define PMSEVFR_EL1_RES0_IMP \ - (GENMASK_ULL(47, 32) | GENMASK_ULL(23, 16) | GENMASK_ULL(11, 8) |\ - BIT_ULL(6) | BIT_ULL(4) | BIT_ULL(2) | BIT_ULL(0)) -#define PMSEVFR_EL1_RES0_V1P1 \ - (PMSEVFR_EL1_RES0_IMP & ~(BIT_ULL(18) | BIT_ULL(17) | BIT_ULL(11))) -#define PMSEVFR_EL1_RES0_V1P2 \ - (PMSEVFR_EL1_RES0_V1P1 & ~BIT_ULL(6)) - /* Buffer error reporting */ #define PMBSR_EL1_FAULT_FSC_SHIFT PMBSR_EL1_MSS_SHIFT #define PMBSR_EL1_FAULT_FSC_MASK PMBSR_EL1_MSS_MASK diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 369e77ad5f13..86c9948ab5a0 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -89,6 +89,7 @@ struct arm_spe_pmu { #define SPE_PMU_FEAT_DEV_PROBED (1UL << 63) u64 features; + u64 pmsevfr_res0; u16 max_record_sz; u16 align; struct perf_output_handle __percpu *handle; @@ -697,20 +698,6 @@ static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev) return IRQ_HANDLED; } -static u64 arm_spe_pmsevfr_res0(u16 pmsver) -{ - switch (pmsver) { - case ID_AA64DFR0_EL1_PMSVer_IMP: - return PMSEVFR_EL1_RES0_IMP; - case ID_AA64DFR0_EL1_PMSVer_V1P1: - return PMSEVFR_EL1_RES0_V1P1; - case ID_AA64DFR0_EL1_PMSVer_V1P2: - /* Return the highest version we support in default */ - default: - return PMSEVFR_EL1_RES0_V1P2; - } -} - /* Perf callbacks */ static int arm_spe_pmu_event_init(struct perf_event *event) { @@ -726,10 +713,10 @@ static int arm_spe_pmu_event_init(struct perf_event *event) !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus)) return -ENOENT; - if (arm_spe_event_to_pmsevfr(event) & arm_spe_pmsevfr_res0(spe_pmu->pmsver)) + if (arm_spe_event_to_pmsevfr(event) & spe_pmu->pmsevfr_res0) return -EOPNOTSUPP; - if (arm_spe_event_to_pmsnevfr(event) & arm_spe_pmsevfr_res0(spe_pmu->pmsver)) + if (arm_spe_event_to_pmsnevfr(event) & spe_pmu->pmsevfr_res0) return -EOPNOTSUPP; if (attr->exclude_idle) @@ -1107,6 +1094,10 @@ static void __arm_spe_pmu_dev_probe(void *info) spe_pmu->counter_sz = 16; } + /* Write all 1s and then read back. Unsupported filter bits are RAZ/WI. */ + write_sysreg_s(U64_MAX, SYS_PMSEVFR_EL1); + spe_pmu->pmsevfr_res0 = ~read_sysreg_s(SYS_PMSEVFR_EL1); + dev_info(dev, "probed SPEv1.%d for CPUs %*pbl [max_record_sz %u, align %u, features 0x%llx]\n", spe_pmu->pmsver - 1, cpumask_pr_args(&spe_pmu->supported_cpus), From 51b9f16697cda2229aacc10bbb0216b6900cbca1 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 1 Sep 2025 13:40:32 +0100 Subject: [PATCH 50/93] perf: arm_spe: Expose event filter Expose an "event_filter" entry in the caps folder to inform user space about which events can be filtered. Change the return type of arm_spe_pmu_cap_get() from u32 to u64 to accommodate the added event filter entry. Signed-off-by: Leo Yan Tested-by: Leo Yan Signed-off-by: James Clark Signed-off-by: Will Deacon --- drivers/perf/arm_spe_pmu.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 86c9948ab5a0..ba55bc3db708 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -116,6 +116,7 @@ enum arm_spe_pmu_capabilities { SPE_PMU_CAP_FEAT_MAX, SPE_PMU_CAP_CNT_SZ = SPE_PMU_CAP_FEAT_MAX, SPE_PMU_CAP_MIN_IVAL, + SPE_PMU_CAP_EVENT_FILTER, }; static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = { @@ -123,7 +124,7 @@ static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = { [SPE_PMU_CAP_ERND] = SPE_PMU_FEAT_ERND, }; -static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap) +static u64 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap) { if (cap < SPE_PMU_CAP_FEAT_MAX) return !!(spe_pmu->features & arm_spe_pmu_feat_caps[cap]); @@ -133,6 +134,8 @@ static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap) return spe_pmu->counter_sz; case SPE_PMU_CAP_MIN_IVAL: return spe_pmu->min_period; + case SPE_PMU_CAP_EVENT_FILTER: + return ~spe_pmu->pmsevfr_res0; default: WARN(1, "unknown cap %d\n", cap); } @@ -149,7 +152,19 @@ static ssize_t arm_spe_pmu_cap_show(struct device *dev, container_of(attr, struct dev_ext_attribute, attr); int cap = (long)ea->var; - return sysfs_emit(buf, "%u\n", arm_spe_pmu_cap_get(spe_pmu, cap)); + return sysfs_emit(buf, "%llu\n", arm_spe_pmu_cap_get(spe_pmu, cap)); +} + +static ssize_t arm_spe_pmu_cap_show_hex(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct arm_spe_pmu *spe_pmu = dev_get_drvdata(dev); + struct dev_ext_attribute *ea = + container_of(attr, struct dev_ext_attribute, attr); + int cap = (long)ea->var; + + return sysfs_emit(buf, "0x%llx\n", arm_spe_pmu_cap_get(spe_pmu, cap)); } #define SPE_EXT_ATTR_ENTRY(_name, _func, _var) \ @@ -159,12 +174,15 @@ static ssize_t arm_spe_pmu_cap_show(struct device *dev, #define SPE_CAP_EXT_ATTR_ENTRY(_name, _var) \ SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show, _var) +#define SPE_CAP_EXT_ATTR_ENTRY_HEX(_name, _var) \ + SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show_hex, _var) static struct attribute *arm_spe_pmu_cap_attr[] = { SPE_CAP_EXT_ATTR_ENTRY(arch_inst, SPE_PMU_CAP_ARCH_INST), SPE_CAP_EXT_ATTR_ENTRY(ernd, SPE_PMU_CAP_ERND), SPE_CAP_EXT_ATTR_ENTRY(count_size, SPE_PMU_CAP_CNT_SZ), SPE_CAP_EXT_ATTR_ENTRY(min_interval, SPE_PMU_CAP_MIN_IVAL), + SPE_CAP_EXT_ATTR_ENTRY_HEX(event_filter, SPE_PMU_CAP_EVENT_FILTER), NULL, }; From dad9603c5ea308a7b26af66ff4824dccd438af5d Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 1 Sep 2025 13:40:33 +0100 Subject: [PATCH 51/93] perf: arm_spe: Add support for FEAT_SPE_EFT extended filtering FEAT_SPE_EFT (optional from Armv9.4) adds mask bits for the existing load, store and branch filters. It also adds two new filter bits for SIMD and floating point with their own associated mask bits. The current filters only allow OR filtering on samples that are load OR store etc, and the new mask bits allow setting part of the filter to an AND, for example filtering samples that are store AND SIMD. With mask bits set to 0, the OR behavior is preserved, so the unless any masks are explicitly set old filters will behave the same. Add them all and make them behave the same way as existing format bits, hidden and return EOPNOTSUPP if set when the feature doesn't exist. Reviewed-by: Leo Yan Tested-by: Leo Yan Signed-off-by: James Clark Signed-off-by: Will Deacon --- drivers/perf/arm_spe_pmu.c | 66 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index ba55bc3db708..591f72fa0327 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -86,6 +86,7 @@ struct arm_spe_pmu { #define SPE_PMU_FEAT_ERND (1UL << 5) #define SPE_PMU_FEAT_INV_FILT_EVT (1UL << 6) #define SPE_PMU_FEAT_DISCARD (1UL << 7) +#define SPE_PMU_FEAT_EFT (1UL << 8) #define SPE_PMU_FEAT_DEV_PROBED (1UL << 63) u64 features; @@ -216,6 +217,27 @@ static const struct attribute_group arm_spe_pmu_cap_group = { #define ATTR_CFG_FLD_discard_CFG config /* PMBLIMITR_EL1.FM = DISCARD */ #define ATTR_CFG_FLD_discard_LO 35 #define ATTR_CFG_FLD_discard_HI 35 +#define ATTR_CFG_FLD_branch_filter_mask_CFG config /* PMSFCR_EL1.Bm */ +#define ATTR_CFG_FLD_branch_filter_mask_LO 36 +#define ATTR_CFG_FLD_branch_filter_mask_HI 36 +#define ATTR_CFG_FLD_load_filter_mask_CFG config /* PMSFCR_EL1.LDm */ +#define ATTR_CFG_FLD_load_filter_mask_LO 37 +#define ATTR_CFG_FLD_load_filter_mask_HI 37 +#define ATTR_CFG_FLD_store_filter_mask_CFG config /* PMSFCR_EL1.STm */ +#define ATTR_CFG_FLD_store_filter_mask_LO 38 +#define ATTR_CFG_FLD_store_filter_mask_HI 38 +#define ATTR_CFG_FLD_simd_filter_CFG config /* PMSFCR_EL1.SIMD */ +#define ATTR_CFG_FLD_simd_filter_LO 39 +#define ATTR_CFG_FLD_simd_filter_HI 39 +#define ATTR_CFG_FLD_simd_filter_mask_CFG config /* PMSFCR_EL1.SIMDm */ +#define ATTR_CFG_FLD_simd_filter_mask_LO 40 +#define ATTR_CFG_FLD_simd_filter_mask_HI 40 +#define ATTR_CFG_FLD_float_filter_CFG config /* PMSFCR_EL1.FP */ +#define ATTR_CFG_FLD_float_filter_LO 41 +#define ATTR_CFG_FLD_float_filter_HI 41 +#define ATTR_CFG_FLD_float_filter_mask_CFG config /* PMSFCR_EL1.FPm */ +#define ATTR_CFG_FLD_float_filter_mask_LO 42 +#define ATTR_CFG_FLD_float_filter_mask_HI 42 #define ATTR_CFG_FLD_event_filter_CFG config1 /* PMSEVFR_EL1 */ #define ATTR_CFG_FLD_event_filter_LO 0 @@ -234,8 +256,15 @@ GEN_PMU_FORMAT_ATTR(pa_enable); GEN_PMU_FORMAT_ATTR(pct_enable); GEN_PMU_FORMAT_ATTR(jitter); GEN_PMU_FORMAT_ATTR(branch_filter); +GEN_PMU_FORMAT_ATTR(branch_filter_mask); GEN_PMU_FORMAT_ATTR(load_filter); +GEN_PMU_FORMAT_ATTR(load_filter_mask); GEN_PMU_FORMAT_ATTR(store_filter); +GEN_PMU_FORMAT_ATTR(store_filter_mask); +GEN_PMU_FORMAT_ATTR(simd_filter); +GEN_PMU_FORMAT_ATTR(simd_filter_mask); +GEN_PMU_FORMAT_ATTR(float_filter); +GEN_PMU_FORMAT_ATTR(float_filter_mask); GEN_PMU_FORMAT_ATTR(event_filter); GEN_PMU_FORMAT_ATTR(inv_event_filter); GEN_PMU_FORMAT_ATTR(min_latency); @@ -247,8 +276,15 @@ static struct attribute *arm_spe_pmu_formats_attr[] = { &format_attr_pct_enable.attr, &format_attr_jitter.attr, &format_attr_branch_filter.attr, + &format_attr_branch_filter_mask.attr, &format_attr_load_filter.attr, + &format_attr_load_filter_mask.attr, &format_attr_store_filter.attr, + &format_attr_store_filter_mask.attr, + &format_attr_simd_filter.attr, + &format_attr_simd_filter_mask.attr, + &format_attr_float_filter.attr, + &format_attr_float_filter_mask.attr, &format_attr_event_filter.attr, &format_attr_inv_event_filter.attr, &format_attr_min_latency.attr, @@ -269,6 +305,16 @@ static umode_t arm_spe_pmu_format_attr_is_visible(struct kobject *kobj, if (attr == &format_attr_inv_event_filter.attr && !(spe_pmu->features & SPE_PMU_FEAT_INV_FILT_EVT)) return 0; + if ((attr == &format_attr_branch_filter_mask.attr || + attr == &format_attr_load_filter_mask.attr || + attr == &format_attr_store_filter_mask.attr || + attr == &format_attr_simd_filter.attr || + attr == &format_attr_simd_filter_mask.attr || + attr == &format_attr_float_filter.attr || + attr == &format_attr_float_filter_mask.attr) && + !(spe_pmu->features & SPE_PMU_FEAT_EFT)) + return 0; + return attr->mode; } @@ -364,8 +410,15 @@ static u64 arm_spe_event_to_pmsfcr(struct perf_event *event) u64 reg = 0; reg |= FIELD_PREP(PMSFCR_EL1_LD, ATTR_CFG_GET_FLD(attr, load_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_LDm, ATTR_CFG_GET_FLD(attr, load_filter_mask)); reg |= FIELD_PREP(PMSFCR_EL1_ST, ATTR_CFG_GET_FLD(attr, store_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_STm, ATTR_CFG_GET_FLD(attr, store_filter_mask)); reg |= FIELD_PREP(PMSFCR_EL1_B, ATTR_CFG_GET_FLD(attr, branch_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_Bm, ATTR_CFG_GET_FLD(attr, branch_filter_mask)); + reg |= FIELD_PREP(PMSFCR_EL1_SIMD, ATTR_CFG_GET_FLD(attr, simd_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_SIMDm, ATTR_CFG_GET_FLD(attr, simd_filter_mask)); + reg |= FIELD_PREP(PMSFCR_EL1_FP, ATTR_CFG_GET_FLD(attr, float_filter)); + reg |= FIELD_PREP(PMSFCR_EL1_FPm, ATTR_CFG_GET_FLD(attr, float_filter_mask)); if (reg) reg |= PMSFCR_EL1_FT; @@ -767,6 +820,16 @@ static int arm_spe_pmu_event_init(struct perf_event *event) !(spe_pmu->features & SPE_PMU_FEAT_FILT_LAT)) return -EOPNOTSUPP; + if ((FIELD_GET(PMSFCR_EL1_LDm, reg) || + FIELD_GET(PMSFCR_EL1_STm, reg) || + FIELD_GET(PMSFCR_EL1_Bm, reg) || + FIELD_GET(PMSFCR_EL1_SIMD, reg) || + FIELD_GET(PMSFCR_EL1_SIMDm, reg) || + FIELD_GET(PMSFCR_EL1_FP, reg) || + FIELD_GET(PMSFCR_EL1_FPm, reg)) && + !(spe_pmu->features & SPE_PMU_FEAT_EFT)) + return -EOPNOTSUPP; + if (ATTR_CFG_GET_FLD(&event->attr, discard) && !(spe_pmu->features & SPE_PMU_FEAT_DISCARD)) return -EOPNOTSUPP; @@ -1058,6 +1121,9 @@ static void __arm_spe_pmu_dev_probe(void *info) if (spe_pmu->pmsver >= ID_AA64DFR0_EL1_PMSVer_V1P2) spe_pmu->features |= SPE_PMU_FEAT_DISCARD; + if (FIELD_GET(PMSIDR_EL1_EFT, reg)) + spe_pmu->features |= SPE_PMU_FEAT_EFT; + /* This field has a spaced out encoding, so just use a look-up */ fld = FIELD_GET(PMSIDR_EL1_INTERVAL, reg); switch (fld) { From 510a8fa49dc1d18b120e2d3992fa2aff7fc5c46b Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 1 Sep 2025 13:40:34 +0100 Subject: [PATCH 52/93] arm64/boot: Factor out a macro to check SPE version We check the version of SPE twice, and we'll add one more check in the next commit so factor out a macro to do this. Change the #3 magic number to the actual SPE version define (V1p2) to make it more readable. No functional changes intended. Tested-by: Leo Yan Reviewed-by: Leo Yan Signed-off-by: James Clark Signed-off-by: Will Deacon --- arch/arm64/include/asm/el2_setup.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 46033027510c..a305386eb2e3 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -91,6 +91,14 @@ msr cntvoff_el2, xzr // Clear virtual offset .endm +/* Branch to skip_label if SPE version is less than given version */ +.macro __spe_vers_imp skip_label, version, tmp + mrs \tmp, id_aa64dfr0_el1 + ubfx \tmp, \tmp, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 + cmp \tmp, \version + b.lt \skip_label +.endm + .macro __init_el2_debug mrs x1, id_aa64dfr0_el1 ubfx x0, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4 @@ -103,8 +111,7 @@ csel x2, xzr, x0, eq // all PMU counters from EL1 /* Statistical profiling */ - ubfx x0, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 - cbz x0, .Lskip_spe_\@ // Skip if SPE not present + __spe_vers_imp .Lskip_spe_\@, ID_AA64DFR0_EL1_PMSVer_IMP, x0 // Skip if SPE not present mrs_s x0, SYS_PMBIDR_EL1 // If SPE available at EL2, and x0, x0, #(1 << PMBIDR_EL1_P_SHIFT) @@ -263,10 +270,8 @@ mov x0, xzr mov x2, xzr - mrs x1, id_aa64dfr0_el1 - ubfx x1, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 - cmp x1, #3 - b.lt .Lskip_spe_fgt_\@ + /* If SPEv1p2 is implemented, */ + __spe_vers_imp .Lskip_spe_fgt_\@, #ID_AA64DFR0_EL1_PMSVer_V1P2, x1 /* Disable PMSNEVFR_EL1 read and write traps */ orr x0, x0, #HDFGRTR_EL2_nPMSNEVFR_EL1_MASK orr x2, x2, #HDFGWTR_EL2_nPMSNEVFR_EL1_MASK From 00d7a1af5ab58d89c2f0af27485b2d710c862dfc Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 1 Sep 2025 13:40:35 +0100 Subject: [PATCH 53/93] arm64/boot: Enable EL2 requirements for SPE_FEAT_FDS SPE data source filtering (optional from Armv8.8) requires that traps to the filter register PMSDSFR be disabled. Document the requirements and disable the traps if the feature is present. Tested-by: Leo Yan Reviewed-by: Leo Yan Signed-off-by: James Clark Signed-off-by: Will Deacon --- Documentation/arch/arm64/booting.rst | 11 +++++++++++ arch/arm64/include/asm/el2_setup.h | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index 2f666a7c303c..e4f953839f71 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -466,6 +466,17 @@ Before jumping into the kernel, the following conditions must be met: - HDFGWTR2_EL2.nPMICFILTR_EL0 (bit 3) must be initialised to 0b1. - HDFGWTR2_EL2.nPMUACR_EL1 (bit 4) must be initialised to 0b1. + For CPUs with SPE data source filtering (FEAT_SPE_FDS): + + - If EL3 is present: + + - MDCR_EL3.EnPMS3 (bit 42) must be initialised to 0b1. + + - If the kernel is entered at EL1 and EL2 is present: + + - HDFGRTR2_EL2.nPMSDSFR_EL1 (bit 19) must be initialised to 0b1. + - HDFGWTR2_EL2.nPMSDSFR_EL1 (bit 19) must be initialised to 0b1. + For CPUs with Memory Copy and Memory Set instructions (FEAT_MOPS): - If the kernel is entered at EL1 and EL2 is present: diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index a305386eb2e3..b37da3ee8529 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -392,6 +392,17 @@ orr x0, x0, #HDFGRTR2_EL2_nPMICFILTR_EL0 orr x0, x0, #HDFGRTR2_EL2_nPMUACR_EL1 .Lskip_pmuv3p9_\@: + /* If SPE is implemented, */ + __spe_vers_imp .Lskip_spefds_\@, ID_AA64DFR0_EL1_PMSVer_IMP, x1 + /* we can read PMSIDR and */ + mrs_s x1, SYS_PMSIDR_EL1 + and x1, x1, #PMSIDR_EL1_FDS + /* if FEAT_SPE_FDS is implemented, */ + cbz x1, .Lskip_spefds_\@ + /* disable traps of PMSDSFR to EL2. */ + orr x0, x0, #HDFGRTR2_EL2_nPMSDSFR_EL1 + +.Lskip_spefds_\@: msr_s SYS_HDFGRTR2_EL2, x0 msr_s SYS_HDFGWTR2_EL2, x0 msr_s SYS_HFGRTR2_EL2, xzr From f8f89e8cf3d668a40106444276d8c448c114e963 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Wed, 20 Aug 2025 16:45:33 +0800 Subject: [PATCH 54/93] perf: arm_pmuv3: Factor out PMCCNTR_EL0 use conditions PMCCNTR_EL0 is preferred for counting CPU_CYCLES under certain conditions. Factor out the condition check to a separate function for further extension. Add documents for better understanding. No functional changes intended. Reviewed-by: James Clark Acked-by: Mark Rutland Signed-off-by: Yicong Yang Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index f6d7bab5d555..69c5cc8f5606 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -978,6 +978,32 @@ static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc, return -EAGAIN; } +static bool armv8pmu_can_use_pmccntr(struct pmu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; + + if (evtype != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) + return false; + + /* + * A CPU_CYCLES event with threshold counting cannot use PMCCNTR_EL0 + * since it lacks threshold support. + */ + if (armv8pmu_event_get_threshold(&event->attr)) + return false; + + /* + * PMCCNTR_EL0 is not affected by BRBE controls like BRBCR_ELx.FZP. + * So don't use it for branch events. + */ + if (has_branch_stack(event)) + return false; + + return true; +} + static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, struct perf_event *event) { @@ -986,8 +1012,7 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT; /* Always prefer to place a cycle counter into the cycle counter. */ - if ((evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) && - !armv8pmu_event_get_threshold(&event->attr) && !has_branch_stack(event)) { + if (armv8pmu_can_use_pmccntr(cpuc, event)) { if (!test_and_set_bit(ARMV8_PMU_CYCLE_IDX, cpuc->used_mask)) return ARMV8_PMU_CYCLE_IDX; else if (armv8pmu_event_is_64bit(event) && From e31c0eb10388e2af0502b77e61453efbc8a4f974 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 14 Aug 2025 17:16:20 +0800 Subject: [PATCH 55/93] drivers/perf: hisi: Add support for HiSilicon NoC PMU Adds the support for HiSilicon NoC (Network on Chip) PMU which will be used to monitor the events on the system bus. The PMU device will be named after the SCL ID (either Super CPU cluster or Super IO cluster) and the index ID, just similar to other HiSilicon Uncore PMUs. Below PMU formats are provided besides the event: - ch: the transaction channel (data, request, response, etc) which can be used to filter the counting. - tt_en: tracetag filtering enable. Just as other HiSilicon Uncore PMUs the NoC PMU supports only counting the transactions with tracetag. The NoC PMU doesn't have an interrupt to indicate the overflow. However we have a 64 bit counter which is large enough and it's nearly impossible to overflow. Reviewed-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/hisi-pmu.rst | 11 + drivers/perf/hisilicon/Makefile | 3 +- drivers/perf/hisilicon/hisi_uncore_noc_pmu.c | 443 +++++++++++++++++++ 3 files changed, 456 insertions(+), 1 deletion(-) create mode 100644 drivers/perf/hisilicon/hisi_uncore_noc_pmu.c diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst index 48992a0b8e94..6f0ea4f641cc 100644 --- a/Documentation/admin-guide/perf/hisi-pmu.rst +++ b/Documentation/admin-guide/perf/hisi-pmu.rst @@ -112,6 +112,17 @@ uring channel. It is 2 bits. Some important codes are as follows: - 2'b00: default value, count the events which sent to the both uring and uring_ext channel; +6. ch: NoC PMU supports filtering the event counts of certain transaction +channel with this option. The current supported channels are as follows: + +- 3'b010: Request channel +- 3'b100: Snoop channel +- 3'b110: Response channel +- 3'b111: Data channel + +7. tt_en: NoC PMU supports counting only transactions that have tracetag set +if this option is set. See the 2nd list for more information about tracetag. + Users could configure IDs to count data come from specific CCL/ICL, by setting srcid_cmd & srcid_msk, and data desitined for specific CCL/ICL by setting tgtid_cmd & tgtid_msk. A set bit in srcid_msk/tgtid_msk means the PMU will not diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile index 48dcc8381ea7..dcec8f39719d 100644 --- a/drivers/perf/hisilicon/Makefile +++ b/drivers/perf/hisilicon/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \ hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \ - hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o hisi_uncore_uc_pmu.o + hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o hisi_uncore_uc_pmu.o \ + hisi_uncore_noc_pmu.o obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o obj-$(CONFIG_HNS3_PMU) += hns3_pmu.o diff --git a/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c new file mode 100644 index 000000000000..de3b9cc7aada --- /dev/null +++ b/drivers/perf/hisilicon/hisi_uncore_noc_pmu.c @@ -0,0 +1,443 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Driver for HiSilicon Uncore NoC (Network on Chip) PMU device + * + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. + * Author: Yicong Yang + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hisi_uncore_pmu.h" + +#define NOC_PMU_VERSION 0x1e00 +#define NOC_PMU_GLOBAL_CTRL 0x1e04 +#define NOC_PMU_GLOBAL_CTRL_PMU_EN BIT(0) +#define NOC_PMU_GLOBAL_CTRL_TT_EN BIT(1) +#define NOC_PMU_CNT_INFO 0x1e08 +#define NOC_PMU_CNT_INFO_OVERFLOW(n) BIT(n) +#define NOC_PMU_EVENT_CTRL0 0x1e20 +#define NOC_PMU_EVENT_CTRL_TYPE GENMASK(4, 0) +/* + * Note channel of 0x0 will reset the counter value, so don't do it before + * we read out the counter. + */ +#define NOC_PMU_EVENT_CTRL_CHANNEL GENMASK(10, 8) +#define NOC_PMU_EVENT_CTRL_EN BIT(11) +#define NOC_PMU_EVENT_COUNTER0 0x1e80 + +#define NOC_PMU_NR_COUNTERS 4 +#define NOC_PMU_CH_DEFAULT 0x7 + +#define NOC_PMU_EVENT_CTRLn(ctrl0, n) ((ctrl0) + 4 * (n)) +#define NOC_PMU_EVENT_CNTRn(cntr0, n) ((cntr0) + 8 * (n)) + +HISI_PMU_EVENT_ATTR_EXTRACTOR(ch, config1, 2, 0); +HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_en, config1, 3, 3); + +/* Dynamic CPU hotplug state used by this PMU driver */ +static enum cpuhp_state hisi_noc_pmu_cpuhp_state; + +struct hisi_noc_pmu_regs { + u32 version; + u32 pmu_ctrl; + u32 event_ctrl0; + u32 event_cntr0; + u32 overflow_status; +}; + +/* + * Tracetag filtering is not per event and all the events should keep + * the consistence. Return true if the new comer doesn't match the + * tracetag filtering configuration of the current scheduled events. + */ +static bool hisi_noc_pmu_check_global_filter(struct perf_event *curr, + struct perf_event *new) +{ + return hisi_get_tt_en(curr) == hisi_get_tt_en(new); +} + +static void hisi_noc_pmu_write_evtype(struct hisi_pmu *noc_pmu, int idx, u32 type) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, idx)); + reg &= ~NOC_PMU_EVENT_CTRL_TYPE; + reg |= FIELD_PREP(NOC_PMU_EVENT_CTRL_TYPE, type); + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, idx)); +} + +static int hisi_noc_pmu_get_event_idx(struct perf_event *event) +{ + struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu); + struct hisi_pmu_hwevents *pmu_events = &noc_pmu->pmu_events; + int cur_idx; + + cur_idx = find_first_bit(pmu_events->used_mask, noc_pmu->num_counters); + if (cur_idx != noc_pmu->num_counters && + !hisi_noc_pmu_check_global_filter(pmu_events->hw_events[cur_idx], event)) + return -EAGAIN; + + return hisi_uncore_pmu_get_event_idx(event); +} + +static u64 hisi_noc_pmu_read_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + + return readq(noc_pmu->base + NOC_PMU_EVENT_CNTRn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_noc_pmu_write_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc, u64 val) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + + writeq(val, noc_pmu->base + NOC_PMU_EVENT_CNTRn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_noc_pmu_enable_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + reg |= NOC_PMU_EVENT_CTRL_EN; + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); +} + +static void hisi_noc_pmu_disable_counter(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + reg &= ~NOC_PMU_EVENT_CTRL_EN; + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); +} + +static void hisi_noc_pmu_enable_counter_int(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ + /* We don't support interrupt, so a stub here. */ +} + +static void hisi_noc_pmu_disable_counter_int(struct hisi_pmu *noc_pmu, + struct hw_perf_event *hwc) +{ +} + +static void hisi_noc_pmu_start_counters(struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg |= NOC_PMU_GLOBAL_CTRL_PMU_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); +} + +static void hisi_noc_pmu_stop_counters(struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg &= ~NOC_PMU_GLOBAL_CTRL_PMU_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); +} + +static u32 hisi_noc_pmu_get_int_status(struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + + return readl(noc_pmu->base + reg_info->overflow_status); +} + +static void hisi_noc_pmu_clear_int_status(struct hisi_pmu *noc_pmu, int idx) +{ + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 reg; + + reg = readl(noc_pmu->base + reg_info->overflow_status); + reg &= ~NOC_PMU_CNT_INFO_OVERFLOW(idx); + writel(reg, noc_pmu->base + reg_info->overflow_status); +} + +static void hisi_noc_pmu_enable_filter(struct perf_event *event) +{ + struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu); + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + struct hw_perf_event *hwc = &event->hw; + u32 tt_en = hisi_get_tt_en(event); + u32 ch = hisi_get_ch(event); + u32 reg; + + if (!ch) + ch = NOC_PMU_CH_DEFAULT; + + reg = readl(noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + reg &= ~NOC_PMU_EVENT_CTRL_CHANNEL; + reg |= FIELD_PREP(NOC_PMU_EVENT_CTRL_CHANNEL, ch); + writel(reg, noc_pmu->base + NOC_PMU_EVENT_CTRLn(reg_info->event_ctrl0, hwc->idx)); + + /* + * Since tracetag filter applies to all the counters, don't touch it + * if user doesn't specify it explicitly. + */ + if (tt_en) { + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg |= NOC_PMU_GLOBAL_CTRL_TT_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); + } +} + +static void hisi_noc_pmu_disable_filter(struct perf_event *event) +{ + struct hisi_pmu *noc_pmu = to_hisi_pmu(event->pmu); + struct hisi_noc_pmu_regs *reg_info = noc_pmu->dev_info->private; + u32 tt_en = hisi_get_tt_en(event); + u32 reg; + + /* + * If we're not the last counter, don't touch the global tracetag + * configuration. + */ + if (bitmap_weight(noc_pmu->pmu_events.used_mask, noc_pmu->num_counters) > 1) + return; + + if (tt_en) { + reg = readl(noc_pmu->base + reg_info->pmu_ctrl); + reg &= ~NOC_PMU_GLOBAL_CTRL_TT_EN; + writel(reg, noc_pmu->base + reg_info->pmu_ctrl); + } +} + +static const struct hisi_uncore_ops hisi_uncore_noc_ops = { + .write_evtype = hisi_noc_pmu_write_evtype, + .get_event_idx = hisi_noc_pmu_get_event_idx, + .read_counter = hisi_noc_pmu_read_counter, + .write_counter = hisi_noc_pmu_write_counter, + .enable_counter = hisi_noc_pmu_enable_counter, + .disable_counter = hisi_noc_pmu_disable_counter, + .enable_counter_int = hisi_noc_pmu_enable_counter_int, + .disable_counter_int = hisi_noc_pmu_disable_counter_int, + .start_counters = hisi_noc_pmu_start_counters, + .stop_counters = hisi_noc_pmu_stop_counters, + .get_int_status = hisi_noc_pmu_get_int_status, + .clear_int_status = hisi_noc_pmu_clear_int_status, + .enable_filter = hisi_noc_pmu_enable_filter, + .disable_filter = hisi_noc_pmu_disable_filter, +}; + +static struct attribute *hisi_noc_pmu_format_attrs[] = { + HISI_PMU_FORMAT_ATTR(event, "config:0-7"), + HISI_PMU_FORMAT_ATTR(ch, "config1:0-2"), + HISI_PMU_FORMAT_ATTR(tt_en, "config1:3"), + NULL +}; + +static const struct attribute_group hisi_noc_pmu_format_group = { + .name = "format", + .attrs = hisi_noc_pmu_format_attrs, +}; + +static struct attribute *hisi_noc_pmu_events_attrs[] = { + HISI_PMU_EVENT_ATTR(cycles, 0x0e), + /* Flux on/off the ring */ + HISI_PMU_EVENT_ATTR(ingress_flow_sum, 0x1a), + HISI_PMU_EVENT_ATTR(egress_flow_sum, 0x17), + /* Buffer full duration on/off the ring */ + HISI_PMU_EVENT_ATTR(ingress_buf_full, 0x19), + HISI_PMU_EVENT_ATTR(egress_buf_full, 0x12), + /* Failure packets count on/off the ring */ + HISI_PMU_EVENT_ATTR(cw_ingress_fail, 0x01), + HISI_PMU_EVENT_ATTR(cc_ingress_fail, 0x09), + HISI_PMU_EVENT_ATTR(cw_egress_fail, 0x03), + HISI_PMU_EVENT_ATTR(cc_egress_fail, 0x0b), + /* Flux of the ring */ + HISI_PMU_EVENT_ATTR(cw_main_flow_sum, 0x05), + HISI_PMU_EVENT_ATTR(cc_main_flow_sum, 0x0d), + NULL +}; + +static const struct attribute_group hisi_noc_pmu_events_group = { + .name = "events", + .attrs = hisi_noc_pmu_events_attrs, +}; + +static const struct attribute_group *hisi_noc_pmu_attr_groups[] = { + &hisi_noc_pmu_format_group, + &hisi_noc_pmu_events_group, + &hisi_pmu_cpumask_attr_group, + &hisi_pmu_identifier_group, + NULL +}; + +static int hisi_noc_pmu_dev_init(struct platform_device *pdev, struct hisi_pmu *noc_pmu) +{ + struct hisi_noc_pmu_regs *reg_info; + + hisi_uncore_pmu_init_topology(noc_pmu, &pdev->dev); + + if (noc_pmu->topo.scl_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, "failed to get scl-id\n"); + + if (noc_pmu->topo.index_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, "failed to get idx-id\n"); + + if (noc_pmu->topo.sub_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, "failed to get sub-id\n"); + + noc_pmu->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(noc_pmu->base)) + return dev_err_probe(&pdev->dev, PTR_ERR(noc_pmu->base), + "fail to remap io memory\n"); + + noc_pmu->dev_info = device_get_match_data(&pdev->dev); + if (!noc_pmu->dev_info) + return -ENODEV; + + noc_pmu->pmu_events.attr_groups = noc_pmu->dev_info->attr_groups; + noc_pmu->counter_bits = noc_pmu->dev_info->counter_bits; + noc_pmu->check_event = noc_pmu->dev_info->check_event; + noc_pmu->num_counters = NOC_PMU_NR_COUNTERS; + noc_pmu->ops = &hisi_uncore_noc_ops; + noc_pmu->dev = &pdev->dev; + noc_pmu->on_cpu = -1; + + reg_info = noc_pmu->dev_info->private; + noc_pmu->identifier = readl(noc_pmu->base + reg_info->version); + + return 0; +} + +static void hisi_noc_pmu_remove_cpuhp_instance(void *hotplug_node) +{ + cpuhp_state_remove_instance_nocalls(hisi_noc_pmu_cpuhp_state, hotplug_node); +} + +static void hisi_noc_pmu_unregister_pmu(void *pmu) +{ + perf_pmu_unregister(pmu); +} + +static int hisi_noc_pmu_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct hisi_pmu *noc_pmu; + char *name; + int ret; + + noc_pmu = devm_kzalloc(dev, sizeof(*noc_pmu), GFP_KERNEL); + if (!noc_pmu) + return -ENOMEM; + + /* + * HiSilicon Uncore PMU framework needs to get common hisi_pmu device + * from device's drvdata. + */ + platform_set_drvdata(pdev, noc_pmu); + + ret = hisi_noc_pmu_dev_init(pdev, noc_pmu); + if (ret) + return ret; + + ret = cpuhp_state_add_instance(hisi_noc_pmu_cpuhp_state, &noc_pmu->node); + if (ret) + return dev_err_probe(dev, ret, "Fail to register cpuhp instance\n"); + + ret = devm_add_action_or_reset(dev, hisi_noc_pmu_remove_cpuhp_instance, + &noc_pmu->node); + if (ret) + return ret; + + hisi_pmu_init(noc_pmu, THIS_MODULE); + + name = devm_kasprintf(dev, GFP_KERNEL, "hisi_scl%d_noc%d_%d", + noc_pmu->topo.scl_id, noc_pmu->topo.index_id, + noc_pmu->topo.sub_id); + if (!name) + return -ENOMEM; + + ret = perf_pmu_register(&noc_pmu->pmu, name, -1); + if (ret) + return dev_err_probe(dev, ret, "Fail to register PMU\n"); + + return devm_add_action_or_reset(dev, hisi_noc_pmu_unregister_pmu, + &noc_pmu->pmu); +} + +static struct hisi_noc_pmu_regs hisi_noc_v1_pmu_regs = { + .version = NOC_PMU_VERSION, + .pmu_ctrl = NOC_PMU_GLOBAL_CTRL, + .event_ctrl0 = NOC_PMU_EVENT_CTRL0, + .event_cntr0 = NOC_PMU_EVENT_COUNTER0, + .overflow_status = NOC_PMU_CNT_INFO, +}; + +static const struct hisi_pmu_dev_info hisi_noc_v1 = { + .attr_groups = hisi_noc_pmu_attr_groups, + .counter_bits = 64, + .check_event = NOC_PMU_EVENT_CTRL_TYPE, + .private = &hisi_noc_v1_pmu_regs, +}; + +static const struct acpi_device_id hisi_noc_pmu_ids[] = { + { "HISI04E0", (kernel_ulong_t) &hisi_noc_v1 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, hisi_noc_pmu_ids); + +static struct platform_driver hisi_noc_pmu_driver = { + .driver = { + .name = "hisi_noc_pmu", + .acpi_match_table = hisi_noc_pmu_ids, + .suppress_bind_attrs = true, + }, + .probe = hisi_noc_pmu_probe, +}; + +static int __init hisi_noc_pmu_module_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/hisi/noc:online", + hisi_uncore_pmu_online_cpu, + hisi_uncore_pmu_offline_cpu); + if (ret < 0) { + pr_err("hisi_noc_pmu: Fail to setup cpuhp callbacks, ret = %d\n", ret); + return ret; + } + hisi_noc_pmu_cpuhp_state = ret; + + ret = platform_driver_register(&hisi_noc_pmu_driver); + if (ret) + cpuhp_remove_multi_state(hisi_noc_pmu_cpuhp_state); + + return ret; +} +module_init(hisi_noc_pmu_module_init); + +static void __exit hisi_noc_pmu_module_exit(void) +{ + platform_driver_unregister(&hisi_noc_pmu_driver); + cpuhp_remove_multi_state(hisi_noc_pmu_cpuhp_state); +} +module_exit(hisi_noc_pmu_module_exit); + +MODULE_IMPORT_NS("HISI_PMU"); +MODULE_DESCRIPTION("HiSilicon SoC Uncore NoC PMU driver"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yicong Yang "); From 2257798498b3b069e5ff46ad957c32a9a06b5fc9 Mon Sep 17 00:00:00 2001 From: Junhao He Date: Thu, 14 Aug 2025 17:16:21 +0800 Subject: [PATCH 56/93] drivers/perf: hisi: Add support for HiSilicon MN PMU driver MN (Miscellaneous Node) is a hybrid node in ARM CHI. It broadcasts the following two types of requests: DVM operations and PCIe configuration. MN PMU devices exist on both SCCL and SICL, so we named the MN pmu driver after SCL (Super cluster) ID. The MN PMU driver using the HiSilicon uncore PMU framework. And only the event parameter is supported. Signed-off-by: Junhao He Reviewed-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Will Deacon --- drivers/perf/hisilicon/Makefile | 2 +- drivers/perf/hisilicon/hisi_uncore_mn_pmu.c | 411 ++++++++++++++++++++ 2 files changed, 412 insertions(+), 1 deletion(-) create mode 100644 drivers/perf/hisilicon/hisi_uncore_mn_pmu.c diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile index dcec8f39719d..186be3d02238 100644 --- a/drivers/perf/hisilicon/Makefile +++ b/drivers/perf/hisilicon/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \ hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \ hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o hisi_uncore_uc_pmu.o \ - hisi_uncore_noc_pmu.o + hisi_uncore_noc_pmu.o hisi_uncore_mn_pmu.o obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o obj-$(CONFIG_HNS3_PMU) += hns3_pmu.o diff --git a/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c b/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c new file mode 100644 index 000000000000..4df4eebe243e --- /dev/null +++ b/drivers/perf/hisilicon/hisi_uncore_mn_pmu.c @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HiSilicon SoC MN uncore Hardware event counters support + * + * Copyright (c) 2025 HiSilicon Technologies Co., Ltd. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "hisi_uncore_pmu.h" + +/* Dynamic CPU hotplug state used by MN PMU */ +static enum cpuhp_state hisi_mn_pmu_online; + +/* MN register definition */ +#define HISI_MN_DYNAMIC_CTRL_REG 0x400 +#define HISI_MN_DYNAMIC_CTRL_EN BIT(0) +#define HISI_MN_PERF_CTRL_REG 0x408 +#define HISI_MN_PERF_CTRL_EN BIT(6) +#define HISI_MN_INT_MASK_REG 0x800 +#define HISI_MN_INT_STATUS_REG 0x808 +#define HISI_MN_INT_CLEAR_REG 0x80C +#define HISI_MN_EVENT_CTRL_REG 0x1C00 +#define HISI_MN_VERSION_REG 0x1C04 +#define HISI_MN_EVTYPE0_REG 0x1d00 +#define HISI_MN_EVTYPE_MASK GENMASK(7, 0) +#define HISI_MN_CNTR0_REG 0x1e00 +#define HISI_MN_EVTYPE_REGn(evtype0, n) ((evtype0) + (n) * 4) +#define HISI_MN_CNTR_REGn(cntr0, n) ((cntr0) + (n) * 8) + +#define HISI_MN_NR_COUNTERS 4 +#define HISI_MN_TIMEOUT_US 500U + +struct hisi_mn_pmu_regs { + u32 version; + u32 dyn_ctrl; + u32 perf_ctrl; + u32 int_mask; + u32 int_clear; + u32 int_status; + u32 event_ctrl; + u32 event_type0; + u32 event_cntr0; +}; + +/* + * Each event request takes a certain amount of time to complete. If + * we counting the latency related event, we need to wait for the all + * requests complete. Otherwise, the value of counter is slightly larger. + */ +static void hisi_mn_pmu_counter_flush(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + int ret; + u32 val; + + val = readl(mn_pmu->base + reg_info->dyn_ctrl); + val |= HISI_MN_DYNAMIC_CTRL_EN; + writel(val, mn_pmu->base + reg_info->dyn_ctrl); + + ret = readl_poll_timeout_atomic(mn_pmu->base + reg_info->dyn_ctrl, + val, !(val & HISI_MN_DYNAMIC_CTRL_EN), + 1, HISI_MN_TIMEOUT_US); + if (ret) + dev_warn(mn_pmu->dev, "Counter flush timeout\n"); +} + +static u64 hisi_mn_pmu_read_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + return readq(mn_pmu->base + HISI_MN_CNTR_REGn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_mn_pmu_write_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc, u64 val) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + writeq(val, mn_pmu->base + HISI_MN_CNTR_REGn(reg_info->event_cntr0, hwc->idx)); +} + +static void hisi_mn_pmu_write_evtype(struct hisi_pmu *mn_pmu, int idx, u32 type) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + /* + * Select the appropriate event select register. + * There are 2 32-bit event select registers for the + * 8 hardware counters, each event code is 8-bit wide. + */ + val = readl(mn_pmu->base + HISI_MN_EVTYPE_REGn(reg_info->event_type0, idx / 4)); + val &= ~(HISI_MN_EVTYPE_MASK << HISI_PMU_EVTYPE_SHIFT(idx)); + val |= (type << HISI_PMU_EVTYPE_SHIFT(idx)); + writel(val, mn_pmu->base + HISI_MN_EVTYPE_REGn(reg_info->event_type0, idx / 4)); +} + +static void hisi_mn_pmu_start_counters(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->perf_ctrl); + val |= HISI_MN_PERF_CTRL_EN; + writel(val, mn_pmu->base + reg_info->perf_ctrl); +} + +static void hisi_mn_pmu_stop_counters(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->perf_ctrl); + val &= ~HISI_MN_PERF_CTRL_EN; + writel(val, mn_pmu->base + reg_info->perf_ctrl); + + hisi_mn_pmu_counter_flush(mn_pmu); +} + +static void hisi_mn_pmu_enable_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->event_ctrl); + val |= BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->event_ctrl); +} + +static void hisi_mn_pmu_disable_counter(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->event_ctrl); + val &= ~BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->event_ctrl); +} + +static void hisi_mn_pmu_enable_counter_int(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->int_mask); + val &= ~BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->int_mask); +} + +static void hisi_mn_pmu_disable_counter_int(struct hisi_pmu *mn_pmu, + struct hw_perf_event *hwc) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + u32 val; + + val = readl(mn_pmu->base + reg_info->int_mask); + val |= BIT(hwc->idx); + writel(val, mn_pmu->base + reg_info->int_mask); +} + +static u32 hisi_mn_pmu_get_int_status(struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + return readl(mn_pmu->base + reg_info->int_status); +} + +static void hisi_mn_pmu_clear_int_status(struct hisi_pmu *mn_pmu, int idx) +{ + struct hisi_mn_pmu_regs *reg_info = mn_pmu->dev_info->private; + + writel(BIT(idx), mn_pmu->base + reg_info->int_clear); +} + +static struct attribute *hisi_mn_pmu_format_attr[] = { + HISI_PMU_FORMAT_ATTR(event, "config:0-7"), + NULL +}; + +static const struct attribute_group hisi_mn_pmu_format_group = { + .name = "format", + .attrs = hisi_mn_pmu_format_attr, +}; + +static struct attribute *hisi_mn_pmu_events_attr[] = { + HISI_PMU_EVENT_ATTR(req_eobarrier_num, 0x00), + HISI_PMU_EVENT_ATTR(req_ecbarrier_num, 0x01), + HISI_PMU_EVENT_ATTR(req_dvmop_num, 0x02), + HISI_PMU_EVENT_ATTR(req_dvmsync_num, 0x03), + HISI_PMU_EVENT_ATTR(req_retry_num, 0x04), + HISI_PMU_EVENT_ATTR(req_writenosnp_num, 0x05), + HISI_PMU_EVENT_ATTR(req_readnosnp_num, 0x06), + HISI_PMU_EVENT_ATTR(snp_dvm_num, 0x07), + HISI_PMU_EVENT_ATTR(snp_dvmsync_num, 0x08), + HISI_PMU_EVENT_ATTR(l3t_req_dvm_num, 0x09), + HISI_PMU_EVENT_ATTR(l3t_req_dvmsync_num, 0x0A), + HISI_PMU_EVENT_ATTR(mn_req_dvm_num, 0x0B), + HISI_PMU_EVENT_ATTR(mn_req_dvmsync_num, 0x0C), + HISI_PMU_EVENT_ATTR(pa_req_dvm_num, 0x0D), + HISI_PMU_EVENT_ATTR(pa_req_dvmsync_num, 0x0E), + HISI_PMU_EVENT_ATTR(snp_dvm_latency, 0x80), + HISI_PMU_EVENT_ATTR(snp_dvmsync_latency, 0x81), + HISI_PMU_EVENT_ATTR(l3t_req_dvm_latency, 0x82), + HISI_PMU_EVENT_ATTR(l3t_req_dvmsync_latency, 0x83), + HISI_PMU_EVENT_ATTR(mn_req_dvm_latency, 0x84), + HISI_PMU_EVENT_ATTR(mn_req_dvmsync_latency, 0x85), + HISI_PMU_EVENT_ATTR(pa_req_dvm_latency, 0x86), + HISI_PMU_EVENT_ATTR(pa_req_dvmsync_latency, 0x87), + NULL +}; + +static const struct attribute_group hisi_mn_pmu_events_group = { + .name = "events", + .attrs = hisi_mn_pmu_events_attr, +}; + +static const struct attribute_group *hisi_mn_pmu_attr_groups[] = { + &hisi_mn_pmu_format_group, + &hisi_mn_pmu_events_group, + &hisi_pmu_cpumask_attr_group, + &hisi_pmu_identifier_group, + NULL +}; + +static const struct hisi_uncore_ops hisi_uncore_mn_ops = { + .write_evtype = hisi_mn_pmu_write_evtype, + .get_event_idx = hisi_uncore_pmu_get_event_idx, + .start_counters = hisi_mn_pmu_start_counters, + .stop_counters = hisi_mn_pmu_stop_counters, + .enable_counter = hisi_mn_pmu_enable_counter, + .disable_counter = hisi_mn_pmu_disable_counter, + .enable_counter_int = hisi_mn_pmu_enable_counter_int, + .disable_counter_int = hisi_mn_pmu_disable_counter_int, + .write_counter = hisi_mn_pmu_write_counter, + .read_counter = hisi_mn_pmu_read_counter, + .get_int_status = hisi_mn_pmu_get_int_status, + .clear_int_status = hisi_mn_pmu_clear_int_status, +}; + +static int hisi_mn_pmu_dev_init(struct platform_device *pdev, + struct hisi_pmu *mn_pmu) +{ + struct hisi_mn_pmu_regs *reg_info; + int ret; + + hisi_uncore_pmu_init_topology(mn_pmu, &pdev->dev); + + if (mn_pmu->topo.scl_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, + "Failed to read MN scl id\n"); + + if (mn_pmu->topo.index_id < 0) + return dev_err_probe(&pdev->dev, -EINVAL, + "Failed to read MN index id\n"); + + mn_pmu->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(mn_pmu->base)) + return dev_err_probe(&pdev->dev, PTR_ERR(mn_pmu->base), + "Failed to ioremap resource\n"); + + ret = hisi_uncore_pmu_init_irq(mn_pmu, pdev); + if (ret) + return ret; + + mn_pmu->dev_info = device_get_match_data(&pdev->dev); + if (!mn_pmu->dev_info) + return -ENODEV; + + mn_pmu->pmu_events.attr_groups = mn_pmu->dev_info->attr_groups; + mn_pmu->counter_bits = mn_pmu->dev_info->counter_bits; + mn_pmu->check_event = mn_pmu->dev_info->check_event; + mn_pmu->num_counters = HISI_MN_NR_COUNTERS; + mn_pmu->ops = &hisi_uncore_mn_ops; + mn_pmu->dev = &pdev->dev; + mn_pmu->on_cpu = -1; + + reg_info = mn_pmu->dev_info->private; + mn_pmu->identifier = readl(mn_pmu->base + reg_info->version); + + return 0; +} + +static void hisi_mn_pmu_remove_cpuhp(void *hotplug_node) +{ + cpuhp_state_remove_instance_nocalls(hisi_mn_pmu_online, hotplug_node); +} + +static void hisi_mn_pmu_unregister(void *pmu) +{ + perf_pmu_unregister(pmu); +} + +static int hisi_mn_pmu_probe(struct platform_device *pdev) +{ + struct hisi_pmu *mn_pmu; + char *name; + int ret; + + mn_pmu = devm_kzalloc(&pdev->dev, sizeof(*mn_pmu), GFP_KERNEL); + if (!mn_pmu) + return -ENOMEM; + + platform_set_drvdata(pdev, mn_pmu); + + ret = hisi_mn_pmu_dev_init(pdev, mn_pmu); + if (ret) + return ret; + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_scl%d_mn%d", + mn_pmu->topo.scl_id, mn_pmu->topo.index_id); + if (!name) + return -ENOMEM; + + ret = cpuhp_state_add_instance(hisi_mn_pmu_online, &mn_pmu->node); + if (ret) + return dev_err_probe(&pdev->dev, ret, "Failed to register cpu hotplug\n"); + + ret = devm_add_action_or_reset(&pdev->dev, hisi_mn_pmu_remove_cpuhp, &mn_pmu->node); + if (ret) + return ret; + + hisi_pmu_init(mn_pmu, THIS_MODULE); + + ret = perf_pmu_register(&mn_pmu->pmu, name, -1); + if (ret) + return dev_err_probe(mn_pmu->dev, ret, "Failed to register MN PMU\n"); + + return devm_add_action_or_reset(&pdev->dev, hisi_mn_pmu_unregister, &mn_pmu->pmu); +} + +static struct hisi_mn_pmu_regs hisi_mn_v1_pmu_regs = { + .version = HISI_MN_VERSION_REG, + .dyn_ctrl = HISI_MN_DYNAMIC_CTRL_REG, + .perf_ctrl = HISI_MN_PERF_CTRL_REG, + .int_mask = HISI_MN_INT_MASK_REG, + .int_clear = HISI_MN_INT_CLEAR_REG, + .int_status = HISI_MN_INT_STATUS_REG, + .event_ctrl = HISI_MN_EVENT_CTRL_REG, + .event_type0 = HISI_MN_EVTYPE0_REG, + .event_cntr0 = HISI_MN_CNTR0_REG, +}; + +static const struct hisi_pmu_dev_info hisi_mn_v1 = { + .attr_groups = hisi_mn_pmu_attr_groups, + .counter_bits = 48, + .check_event = HISI_MN_EVTYPE_MASK, + .private = &hisi_mn_v1_pmu_regs, +}; + +static const struct acpi_device_id hisi_mn_pmu_acpi_match[] = { + { "HISI0222", (kernel_ulong_t) &hisi_mn_v1 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, hisi_mn_pmu_acpi_match); + +static struct platform_driver hisi_mn_pmu_driver = { + .driver = { + .name = "hisi_mn_pmu", + .acpi_match_table = hisi_mn_pmu_acpi_match, + /* + * We have not worked out a safe bind/unbind process, + * Forcefully unbinding during sampling will lead to a + * kernel panic, so this is not supported yet. + */ + .suppress_bind_attrs = true, + }, + .probe = hisi_mn_pmu_probe, +}; + +static int __init hisi_mn_pmu_module_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/hisi/mn:online", + hisi_uncore_pmu_online_cpu, + hisi_uncore_pmu_offline_cpu); + if (ret < 0) { + pr_err("hisi_mn_pmu: Failed to setup MN PMU hotplug: %d\n", ret); + return ret; + } + hisi_mn_pmu_online = ret; + + ret = platform_driver_register(&hisi_mn_pmu_driver); + if (ret) + cpuhp_remove_multi_state(hisi_mn_pmu_online); + + return ret; +} +module_init(hisi_mn_pmu_module_init); + +static void __exit hisi_mn_pmu_module_exit(void) +{ + platform_driver_unregister(&hisi_mn_pmu_driver); + cpuhp_remove_multi_state(hisi_mn_pmu_online); +} +module_exit(hisi_mn_pmu_module_exit); + +MODULE_IMPORT_NS("HISI_PMU"); +MODULE_DESCRIPTION("HiSilicon SoC MN uncore PMU driver"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Junhao He "); From 542342d27122e4a0dc90025748f7821f8e15920a Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 14 Aug 2025 17:16:22 +0800 Subject: [PATCH 57/93] MAINTAINERS: Remove myself from HiSilicon PMU maintainers Remove myself as I'm leaving HiSilicon and not suitable for maintaining this. Thanks for the journey. Signed-off-by: Yicong Yang Acked-by: Jonathan Cameron Signed-off-by: Will Deacon --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3815a2c4b3a8..c482d641fbe1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11062,7 +11062,6 @@ F: Documentation/devicetree/bindings/net/hisilicon*.txt F: drivers/net/ethernet/hisilicon/ HISILICON PMU DRIVER -M: Yicong Yang M: Jonathan Cameron S: Supported W: http://www.hisilicon.com From 105f56877f2d5f82d71e20b45eb7be7c24c3d908 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Sep 2025 18:41:38 +0100 Subject: [PATCH 58/93] coresight: trbe: Prevent overflow in PERF_IDX2OFF() Cast nr_pages to unsigned long to avoid overflow when handling large AUX buffer sizes (>= 2 GiB). Fixes: 3fbf7f011f24 ("coresight: sink: Add TRBE driver") Signed-off-by: Leo Yan Signed-off-by: Will Deacon --- drivers/hwtracing/coresight/coresight-trbe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c index 8267dd1a2130..8f426f94e32a 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.c +++ b/drivers/hwtracing/coresight/coresight-trbe.c @@ -23,7 +23,8 @@ #include "coresight-self-hosted-trace.h" #include "coresight-trbe.h" -#define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT)) +#define PERF_IDX2OFF(idx, buf) \ + ((idx) % ((unsigned long)(buf)->nr_pages << PAGE_SHIFT)) /* * A padding packet that will help the user space tools From a29fea30dd93da16652930162b177941abd8c75e Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 17 Sep 2025 18:41:39 +0100 Subject: [PATCH 59/93] perf: arm_spe: Prevent overflow in PERF_IDX2OFF() Cast nr_pages to unsigned long to avoid overflow when handling large AUX buffer sizes (>= 2 GiB). Fixes: d5d9696b0380 ("drivers/perf: Add support for ARMv8.2 Statistical Profiling Extension") Signed-off-by: Leo Yan Signed-off-by: Will Deacon --- drivers/perf/arm_spe_pmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 591f72fa0327..fa50645fedda 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -99,7 +99,8 @@ struct arm_spe_pmu { #define to_spe_pmu(p) (container_of(p, struct arm_spe_pmu, pmu)) /* Convert a free-running index from perf into an SPE buffer offset */ -#define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT)) +#define PERF_IDX2OFF(idx, buf) \ + ((idx) % ((unsigned long)(buf)->nr_pages << PAGE_SHIFT)) /* Keep track of our dynamic hotplug state */ static enum cpuhp_state arm_spe_pmu_online; From 52b49bd6de29a89a040fa33e980270904c734d69 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 18 Sep 2025 15:11:31 +0100 Subject: [PATCH 60/93] arm64: cputype: Remove duplicate Cortex-X1C definitions We currently have duplicate definitions for ARM_CPU_PART_CORTEX_X1C and MIDR_CORTEX_X1C as a result of commits: 58d245e03c324d08 ("arm64: cputype: Add Cortex-X1C definitions") efe676a1a7554219 ("arm64: proton-pack: Add new CPUs 'k' values for branch mitigation") Due to inconsistent sorting when adding entries, there was no textual conflict between the two patches. Delete the duplicate definitions added by the latter commit. The definitions in general are largely (but not entirely) in order of the MIDR_EL1.PartNum value rather than by CPU name, and the remaining Cortex-X1C definitions appear later in the list. For now I haven't sorted the remaining MIDR definitions to minimize churn. I intend to perform some larger cleanup of these in the near future which should supersede that anyhow. Signed-off-by: Mark Rutland Cc: James Morse Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index b10eba7f5247..98bedc3706ea 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -81,7 +81,6 @@ #define ARM_CPU_PART_CORTEX_A78AE 0xD42 #define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A510 0xD46 -#define ARM_CPU_PART_CORTEX_X1C 0xD4C #define ARM_CPU_PART_CORTEX_A520 0xD80 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_CORTEX_A715 0xD4D @@ -171,7 +170,6 @@ #define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) #define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) -#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C) #define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_CORTEX_A715 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A715) From b3fe1c83a56f3cb7c475747ee1c6ec5a9dd5f60e Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 18 Sep 2025 17:25:31 +0100 Subject: [PATCH 61/93] perf/arm-cmn: Fix CMN S3 DTM offset CMN S3's DTM offset is different between r0px and r1p0, and it turns out this was not a error in the earlier documentation, but does actually exist in the design. Lovely. Cc: stable@vger.kernel.org Fixes: 0dc2f4963f7e ("perf/arm-cmn: Support CMN S3") Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 11fb2234b10f..23245352a3fc 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -65,7 +65,7 @@ /* PMU registers occupy the 3rd 4KB page of each node's region */ #define CMN_PMU_OFFSET 0x2000 /* ...except when they don't :( */ -#define CMN_S3_DTM_OFFSET 0xa000 +#define CMN_S3_R1_DTM_OFFSET 0xa000 #define CMN_S3_PMU_OFFSET 0xd900 /* For most nodes, this is all there is */ @@ -233,6 +233,9 @@ enum cmn_revision { REV_CMN700_R1P0, REV_CMN700_R2P0, REV_CMN700_R3P0, + REV_CMNS3_R0P0 = 0, + REV_CMNS3_R0P1, + REV_CMNS3_R1P0, REV_CI700_R0P0 = 0, REV_CI700_R1P0, REV_CI700_R2P0, @@ -425,8 +428,8 @@ static enum cmn_model arm_cmn_model(const struct arm_cmn *cmn) static int arm_cmn_pmu_offset(const struct arm_cmn *cmn, const struct arm_cmn_node *dn) { if (cmn->part == PART_CMN_S3) { - if (dn->type == CMN_TYPE_XP) - return CMN_S3_DTM_OFFSET; + if (cmn->rev >= REV_CMNS3_R1P0 && dn->type == CMN_TYPE_XP) + return CMN_S3_R1_DTM_OFFSET; return CMN_S3_PMU_OFFSET; } return CMN_PMU_OFFSET; From e185c8a0d84236d14af61faff8147c953a878a77 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Thu, 18 Sep 2025 08:25:47 -0500 Subject: [PATCH 62/93] arm64: cputype: Add NVIDIA Olympus definitions Add cpu part and model macro definitions for NVIDIA Olympus core. Signed-off-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 98bedc3706ea..67ac757bc9c0 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -129,6 +129,7 @@ #define NVIDIA_CPU_PART_DENVER 0x003 #define NVIDIA_CPU_PART_CARMEL 0x004 +#define NVIDIA_CPU_PART_OLYMPUS 0x010 #define FUJITSU_CPU_PART_A64FX 0x001 @@ -220,6 +221,7 @@ #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER) #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) +#define MIDR_NVIDIA_OLYMPUS MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_OLYMPUS) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) #define MIDR_HISI_HIP09 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP09) From cc80537caaa789e097f35b2032ddc693a4e136f9 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Thu, 18 Sep 2025 08:25:48 -0500 Subject: [PATCH 63/93] arm64: cpufeature: Add Olympus MIDR to BBML2 allow list The NVIDIA Olympus core supports BBML2 without conflict abort. Add its MIDR to the allow list to enable FEAT_BBM. Signed-off-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b1219f14459f..c6cbeee48c57 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2235,6 +2235,7 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco static const struct midr_range supports_bbml2_noabort_list[] = { MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf), MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf), + MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), {} }; From ea87c5536aa8c2b5bcd2fb482df6f11e5517df06 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Thu, 18 Sep 2025 12:54:24 -0500 Subject: [PATCH 64/93] arm64: probes: Fix incorrect bl/blr address and register usage The pt_regs registers are 64-bit on arm64, and should be u64 when manipulated. Correct this so that we aren't truncating the address during br/blr sequences. Fixes: efb07ac534e2 ("arm64: probes: Add GCS support to bl/blr/ret") Signed-off-by: Jeremy Linton Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/simulate-insn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 97ed4db75417..89fbeb32107e 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -145,7 +145,7 @@ void __kprobes simulate_br_blr(u32 opcode, long addr, struct pt_regs *regs) { int xn = (opcode >> 5) & 0x1f; - int b_target = get_x_reg(regs, xn); + u64 b_target = get_x_reg(regs, xn); if (((opcode >> 21) & 0x3) == 1) if (update_lr(regs, addr + 4)) @@ -160,7 +160,7 @@ simulate_ret(u32 opcode, long addr, struct pt_regs *regs) u64 ret_addr; int err = 0; int xn = (opcode >> 5) & 0x1f; - unsigned long r_target = get_x_reg(regs, xn); + u64 r_target = get_x_reg(regs, xn); if (user_mode(regs) && task_gcs_el0_enabled(current)) { ret_addr = pop_user_gcs(&err); From 13efe932d2fc3f4f753e7662d550e903b7ce8e88 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 17 Sep 2025 12:02:08 -0700 Subject: [PATCH 65/93] arm64: cpufeature: add AmpereOne to BBML2 allow list AmpereOne supports BBML2 without conflict abort, add to the allow list. Reviewed-by: Christoph Lameter (Ampere) Reviewed-by: Ryan Roberts Acked-by: Catalin Marinas Signed-off-by: Yang Shi Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c6cbeee48c57..cf2dd5ea173f 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2236,6 +2236,8 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf), MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf), MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + MIDR_ALL_VERSIONS(MIDR_AMPERE1A), {} }; From a660194dd101e937c319171ad99c3fbe466fd825 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Wed, 17 Sep 2025 12:02:07 -0700 Subject: [PATCH 66/93] arm64: Enable permission change on arm64 kernel block mappings This patch paves the path to enable huge mappings in vmalloc space and linear map space by default on arm64. For this we must ensure that we can handle any permission games on the kernel (init_mm) pagetable. Previously, __change_memory_common() used apply_to_page_range() which does not support changing permissions for block mappings. We move away from this by using the pagewalk API, similar to what riscv does right now. It is the responsibility of the caller to ensure that the range over which permissions are being changed falls on leaf mapping boundaries. For systems with BBML2, this will be handled in future patches by dyanmically splitting the mappings when required. Unlike apply_to_page_range(), the pagewalk API currently enforces the init_mm.mmap_lock to be held. To avoid the unnecessary bottleneck of the mmap_lock for our usecase, this patch extends this generic API to be used locklessly, so as to retain the existing behaviour for changing permissions. Apart from this reason, it is noted at [1] that KFENCE can manipulate kernel pgtable entries during softirqs. It does this by calling set_memory_valid() -> __change_memory_common(). This being a non-sleepable context, we cannot take the init_mm mmap lock. Add comments to highlight the conditions under which we can use the lockless variant - no underlying VMA, and the user having exclusive control over the range, thus guaranteeing no concurrent access. We require that the start and end of a given range do not partially overlap block mappings, or cont mappings. Return -EINVAL in case a partial block mapping is detected in any of the PGD/P4D/PUD/PMD levels; add a corresponding comment in update_range_prot() to warn that eliminating such a condition is the responsibility of the caller. Note that, the pte level callback may change permissions for a whole contpte block, and that will be done one pte at a time, as opposed to an atomic operation for the block mappings. This is fine as any access will decode either the old or the new permission until the TLBI. apply_to_page_range() currently performs all pte level callbacks while in lazy mmu mode. Since arm64 can optimize performance by batching barriers when modifying kernel pgtables in lazy mmu mode, we would like to continue to benefit from this optimisation. Unfortunately walk_kernel_page_table_range() does not use lazy mmu mode. However, since the pagewalk framework is not allocating any memory, we can safely bracket the whole operation inside lazy mmu mode ourselves. Therefore, wrap the call to walk_kernel_page_table_range() with the lazy MMU helpers. Link: https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/ [1] Signed-off-by: Dev Jain Signed-off-by: Yang Shi Reviewed-by: Ryan Roberts Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/mm/pageattr.c | 123 ++++++++++++++++++++++++++++----------- include/linux/pagewalk.h | 3 + mm/pagewalk.c | 36 ++++++++---- 3 files changed, 117 insertions(+), 45 deletions(-) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 667aff1efe49..c0648764c403 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,65 @@ struct page_change_data { pgprot_t clear_mask; }; +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) +{ + struct page_change_data *masks = walk->private; + + val &= ~(pgprot_val(masks->clear_mask)); + val |= (pgprot_val(masks->set_mask)); + + return val; +} + +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t val = pudp_get(pud); + + if (pud_sect(val)) { + if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) + return -EINVAL; + val = __pud(set_pageattr_masks(pud_val(val), walk)); + set_pud(pud, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t val = pmdp_get(pmd); + + if (pmd_sect(val)) { + if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) + return -EINVAL; + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); + set_pmd(pmd, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t val = __ptep_get(pte); + + val = __pte(set_pageattr_masks(pte_val(val), walk)); + __set_pte(pte, val); + + return 0; +} + +static const struct mm_walk_ops pageattr_ops = { + .pud_entry = pageattr_pud_entry, + .pmd_entry = pageattr_pmd_entry, + .pte_entry = pageattr_pte_entry, +}; + bool rodata_full __ro_after_init = true; bool can_set_direct_map(void) @@ -37,23 +97,8 @@ bool can_set_direct_map(void) arm64_kfence_can_set_direct_map() || is_realm_world(); } -static int change_page_range(pte_t *ptep, unsigned long addr, void *data) -{ - struct page_change_data *cdata = data; - pte_t pte = __ptep_get(ptep); - - pte = clear_pte_bit(pte, cdata->clear_mask); - pte = set_pte_bit(pte, cdata->set_mask); - - __set_pte(ptep, pte); - return 0; -} - -/* - * This function assumes that the range is mapped with PAGE_SIZE pages. - */ -static int __change_memory_common(unsigned long start, unsigned long size, - pgprot_t set_mask, pgprot_t clear_mask) +static int update_range_prot(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) { struct page_change_data data; int ret; @@ -61,8 +106,26 @@ static int __change_memory_common(unsigned long start, unsigned long size, data.set_mask = set_mask; data.clear_mask = clear_mask; - ret = apply_to_page_range(&init_mm, start, size, change_page_range, - &data); + arch_enter_lazy_mmu_mode(); + + /* + * The caller must ensure that the range we are operating on does not + * partially overlap a block mapping, or a cont mapping. Any such case + * must be eliminated by splitting the mapping. + */ + ret = walk_kernel_page_table_range_lockless(start, start + size, + &pageattr_ops, NULL, &data); + arch_leave_lazy_mmu_mode(); + + return ret; +} + +static int __change_memory_common(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) +{ + int ret; + + ret = update_range_prot(start, size, set_mask, clear_mask); /* * If the memory is being made valid without changing any other bits @@ -174,32 +237,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) int set_direct_map_invalid_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(0), - .clear_mask = __pgprot(PTE_VALID), - }; + pgprot_t clear_mask = __pgprot(PTE_VALID); + pgprot_t set_mask = __pgprot(0); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } int set_direct_map_default_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(PTE_VALID | PTE_WRITE), - .clear_mask = __pgprot(PTE_RDONLY), - }; + pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE); + pgprot_t clear_mask = __pgprot(PTE_RDONLY); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } static int __set_memory_enc_dec(unsigned long addr, diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 682472c15495..88e18615dd72 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -134,6 +134,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); +int walk_kernel_page_table_range_lockless(unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 648038247a8d..936689d8bcac 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -606,10 +606,32 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { - struct mm_struct *mm = &init_mm; + /* + * Kernel intermediate page tables are usually not freed, so the mmap + * read lock is sufficient. But there are some exceptions. + * E.g. memory hot-remove. In which case, the mmap lock is insufficient + * to prevent the intermediate kernel pages tables belonging to the + * specified address range from being freed. The caller should take + * other actions to prevent this race. + */ + mmap_assert_locked(&init_mm); + + return walk_kernel_page_table_range_lockless(start, end, ops, pgd, + private); +} + +/* + * Use this function to walk the kernel page tables locklessly. It should be + * guaranteed that the caller has exclusive access over the range they are + * operating on - that there should be no concurrent access, for example, + * changing permissions for vmalloc objects. + */ +int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) +{ struct mm_walk walk = { .ops = ops, - .mm = mm, + .mm = &init_mm, .pgd = pgd, .private = private, .no_vma = true @@ -620,16 +642,6 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end, if (!check_ops_valid(ops)) return -EINVAL; - /* - * Kernel intermediate page tables are usually not freed, so the mmap - * read lock is sufficient. But there are some exceptions. - * E.g. memory hot-remove. In which case, the mmap lock is insufficient - * to prevent the intermediate kernel pages tables belonging to the - * specified address range from being freed. The caller should take - * other actions to prevent this race. - */ - mmap_assert_locked(mm); - return walk_pgd_range(start, end, &walk); } From a166563e7ec375b38a0fd3a58f7b77e50a6bc6a8 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 17 Sep 2025 12:02:09 -0700 Subject: [PATCH 67/93] arm64: mm: support large block mapping when rodata=full When rodata=full is specified, kernel linear mapping has to be mapped at PTE level since large page table can't be split due to break-before-make rule on ARM64. This resulted in a couple of problems: - performance degradation - more TLB pressure - memory waste for kernel page table With FEAT_BBM level 2 support, splitting large block page table to smaller ones doesn't need to make the page table entry invalid anymore. This allows kernel split large block mapping on the fly. Add kernel page table split support and use large block mapping by default when FEAT_BBM level 2 is supported for rodata=full. When changing permissions for kernel linear mapping, the page table will be split to smaller size. The machine without FEAT_BBM level 2 will fallback to have kernel linear mapping PTE-mapped when rodata=full. With this we saw significant performance boost with some benchmarks and much less memory consumption on my AmpereOne machine (192 cores, 1P) with 256GB memory. * Memory use after boot Before: MemTotal: 258988984 kB MemFree: 254821700 kB After: MemTotal: 259505132 kB MemFree: 255410264 kB Around 500MB more memory are free to use. The larger the machine, the more memory saved. * Memcached We saw performance degradation when running Memcached benchmark with rodata=full vs rodata=on. Our profiling pointed to kernel TLB pressure. With this patchset we saw ops/sec is increased by around 3.5%, P99 latency is reduced by around 9.6%. The gain mainly came from reduced kernel TLB misses. The kernel TLB MPKI is reduced by 28.5%. The benchmark data is now on par with rodata=on too. * Disk encryption (dm-crypt) benchmark Ran fio benchmark with the below command on a 128G ramdisk (ext4) with disk encryption (by dm-crypt). fio --directory=/data --random_generator=lfsr --norandommap \ --randrepeat 1 --status-interval=999 --rw=write --bs=4k --loops=1 \ --ioengine=sync --iodepth=1 --numjobs=1 --fsync_on_close=1 \ --group_reporting --thread --name=iops-test-job --eta-newline=1 \ --size 100G The IOPS is increased by 90% - 150% (the variance is high, but the worst number of good case is around 90% more than the best number of bad case). The bandwidth is increased and the avg clat is reduced proportionally. * Sequential file read Read 100G file sequentially on XFS (xfs_io read with page cache populated). The bandwidth is increased by 150%. Co-developed-by: Ryan Roberts Signed-off-by: Ryan Roberts Reviewed-by: Catalin Marinas Signed-off-by: Yang Shi Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 2 + arch/arm64/include/asm/mmu.h | 1 + arch/arm64/include/asm/pgtable.h | 5 + arch/arm64/kernel/cpufeature.c | 7 +- arch/arm64/mm/mmu.c | 264 +++++++++++++++++++++++++++- arch/arm64/mm/pageattr.c | 4 + 6 files changed, 277 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index bf13d676aae2..e223cbf350e4 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -871,6 +871,8 @@ static inline bool system_supports_pmuv3(void) return cpus_have_final_cap(ARM64_HAS_PMUV3); } +bool cpu_supports_bbml2_noabort(void); + static inline bool system_supports_bbml2_noabort(void) { return alternative_has_cap_unlikely(ARM64_HAS_BBML2_NOABORT); diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 6e8aa8e72601..56fca81f60ad 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -71,6 +71,7 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgprot_t prot, bool page_mappings_only); extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot); extern void mark_linear_text_alias_ro(void); +extern int split_kernel_leaf_mapping(unsigned long start, unsigned long end); /* * This check is triggered during the early boot before the cpufeature diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index abd2dee416b3..aa89c2e67ebc 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -371,6 +371,11 @@ static inline pmd_t pmd_mkcont(pmd_t pmd) return __pmd(pmd_val(pmd) | PMD_SECT_CONT); } +static inline pmd_t pmd_mknoncont(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) & ~PMD_SECT_CONT); +} + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9ad065f15f1d..e15472beff3f 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2217,7 +2217,7 @@ static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE); } -static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope) +bool cpu_supports_bbml2_noabort(void) { /* * We want to allow usage of BBML2 in as wide a range of kernel contexts @@ -2249,6 +2249,11 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco return true; } +static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope) +{ + return cpu_supports_bbml2_noabort(); +} + #ifdef CONFIG_ARM64_PAN static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index de463040582c..a7b29daf1a38 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -481,6 +481,8 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, int flags); #endif +#define INVALID_PHYS_ADDR (-1ULL) + static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, enum pgtable_type pgtable_type) { @@ -488,7 +490,9 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0); phys_addr_t pa; - BUG_ON(!ptdesc); + if (!ptdesc) + return INVALID_PHYS_ADDR; + pa = page_to_phys(ptdesc_page(ptdesc)); switch (pgtable_type) { @@ -509,16 +513,256 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, return pa; } +static phys_addr_t +try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) +{ + return __pgd_pgtable_alloc(&init_mm, pgtable_type); +} + static phys_addr_t __maybe_unused pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) { - return __pgd_pgtable_alloc(&init_mm, pgtable_type); + phys_addr_t pa; + + pa = __pgd_pgtable_alloc(&init_mm, pgtable_type); + BUG_ON(pa == INVALID_PHYS_ADDR); + return pa; } static phys_addr_t pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) { - return __pgd_pgtable_alloc(NULL, pgtable_type); + phys_addr_t pa; + + pa = __pgd_pgtable_alloc(NULL, pgtable_type); + BUG_ON(pa == INVALID_PHYS_ADDR); + return pa; +} + +static void split_contpte(pte_t *ptep) +{ + int i; + + ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); + for (i = 0; i < CONT_PTES; i++, ptep++) + __set_pte(ptep, pte_mknoncont(__ptep_get(ptep))); +} + +static int split_pmd(pmd_t *pmdp, pmd_t pmd) +{ + pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; + unsigned long pfn = pmd_pfn(pmd); + pgprot_t prot = pmd_pgprot(pmd); + phys_addr_t pte_phys; + pte_t *ptep; + int i; + + pte_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PTE); + if (pte_phys == INVALID_PHYS_ADDR) + return -ENOMEM; + ptep = (pte_t *)phys_to_virt(pte_phys); + + if (pgprot_val(prot) & PMD_SECT_PXN) + tableprot |= PMD_TABLE_PXN; + + prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE); + prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++) + __set_pte(ptep, pfn_pte(pfn, prot)); + + /* + * Ensure the pte entries are visible to the table walker by the time + * the pmd entry that points to the ptes is visible. + */ + dsb(ishst); + __pmd_populate(pmdp, pte_phys, tableprot); + + return 0; +} + +static void split_contpmd(pmd_t *pmdp) +{ + int i; + + pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS); + for (i = 0; i < CONT_PMDS; i++, pmdp++) + set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp))); +} + +static int split_pud(pud_t *pudp, pud_t pud) +{ + pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; + unsigned int step = PMD_SIZE >> PAGE_SHIFT; + unsigned long pfn = pud_pfn(pud); + pgprot_t prot = pud_pgprot(pud); + phys_addr_t pmd_phys; + pmd_t *pmdp; + int i; + + pmd_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PMD); + if (pmd_phys == INVALID_PHYS_ADDR) + return -ENOMEM; + pmdp = (pmd_t *)phys_to_virt(pmd_phys); + + if (pgprot_val(prot) & PMD_SECT_PXN) + tableprot |= PUD_TABLE_PXN; + + prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); + prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step) + set_pmd(pmdp, pfn_pmd(pfn, prot)); + + /* + * Ensure the pmd entries are visible to the table walker by the time + * the pud entry that points to the pmds is visible. + */ + dsb(ishst); + __pud_populate(pudp, pmd_phys, tableprot); + + return 0; +} + +static int split_kernel_leaf_mapping_locked(unsigned long addr) +{ + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + int ret = 0; + + /* + * PGD: If addr is PGD aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. + */ + if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr) + goto out; + pgdp = pgd_offset_k(addr); + pgd = pgdp_get(pgdp); + if (!pgd_present(pgd)) + goto out; + + /* + * P4D: If addr is P4D aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. + */ + if (ALIGN_DOWN(addr, P4D_SIZE) == addr) + goto out; + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + if (!p4d_present(p4d)) + goto out; + + /* + * PUD: If addr is PUD aligned then addr already describes a leaf + * boundary. If not present then there is nothing to split. Otherwise, + * if we have a pud leaf, split to contpmd. + */ + if (ALIGN_DOWN(addr, PUD_SIZE) == addr) + goto out; + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + if (!pud_present(pud)) + goto out; + if (pud_leaf(pud)) { + ret = split_pud(pudp, pud); + if (ret) + goto out; + } + + /* + * CONTPMD: If addr is CONTPMD aligned then addr already describes a + * leaf boundary. If not present then there is nothing to split. + * Otherwise, if we have a contpmd leaf, split to pmd. + */ + if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr) + goto out; + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + if (!pmd_present(pmd)) + goto out; + if (pmd_leaf(pmd)) { + if (pmd_cont(pmd)) + split_contpmd(pmdp); + /* + * PMD: If addr is PMD aligned then addr already describes a + * leaf boundary. Otherwise, split to contpte. + */ + if (ALIGN_DOWN(addr, PMD_SIZE) == addr) + goto out; + ret = split_pmd(pmdp, pmd); + if (ret) + goto out; + } + + /* + * CONTPTE: If addr is CONTPTE aligned then addr already describes a + * leaf boundary. If not present then there is nothing to split. + * Otherwise, if we have a contpte leaf, split to pte. + */ + if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr) + goto out; + ptep = pte_offset_kernel(pmdp, addr); + pte = __ptep_get(ptep); + if (!pte_present(pte)) + goto out; + if (pte_cont(pte)) + split_contpte(ptep); + +out: + return ret; +} + +static DEFINE_MUTEX(pgtable_split_lock); + +int split_kernel_leaf_mapping(unsigned long start, unsigned long end) +{ + int ret; + + /* + * !BBML2_NOABORT systems should not be trying to change permissions on + * anything that is not pte-mapped in the first place. Just return early + * and let the permission change code raise a warning if not already + * pte-mapped. + */ + if (!system_supports_bbml2_noabort()) + return 0; + + /* + * Ensure start and end are at least page-aligned since this is the + * finest granularity we can split to. + */ + if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end)) + return -EINVAL; + + mutex_lock(&pgtable_split_lock); + arch_enter_lazy_mmu_mode(); + + /* + * The split_kernel_leaf_mapping_locked() may sleep, it is not a + * problem for ARM64 since ARM64's lazy MMU implementation allows + * sleeping. + * + * Optimize for the common case of splitting out a single page from a + * larger mapping. Here we can just split on the "least aligned" of + * start and end and this will guarantee that there must also be a split + * on the more aligned address since the both addresses must be in the + * same contpte block and it must have been split to ptes. + */ + if (end - start == PAGE_SIZE) { + start = __ffs(start) < __ffs(end) ? start : end; + ret = split_kernel_leaf_mapping_locked(start); + } else { + ret = split_kernel_leaf_mapping_locked(start); + if (!ret) + ret = split_kernel_leaf_mapping_locked(end); + } + + arch_leave_lazy_mmu_mode(); + mutex_unlock(&pgtable_split_lock); + return ret; } /* @@ -640,6 +884,16 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { #endif /* CONFIG_KFENCE */ +static inline bool force_pte_mapping(void) +{ + bool bbml2 = system_capabilities_finalized() ? + system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); + + return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() || + is_realm_world())) || + debug_pagealloc_enabled(); +} + static void __init map_mem(pgd_t *pgdp) { static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); @@ -665,7 +919,7 @@ static void __init map_mem(pgd_t *pgdp) early_kfence_pool = arm64_kfence_alloc_pool(); - if (can_set_direct_map()) + if (force_pte_mapping()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* @@ -1368,7 +1622,7 @@ int arch_add_memory(int nid, u64 start, u64 size, VM_BUG_ON(!mhp_range_allowed(start, size, true)); - if (can_set_direct_map()) + if (force_pte_mapping()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index c0648764c403..5135f2d66958 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -106,6 +106,10 @@ static int update_range_prot(unsigned long start, unsigned long size, data.set_mask = set_mask; data.clear_mask = clear_mask; + ret = split_kernel_leaf_mapping(start, start + size); + if (WARN_ON_ONCE(ret)) + return ret; + arch_enter_lazy_mmu_mode(); /* From fa84e534c3ec2904d8718a83180294f7b5afecc7 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 18 Sep 2025 13:56:16 +0100 Subject: [PATCH 68/93] arm64: realm: ioremap: Allow mapping memory as encrypted For ioremap(), so far we only checked if it was a device (RIPAS_DEV) to choose an encrypted vs decrypted mapping. However, we may have firmware reserved memory regions exposed to the OS (e.g., EFI Coco Secret Securityfs, ACPI CCEL). We need to make sure that anything that is RIPAS_RAM (i.e., Guest protected memory with RMM guarantees) are also mapped as encrypted. Rephrasing the above, anything that is not RIPAS_EMPTY is guaranteed to be protected by the RMM. Thus we choose encrypted mapping for anything that is not RIPAS_EMPTY. While at it, rename the helper function __arm64_is_protected_mmio => arm64_rsi_is_protected to clearly indicate that this not an arm64 generic helper, but something to do with Realms. Cc: Sami Mujawar Cc: Will Deacon Cc: Catalin Marinas Cc: Aneesh Kumar K.V Cc: Steven Price Reviewed-by: Gavin Shan Reviewed-by: Steven Price Tested-by: Sami Mujawar Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- arch/arm64/include/asm/io.h | 2 +- arch/arm64/include/asm/rsi.h | 2 +- arch/arm64/kernel/rsi.c | 26 ++++++++++++++++++++++---- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 9b96840fb979..82276282a3c7 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -311,7 +311,7 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size) { if (unlikely(is_realm_world())) - return __arm64_is_protected_mmio(phys_addr, size); + return arm64_rsi_is_protected(phys_addr, size); return false; } diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h index b42aeac05340..88b50d660e85 100644 --- a/arch/arm64/include/asm/rsi.h +++ b/arch/arm64/include/asm/rsi.h @@ -16,7 +16,7 @@ DECLARE_STATIC_KEY_FALSE(rsi_present); void __init arm64_rsi_init(void); -bool __arm64_is_protected_mmio(phys_addr_t base, size_t size); +bool arm64_rsi_is_protected(phys_addr_t base, size_t size); static inline bool is_realm_world(void) { diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index ce4778141ec7..c64a06f58c0b 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -84,7 +84,25 @@ static void __init arm64_rsi_setup_memory(void) } } -bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) +/* + * Check if a given PA range is Trusted (e.g., Protected memory, a Trusted Device + * mapping, or an MMIO emulated in the Realm world). + * + * We can rely on the RIPAS value of the region to detect if a given region is + * protected. + * + * RIPAS_DEV - A trusted device memory or a trusted emulated MMIO (in the Realm + * world + * RIPAS_RAM - Memory (RAM), protected by the RMM guarantees. (e.g., Firmware + * reserved regions for data sharing). + * + * RIPAS_DESTROYED is a special case of one of the above, where the host did + * something without our permission and as such we can't do anything about it. + * + * The only case where something is emulated by the untrusted hypervisor or is + * backed by shared memory is indicated by RSI_RIPAS_EMPTY. + */ +bool arm64_rsi_is_protected(phys_addr_t base, size_t size) { enum ripas ripas; phys_addr_t end, top; @@ -101,18 +119,18 @@ bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) break; if (WARN_ON(top <= base)) break; - if (ripas != RSI_RIPAS_DEV) + if (ripas == RSI_RIPAS_EMPTY) break; base = top; } return base >= end; } -EXPORT_SYMBOL(__arm64_is_protected_mmio); +EXPORT_SYMBOL(arm64_rsi_is_protected); static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) { - if (__arm64_is_protected_mmio(phys, size)) + if (arm64_rsi_is_protected(phys, size)) *prot = pgprot_encrypted(*prot); else *prot = pgprot_decrypted(*prot); From 9e8a3df3e7f762966762a6fbf3282b9da2074127 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 18 Sep 2025 13:56:17 +0100 Subject: [PATCH 69/93] arm64: Enable EFI secret area Securityfs support Enable EFI COCO secrets support. Provide the ioremap_encrypted() support required by the driver. Cc: Sami Mujawar Cc: Will Deacon Cc: Catalin Marinas Cc: Aneesh Kumar K.V Cc: Steven Price Reviewed-by: Gavin Shan Tested-by: Sami Mujawar Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- arch/arm64/include/asm/io.h | 4 ++++ drivers/virt/coco/efi_secret/Kconfig | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 82276282a3c7..83e03abbb2ca 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -274,6 +274,10 @@ int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); #define ioremap_np(addr, size) \ ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE)) + +#define ioremap_encrypted(addr, size) \ + ioremap_prot((addr), (size), PAGE_KERNEL) + /* * io{read,write}{16,32,64}be() macros */ diff --git a/drivers/virt/coco/efi_secret/Kconfig b/drivers/virt/coco/efi_secret/Kconfig index 4404d198f3b2..94d88e5da707 100644 --- a/drivers/virt/coco/efi_secret/Kconfig +++ b/drivers/virt/coco/efi_secret/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config EFI_SECRET tristate "EFI secret area securityfs support" - depends on EFI && X86_64 + depends on EFI && (X86_64 || ARM64) select EFI_COCO_SECRET select SECURITYFS help From d02c2e45b1e7767b177f6854026e4ad0d70b4a4d Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 18 Sep 2025 13:56:18 +0100 Subject: [PATCH 70/93] arm64: acpi: Enable ACPI CCEL support Add support for ACPI CCEL by handling the EfiACPIMemoryNVS type memory. As per UEFI specifications NVS memory is reserved for Firmware use even after exiting boot services. Thus map the region as read-only. Cc: Sami Mujawar Cc: Will Deacon Cc: Catalin Marinas Cc: Aneesh Kumar K.V Cc: Steven Price Cc: Sudeep Holla Cc: Gavin Shan Reviewed-by: Gavin Shan Tested-by: Sami Mujawar Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- arch/arm64/kernel/acpi.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 4d529ff7ba51..b3195b3b895f 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -357,6 +357,16 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) * as long as we take care not to create a writable * mapping for executable code. */ + fallthrough; + + case EFI_ACPI_MEMORY_NVS: + /* + * ACPI NVS marks an area reserved for use by the + * firmware, even after exiting the boot service. + * This may be used by the firmware for sharing dynamic + * tables/data (e.g., ACPI CCEL) with the OS. Map it + * as read-only. + */ prot = PAGE_KERNEL_RO; break; From 777fb19ed8d6f193c2811b76371ffb41acbca1da Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 18 Sep 2025 20:42:07 +0100 Subject: [PATCH 71/93] kselftest/arm64: Add lsfe to the hwcaps test This feature has no traps associated with it so the SIGILL is not reliable. Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/hwcap.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index 27d4790c2f0c..3b96d090c5eb 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -17,6 +17,8 @@ #include #include +#include + #include "../../kselftest.h" #define TESTS_PER_HWCAP 3 @@ -168,6 +170,18 @@ static void lse128_sigill(void) : "cc", "memory"); } +static void lsfe_sigill(void) +{ + float __attribute__ ((aligned (16))) mem; + register float *memp asm ("x0") = &mem; + + /* STFADD H0, [X0] */ + asm volatile(".inst 0x7c20801f" + : "+r" (memp) + : + : "memory"); +} + static void lut_sigill(void) { /* LUTI2 V0.16B, { V0.16B }, V[0] */ @@ -761,6 +775,13 @@ static const struct hwcap_data { .cpuinfo = "lse128", .sigill_fn = lse128_sigill, }, + { + .name = "LSFE", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_LSFE, + .cpuinfo = "lsfe", + .sigill_fn = lsfe_sigill, + }, { .name = "LUT", .at_hwcap = AT_HWCAP2, From 92d051a1c1e30d6f8655a3fab91e19fdad72040b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 19 Sep 2025 14:56:43 +0100 Subject: [PATCH 72/93] arm64: Kconfig: Spell out "ARMv9.4" in menuconfig text The menuconfig entries to configure various architectural features are all formatted as "ARMvx.y architecture features" with the unusual exception of 9.4, which omits the "ARM" prefix. Add the "ARM" prefix to the menuconfig entry for the ARMv9.4 architectural features. Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a6..514038b18eba 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2218,7 +2218,7 @@ config ARM64_HAFT endmenu # "ARMv8.9 architectural features" -menu "v9.4 architectural features" +menu "ARMv9.4 architectural features" config ARM64_GCS bool "Enable support for Guarded Control Stack (GCS)" @@ -2237,7 +2237,7 @@ config ARM64_GCS The feature is detected at runtime, and will remain disabled if the system does not implement the feature. -endmenu # "v9.4 architectural features" +endmenu # "ARMv9.4 architectural features" config ARM64_SVE bool "ARM Scalable Vector Extension support" From 3df6979d222b8638a60aa6921b73cb5f3e4f5dcb Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 17 Sep 2025 12:02:10 -0700 Subject: [PATCH 73/93] arm64: mm: split linear mapping if BBML2 unsupported on secondary CPUs The kernel linear mapping is painted in very early stage of system boot. The cpufeature has not been finalized yet at this point. So the linear mapping is determined by the capability of boot CPU only. If the boot CPU supports BBML2, large block mappings will be used for linear mapping. But the secondary CPUs may not support BBML2, so repaint the linear mapping if large block mapping is used and the secondary CPUs don't support BBML2 once cpufeature is finalized on all CPUs. If the boot CPU doesn't support BBML2 or the secondary CPUs have the same BBML2 capability with the boot CPU, repainting the linear mapping is not needed. Repainting is implemented by the boot CPU, which we know supports BBML2, so it is safe for the live mapping size to change for this CPU. The linear map region is walked using the pagewalk API and any discovered large leaf mappings are split to pte mappings using the existing helper functions. Since the repainting is performed inside of a stop_machine(), we must use GFP_ATOMIC to allocate the extra intermediate pgtables. But since we are still early in boot, it is expected that there is plenty of memory available so we will never need to sleep for reclaim, and so GFP_ATOMIC is acceptable here. The secondary CPUs are all put into a waiting area with the idmap in TTBR0 and reserved map in TTBR1 while this is performed since they cannot be allowed to observe any size changes on the live mappings. Some of this infrastructure is reused from the kpti case. Specifically we share the same flag (was __idmap_kpti_flag, now idmap_kpti_bbml2_flag) since it means we don't have to reserve any extra pgtable memory to idmap the extra flag. Co-developed-by: Yang Shi Signed-off-by: Yang Shi Signed-off-by: Ryan Roberts Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu.h | 2 + arch/arm64/kernel/cpufeature.c | 3 + arch/arm64/mm/mmu.c | 182 +++++++++++++++++++++++++++++---- arch/arm64/mm/proc.S | 27 +++-- 4 files changed, 187 insertions(+), 27 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 56fca81f60ad..2acfa7801d02 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -72,6 +72,8 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot); extern void mark_linear_text_alias_ro(void); extern int split_kernel_leaf_mapping(unsigned long start, unsigned long end); +extern void init_idmap_kpti_bbml2_flag(void); +extern void linear_map_maybe_split_to_ptes(void); /* * This check is triggered during the early boot before the cpufeature diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e15472beff3f..3780f343fd2c 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -85,6 +85,7 @@ #include #include #include +#include #include #include #include @@ -2027,6 +2028,7 @@ static void __init kpti_install_ng_mappings(void) if (arm64_use_ng_mappings) return; + init_idmap_kpti_bbml2_flag(); stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); } @@ -3928,6 +3930,7 @@ void __init setup_system_features(void) { setup_system_capabilities(); + linear_map_maybe_split_to_ptes(); kpti_install_ng_mappings(); sve_setup(); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a7b29daf1a38..431ed90914bb 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include @@ -483,11 +485,11 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, #define INVALID_PHYS_ADDR (-1ULL) -static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, +static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, enum pgtable_type pgtable_type) { /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ - struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0); + struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0); phys_addr_t pa; if (!ptdesc) @@ -514,9 +516,9 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, } static phys_addr_t -try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) +try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type, gfp_t gfp) { - return __pgd_pgtable_alloc(&init_mm, pgtable_type); + return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type); } static phys_addr_t __maybe_unused @@ -524,7 +526,7 @@ pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) { phys_addr_t pa; - pa = __pgd_pgtable_alloc(&init_mm, pgtable_type); + pa = __pgd_pgtable_alloc(&init_mm, GFP_PGTABLE_KERNEL, pgtable_type); BUG_ON(pa == INVALID_PHYS_ADDR); return pa; } @@ -534,7 +536,7 @@ pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) { phys_addr_t pa; - pa = __pgd_pgtable_alloc(NULL, pgtable_type); + pa = __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type); BUG_ON(pa == INVALID_PHYS_ADDR); return pa; } @@ -548,7 +550,7 @@ static void split_contpte(pte_t *ptep) __set_pte(ptep, pte_mknoncont(__ptep_get(ptep))); } -static int split_pmd(pmd_t *pmdp, pmd_t pmd) +static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont) { pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; unsigned long pfn = pmd_pfn(pmd); @@ -557,7 +559,7 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd) pte_t *ptep; int i; - pte_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PTE); + pte_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PTE, gfp); if (pte_phys == INVALID_PHYS_ADDR) return -ENOMEM; ptep = (pte_t *)phys_to_virt(pte_phys); @@ -566,7 +568,9 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd) tableprot |= PMD_TABLE_PXN; prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE); - prot = __pgprot(pgprot_val(prot) | PTE_CONT); + prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); + if (to_cont) + prot = __pgprot(pgprot_val(prot) | PTE_CONT); for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++) __set_pte(ptep, pfn_pte(pfn, prot)); @@ -590,7 +594,7 @@ static void split_contpmd(pmd_t *pmdp) set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp))); } -static int split_pud(pud_t *pudp, pud_t pud) +static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont) { pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; unsigned int step = PMD_SIZE >> PAGE_SHIFT; @@ -600,7 +604,7 @@ static int split_pud(pud_t *pudp, pud_t pud) pmd_t *pmdp; int i; - pmd_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PMD); + pmd_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PMD, gfp); if (pmd_phys == INVALID_PHYS_ADDR) return -ENOMEM; pmdp = (pmd_t *)phys_to_virt(pmd_phys); @@ -609,7 +613,9 @@ static int split_pud(pud_t *pudp, pud_t pud) tableprot |= PUD_TABLE_PXN; prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); - prot = __pgprot(pgprot_val(prot) | PTE_CONT); + prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); + if (to_cont) + prot = __pgprot(pgprot_val(prot) | PTE_CONT); for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step) set_pmd(pmdp, pfn_pmd(pfn, prot)); @@ -667,7 +673,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr) if (!pud_present(pud)) goto out; if (pud_leaf(pud)) { - ret = split_pud(pudp, pud); + ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true); if (ret) goto out; } @@ -692,7 +698,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr) */ if (ALIGN_DOWN(addr, PMD_SIZE) == addr) goto out; - ret = split_pmd(pmdp, pmd); + ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true); if (ret) goto out; } @@ -765,6 +771,138 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) return ret; } +static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr, + unsigned long next, + struct mm_walk *walk) +{ + pud_t pud = pudp_get(pudp); + int ret = 0; + + if (pud_leaf(pud)) + ret = split_pud(pudp, pud, GFP_ATOMIC, false); + + return ret; +} + +static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, + unsigned long next, + struct mm_walk *walk) +{ + pmd_t pmd = pmdp_get(pmdp); + int ret = 0; + + if (pmd_leaf(pmd)) { + if (pmd_cont(pmd)) + split_contpmd(pmdp); + ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false); + + /* + * We have split the pmd directly to ptes so there is no need to + * visit each pte to check if they are contpte. + */ + walk->action = ACTION_CONTINUE; + } + + return ret; +} + +static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, + unsigned long next, + struct mm_walk *walk) +{ + pte_t pte = __ptep_get(ptep); + + if (pte_cont(pte)) + split_contpte(ptep); + + return 0; +} + +static const struct mm_walk_ops split_to_ptes_ops __initconst = { + .pud_entry = split_to_ptes_pud_entry, + .pmd_entry = split_to_ptes_pmd_entry, + .pte_entry = split_to_ptes_pte_entry, +}; + +static bool linear_map_requires_bbml2 __initdata; + +u32 idmap_kpti_bbml2_flag; + +void __init init_idmap_kpti_bbml2_flag(void) +{ + WRITE_ONCE(idmap_kpti_bbml2_flag, 1); + /* Must be visible to other CPUs before stop_machine() is called. */ + smp_mb(); +} + +static int __init linear_map_split_to_ptes(void *__unused) +{ + /* + * Repainting the linear map must be done by CPU0 (the boot CPU) because + * that's the only CPU that we know supports BBML2. The other CPUs will + * be held in a waiting area with the idmap active. + */ + if (!smp_processor_id()) { + unsigned long lstart = _PAGE_OFFSET(vabits_actual); + unsigned long lend = PAGE_END; + unsigned long kstart = (unsigned long)lm_alias(_stext); + unsigned long kend = (unsigned long)lm_alias(__init_begin); + int ret; + + /* + * Wait for all secondary CPUs to be put into the waiting area. + */ + smp_cond_load_acquire(&idmap_kpti_bbml2_flag, VAL == num_online_cpus()); + + /* + * Walk all of the linear map [lstart, lend), except the kernel + * linear map alias [kstart, kend), and split all mappings to + * PTE. The kernel alias remains static throughout runtime so + * can continue to be safely mapped with large mappings. + */ + ret = walk_kernel_page_table_range_lockless(lstart, kstart, + &split_to_ptes_ops, NULL, NULL); + if (!ret) + ret = walk_kernel_page_table_range_lockless(kend, lend, + &split_to_ptes_ops, NULL, NULL); + if (ret) + panic("Failed to split linear map\n"); + flush_tlb_kernel_range(lstart, lend); + + /* + * Relies on dsb in flush_tlb_kernel_range() to avoid reordering + * before any page table split operations. + */ + WRITE_ONCE(idmap_kpti_bbml2_flag, 0); + } else { + typedef void (wait_split_fn)(void); + extern wait_split_fn wait_linear_map_split_to_ptes; + wait_split_fn *wait_fn; + + wait_fn = (void *)__pa_symbol(wait_linear_map_split_to_ptes); + + /* + * At least one secondary CPU doesn't support BBML2 so cannot + * tolerate the size of the live mappings changing. So have the + * secondary CPUs wait for the boot CPU to make the changes + * with the idmap active and init_mm inactive. + */ + cpu_install_idmap(); + wait_fn(); + cpu_uninstall_idmap(); + } + + return 0; +} + +void __init linear_map_maybe_split_to_ptes(void) +{ + if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) { + init_idmap_kpti_bbml2_flag(); + stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask); + } +} + /* * This function can only be used to modify existing table entries, * without allocating new levels of table. Note that this permits the @@ -919,6 +1057,8 @@ static void __init map_mem(pgd_t *pgdp) early_kfence_pool = arm64_kfence_alloc_pool(); + linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map(); + if (force_pte_mapping()) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; @@ -1053,7 +1193,7 @@ void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, u64 va_offset); static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, - kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; + kpti_bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; static void __init create_idmap(void) { @@ -1065,15 +1205,17 @@ static void __init create_idmap(void) IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, __phys_to_virt(ptep) - ptep); - if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) { - extern u32 __idmap_kpti_flag; - phys_addr_t pa = __pa_symbol(&__idmap_kpti_flag); + if (linear_map_requires_bbml2 || + (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings)) { + phys_addr_t pa = __pa_symbol(&idmap_kpti_bbml2_flag); /* * The KPTI G-to-nG conversion code needs a read-write mapping - * of its synchronization flag in the ID map. + * of its synchronization flag in the ID map. This is also used + * when splitting the linear map to ptes if a secondary CPU + * doesn't support bbml2. */ - ptep = __pa_symbol(kpti_ptes); + ptep = __pa_symbol(kpti_bbml2_ptes); __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL, IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, __phys_to_virt(ptep) - ptep); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 8c75965afc9e..86818511962b 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -245,10 +245,6 @@ SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1) * * Called exactly once from stop_machine context by each CPU found during boot. */ - .pushsection ".data", "aw", %progbits -SYM_DATA(__idmap_kpti_flag, .long 1) - .popsection - SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) cpu .req w0 temp_pte .req x0 @@ -273,7 +269,7 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) mov x5, x3 // preserve temp_pte arg mrs swapper_ttb, ttbr1_el1 - adr_l flag_ptr, __idmap_kpti_flag + adr_l flag_ptr, idmap_kpti_bbml2_flag cbnz cpu, __idmap_kpti_secondary @@ -416,7 +412,25 @@ alternative_else_nop_endif __idmap_kpti_secondary: /* Uninstall swapper before surgery begins */ __idmap_cpu_set_reserved_ttbr1 x16, x17 + b scondary_cpu_wait + .unreq swapper_ttb + .unreq flag_ptr +SYM_FUNC_END(idmap_kpti_install_ng_mappings) + .popsection +#endif + + .pushsection ".idmap.text", "a" +SYM_TYPED_FUNC_START(wait_linear_map_split_to_ptes) + /* Must be same registers as in idmap_kpti_install_ng_mappings */ + swapper_ttb .req x3 + flag_ptr .req x4 + + mrs swapper_ttb, ttbr1_el1 + adr_l flag_ptr, idmap_kpti_bbml2_flag + __idmap_cpu_set_reserved_ttbr1 x16, x17 + +scondary_cpu_wait: /* Increment the flag to let the boot CPU we're ready */ 1: ldxr w16, [flag_ptr] add w16, w16, #1 @@ -436,9 +450,8 @@ __idmap_kpti_secondary: .unreq swapper_ttb .unreq flag_ptr -SYM_FUNC_END(idmap_kpti_install_ng_mappings) +SYM_FUNC_END(wait_linear_map_split_to_ptes) .popsection -#endif /* * __cpu_setup From 3bbf004c4808e2c3241e5c1ad6cc102f38a03c39 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 19 Sep 2025 15:58:28 +0100 Subject: [PATCH 74/93] arm64: cputype: Add Neoverse-V3AE definitions Add cputype definitions for Neoverse-V3AE. These will be used for errata detection in subsequent patches. These values can be found in the Neoverse-V3AE TRM: https://developer.arm.com/documentation/SDEN-2615521/9-0/ ... in section A.6.1 ("MIDR_EL1, Main ID Register"). Signed-off-by: Mark Rutland Cc: James Morse Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Ryan Roberts Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 67ac757bc9c0..9b00b75acbf2 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -92,6 +92,7 @@ #define ARM_CPU_PART_NEOVERSE_V2 0xD4F #define ARM_CPU_PART_CORTEX_A720 0xD81 #define ARM_CPU_PART_CORTEX_X4 0xD82 +#define ARM_CPU_PART_NEOVERSE_V3AE 0xD83 #define ARM_CPU_PART_NEOVERSE_V3 0xD84 #define ARM_CPU_PART_CORTEX_X925 0xD85 #define ARM_CPU_PART_CORTEX_A725 0xD87 @@ -182,6 +183,7 @@ #define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2) #define MIDR_CORTEX_A720 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720) #define MIDR_CORTEX_X4 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X4) +#define MIDR_NEOVERSE_V3AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3AE) #define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3) #define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925) #define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725) From 0c33aa1804d101c11ba1992504f17a42233f0e11 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 19 Sep 2025 15:58:29 +0100 Subject: [PATCH 75/93] arm64: errata: Apply workarounds for Neoverse-V3AE Neoverse-V3AE is also affected by erratum #3312417, as described in its Software Developer Errata Notice (SDEN) document: Neoverse V3AE (MP172) SDEN v9.0, erratum 3312417 https://developer.arm.com/documentation/SDEN-2615521/9-0/ Enable the workaround for Neoverse-V3AE, and document this. Signed-off-by: Mark Rutland Cc: James Morse Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Ryan Roberts Signed-off-by: Will Deacon --- Documentation/arch/arm64/silicon-errata.rst | 2 ++ arch/arm64/Kconfig | 1 + arch/arm64/kernel/cpu_errata.c | 1 + 3 files changed, 4 insertions(+) diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index b18ef4064bc0..a7ec57060f64 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -200,6 +200,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-V3 | #3312417 | ARM64_ERRATUM_3194386 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Neoverse-V3AE | #3312417 | ARM64_ERRATUM_3194386 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | MMU-500 | #841119,826419 | ARM_SMMU_MMU_500_CPRE_ERRATA| | | | #562869,1047329 | | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a6..93f391e67af1 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1138,6 +1138,7 @@ config ARM64_ERRATUM_3194386 * ARM Neoverse-V1 erratum 3324341 * ARM Neoverse V2 erratum 3324336 * ARM Neoverse-V3 erratum 3312417 + * ARM Neoverse-V3AE erratum 3312417 On affected cores "MSR SSBS, #0" instructions may not affect subsequent speculative instructions, which may permit unexepected diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 7ff6b49beaaf..8cb3b575a031 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -546,6 +546,7 @@ static const struct midr_range erratum_spec_ssbs_list[] = { MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3AE), {} }; #endif From 8fca3852e33d762b8d8beed5458c99ffb7fd5975 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 19 Sep 2025 15:58:30 +0100 Subject: [PATCH 76/93] arm64: cpufeature: add Neoverse-V3AE to BBML2 allow list Neoverse-V3AE advertises support for BBML2 and is known to not raise conflict aborts. So add it to the BBML2_NOABORT allow list. However, just like Neoverse-V3, Neoverse-V3AE r0p0 and r0p1 suffer from erratum #3053180, for which the workaround is to always observe break-before-make requirements for affected revisions. Therefore only add to the allow list from r0p2 onwards. For more details see Software Developer Errata Notice (SDEN) document: Neoverse V3AE (MP172) SDEN v9.0, erratum 3053180 https://developer.arm.com/documentation/SDEN-2615521/9-0/ Signed-off-by: Ryan Roberts Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index cf2dd5ea173f..30244c1f833c 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2235,6 +2235,7 @@ static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int sco static const struct midr_range supports_bbml2_noabort_list[] = { MIDR_REV_RANGE(MIDR_CORTEX_X4, 0, 3, 0xf), MIDR_REV_RANGE(MIDR_NEOVERSE_V3, 0, 2, 0xf), + MIDR_REV_RANGE(MIDR_NEOVERSE_V3AE, 0, 2, 0xf), MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS), MIDR_ALL_VERSIONS(MIDR_AMPERE1), MIDR_ALL_VERSIONS(MIDR_AMPERE1A), From fa93b45fd397e25265ff618de26dd5c74ee403d3 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Mon, 22 Sep 2025 12:11:26 +0530 Subject: [PATCH 77/93] arm64: Enable vmalloc-huge with ptdump Our goal is to move towards enabling vmalloc-huge by default on arm64 so as to reduce TLB pressure. Therefore, we need a way to analyze the portion of block mappings in vmalloc space we can get on a production system; this can be done through ptdump, but currently we disable vmalloc-huge if CONFIG_PTDUMP_DEBUGFS is on. The reason is that lazy freeing of kernel pagetables via vmap_try_huge_pxd() may race with ptdump, so ptdump may dereference a bogus address. To solve this, we need to synchronize ptdump_walk() and ptdump_check_wx() with pud_free_pmd_page() and pmd_free_pte_page(). Since this race is very unlikely to happen in practice, we do not want to penalize the vmalloc pagetable tearing path by taking the init_mm mmap_lock. Therefore, we use static keys. ptdump_walk() and ptdump_check_wx() are the pagetable walkers; they will enable the static key - upon observing that, the vmalloc pagetable tearing path will get patched in with an mmap_read_lock/unlock sequence. A combination of the patched-in mmap_read_lock/unlock, the acquire semantics of static_branch_inc(), and the barriers in __flush_tlb_kernel_pgtable() ensures that ptdump will never get a hold on the address of a freed PMD or PTE table. We can verify the correctness of the algorithm via the following litmus test (thanks to James Houghton and Will Deacon): AArch64 ptdump Variant=Ifetch { uint64_t pud=0xa110c; uint64_t pmd; 0:X0=label:"P1:L0"; 0:X1=instr:"NOP"; 0:X2=lock; 0:X3=pud; 0:X4=pmd; 1:X1=0xdead; 1:X2=lock; 1:X3=pud; 1:X4=pmd; } P0 | P1 ; (* static_key_enable *) | (* pud_free_pmd_page *) ; STR W1, [X0] | LDR X9, [X3] ; DC CVAU,X0 | STR XZR, [X3] ; DSB ISH | DSB ISH ; IC IVAU,X0 | ISB ; DSB ISH | ; ISB | (* static key *) ; | L0: ; (* mmap_lock *) | B out1 ; Lwlock: | ; MOV W7, #1 | (* mmap_lock *) ; SWPA W7, W8, [X2] | Lrlock: ; | MOV W7, #1 ; | SWPA W7, W8, [X2] ; (* walk pgtable *) | ; LDR X9, [X3] | (* mmap_unlock *) ; CBZ X9, out0 | STLR WZR, [X2] ; EOR X10, X9, X9 | ; LDR X11, [X4, X10] | out1: ; | EOR X10, X9, X9 ; out0: | STR X1, [X4, X10] ; exists (0:X8=0 /\ 1:X8=0 /\ (* Lock acquisitions succeed *) 0:X9=0xa110c /\ (* P0 sees the valid PUD ...*) 0:X11=0xdead) (* ... but the freed PMD *) For an approximate written proof of why this algorithm works, please read the code comment in [1], which is now removed for the sake of simplicity. mm-selftests pass. No issues were observed while parallelly running test_vmalloc.sh (which stresses the vmalloc subsystem), and cat /sys/kernel/debug/{kernel_page_tables, check_wx_pages} in a loop. Link: https://lore.kernel.org/all/20250723161827.15802-1-dev.jain@arm.com/ [1] Reviewed-by: Ryan Roberts Signed-off-by: Dev Jain Signed-off-by: Will Deacon --- arch/arm64/include/asm/ptdump.h | 2 ++ arch/arm64/include/asm/vmalloc.h | 9 ++----- arch/arm64/mm/mmu.c | 43 +++++++++++++++++++++++++++++--- arch/arm64/mm/ptdump.c | 11 ++++++-- 4 files changed, 52 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index fded5358641f..baff24004459 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -7,6 +7,8 @@ #include +DECLARE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); + #ifdef CONFIG_PTDUMP #include diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index 12f534e8f3ed..4ec1acd3c1b3 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -9,18 +9,13 @@ #define arch_vmap_pud_supported arch_vmap_pud_supported static inline bool arch_vmap_pud_supported(pgprot_t prot) { - /* - * SW table walks can't handle removal of intermediate entries. - */ - return pud_sect_supported() && - !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); + return pud_sect_supported(); } #define arch_vmap_pmd_supported arch_vmap_pmd_supported static inline bool arch_vmap_pmd_supported(pgprot_t prot) { - /* See arch_vmap_pud_supported() */ - return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); + return true; } #define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 431ed90914bb..0ba1a15e7e74 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -56,6 +56,8 @@ enum pgtable_type { TABLE_P4D, }; +DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); + u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); @@ -1665,7 +1667,8 @@ int pmd_clear_huge(pmd_t *pmdp) return 1; } -int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) +static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr, + bool acquire_mmap_lock) { pte_t *table; pmd_t pmd; @@ -1677,13 +1680,25 @@ int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) return 1; } + /* See comment in pud_free_pmd_page for static key logic */ table = pte_offset_kernel(pmdp, addr); pmd_clear(pmdp); __flush_tlb_kernel_pgtable(addr); + if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) { + mmap_read_lock(&init_mm); + mmap_read_unlock(&init_mm); + } + pte_free_kernel(NULL, table); return 1; } +int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) +{ + /* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */ + return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true); +} + int pud_free_pmd_page(pud_t *pudp, unsigned long addr) { pmd_t *table; @@ -1699,16 +1714,36 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr) } table = pmd_offset(pudp, addr); + + /* + * Our objective is to prevent ptdump from reading a PMD table which has + * been freed. In this race, if pud_free_pmd_page observes the key on + * (which got flipped by ptdump) then the mmap lock sequence here will, + * as a result of the mmap write lock/unlock sequence in ptdump, give + * us the correct synchronization. If not, this means that ptdump has + * yet not started walking the pagetables - the sequence of barriers + * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will + * observe an empty PUD. + */ + pud_clear(pudp); + __flush_tlb_kernel_pgtable(addr); + if (static_branch_unlikely(&arm64_ptdump_lock_key)) { + mmap_read_lock(&init_mm); + mmap_read_unlock(&init_mm); + } + pmdp = table; next = addr; end = addr + PUD_SIZE; do { if (pmd_present(pmdp_get(pmdp))) - pmd_free_pte_page(pmdp, next); + /* + * PMD has been isolated, so ptdump won't see it. No + * need to acquire init_mm.mmap_lock. + */ + __pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false); } while (pmdp++, next += PMD_SIZE, next != end); - pud_clear(pudp); - __flush_tlb_kernel_pgtable(addr); pmd_free(NULL, table); return 1; } diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 421a5de806c6..ab9899ca1e5f 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -283,6 +283,13 @@ void note_page_flush(struct ptdump_state *pt_st) note_page(pt_st, 0, -1, pte_val(pte_zero)); } +static void arm64_ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm) +{ + static_branch_inc(&arm64_ptdump_lock_key); + ptdump_walk_pgd(st, mm, NULL); + static_branch_dec(&arm64_ptdump_lock_key); +} + void ptdump_walk(struct seq_file *s, struct ptdump_info *info) { unsigned long end = ~0UL; @@ -311,7 +318,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info) } }; - ptdump_walk_pgd(&st.ptdump, info->mm, NULL); + arm64_ptdump_walk_pgd(&st.ptdump, info->mm); } static void __init ptdump_initialize(void) @@ -353,7 +360,7 @@ bool ptdump_check_wx(void) } }; - ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + arm64_ptdump_walk_pgd(&st.ptdump, &init_mm); if (st.wx_pages || st.uxn_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", From 14f158552eec700ae0e52b91aa17168a7b168c0c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Sun, 21 Sep 2025 06:22:58 +0530 Subject: [PATCH 78/93] arm64/sysreg: Update TCR_EL1 register Update TCR_EL1 register fields as per latest ARM ARM DDI 0487 L.B and while here drop an explicit sysreg definition SYS_TCR_EL1 from sysreg.h, which is now redundant. Cc: Catalin Marinas Cc: Will Deacon Cc: Marc Zyngier Cc: Mark Brown Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Mark Brown Signed-off-by: Anshuman Khandual Signed-off-by: Will Deacon --- arch/arm64/include/asm/sysreg.h | 2 -- arch/arm64/tools/sysreg | 52 ++++++++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index d5b5f2ae1afa..ad5c901af229 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -281,8 +281,6 @@ #define SYS_RGSR_EL1 sys_reg(3, 0, 1, 0, 5) #define SYS_GCR_EL1 sys_reg(3, 0, 1, 0, 6) -#define SYS_TCR_EL1 sys_reg(3, 0, 2, 0, 2) - #define SYS_APIAKEYLO_EL1 sys_reg(3, 0, 2, 1, 0) #define SYS_APIAKEYHI_EL1 sys_reg(3, 0, 2, 1, 1) #define SYS_APIBKEYLO_EL1 sys_reg(3, 0, 2, 1, 2) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index d396fa587ec1..d5eb5b67145f 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -4748,17 +4748,53 @@ Field 37 TBI0 Field 36 AS Res0 35 Field 34:32 IPS -Field 31:30 TG1 -Field 29:28 SH1 -Field 27:26 ORGN1 -Field 25:24 IRGN1 +Enum 31:30 TG1 + 0b01 16K + 0b10 4K + 0b11 64K +EndEnum +Enum 29:28 SH1 + 0b00 NONE + 0b10 OUTER + 0b11 INNER +EndEnum +Enum 27:26 ORGN1 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum +Enum 25:24 IRGN1 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum Field 23 EPD1 Field 22 A1 Field 21:16 T1SZ -Field 15:14 TG0 -Field 13:12 SH0 -Field 11:10 ORGN0 -Field 9:8 IRGN0 +Enum 15:14 TG0 + 0b00 4K + 0b01 64K + 0b10 16K +EndEnum +Enum 13:12 SH0 + 0b00 NONE + 0b10 OUTER + 0b11 INNER +EndEnum +Enum 11:10 ORGN0 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum +Enum 9:8 IRGN0 + 0b00 NC + 0b01 WBWA + 0b10 WT + 0b11 WBnWA +EndEnum Field 7 EPD0 Res0 6 Field 5:0 T0SZ From 5973a62efa34c80c9a4e5eac1fca6f6209b902af Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 19 Sep 2025 14:27:51 -0700 Subject: [PATCH 79/93] arm64: map [_text, _stext) virtual address range non-executable+read-only Since the referenced fixes commit, the kernel's .text section is only mapped starting from _stext; the region [_text, _stext) is omitted. As a result, other vmalloc/vmap allocations may use the virtual addresses nominally in the range [_text, _stext). This address reuse confuses multiple things: 1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore mapping the entire range [_text, _end) to [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in [_text, _stext) from /proc/vmcore therefore gives the incorrect result. 2. Tools doing symbolization (either by reading /proc/kallsyms or based on the vmlinux ELF file) will incorrectly identify vmalloc/vmap allocations in [_text, _stext) as kernel symbols. In practice, both of these issues affect the drgn debugger. Specifically, there were cases where the vmap IRQ stacks for some CPUs were allocated in [_text, _stext). As a result, drgn could not get the stack trace for a crash in an IRQ handler because the core dump contained invalid data for the IRQ stack address. The stack addresses were also symbolized as being in the _text symbol. Fix this by bringing back the mapping of [_text, _stext), but now make it non-executable and read-only. This prevents other allocations from using it while still achieving the original goal of not mapping unpredictable data as executable. Other than the changed protection, this is effectively a revert of the fixes commit. Fixes: e2a073dde921 ("arm64: omit [_text, _stext) from permanent kernel mapping") Cc: stable@vger.kernel.org Signed-off-by: Omar Sandoval Signed-off-by: Will Deacon --- arch/arm64/kernel/pi/map_kernel.c | 6 ++++++ arch/arm64/kernel/setup.c | 4 ++-- arch/arm64/mm/init.c | 2 +- arch/arm64/mm/mmu.c | 14 +++++++++----- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index e6d35eff1486..e8ddbde31a83 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -78,6 +78,12 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) twopass |= enable_scs; prot = twopass ? data_prot : text_prot; + /* + * [_stext, _text) isn't executed after boot and contains some + * non-executable, unpredictable data, so map it non-executable. + */ + map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot, + false, root_level); map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot, !twopass, root_level); map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata, diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 77c7926a4df6..23c05dc7a8f2 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -214,7 +214,7 @@ static void __init request_standard_resources(void) unsigned long i = 0; size_t res_size; - kernel_code.start = __pa_symbol(_stext); + kernel_code.start = __pa_symbol(_text); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); @@ -280,7 +280,7 @@ u64 cpu_logical_map(unsigned int cpu) void __init __no_sanitize_address setup_arch(char **cmdline_p) { - setup_initial_init_mm(_stext, _etext, _edata, _end); + setup_initial_init_mm(_text, _etext, _edata, _end); *cmdline_p = boot_command_line; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 70c2ca813c18..524d34a0e921 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -279,7 +279,7 @@ void __init arm64_memblock_init(void) * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. */ - memblock_reserve(__pa_symbol(_stext), _end - _stext); + memblock_reserve(__pa_symbol(_text), _end - _text); if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { /* the generic initrd code expects virtual addresses */ initrd_start = __phys_to_virt(phys_initrd_start); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 0ba1a15e7e74..10c258099581 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -965,8 +965,8 @@ void __init mark_linear_text_alias_ro(void) /* * Remove the write permissions from the linear alias of .text/.rodata */ - update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext), - (unsigned long)__init_begin - (unsigned long)_stext, + update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), + (unsigned long)__init_begin - (unsigned long)_text, PAGE_KERNEL_RO); } @@ -1037,7 +1037,7 @@ static inline bool force_pte_mapping(void) static void __init map_mem(pgd_t *pgdp) { static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); - phys_addr_t kernel_start = __pa_symbol(_stext); + phys_addr_t kernel_start = __pa_symbol(_text); phys_addr_t kernel_end = __pa_symbol(__init_begin); phys_addr_t start, end; phys_addr_t early_kfence_pool; @@ -1086,7 +1086,7 @@ static void __init map_mem(pgd_t *pgdp) } /* - * Map the linear alias of the [_stext, __init_begin) interval + * Map the linear alias of the [_text, __init_begin) interval * as non-executable now, and remove the write permission in * mark_linear_text_alias_ro() below (which will be called after * alternative patching has completed). This makes the contents @@ -1113,6 +1113,10 @@ void mark_rodata_ro(void) WRITE_ONCE(rodata_is_rw, false); update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); + /* mark the range between _text and _stext as read only. */ + update_mapping_prot(__pa_symbol(_text), (unsigned long)_text, + (unsigned long)_stext - (unsigned long)_text, + PAGE_KERNEL_RO); } static void __init declare_vma(struct vm_struct *vma, @@ -1183,7 +1187,7 @@ static void __init declare_kernel_vmas(void) { static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; - declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); + declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD); declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); From bad11557eed2592c017a06752e58df49080b4a6a Mon Sep 17 00:00:00 2001 From: Koichi Okuno Date: Tue, 9 Sep 2025 12:02:50 +0900 Subject: [PATCH 80/93] perf: Fujitsu: Add the Uncore PMU driver This adds a new dynamic PMU to the Perf Events framework to program and control the Uncore PMUs in Fujitsu chips. This driver exports formatting and event information to sysfs so it can be used by the perf user space tools with the syntaxes: perf stat -e pci_iod0_pci0/ea-pci/ ls perf stat -e pci_iod0_pci0/event=0x80/ ls perf stat -e mac_iod0_mac0_ch0/ea-mac/ ls perf stat -e mac_iod0_mac0_ch0/event=0x80/ ls FUJITSU-MONAKA PMU Events Specification v1.1 URL: https://github.com/fujitsu/FUJITSU-MONAKA Reviewed-by: Yicong Yang Signed-off-by: Koichi Okuno Signed-off-by: Will Deacon --- .../admin-guide/perf/fujitsu_uncore_pmu.rst | 110 ++++ Documentation/admin-guide/perf/index.rst | 1 + drivers/perf/Kconfig | 9 + drivers/perf/Makefile | 1 + drivers/perf/fujitsu_uncore_pmu.c | 613 ++++++++++++++++++ 5 files changed, 734 insertions(+) create mode 100644 Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst create mode 100644 drivers/perf/fujitsu_uncore_pmu.c diff --git a/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst b/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst new file mode 100644 index 000000000000..46595b788d3a --- /dev/null +++ b/Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst @@ -0,0 +1,110 @@ +.. SPDX-License-Identifier: GPL-2.0-only + +================================================ +Fujitsu Uncore Performance Monitoring Unit (PMU) +================================================ + +This driver supports the Uncore MAC PMUs and the Uncore PCI PMUs found +in Fujitsu chips. +Each MAC PMU on these chips is exposed as a uncore perf PMU with device name +mac_iod_mac_ch. +And each PCI PMU on these chips is exposed as a uncore perf PMU with device name +pci_iod_pci. + +The driver provides a description of its available events and configuration +options in sysfs, see /sys/bus/event_sources/devices/mac_iod_mac_ch/ +and /sys/bus/event_sources/devices/pci_iod_pci/. +This driver exports: +- formats, used by perf user space and other tools to configure events +- events, used by perf user space and other tools to create events + symbolically, e.g.: + perf stat -a -e mac_iod0_mac0_ch0/event=0x21/ ls + perf stat -a -e pci_iod0_pci0/event=0x24/ ls +- cpumask, used by perf user space and other tools to know on which CPUs + to open the events + +This driver supports the following events for MAC: +- cycles + This event counts MAC cycles at MAC frequency. +- read-count + This event counts the number of read requests to MAC. +- read-count-request + This event counts the number of read requests including retry to MAC. +- read-count-return + This event counts the number of responses to read requests to MAC. +- read-count-request-pftgt + This event counts the number of read requests including retry with PFTGT + flag. +- read-count-request-normal + This event counts the number of read requests including retry without PFTGT + flag. +- read-count-return-pftgt-hit + This event counts the number of responses to read requests which hit the + PFTGT buffer. +- read-count-return-pftgt-miss + This event counts the number of responses to read requests which miss the + PFTGT buffer. +- read-wait + This event counts outstanding read requests issued by DDR memory controller + per cycle. +- write-count + This event counts the number of write requests to MAC (including zero write, + full write, partial write, write cancel). +- write-count-write + This event counts the number of full write requests to MAC (not including + zero write). +- write-count-pwrite + This event counts the number of partial write requests to MAC. +- memory-read-count + This event counts the number of read requests from MAC to memory. +- memory-write-count + This event counts the number of full write requests from MAC to memory. +- memory-pwrite-count + This event counts the number of partial write requests from MAC to memory. +- ea-mac + This event counts energy consumption of MAC. +- ea-memory + This event counts energy consumption of memory. +- ea-memory-mac-write + This event counts the number of write requests from MAC to memory. +- ea-ha + This event counts energy consumption of HA. + + 'ea' is the abbreviation for 'Energy Analyzer'. + +Examples for use with perf:: + + perf stat -e mac_iod0_mac0_ch0/ea-mac/ ls + +And, this driver supports the following events for PCI: +- pci-port0-cycles + This event counts PCI cycles at PCI frequency in port0. +- pci-port0-read-count + This event counts read transactions for data transfer in port0. +- pci-port0-read-count-bus + This event counts read transactions for bus usage in port0. +- pci-port0-write-count + This event counts write transactions for data transfer in port0. +- pci-port0-write-count-bus + This event counts write transactions for bus usage in port0. +- pci-port1-cycles + This event counts PCI cycles at PCI frequency in port1. +- pci-port1-read-count + This event counts read transactions for data transfer in port1. +- pci-port1-read-count-bus + This event counts read transactions for bus usage in port1. +- pci-port1-write-count + This event counts write transactions for data transfer in port1. +- pci-port1-write-count-bus + This event counts write transactions for bus usage in port1. +- ea-pci + This event counts energy consumption of PCI. + + 'ea' is the abbreviation for 'Energy Analyzer'. + +Examples for use with perf:: + + perf stat -e pci_iod0_pci0/ea-pci/ ls + +Given that these are uncore PMUs the driver does not support sampling, therefore +"perf record" will not work. Per-task perf sessions are not supported. diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index 072b510385c4..47d9a3df6329 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -29,3 +29,4 @@ Performance monitor support cxl ampere_cspmu mrvl-pem-pmu + fujitsu_uncore_pmu diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index a9188dec36fe..638321fc9800 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -178,6 +178,15 @@ config FSL_IMX9_DDR_PMU can give information about memory throughput and other related events. +config FUJITSU_UNCORE_PMU + tristate "Fujitsu Uncore PMU" + depends on (ARM64 && ACPI) || (COMPILE_TEST && 64BIT) + help + Provides support for the Uncore performance monitor unit (PMU) + in Fujitsu processors. + Adds the Uncore PMU into the perf events subsystem for + monitoring Uncore events. + config QCOM_L2_PMU bool "Qualcomm Technologies L2-cache PMU" depends on ARCH_QCOM && ARM64 && ACPI diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 192fc8b16204..ea52711a87e3 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_ARM_XSCALE_PMU) += arm_xscale_pmu.o obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o obj-$(CONFIG_FSL_IMX8_DDR_PMU) += fsl_imx8_ddr_perf.o obj-$(CONFIG_FSL_IMX9_DDR_PMU) += fsl_imx9_ddr_perf.o +obj-$(CONFIG_FUJITSU_UNCORE_PMU) += fujitsu_uncore_pmu.o obj-$(CONFIG_HISI_PMU) += hisilicon/ obj-$(CONFIG_QCOM_L2_PMU) += qcom_l2_pmu.o obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o diff --git a/drivers/perf/fujitsu_uncore_pmu.c b/drivers/perf/fujitsu_uncore_pmu.c new file mode 100644 index 000000000000..c3c6f56474ad --- /dev/null +++ b/drivers/perf/fujitsu_uncore_pmu.c @@ -0,0 +1,613 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Driver for the Uncore PMUs in Fujitsu chips. + * + * See Documentation/admin-guide/perf/fujitsu_uncore_pmu.rst for more details. + * + * Copyright (c) 2025 Fujitsu. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Number of counters on each PMU */ +#define MAC_NUM_COUNTERS 8 +#define PCI_NUM_COUNTERS 8 +/* Mask for the event type field within perf_event_attr.config and EVTYPE reg */ +#define UNCORE_EVTYPE_MASK 0xFF + +/* Perfmon registers */ +#define PM_EVCNTR(__cntr) (0x000 + (__cntr) * 8) +#define PM_CNTCTL(__cntr) (0x100 + (__cntr) * 8) +#define PM_CNTCTL_RESET 0 +#define PM_EVTYPE(__cntr) (0x200 + (__cntr) * 8) +#define PM_EVTYPE_EVSEL(__val) FIELD_GET(UNCORE_EVTYPE_MASK, __val) +#define PM_CR 0x400 +#define PM_CR_RESET BIT(1) +#define PM_CR_ENABLE BIT(0) +#define PM_CNTENSET 0x410 +#define PM_CNTENSET_IDX(__cntr) BIT(__cntr) +#define PM_CNTENCLR 0x418 +#define PM_CNTENCLR_IDX(__cntr) BIT(__cntr) +#define PM_CNTENCLR_RESET 0xFF +#define PM_INTENSET 0x420 +#define PM_INTENSET_IDX(__cntr) BIT(__cntr) +#define PM_INTENCLR 0x428 +#define PM_INTENCLR_IDX(__cntr) BIT(__cntr) +#define PM_INTENCLR_RESET 0xFF +#define PM_OVSR 0x440 +#define PM_OVSR_OVSRCLR_RESET 0xFF + +enum fujitsu_uncore_pmu { + FUJITSU_UNCORE_PMU_MAC = 1, + FUJITSU_UNCORE_PMU_PCI = 2, +}; + +struct uncore_pmu { + int num_counters; + struct pmu pmu; + struct hlist_node node; + void __iomem *regs; + struct perf_event **events; + unsigned long *used_mask; + int cpu; + int irq; + struct device *dev; +}; + +#define to_uncore_pmu(p) (container_of(p, struct uncore_pmu, pmu)) + +static int uncore_pmu_cpuhp_state; + +static void fujitsu_uncore_counter_start(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + int idx = event->hw.idx; + + /* Initialize the hardware counter and reset prev_count*/ + local64_set(&event->hw.prev_count, 0); + writeq_relaxed(0, uncorepmu->regs + PM_EVCNTR(idx)); + + /* Set the event type */ + writeq_relaxed(PM_EVTYPE_EVSEL(event->attr.config), uncorepmu->regs + PM_EVTYPE(idx)); + + /* Enable interrupt generation by this counter */ + writeq_relaxed(PM_INTENSET_IDX(idx), uncorepmu->regs + PM_INTENSET); + + /* Finally, enable the counter */ + writeq_relaxed(PM_CNTCTL_RESET, uncorepmu->regs + PM_CNTCTL(idx)); + writeq_relaxed(PM_CNTENSET_IDX(idx), uncorepmu->regs + PM_CNTENSET); +} + +static void fujitsu_uncore_counter_stop(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + int idx = event->hw.idx; + + /* Disable the counter */ + writeq_relaxed(PM_CNTENCLR_IDX(idx), uncorepmu->regs + PM_CNTENCLR); + + /* Disable interrupt generation by this counter */ + writeq_relaxed(PM_INTENCLR_IDX(idx), uncorepmu->regs + PM_INTENCLR); +} + +static void fujitsu_uncore_counter_update(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + int idx = event->hw.idx; + u64 prev, new; + + do { + prev = local64_read(&event->hw.prev_count); + new = readq_relaxed(uncorepmu->regs + PM_EVCNTR(idx)); + } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + + local64_add(new - prev, &event->count); +} + +static inline void fujitsu_uncore_init(struct uncore_pmu *uncorepmu) +{ + int i; + + writeq_relaxed(PM_CR_RESET, uncorepmu->regs + PM_CR); + + writeq_relaxed(PM_CNTENCLR_RESET, uncorepmu->regs + PM_CNTENCLR); + writeq_relaxed(PM_INTENCLR_RESET, uncorepmu->regs + PM_INTENCLR); + writeq_relaxed(PM_OVSR_OVSRCLR_RESET, uncorepmu->regs + PM_OVSR); + + for (i = 0; i < uncorepmu->num_counters; ++i) { + writeq_relaxed(PM_CNTCTL_RESET, uncorepmu->regs + PM_CNTCTL(i)); + writeq_relaxed(PM_EVTYPE_EVSEL(0), uncorepmu->regs + PM_EVTYPE(i)); + } + writeq_relaxed(PM_CR_ENABLE, uncorepmu->regs + PM_CR); +} + +static irqreturn_t fujitsu_uncore_handle_irq(int irq_num, void *data) +{ + struct uncore_pmu *uncorepmu = data; + /* Read the overflow status register */ + long status = readq_relaxed(uncorepmu->regs + PM_OVSR); + int idx; + + if (status == 0) + return IRQ_NONE; + + /* Clear the bits we read on the overflow status register */ + writeq_relaxed(status, uncorepmu->regs + PM_OVSR); + + for_each_set_bit(idx, &status, uncorepmu->num_counters) { + struct perf_event *event; + + event = uncorepmu->events[idx]; + if (!event) + continue; + + fujitsu_uncore_counter_update(event); + } + + return IRQ_HANDLED; +} + +static void fujitsu_uncore_pmu_enable(struct pmu *pmu) +{ + writeq_relaxed(PM_CR_ENABLE, to_uncore_pmu(pmu)->regs + PM_CR); +} + +static void fujitsu_uncore_pmu_disable(struct pmu *pmu) +{ + writeq_relaxed(0, to_uncore_pmu(pmu)->regs + PM_CR); +} + +static bool fujitsu_uncore_validate_event_group(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct perf_event *leader = event->group_leader; + struct perf_event *sibling; + int counters = 1; + + if (leader == event) + return true; + + if (leader->pmu == event->pmu) + counters++; + + for_each_sibling_event(sibling, leader) { + if (sibling->pmu == event->pmu) + counters++; + } + + /* + * If the group requires more counters than the HW has, it + * cannot ever be scheduled. + */ + return counters <= uncorepmu->num_counters; +} + +static int fujitsu_uncore_event_init(struct perf_event *event) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + /* Is the event for this PMU? */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* + * Sampling not supported since these events are not + * core-attributable. + */ + if (is_sampling_event(event)) + return -EINVAL; + + /* + * Task mode not available, we run the counters as socket counters, + * not attributable to any CPU and therefore cannot attribute per-task. + */ + if (event->cpu < 0) + return -EINVAL; + + /* Validate the group */ + if (!fujitsu_uncore_validate_event_group(event)) + return -EINVAL; + + hwc->idx = -1; + + event->cpu = uncorepmu->cpu; + + return 0; +} + +static void fujitsu_uncore_event_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->state = 0; + fujitsu_uncore_counter_start(event); +} + +static void fujitsu_uncore_event_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->state & PERF_HES_STOPPED) + return; + + fujitsu_uncore_counter_stop(event); + if (flags & PERF_EF_UPDATE) + fujitsu_uncore_counter_update(event); + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; +} + +static int fujitsu_uncore_event_add(struct perf_event *event, int flags) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx; + + /* Try to allocate a counter. */ + idx = bitmap_find_free_region(uncorepmu->used_mask, uncorepmu->num_counters, 0); + if (idx < 0) + /* The counters are all in use. */ + return -EAGAIN; + + hwc->idx = idx; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + uncorepmu->events[idx] = event; + + if (flags & PERF_EF_START) + fujitsu_uncore_event_start(event, 0); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); + + return 0; +} + +static void fujitsu_uncore_event_del(struct perf_event *event, int flags) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + /* Stop and clean up */ + fujitsu_uncore_event_stop(event, flags | PERF_EF_UPDATE); + uncorepmu->events[hwc->idx] = NULL; + bitmap_release_region(uncorepmu->used_mask, hwc->idx, 0); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); +} + +static void fujitsu_uncore_event_read(struct perf_event *event) +{ + fujitsu_uncore_counter_update(event); +} + +#define UNCORE_PMU_FORMAT_ATTR(_name, _config) \ + (&((struct dev_ext_attribute[]) { \ + { .attr = __ATTR(_name, 0444, device_show_string, NULL), \ + .var = (void *)_config, } \ + })[0].attr.attr) + +static struct attribute *fujitsu_uncore_pmu_formats[] = { + UNCORE_PMU_FORMAT_ATTR(event, "config:0-7"), + NULL +}; + +static const struct attribute_group fujitsu_uncore_pmu_format_group = { + .name = "format", + .attrs = fujitsu_uncore_pmu_formats, +}; + +static ssize_t fujitsu_uncore_pmu_event_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); + return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id); +} + +#define MAC_EVENT_ATTR(_name, _id) \ + PMU_EVENT_ATTR_ID(_name, fujitsu_uncore_pmu_event_show, _id) + +static struct attribute *fujitsu_uncore_mac_pmu_events[] = { + MAC_EVENT_ATTR(cycles, 0x00), + MAC_EVENT_ATTR(read-count, 0x10), + MAC_EVENT_ATTR(read-count-request, 0x11), + MAC_EVENT_ATTR(read-count-return, 0x12), + MAC_EVENT_ATTR(read-count-request-pftgt, 0x13), + MAC_EVENT_ATTR(read-count-request-normal, 0x14), + MAC_EVENT_ATTR(read-count-return-pftgt-hit, 0x15), + MAC_EVENT_ATTR(read-count-return-pftgt-miss, 0x16), + MAC_EVENT_ATTR(read-wait, 0x17), + MAC_EVENT_ATTR(write-count, 0x20), + MAC_EVENT_ATTR(write-count-write, 0x21), + MAC_EVENT_ATTR(write-count-pwrite, 0x22), + MAC_EVENT_ATTR(memory-read-count, 0x40), + MAC_EVENT_ATTR(memory-write-count, 0x50), + MAC_EVENT_ATTR(memory-pwrite-count, 0x60), + MAC_EVENT_ATTR(ea-mac, 0x80), + MAC_EVENT_ATTR(ea-memory, 0x90), + MAC_EVENT_ATTR(ea-memory-mac-write, 0x92), + MAC_EVENT_ATTR(ea-ha, 0xa0), + NULL +}; + +#define PCI_EVENT_ATTR(_name, _id) \ + PMU_EVENT_ATTR_ID(_name, fujitsu_uncore_pmu_event_show, _id) + +static struct attribute *fujitsu_uncore_pci_pmu_events[] = { + PCI_EVENT_ATTR(pci-port0-cycles, 0x00), + PCI_EVENT_ATTR(pci-port0-read-count, 0x10), + PCI_EVENT_ATTR(pci-port0-read-count-bus, 0x14), + PCI_EVENT_ATTR(pci-port0-write-count, 0x20), + PCI_EVENT_ATTR(pci-port0-write-count-bus, 0x24), + PCI_EVENT_ATTR(pci-port1-cycles, 0x40), + PCI_EVENT_ATTR(pci-port1-read-count, 0x50), + PCI_EVENT_ATTR(pci-port1-read-count-bus, 0x54), + PCI_EVENT_ATTR(pci-port1-write-count, 0x60), + PCI_EVENT_ATTR(pci-port1-write-count-bus, 0x64), + PCI_EVENT_ATTR(ea-pci, 0x80), + NULL +}; + +static const struct attribute_group fujitsu_uncore_mac_pmu_events_group = { + .name = "events", + .attrs = fujitsu_uncore_mac_pmu_events, +}; + +static const struct attribute_group fujitsu_uncore_pci_pmu_events_group = { + .name = "events", + .attrs = fujitsu_uncore_pci_pmu_events, +}; + +static ssize_t cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct uncore_pmu *uncorepmu = to_uncore_pmu(dev_get_drvdata(dev)); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(uncorepmu->cpu)); +} +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *fujitsu_uncore_pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static const struct attribute_group fujitsu_uncore_pmu_cpumask_attr_group = { + .attrs = fujitsu_uncore_pmu_cpumask_attrs, +}; + +static const struct attribute_group *fujitsu_uncore_mac_pmu_attr_grps[] = { + &fujitsu_uncore_pmu_format_group, + &fujitsu_uncore_mac_pmu_events_group, + &fujitsu_uncore_pmu_cpumask_attr_group, + NULL +}; + +static const struct attribute_group *fujitsu_uncore_pci_pmu_attr_grps[] = { + &fujitsu_uncore_pmu_format_group, + &fujitsu_uncore_pci_pmu_events_group, + &fujitsu_uncore_pmu_cpumask_attr_group, + NULL +}; + +static void fujitsu_uncore_pmu_migrate(struct uncore_pmu *uncorepmu, unsigned int cpu) +{ + perf_pmu_migrate_context(&uncorepmu->pmu, uncorepmu->cpu, cpu); + irq_set_affinity(uncorepmu->irq, cpumask_of(cpu)); + uncorepmu->cpu = cpu; +} + +static int fujitsu_uncore_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct uncore_pmu *uncorepmu; + int node; + + uncorepmu = hlist_entry_safe(cpuhp_node, struct uncore_pmu, node); + node = dev_to_node(uncorepmu->dev); + if (cpu_to_node(uncorepmu->cpu) != node && cpu_to_node(cpu) == node) + fujitsu_uncore_pmu_migrate(uncorepmu, cpu); + + return 0; +} + +static int fujitsu_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct uncore_pmu *uncorepmu; + unsigned int target; + int node; + + uncorepmu = hlist_entry_safe(cpuhp_node, struct uncore_pmu, node); + if (cpu != uncorepmu->cpu) + return 0; + + node = dev_to_node(uncorepmu->dev); + target = cpumask_any_and_but(cpumask_of_node(node), cpu_online_mask, cpu); + if (target >= nr_cpu_ids) + target = cpumask_any_but(cpu_online_mask, cpu); + + if (target < nr_cpu_ids) + fujitsu_uncore_pmu_migrate(uncorepmu, target); + + return 0; +} + +static int fujitsu_uncore_pmu_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + unsigned long device_type = (unsigned long)device_get_match_data(dev); + const struct attribute_group **attr_groups; + struct uncore_pmu *uncorepmu; + struct resource *memrc; + size_t alloc_size; + char *name; + int ret; + int irq; + u64 uid; + + ret = acpi_dev_uid_to_integer(ACPI_COMPANION(dev), &uid); + if (ret) + return dev_err_probe(dev, ret, "unable to read ACPI uid\n"); + + uncorepmu = devm_kzalloc(dev, sizeof(*uncorepmu), GFP_KERNEL); + if (!uncorepmu) + return -ENOMEM; + uncorepmu->dev = dev; + uncorepmu->cpu = cpumask_local_spread(0, dev_to_node(dev)); + platform_set_drvdata(pdev, uncorepmu); + + switch (device_type) { + case FUJITSU_UNCORE_PMU_MAC: + uncorepmu->num_counters = MAC_NUM_COUNTERS; + attr_groups = fujitsu_uncore_mac_pmu_attr_grps; + name = devm_kasprintf(dev, GFP_KERNEL, "mac_iod%llu_mac%llu_ch%llu", + (uid >> 8) & 0xF, (uid >> 4) & 0xF, uid & 0xF); + break; + case FUJITSU_UNCORE_PMU_PCI: + uncorepmu->num_counters = PCI_NUM_COUNTERS; + attr_groups = fujitsu_uncore_pci_pmu_attr_grps; + name = devm_kasprintf(dev, GFP_KERNEL, "pci_iod%llu_pci%llu", + (uid >> 4) & 0xF, uid & 0xF); + break; + default: + return dev_err_probe(dev, -EINVAL, "illegal device type: %lu\n", device_type); + } + if (!name) + return -ENOMEM; + + uncorepmu->pmu = (struct pmu) { + .parent = dev, + .task_ctx_nr = perf_invalid_context, + + .attr_groups = attr_groups, + + .pmu_enable = fujitsu_uncore_pmu_enable, + .pmu_disable = fujitsu_uncore_pmu_disable, + .event_init = fujitsu_uncore_event_init, + .add = fujitsu_uncore_event_add, + .del = fujitsu_uncore_event_del, + .start = fujitsu_uncore_event_start, + .stop = fujitsu_uncore_event_stop, + .read = fujitsu_uncore_event_read, + + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + }; + + alloc_size = sizeof(uncorepmu->events[0]) * uncorepmu->num_counters; + uncorepmu->events = devm_kzalloc(dev, alloc_size, GFP_KERNEL); + if (!uncorepmu->events) + return -ENOMEM; + + alloc_size = sizeof(uncorepmu->used_mask[0]) * BITS_TO_LONGS(uncorepmu->num_counters); + uncorepmu->used_mask = devm_kzalloc(dev, alloc_size, GFP_KERNEL); + if (!uncorepmu->used_mask) + return -ENOMEM; + + uncorepmu->regs = devm_platform_get_and_ioremap_resource(pdev, 0, &memrc); + if (IS_ERR(uncorepmu->regs)) + return PTR_ERR(uncorepmu->regs); + + fujitsu_uncore_init(uncorepmu); + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + ret = devm_request_irq(dev, irq, fujitsu_uncore_handle_irq, + IRQF_NOBALANCING | IRQF_NO_THREAD, + name, uncorepmu); + if (ret) + return dev_err_probe(dev, ret, "Failed to request IRQ:%d\n", irq); + + ret = irq_set_affinity(irq, cpumask_of(uncorepmu->cpu)); + if (ret) + return dev_err_probe(dev, ret, "Failed to set irq affinity:%d\n", irq); + + uncorepmu->irq = irq; + + /* Add this instance to the list used by the offline callback */ + ret = cpuhp_state_add_instance(uncore_pmu_cpuhp_state, &uncorepmu->node); + if (ret) + return dev_err_probe(dev, ret, "Error registering hotplug"); + + ret = perf_pmu_register(&uncorepmu->pmu, name, -1); + if (ret < 0) { + cpuhp_state_remove_instance_nocalls(uncore_pmu_cpuhp_state, &uncorepmu->node); + return dev_err_probe(dev, ret, "Failed to register %s PMU\n", name); + } + + dev_dbg(dev, "Registered %s, type: %d\n", name, uncorepmu->pmu.type); + + return 0; +} + +static void fujitsu_uncore_pmu_remove(struct platform_device *pdev) +{ + struct uncore_pmu *uncorepmu = platform_get_drvdata(pdev); + + writeq_relaxed(0, uncorepmu->regs + PM_CR); + + perf_pmu_unregister(&uncorepmu->pmu); + cpuhp_state_remove_instance_nocalls(uncore_pmu_cpuhp_state, &uncorepmu->node); +} + +static const struct acpi_device_id fujitsu_uncore_pmu_acpi_match[] = { + { "FUJI200C", FUJITSU_UNCORE_PMU_MAC }, + { "FUJI200D", FUJITSU_UNCORE_PMU_PCI }, + { } +}; +MODULE_DEVICE_TABLE(acpi, fujitsu_uncore_pmu_acpi_match); + +static struct platform_driver fujitsu_uncore_pmu_driver = { + .driver = { + .name = "fujitsu-uncore-pmu", + .acpi_match_table = fujitsu_uncore_pmu_acpi_match, + .suppress_bind_attrs = true, + }, + .probe = fujitsu_uncore_pmu_probe, + .remove = fujitsu_uncore_pmu_remove, +}; + +static int __init fujitsu_uncore_pmu_init(void) +{ + int ret; + + /* Install a hook to update the reader CPU in case it goes offline */ + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/fujitsu/uncore:online", + fujitsu_uncore_pmu_online_cpu, + fujitsu_uncore_pmu_offline_cpu); + if (ret < 0) + return ret; + + uncore_pmu_cpuhp_state = ret; + + ret = platform_driver_register(&fujitsu_uncore_pmu_driver); + if (ret) + cpuhp_remove_multi_state(uncore_pmu_cpuhp_state); + + return ret; +} + +static void __exit fujitsu_uncore_pmu_exit(void) +{ + platform_driver_unregister(&fujitsu_uncore_pmu_driver); + cpuhp_remove_multi_state(uncore_pmu_cpuhp_state); +} + +module_init(fujitsu_uncore_pmu_init); +module_exit(fujitsu_uncore_pmu_exit); + +MODULE_AUTHOR("Koichi Okuno "); +MODULE_DESCRIPTION("Fujitsu Uncore PMU driver"); +MODULE_LICENSE("GPL"); From 43de0ac332b815cf56dbdce63687de9acfd35d49 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:19 +0800 Subject: [PATCH 81/93] drivers/perf: hisi: Relax the event ID check in the framework Event ID is only using the attr::config bit [7, 0] but we check the event range using the whole 64bit field. It blocks the usage of the rest field of attr::config. Relax the check by only using the bit [7, 0]. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_pmu.c | 2 +- drivers/perf/hisilicon/hisi_uncore_pmu.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index a449651f79c9..6594d64b03a9 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -234,7 +234,7 @@ int hisi_uncore_pmu_event_init(struct perf_event *event) return -EINVAL; hisi_pmu = to_hisi_pmu(event->pmu); - if (event->attr.config > hisi_pmu->check_event) + if ((event->attr.config & HISI_EVENTID_MASK) > hisi_pmu->check_event) return -EINVAL; if (hisi_pmu->on_cpu == -1) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h index 777675838b80..e69660f72be6 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.h +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h @@ -43,7 +43,8 @@ return FIELD_GET(GENMASK_ULL(hi, lo), event->attr.config); \ } -#define HISI_GET_EVENTID(ev) (ev->hw.config_base & 0xff) +#define HISI_EVENTID_MASK GENMASK(7, 0) +#define HISI_GET_EVENTID(ev) ((ev)->hw.config_base & HISI_EVENTID_MASK) #define HISI_PMU_EVTYPE_BITS 8 #define HISI_PMU_EVTYPE_SHIFT(idx) ((idx) % 4 * HISI_PMU_EVTYPE_BITS) From 4550244b53b7ef81607c0d52c72f835718497218 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:20 +0800 Subject: [PATCH 82/93] drivers/perf: hisi: Export hisi_uncore_pmu_isr() Currently Uncore PMU framework assume one PMU device only have one interrupt and will help register the interrupt handler. It cannot support a PMU with multiple interrupt resources. An uncore PMU may have multiple interrupts that can share the same handler. Export hisi_uncore_pmu_isr() to allow drivers register the irq handler by their own routine. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Reviewed-by: Jonathan Cameron Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_pmu.c | 3 ++- drivers/perf/hisilicon/hisi_uncore_pmu.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index 6594d64b03a9..de71dcf11653 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -149,7 +149,7 @@ static void hisi_uncore_pmu_clear_event_idx(struct hisi_pmu *hisi_pmu, int idx) clear_bit(idx, hisi_pmu->pmu_events.used_mask); } -static irqreturn_t hisi_uncore_pmu_isr(int irq, void *data) +irqreturn_t hisi_uncore_pmu_isr(int irq, void *data) { struct hisi_pmu *hisi_pmu = data; struct perf_event *event; @@ -178,6 +178,7 @@ static irqreturn_t hisi_uncore_pmu_isr(int irq, void *data) return IRQ_HANDLED; } +EXPORT_SYMBOL_NS_GPL(hisi_uncore_pmu_isr, "HISI_PMU"); int hisi_uncore_pmu_init_irq(struct hisi_pmu *hisi_pmu, struct platform_device *pdev) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h index e69660f72be6..8649be6f716a 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.h +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h @@ -165,6 +165,7 @@ int hisi_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node); ssize_t hisi_uncore_pmu_identifier_attr_show(struct device *dev, struct device_attribute *attr, char *page); +irqreturn_t hisi_uncore_pmu_isr(int irq, void *data); int hisi_uncore_pmu_init_irq(struct hisi_pmu *hisi_pmu, struct platform_device *pdev); void hisi_uncore_pmu_init_topology(struct hisi_pmu *hisi_pmu, struct device *dev); From 0960e535be5403954701b06e722b68b53463cbe0 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:21 +0800 Subject: [PATCH 83/93] drivers/perf: hisi: Simplify the probe process of each L3C PMU version Version 1 and 2 of L3C PMU also use different HID. Make use of struct acpi_device_id::driver_data for version specific information rather than judge the version register. This will help to simplify the probe process and also a bit easier for extension. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Reviewed-by: Jonathan Cameron Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 43 ++++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index 412fc3a97963..db683dd7375c 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -345,13 +345,6 @@ static void hisi_l3c_pmu_clear_int_status(struct hisi_pmu *l3c_pmu, int idx) writel(1 << idx, l3c_pmu->base + L3C_INT_CLEAR); } -static const struct acpi_device_id hisi_l3c_pmu_acpi_match[] = { - { "HISI0213", }, - { "HISI0214", }, - {} -}; -MODULE_DEVICE_TABLE(acpi, hisi_l3c_pmu_acpi_match); - static int hisi_l3c_pmu_init_data(struct platform_device *pdev, struct hisi_pmu *l3c_pmu) { @@ -371,6 +364,10 @@ static int hisi_l3c_pmu_init_data(struct platform_device *pdev, return -EINVAL; } + l3c_pmu->dev_info = device_get_match_data(&pdev->dev); + if (!l3c_pmu->dev_info) + return -ENODEV; + l3c_pmu->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(l3c_pmu->base)) { dev_err(&pdev->dev, "ioremap failed for l3c_pmu resource\n"); @@ -457,6 +454,18 @@ static const struct attribute_group *hisi_l3c_pmu_v2_attr_groups[] = { NULL }; +static const struct hisi_pmu_dev_info hisi_l3c_pmu_v1 = { + .attr_groups = hisi_l3c_pmu_v1_attr_groups, + .counter_bits = 48, + .check_event = L3C_V1_NR_EVENTS, +}; + +static const struct hisi_pmu_dev_info hisi_l3c_pmu_v2 = { + .attr_groups = hisi_l3c_pmu_v2_attr_groups, + .counter_bits = 64, + .check_event = L3C_V2_NR_EVENTS, +}; + static const struct hisi_uncore_ops hisi_uncore_l3c_ops = { .write_evtype = hisi_l3c_pmu_write_evtype, .get_event_idx = hisi_uncore_pmu_get_event_idx, @@ -487,16 +496,9 @@ static int hisi_l3c_pmu_dev_probe(struct platform_device *pdev, if (ret) return ret; - if (l3c_pmu->identifier >= HISI_PMU_V2) { - l3c_pmu->counter_bits = 64; - l3c_pmu->check_event = L3C_V2_NR_EVENTS; - l3c_pmu->pmu_events.attr_groups = hisi_l3c_pmu_v2_attr_groups; - } else { - l3c_pmu->counter_bits = 48; - l3c_pmu->check_event = L3C_V1_NR_EVENTS; - l3c_pmu->pmu_events.attr_groups = hisi_l3c_pmu_v1_attr_groups; - } - + l3c_pmu->pmu_events.attr_groups = l3c_pmu->dev_info->attr_groups; + l3c_pmu->counter_bits = l3c_pmu->dev_info->counter_bits; + l3c_pmu->check_event = l3c_pmu->dev_info->check_event; l3c_pmu->num_counters = L3C_NR_COUNTERS; l3c_pmu->ops = &hisi_uncore_l3c_ops; l3c_pmu->dev = &pdev->dev; @@ -554,6 +556,13 @@ static void hisi_l3c_pmu_remove(struct platform_device *pdev) &l3c_pmu->node); } +static const struct acpi_device_id hisi_l3c_pmu_acpi_match[] = { + { "HISI0213", (kernel_ulong_t)&hisi_l3c_pmu_v1 }, + { "HISI0214", (kernel_ulong_t)&hisi_l3c_pmu_v2 }, + {} +}; +MODULE_DEVICE_TABLE(acpi, hisi_l3c_pmu_acpi_match); + static struct platform_driver hisi_l3c_pmu_driver = { .driver = { .name = "hisi_l3c_pmu", From 2271f1634243897cf18763386994d613a0594d98 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:22 +0800 Subject: [PATCH 84/93] drivers/perf: hisi: Extract the event filter check of L3C PMU L3C PMU has 4 filter options which are sharing perf_event_attr::config1. Driver will check config1 to see whether a certain event has a filter setting. It'll be incorrect if we make use of other bits in config1 for non-filter options. So check whether each filter options are set directly in a separate function instead. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Reviewed-by: Jonathan Cameron Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index db683dd7375c..a372dd2c07b5 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -204,9 +204,15 @@ static void hisi_l3c_pmu_clear_core_tracetag(struct perf_event *event) } } +static bool hisi_l3c_pmu_have_filter(struct perf_event *event) +{ + return hisi_get_tt_req(event) || hisi_get_tt_core(event) || + hisi_get_datasrc_cfg(event) || hisi_get_datasrc_skt(event); +} + static void hisi_l3c_pmu_enable_filter(struct perf_event *event) { - if (event->attr.config1 != 0x0) { + if (hisi_l3c_pmu_have_filter(event)) { hisi_l3c_pmu_config_req_tracetag(event); hisi_l3c_pmu_config_core_tracetag(event); hisi_l3c_pmu_config_ds(event); @@ -215,7 +221,7 @@ static void hisi_l3c_pmu_enable_filter(struct perf_event *event) static void hisi_l3c_pmu_disable_filter(struct perf_event *event) { - if (event->attr.config1 != 0x0) { + if (hisi_l3c_pmu_have_filter(event)) { hisi_l3c_pmu_clear_ds(event); hisi_l3c_pmu_clear_core_tracetag(event); hisi_l3c_pmu_clear_req_tracetag(event); From ede339ff61c61a1ac3bedd01528e4d701d4aea22 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:23 +0800 Subject: [PATCH 85/93] drivers/perf: hisi: Extend the field of tt_core Currently the tt_core's using config1's bit [7, 0] and can not be extended. For some platforms there's more the 8 CPUs sharing the L3 cache. So make tt_core use config2's bit [15, 0] and the remaining bits in config2 is reserved for extension. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Reviewed-by: Jonathan Cameron Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index a372dd2c07b5..39444f11cbad 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -55,10 +55,10 @@ #define L3C_V1_NR_EVENTS 0x59 #define L3C_V2_NR_EVENTS 0xFF -HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config1, 7, 0); HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_req, config1, 10, 8); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_cfg, config1, 15, 11); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_skt, config1, 16, 16); +HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config2, 15, 0); static void hisi_l3c_pmu_config_req_tracetag(struct perf_event *event) { @@ -397,7 +397,7 @@ static const struct attribute_group hisi_l3c_pmu_v1_format_group = { static struct attribute *hisi_l3c_pmu_v2_format_attr[] = { HISI_PMU_FORMAT_ATTR(event, "config:0-7"), - HISI_PMU_FORMAT_ATTR(tt_core, "config1:0-7"), + HISI_PMU_FORMAT_ATTR(tt_core, "config2:0-15"), HISI_PMU_FORMAT_ATTR(tt_req, "config1:8-10"), HISI_PMU_FORMAT_ATTR(datasrc_cfg, "config1:11-15"), HISI_PMU_FORMAT_ATTR(datasrc_skt, "config1:16"), From b3abb08d6f628a76c36bf7da9508e1a67bf186a0 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:24 +0800 Subject: [PATCH 86/93] drivers/perf: hisi: Refactor the event configuration of L3C PMU The event register is configured using hisi_pmu::base directly since only one address space is supported for L3C PMU. We need to extend if events configuration locates in different address space. In order to make preparation for such hardware, extract the event register configuration to separate function using hw_perf_event::event_base as each event's base address. Implement a private hisi_uncore_ops::get_event_idx() callback for initialize the event_base besides get the hardware index. No functional changes intended. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Signed-off-by: Yushan Wang Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 129 ++++++++++++------- 1 file changed, 84 insertions(+), 45 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index 39444f11cbad..7928b9bb3e7e 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -60,51 +60,87 @@ HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_cfg, config1, 15, 11); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_skt, config1, 16, 16); HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config2, 15, 0); -static void hisi_l3c_pmu_config_req_tracetag(struct perf_event *event) +static int hisi_l3c_pmu_get_event_idx(struct perf_event *event) { struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + unsigned long *used_mask = l3c_pmu->pmu_events.used_mask; + u32 num_counters = l3c_pmu->num_counters; + int idx; + + idx = find_first_zero_bit(used_mask, num_counters); + if (idx == num_counters) + return -EAGAIN; + + set_bit(idx, used_mask); + event->hw.event_base = (unsigned long)l3c_pmu->base; + + return idx; +} + +static u32 hisi_l3c_pmu_event_readl(struct hw_perf_event *hwc, u32 reg) +{ + return readl((void __iomem *)hwc->event_base + reg); +} + +static void hisi_l3c_pmu_event_writel(struct hw_perf_event *hwc, u32 reg, u32 val) +{ + writel(val, (void __iomem *)hwc->event_base + reg); +} + +static u64 hisi_l3c_pmu_event_readq(struct hw_perf_event *hwc, u32 reg) +{ + return readq((void __iomem *)hwc->event_base + reg); +} + +static void hisi_l3c_pmu_event_writeq(struct hw_perf_event *hwc, u32 reg, u64 val) +{ + writeq(val, (void __iomem *)hwc->event_base + reg); +} + +static void hisi_l3c_pmu_config_req_tracetag(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; u32 tt_req = hisi_get_tt_req(event); if (tt_req) { u32 val; /* Set request-type for tracetag */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val |= tt_req << L3C_TRACETAG_REQ_SHIFT; val |= L3C_TRACETAG_REQ_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); /* Enable request-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val |= L3C_TRACETAG_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); } } static void hisi_l3c_pmu_clear_req_tracetag(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 tt_req = hisi_get_tt_req(event); if (tt_req) { u32 val; /* Clear request-type */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val &= ~(tt_req << L3C_TRACETAG_REQ_SHIFT); val &= ~L3C_TRACETAG_REQ_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); /* Disable request-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val &= ~L3C_TRACETAG_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); } } static void hisi_l3c_pmu_write_ds(struct perf_event *event, u32 ds_cfg) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; u32 reg, reg_idx, shift, val; int idx = hwc->idx; @@ -120,15 +156,15 @@ static void hisi_l3c_pmu_write_ds(struct perf_event *event, u32 ds_cfg) reg_idx = idx % 4; shift = 8 * reg_idx; - val = readl(l3c_pmu->base + reg); + val = hisi_l3c_pmu_event_readl(hwc, reg); val &= ~(L3C_DATSRC_MASK << shift); val |= ds_cfg << shift; - writel(val, l3c_pmu->base + reg); + hisi_l3c_pmu_event_writel(hwc, reg, val); } static void hisi_l3c_pmu_config_ds(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 ds_cfg = hisi_get_datasrc_cfg(event); u32 ds_skt = hisi_get_datasrc_skt(event); @@ -138,15 +174,15 @@ static void hisi_l3c_pmu_config_ds(struct perf_event *event) if (ds_skt) { u32 val; - val = readl(l3c_pmu->base + L3C_DATSRC_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_DATSRC_CTRL); val |= L3C_DATSRC_SKT_EN; - writel(val, l3c_pmu->base + L3C_DATSRC_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_DATSRC_CTRL, val); } } static void hisi_l3c_pmu_clear_ds(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 ds_cfg = hisi_get_datasrc_cfg(event); u32 ds_skt = hisi_get_datasrc_skt(event); @@ -156,51 +192,51 @@ static void hisi_l3c_pmu_clear_ds(struct perf_event *event) if (ds_skt) { u32 val; - val = readl(l3c_pmu->base + L3C_DATSRC_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_DATSRC_CTRL); val &= ~L3C_DATSRC_SKT_EN; - writel(val, l3c_pmu->base + L3C_DATSRC_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_DATSRC_CTRL, val); } } static void hisi_l3c_pmu_config_core_tracetag(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 core = hisi_get_tt_core(event); if (core) { u32 val; /* Config and enable core information */ - writel(core, l3c_pmu->base + L3C_CORE_CTRL); - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_CORE_CTRL, core); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val |= L3C_CORE_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); /* Enable core-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val |= L3C_TRACETAG_CORE_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); } } static void hisi_l3c_pmu_clear_core_tracetag(struct perf_event *event) { - struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; u32 core = hisi_get_tt_core(event); if (core) { u32 val; /* Clear core information */ - writel(L3C_COER_NONE, l3c_pmu->base + L3C_CORE_CTRL); - val = readl(l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_CORE_CTRL, L3C_COER_NONE); + val = hisi_l3c_pmu_event_readl(hwc, L3C_PERF_CTRL); val &= ~L3C_CORE_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_PERF_CTRL, val); /* Disable core-tracetag statistics */ - val = readl(l3c_pmu->base + L3C_TRACETAG_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_TRACETAG_CTRL); val &= ~L3C_TRACETAG_CORE_EN; - writel(val, l3c_pmu->base + L3C_TRACETAG_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_TRACETAG_CTRL, val); } } @@ -239,18 +275,19 @@ static u32 hisi_l3c_pmu_get_counter_offset(int cntr_idx) static u64 hisi_l3c_pmu_read_counter(struct hisi_pmu *l3c_pmu, struct hw_perf_event *hwc) { - return readq(l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(hwc->idx)); + return hisi_l3c_pmu_event_readq(hwc, hisi_l3c_pmu_get_counter_offset(hwc->idx)); } static void hisi_l3c_pmu_write_counter(struct hisi_pmu *l3c_pmu, struct hw_perf_event *hwc, u64 val) { - writeq(val, l3c_pmu->base + hisi_l3c_pmu_get_counter_offset(hwc->idx)); + hisi_l3c_pmu_event_writeq(hwc, hisi_l3c_pmu_get_counter_offset(hwc->idx), val); } static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx, u32 type) { + struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw; u32 reg, reg_idx, shift, val; /* @@ -265,10 +302,10 @@ static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx, shift = 8 * reg_idx; /* Write event code to L3C_EVENT_TYPEx Register */ - val = readl(l3c_pmu->base + reg); + val = hisi_l3c_pmu_event_readl(hwc, reg); val &= ~(L3C_EVTYPE_NONE << shift); val |= (type << shift); - writel(val, l3c_pmu->base + reg); + hisi_l3c_pmu_event_writel(hwc, reg, val); } static void hisi_l3c_pmu_start_counters(struct hisi_pmu *l3c_pmu) @@ -303,9 +340,9 @@ static void hisi_l3c_pmu_enable_counter(struct hisi_pmu *l3c_pmu, u32 val; /* Enable counter index in L3C_EVENT_CTRL register */ - val = readl(l3c_pmu->base + L3C_EVENT_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL); val |= (1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_EVENT_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val); } static void hisi_l3c_pmu_disable_counter(struct hisi_pmu *l3c_pmu, @@ -314,9 +351,9 @@ static void hisi_l3c_pmu_disable_counter(struct hisi_pmu *l3c_pmu, u32 val; /* Clear counter index in L3C_EVENT_CTRL register */ - val = readl(l3c_pmu->base + L3C_EVENT_CTRL); + val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL); val &= ~(1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_EVENT_CTRL); + hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val); } static void hisi_l3c_pmu_enable_counter_int(struct hisi_pmu *l3c_pmu, @@ -324,10 +361,10 @@ static void hisi_l3c_pmu_enable_counter_int(struct hisi_pmu *l3c_pmu, { u32 val; - val = readl(l3c_pmu->base + L3C_INT_MASK); + val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK); /* Write 0 to enable interrupt */ val &= ~(1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_INT_MASK); + hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val); } static void hisi_l3c_pmu_disable_counter_int(struct hisi_pmu *l3c_pmu, @@ -335,10 +372,10 @@ static void hisi_l3c_pmu_disable_counter_int(struct hisi_pmu *l3c_pmu, { u32 val; - val = readl(l3c_pmu->base + L3C_INT_MASK); + val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK); /* Write 1 to mask interrupt */ val |= (1 << hwc->idx); - writel(val, l3c_pmu->base + L3C_INT_MASK); + hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val); } static u32 hisi_l3c_pmu_get_int_status(struct hisi_pmu *l3c_pmu) @@ -348,7 +385,9 @@ static u32 hisi_l3c_pmu_get_int_status(struct hisi_pmu *l3c_pmu) static void hisi_l3c_pmu_clear_int_status(struct hisi_pmu *l3c_pmu, int idx) { - writel(1 << idx, l3c_pmu->base + L3C_INT_CLEAR); + struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw; + + hisi_l3c_pmu_event_writel(hwc, L3C_INT_CLEAR, 1 << idx); } static int hisi_l3c_pmu_init_data(struct platform_device *pdev, @@ -474,7 +513,7 @@ static const struct hisi_pmu_dev_info hisi_l3c_pmu_v2 = { static const struct hisi_uncore_ops hisi_uncore_l3c_ops = { .write_evtype = hisi_l3c_pmu_write_evtype, - .get_event_idx = hisi_uncore_pmu_get_event_idx, + .get_event_idx = hisi_l3c_pmu_get_event_idx, .start_counters = hisi_l3c_pmu_start_counters, .stop_counters = hisi_l3c_pmu_stop_counters, .enable_counter = hisi_l3c_pmu_enable_counter, From 475d94dfe7c635b4311b6c9d87f49534eab6589c Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 29 Aug 2025 18:14:25 +0800 Subject: [PATCH 87/93] drivers/perf: hisi: Add support for L3C PMU v3 This patch adds support for L3C PMU v3. The v3 L3C PMU supports an extended events space which can be controlled in up to 2 extra address spaces with separate overflow interrupts. The layout of the control/event registers are kept the same. The extended events with original ones together cover the monitoring job of all transactions on L3C. The extended events is specified with `ext=[1|2]` option for the driver to distinguish, like below: perf stat -e hisi_sccl0_l3c0_0/event=,ext=1/ Currently only event option using config bit [7, 0]. There's still plenty unused space. Make ext using config [16, 17] and reserve bit [15, 8] for event option for future extension. With the capability of extra counters, number of counters for HiSilicon uncore PMU could reach up to 24, the usedmap is extended accordingly. The hw_perf_event::event_base is initialized to the base MMIO address of the event and will be used for later control, overflow handling and counts readout. We still make use of the Uncore PMU framework for handling the events and interrupt migration on CPU hotplug. The framework's cpuhp callback will handle the event migration and interrupt migration of orginial event, if PMU supports extended events then the interrupt of extended events is migrated to the same CPU choosed by the framework. A new HID of HISI0215 is used for this version of L3C PMU. Acked-by: Jonathan Cameron Signed-off-by: Yicong Yang Co-developed-by: Yushan Wang Signed-off-by: Yushan Wang Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 352 +++++++++++++++++-- drivers/perf/hisilicon/hisi_uncore_pmu.h | 2 +- 2 files changed, 324 insertions(+), 30 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index 7928b9bb3e7e..bbd81a43047d 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -39,6 +39,7 @@ /* L3C has 8-counters */ #define L3C_NR_COUNTERS 0x8 +#define L3C_MAX_EXT 2 #define L3C_PERF_CTRL_EN 0x10000 #define L3C_TRACETAG_EN BIT(31) @@ -55,24 +56,81 @@ #define L3C_V1_NR_EVENTS 0x59 #define L3C_V2_NR_EVENTS 0xFF +HISI_PMU_EVENT_ATTR_EXTRACTOR(ext, config, 17, 16); HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_req, config1, 10, 8); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_cfg, config1, 15, 11); HISI_PMU_EVENT_ATTR_EXTRACTOR(datasrc_skt, config1, 16, 16); HISI_PMU_EVENT_ATTR_EXTRACTOR(tt_core, config2, 15, 0); +struct hisi_l3c_pmu { + struct hisi_pmu l3c_pmu; + + /* MMIO and IRQ resources for extension events */ + void __iomem *ext_base[L3C_MAX_EXT]; + int ext_irq[L3C_MAX_EXT]; + int ext_num; +}; + +#define to_hisi_l3c_pmu(_l3c_pmu) \ + container_of(_l3c_pmu, struct hisi_l3c_pmu, l3c_pmu) + +/* + * The hardware counter idx used in counter enable/disable, + * interrupt enable/disable and status check, etc. + */ +#define L3C_HW_IDX(_cntr_idx) ((_cntr_idx) % L3C_NR_COUNTERS) + +/* Range of ext counters in used mask. */ +#define L3C_CNTR_EXT_L(_ext) (((_ext) + 1) * L3C_NR_COUNTERS) +#define L3C_CNTR_EXT_H(_ext) (((_ext) + 2) * L3C_NR_COUNTERS) + +struct hisi_l3c_pmu_ext { + bool support_ext; +}; + +static bool support_ext(struct hisi_l3c_pmu *pmu) +{ + struct hisi_l3c_pmu_ext *l3c_pmu_ext = pmu->l3c_pmu.dev_info->private; + + return l3c_pmu_ext->support_ext; +} + static int hisi_l3c_pmu_get_event_idx(struct perf_event *event) { struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); unsigned long *used_mask = l3c_pmu->pmu_events.used_mask; - u32 num_counters = l3c_pmu->num_counters; + int ext = hisi_get_ext(event); int idx; - idx = find_first_zero_bit(used_mask, num_counters); - if (idx == num_counters) + /* + * For an L3C PMU that supports extension events, we can monitor + * maximum 2 * num_counters to 3 * num_counters events, depending on + * the number of ext regions supported by hardware. Thus use bit + * [0, num_counters - 1] for normal events and bit + * [ext * num_counters, (ext + 1) * num_counters - 1] for extension + * events. The idx allocation will keep unchanged for normal events and + * we can also use the idx to distinguish whether it's an extension + * event or not. + * + * Since normal events and extension events locates on the different + * address space, save the base address to the event->hw.event_base. + */ + if (ext && !support_ext(hisi_l3c_pmu)) + return -EOPNOTSUPP; + + if (ext) + event->hw.event_base = (unsigned long)hisi_l3c_pmu->ext_base[ext - 1]; + else + event->hw.event_base = (unsigned long)l3c_pmu->base; + + ext -= 1; + idx = find_next_zero_bit(used_mask, L3C_CNTR_EXT_H(ext), L3C_CNTR_EXT_L(ext)); + + if (idx >= L3C_CNTR_EXT_H(ext)) return -EAGAIN; set_bit(idx, used_mask); - event->hw.event_base = (unsigned long)l3c_pmu->base; return idx; } @@ -143,7 +201,7 @@ static void hisi_l3c_pmu_write_ds(struct perf_event *event, u32 ds_cfg) { struct hw_perf_event *hwc = &event->hw; u32 reg, reg_idx, shift, val; - int idx = hwc->idx; + int idx = L3C_HW_IDX(hwc->idx); /* * Select the appropriate datasource register(L3C_DATSRC_TYPE0/1). @@ -264,12 +322,24 @@ static void hisi_l3c_pmu_disable_filter(struct perf_event *event) } } +static int hisi_l3c_pmu_check_filter(struct perf_event *event) +{ + struct hisi_pmu *l3c_pmu = to_hisi_pmu(event->pmu); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ext = hisi_get_ext(event); + + if (ext < 0 || ext > hisi_l3c_pmu->ext_num) + return -EINVAL; + + return 0; +} + /* * Select the counter register offset using the counter index */ static u32 hisi_l3c_pmu_get_counter_offset(int cntr_idx) { - return (L3C_CNTR0_LOWER + (cntr_idx * 8)); + return L3C_CNTR0_LOWER + L3C_HW_IDX(cntr_idx) * 8; } static u64 hisi_l3c_pmu_read_counter(struct hisi_pmu *l3c_pmu, @@ -290,6 +360,8 @@ static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx, struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw; u32 reg, reg_idx, shift, val; + idx = L3C_HW_IDX(idx); + /* * Select the appropriate event select register(L3C_EVENT_TYPE0/1). * There are 2 event select registers for the 8 hardware counters. @@ -304,34 +376,70 @@ static void hisi_l3c_pmu_write_evtype(struct hisi_pmu *l3c_pmu, int idx, /* Write event code to L3C_EVENT_TYPEx Register */ val = hisi_l3c_pmu_event_readl(hwc, reg); val &= ~(L3C_EVTYPE_NONE << shift); - val |= (type << shift); + val |= type << shift; hisi_l3c_pmu_event_writel(hwc, reg, val); } static void hisi_l3c_pmu_start_counters(struct hisi_pmu *l3c_pmu) { + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + unsigned long *used_mask = l3c_pmu->pmu_events.used_mask; + unsigned long used_cntr = find_first_bit(used_mask, l3c_pmu->num_counters); u32 val; + int i; /* - * Set perf_enable bit in L3C_PERF_CTRL register to start counting - * for all enabled counters. + * Check if any counter belongs to the normal range (instead of ext + * range). If so, enable it. */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); - val |= L3C_PERF_CTRL_EN; - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + if (used_cntr < L3C_NR_COUNTERS) { + val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val |= L3C_PERF_CTRL_EN; + writel(val, l3c_pmu->base + L3C_PERF_CTRL); + } + + /* If not, do enable it on ext ranges. */ + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + /* Find used counter in this ext range, skip the range if not. */ + used_cntr = find_next_bit(used_mask, L3C_CNTR_EXT_H(i), L3C_CNTR_EXT_L(i)); + if (used_cntr >= L3C_CNTR_EXT_H(i)) + continue; + + val = readl(hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + val |= L3C_PERF_CTRL_EN; + writel(val, hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + } } static void hisi_l3c_pmu_stop_counters(struct hisi_pmu *l3c_pmu) { + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + unsigned long *used_mask = l3c_pmu->pmu_events.used_mask; + unsigned long used_cntr = find_first_bit(used_mask, l3c_pmu->num_counters); u32 val; + int i; /* - * Clear perf_enable bit in L3C_PERF_CTRL register to stop counting - * for all enabled counters. + * Check if any counter belongs to the normal range (instead of ext + * range). If so, stop it. */ - val = readl(l3c_pmu->base + L3C_PERF_CTRL); - val &= ~(L3C_PERF_CTRL_EN); - writel(val, l3c_pmu->base + L3C_PERF_CTRL); + if (used_cntr < L3C_NR_COUNTERS) { + val = readl(l3c_pmu->base + L3C_PERF_CTRL); + val &= ~L3C_PERF_CTRL_EN; + writel(val, l3c_pmu->base + L3C_PERF_CTRL); + } + + /* If not, do stop it on ext ranges. */ + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + /* Find used counter in this ext range, skip the range if not. */ + used_cntr = find_next_bit(used_mask, L3C_CNTR_EXT_H(i), L3C_CNTR_EXT_L(i)); + if (used_cntr >= L3C_CNTR_EXT_H(i)) + continue; + + val = readl(hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + val &= ~L3C_PERF_CTRL_EN; + writel(val, hisi_l3c_pmu->ext_base[i] + L3C_PERF_CTRL); + } } static void hisi_l3c_pmu_enable_counter(struct hisi_pmu *l3c_pmu, @@ -341,7 +449,7 @@ static void hisi_l3c_pmu_enable_counter(struct hisi_pmu *l3c_pmu, /* Enable counter index in L3C_EVENT_CTRL register */ val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL); - val |= (1 << hwc->idx); + val |= 1 << L3C_HW_IDX(hwc->idx); hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val); } @@ -352,7 +460,7 @@ static void hisi_l3c_pmu_disable_counter(struct hisi_pmu *l3c_pmu, /* Clear counter index in L3C_EVENT_CTRL register */ val = hisi_l3c_pmu_event_readl(hwc, L3C_EVENT_CTRL); - val &= ~(1 << hwc->idx); + val &= ~(1 << L3C_HW_IDX(hwc->idx)); hisi_l3c_pmu_event_writel(hwc, L3C_EVENT_CTRL, val); } @@ -363,7 +471,7 @@ static void hisi_l3c_pmu_enable_counter_int(struct hisi_pmu *l3c_pmu, val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK); /* Write 0 to enable interrupt */ - val &= ~(1 << hwc->idx); + val &= ~(1 << L3C_HW_IDX(hwc->idx)); hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val); } @@ -374,20 +482,34 @@ static void hisi_l3c_pmu_disable_counter_int(struct hisi_pmu *l3c_pmu, val = hisi_l3c_pmu_event_readl(hwc, L3C_INT_MASK); /* Write 1 to mask interrupt */ - val |= (1 << hwc->idx); + val |= 1 << L3C_HW_IDX(hwc->idx); hisi_l3c_pmu_event_writel(hwc, L3C_INT_MASK, val); } static u32 hisi_l3c_pmu_get_int_status(struct hisi_pmu *l3c_pmu) { - return readl(l3c_pmu->base + L3C_INT_STATUS); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + u32 ext_int, status, status_ext = 0; + int i; + + status = readl(l3c_pmu->base + L3C_INT_STATUS); + + if (!support_ext(hisi_l3c_pmu)) + return status; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + ext_int = readl(hisi_l3c_pmu->ext_base[i] + L3C_INT_STATUS); + status_ext |= ext_int << (L3C_NR_COUNTERS * i); + } + + return status | (status_ext << L3C_NR_COUNTERS); } static void hisi_l3c_pmu_clear_int_status(struct hisi_pmu *l3c_pmu, int idx) { struct hw_perf_event *hwc = &l3c_pmu->pmu_events.hw_events[idx]->hw; - hisi_l3c_pmu_event_writel(hwc, L3C_INT_CLEAR, 1 << idx); + hisi_l3c_pmu_event_writel(hwc, L3C_INT_CLEAR, 1 << L3C_HW_IDX(idx)); } static int hisi_l3c_pmu_init_data(struct platform_device *pdev, @@ -424,6 +546,50 @@ static int hisi_l3c_pmu_init_data(struct platform_device *pdev, return 0; } +static int hisi_l3c_pmu_init_ext(struct hisi_pmu *l3c_pmu, struct platform_device *pdev) +{ + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ret, irq, ext_num, i; + char *irqname; + + /* HiSilicon L3C PMU supporting ext should have more than 1 irq resources. */ + ext_num = platform_irq_count(pdev); + if (ext_num < L3C_MAX_EXT) + return -ENODEV; + + /* + * The number of ext supported equals the number of irq - 1, since one + * of the irqs belongs to the normal part of PMU. + */ + hisi_l3c_pmu->ext_num = ext_num - 1; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) { + hisi_l3c_pmu->ext_base[i] = devm_platform_ioremap_resource(pdev, i + 1); + if (IS_ERR(hisi_l3c_pmu->ext_base[i])) + return PTR_ERR(hisi_l3c_pmu->ext_base[i]); + + irq = platform_get_irq(pdev, i + 1); + if (irq < 0) + return irq; + + irqname = devm_kasprintf(&pdev->dev, GFP_KERNEL, "%s ext%d", + dev_name(&pdev->dev), i + 1); + if (!irqname) + return -ENOMEM; + + ret = devm_request_irq(&pdev->dev, irq, hisi_uncore_pmu_isr, + IRQF_NOBALANCING | IRQF_NO_THREAD, + irqname, l3c_pmu); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, + "Fail to request EXT IRQ: %d.\n", irq); + + hisi_l3c_pmu->ext_irq[i] = irq; + } + + return 0; +} + static struct attribute *hisi_l3c_pmu_v1_format_attr[] = { HISI_PMU_FORMAT_ATTR(event, "config:0-7"), NULL, @@ -448,6 +614,19 @@ static const struct attribute_group hisi_l3c_pmu_v2_format_group = { .attrs = hisi_l3c_pmu_v2_format_attr, }; +static struct attribute *hisi_l3c_pmu_v3_format_attr[] = { + HISI_PMU_FORMAT_ATTR(event, "config:0-7"), + HISI_PMU_FORMAT_ATTR(ext, "config:16-17"), + HISI_PMU_FORMAT_ATTR(tt_req, "config1:8-10"), + HISI_PMU_FORMAT_ATTR(tt_core, "config2:0-15"), + NULL +}; + +static const struct attribute_group hisi_l3c_pmu_v3_format_group = { + .name = "format", + .attrs = hisi_l3c_pmu_v3_format_attr, +}; + static struct attribute *hisi_l3c_pmu_v1_events_attr[] = { HISI_PMU_EVENT_ATTR(rd_cpipe, 0x00), HISI_PMU_EVENT_ATTR(wr_cpipe, 0x01), @@ -483,6 +662,26 @@ static const struct attribute_group hisi_l3c_pmu_v2_events_group = { .attrs = hisi_l3c_pmu_v2_events_attr, }; +static struct attribute *hisi_l3c_pmu_v3_events_attr[] = { + HISI_PMU_EVENT_ATTR(rd_spipe, 0x18), + HISI_PMU_EVENT_ATTR(rd_hit_spipe, 0x19), + HISI_PMU_EVENT_ATTR(wr_spipe, 0x1a), + HISI_PMU_EVENT_ATTR(wr_hit_spipe, 0x1b), + HISI_PMU_EVENT_ATTR(io_rd_spipe, 0x1c), + HISI_PMU_EVENT_ATTR(io_rd_hit_spipe, 0x1d), + HISI_PMU_EVENT_ATTR(io_wr_spipe, 0x1e), + HISI_PMU_EVENT_ATTR(io_wr_hit_spipe, 0x1f), + HISI_PMU_EVENT_ATTR(cycles, 0x7f), + HISI_PMU_EVENT_ATTR(l3c_ref, 0xbc), + HISI_PMU_EVENT_ATTR(l3c2ring, 0xbd), + NULL +}; + +static const struct attribute_group hisi_l3c_pmu_v3_events_group = { + .name = "events", + .attrs = hisi_l3c_pmu_v3_events_attr, +}; + static const struct attribute_group *hisi_l3c_pmu_v1_attr_groups[] = { &hisi_l3c_pmu_v1_format_group, &hisi_l3c_pmu_v1_events_group, @@ -499,16 +698,41 @@ static const struct attribute_group *hisi_l3c_pmu_v2_attr_groups[] = { NULL }; +static const struct attribute_group *hisi_l3c_pmu_v3_attr_groups[] = { + &hisi_l3c_pmu_v3_format_group, + &hisi_l3c_pmu_v3_events_group, + &hisi_pmu_cpumask_attr_group, + &hisi_pmu_identifier_group, + NULL +}; + +static struct hisi_l3c_pmu_ext hisi_l3c_pmu_support_ext = { + .support_ext = true, +}; + +static struct hisi_l3c_pmu_ext hisi_l3c_pmu_not_support_ext = { + .support_ext = false, +}; + static const struct hisi_pmu_dev_info hisi_l3c_pmu_v1 = { .attr_groups = hisi_l3c_pmu_v1_attr_groups, .counter_bits = 48, .check_event = L3C_V1_NR_EVENTS, + .private = &hisi_l3c_pmu_not_support_ext, }; static const struct hisi_pmu_dev_info hisi_l3c_pmu_v2 = { .attr_groups = hisi_l3c_pmu_v2_attr_groups, .counter_bits = 64, .check_event = L3C_V2_NR_EVENTS, + .private = &hisi_l3c_pmu_not_support_ext, +}; + +static const struct hisi_pmu_dev_info hisi_l3c_pmu_v3 = { + .attr_groups = hisi_l3c_pmu_v3_attr_groups, + .counter_bits = 64, + .check_event = L3C_V2_NR_EVENTS, + .private = &hisi_l3c_pmu_support_ext, }; static const struct hisi_uncore_ops hisi_uncore_l3c_ops = { @@ -526,11 +750,14 @@ static const struct hisi_uncore_ops hisi_uncore_l3c_ops = { .clear_int_status = hisi_l3c_pmu_clear_int_status, .enable_filter = hisi_l3c_pmu_enable_filter, .disable_filter = hisi_l3c_pmu_disable_filter, + .check_filter = hisi_l3c_pmu_check_filter, }; static int hisi_l3c_pmu_dev_probe(struct platform_device *pdev, struct hisi_pmu *l3c_pmu) { + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + struct hisi_l3c_pmu_ext *l3c_pmu_dev_ext; int ret; ret = hisi_l3c_pmu_init_data(pdev, l3c_pmu); @@ -549,27 +776,47 @@ static int hisi_l3c_pmu_dev_probe(struct platform_device *pdev, l3c_pmu->dev = &pdev->dev; l3c_pmu->on_cpu = -1; + l3c_pmu_dev_ext = l3c_pmu->dev_info->private; + if (l3c_pmu_dev_ext->support_ext) { + ret = hisi_l3c_pmu_init_ext(l3c_pmu, pdev); + if (ret) + return ret; + /* + * The extension events have their own counters with the + * same number of the normal events counters. So we can + * have at maximum num_counters * ext events monitored. + */ + l3c_pmu->num_counters += hisi_l3c_pmu->ext_num * L3C_NR_COUNTERS; + } + return 0; } static int hisi_l3c_pmu_probe(struct platform_device *pdev) { + struct hisi_l3c_pmu *hisi_l3c_pmu; struct hisi_pmu *l3c_pmu; char *name; int ret; - l3c_pmu = devm_kzalloc(&pdev->dev, sizeof(*l3c_pmu), GFP_KERNEL); - if (!l3c_pmu) + hisi_l3c_pmu = devm_kzalloc(&pdev->dev, sizeof(*hisi_l3c_pmu), GFP_KERNEL); + if (!hisi_l3c_pmu) return -ENOMEM; + l3c_pmu = &hisi_l3c_pmu->l3c_pmu; platform_set_drvdata(pdev, l3c_pmu); ret = hisi_l3c_pmu_dev_probe(pdev, l3c_pmu); if (ret) return ret; - name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d", - l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id); + if (l3c_pmu->topo.sub_id >= 0) + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d_%d", + l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id, + l3c_pmu->topo.sub_id); + else + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%d_l3c%d", + l3c_pmu->topo.sccl_id, l3c_pmu->topo.ccl_id); if (!name) return -ENOMEM; @@ -604,6 +851,7 @@ static void hisi_l3c_pmu_remove(struct platform_device *pdev) static const struct acpi_device_id hisi_l3c_pmu_acpi_match[] = { { "HISI0213", (kernel_ulong_t)&hisi_l3c_pmu_v1 }, { "HISI0214", (kernel_ulong_t)&hisi_l3c_pmu_v2 }, + { "HISI0215", (kernel_ulong_t)&hisi_l3c_pmu_v3 }, {} }; MODULE_DEVICE_TABLE(acpi, hisi_l3c_pmu_acpi_match); @@ -618,14 +866,60 @@ static struct platform_driver hisi_l3c_pmu_driver = { .remove = hisi_l3c_pmu_remove, }; +static int hisi_l3c_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct hisi_pmu *l3c_pmu = hlist_entry_safe(node, struct hisi_pmu, node); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ret, i; + + ret = hisi_uncore_pmu_online_cpu(cpu, node); + if (ret) + return ret; + + /* Avoid L3C pmu not supporting ext from ext irq migrating. */ + if (!support_ext(hisi_l3c_pmu)) + return 0; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) + WARN_ON(irq_set_affinity(hisi_l3c_pmu->ext_irq[i], + cpumask_of(l3c_pmu->on_cpu))); + + return 0; +} + +static int hisi_l3c_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct hisi_pmu *l3c_pmu = hlist_entry_safe(node, struct hisi_pmu, node); + struct hisi_l3c_pmu *hisi_l3c_pmu = to_hisi_l3c_pmu(l3c_pmu); + int ret, i; + + ret = hisi_uncore_pmu_offline_cpu(cpu, node); + if (ret) + return ret; + + /* If failed to find any available CPU, skip irq migration. */ + if (l3c_pmu->on_cpu < 0) + return 0; + + /* Avoid L3C pmu not supporting ext from ext irq migrating. */ + if (!support_ext(hisi_l3c_pmu)) + return 0; + + for (i = 0; i < hisi_l3c_pmu->ext_num; i++) + WARN_ON(irq_set_affinity(hisi_l3c_pmu->ext_irq[i], + cpumask_of(l3c_pmu->on_cpu))); + + return 0; +} + static int __init hisi_l3c_pmu_module_init(void) { int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, "AP_PERF_ARM_HISI_L3_ONLINE", - hisi_uncore_pmu_online_cpu, - hisi_uncore_pmu_offline_cpu); + hisi_l3c_pmu_online_cpu, + hisi_l3c_pmu_offline_cpu); if (ret) { pr_err("L3C PMU: Error setup hotplug, ret = %d\n", ret); return ret; diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h index 8649be6f716a..3ffe6acda653 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.h +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h @@ -24,7 +24,7 @@ #define pr_fmt(fmt) "hisi_pmu: " fmt #define HISI_PMU_V2 0x30 -#define HISI_MAX_COUNTERS 0x10 +#define HISI_MAX_COUNTERS 0x18 #define to_hisi_pmu(p) (container_of(p, struct hisi_pmu, pmu)) #define HISI_PMU_ATTR(_name, _func, _config) \ From 272dd0e5e58d9c216771aa4a9dc1e36a662792da Mon Sep 17 00:00:00 2001 From: Yushan Wang Date: Fri, 29 Aug 2025 18:14:26 +0800 Subject: [PATCH 88/93] Documentation: hisi-pmu: Fix of minor format error The inline path of sysfs should be placed in literal blocks to make documentation look better. Acked-by: Jonathan Cameron Acked-by: Yicong Yang Signed-off-by: Yushan Wang Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/hisi-pmu.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst index 6f0ea4f641cc..8df048c26498 100644 --- a/Documentation/admin-guide/perf/hisi-pmu.rst +++ b/Documentation/admin-guide/perf/hisi-pmu.rst @@ -18,9 +18,10 @@ HiSilicon SoC uncore PMU driver Each device PMU has separate registers for event counting, control and interrupt, and the PMU driver shall register perf PMU drivers like L3C, HHA and DDRC etc. The available events and configuration options shall -be described in the sysfs, see: +be described in the sysfs, see:: + +/sys/bus/event_source/devices/hisi_sccl{X}_ -/sys/bus/event_source/devices/hisi_sccl{X}_. The "perf list" command shall list the available events from sysfs. Each L3C, HHA and DDRC is registered as a separate PMU with perf. The PMU From 6d2f913fda5683fbd4c3580262e10386c1263dfb Mon Sep 17 00:00:00 2001 From: Yushan Wang Date: Fri, 29 Aug 2025 18:14:27 +0800 Subject: [PATCH 89/93] Documentation: hisi-pmu: Add introduction to HiSilicon V3 PMU Some of HiSilicon V3 PMU hardware is divided into parts to fulfill the job of monitoring specific parts of a device. Add description on that as well as the newly added ext option for L3C PMU. Acked-by: Jonathan Cameron Signed-off-by: Yushan Wang Reviewed-by: Yicong Yang Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/hisi-pmu.rst | 33 +++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst index 8df048c26498..c4c2cbbf88cb 100644 --- a/Documentation/admin-guide/perf/hisi-pmu.rst +++ b/Documentation/admin-guide/perf/hisi-pmu.rst @@ -124,6 +124,39 @@ channel with this option. The current supported channels are as follows: 7. tt_en: NoC PMU supports counting only transactions that have tracetag set if this option is set. See the 2nd list for more information about tracetag. +For HiSilicon uncore PMU v3 whose identifier is 0x40, some uncore PMUs are +further divided into parts for finer granularity of tracing, each part has its +own dedicated PMU, and all such PMUs together cover the monitoring job of events +on particular uncore device. Such PMUs are described in sysfs with name format +slightly changed:: + +/sys/bus/event_source/devices/hisi_sccl{X}_ + +Z is the sub-id, indicating different PMUs for part of hardware device. + +Usage of most PMUs with different sub-ids are identical. Specially, L3C PMU +provides ``ext`` option to allow exploration of even finer granual statistics +of L3C PMU. L3C PMU driver uses that as hint of termination when delivering +perf command to hardware: + +- ext=0: Default, could be used with event names. +- ext=1 and ext=2: Must be used with event codes, event names are not supported. + +An example of perf command could be:: + + $# perf stat -a -e hisi_sccl0_l3c1_0/rd_spipe/ sleep 5 + +or:: + + $# perf stat -a -e hisi_sccl0_l3c1_0/event=0x1,ext=1/ sleep 5 + +As above, ``hisi_sccl0_l3c1_0`` locates PMU of Super CPU CLuster 0, L3 cache 1 +pipe0. + +First command locates the first part of L3C since ``ext=0`` is implied by +default. Second command issues the counting on another part of L3C with the +event ``0x1``. + Users could configure IDs to count data come from specific CCL/ICL, by setting srcid_cmd & srcid_msk, and data desitined for specific CCL/ICL by setting tgtid_cmd & tgtid_msk. A set bit in srcid_msk/tgtid_msk means the PMU will not From da9e5c04be589524101aac31746902b6803581e4 Mon Sep 17 00:00:00 2001 From: Can Peng Date: Fri, 19 Sep 2025 18:00:42 +0800 Subject: [PATCH 90/93] arm/syscalls: mark syscall invocation as likely in invoke_syscall The invoke_syscall() function is overwhelmingly called for valid system call entries. Annotate the main path with likely() to help the compiler generate better branch prediction hints, reducing CPU pipeline stalls due to mispredictions. This is a micro-optimization targeting syscall-heavy workloads [1]. Link: https://lore.kernel.org/r/20250922121730.986761-1-pengcan@kylinos.cn [1] Signed-off-by: Can Peng Signed-off-by: Will Deacon --- arch/arm64/kernel/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index c442fcec6b9e..aba7ca6bca2d 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -43,7 +43,7 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, add_random_kstack_offset(); - if (scno < sc_nr) { + if (likely(scno < sc_nr)) { syscall_fn_t syscall_fn; syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; ret = __invoke_syscall(regs, syscall_fn); From 2084660ad288c998b6f0c885e266deb364f65fba Mon Sep 17 00:00:00 2001 From: Ilkka Koskinen Date: Tue, 23 Sep 2025 14:31:36 -0700 Subject: [PATCH 91/93] perf/dwc_pcie: Fix use of uninitialized variable Fix use of uninitialized variable in group validation code. Fixes: 71396cfac97d ("perf/dwc_pcie: Support counting multiple lane events in parallel") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202509231223.gZsX6Eio-lkp@intel.com/ Signed-off-by: Ilkka Koskinen Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index d77f767cde89..22f73ac894e9 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -402,7 +402,7 @@ static int dwc_pcie_pmu_validate_group(struct perf_event *event) { struct perf_event *sibling, *leader = event->group_leader; DECLARE_BITMAP(val_lane_events, 2 * DWC_PCIE_LANE_MAX_EVENTS_PER_GROUP); - bool time_event; + bool time_event = false; int type; type = DWC_PCIE_EVENT_TYPE(leader); From 1cf89b6bf660c2e9fa137b3e160c7b1001937a78 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 19 Sep 2025 19:40:25 +0100 Subject: [PATCH 92/93] arm64: Kconfig: Make CPU_BIG_ENDIAN depend on BROKEN Big-endian arm64 configurations are vanishingly rare, yet we still claim to support them in Linux despite very limited testing or visible interest. Supporting big-endian adds unnecessary burden to reviewers and contributors which, without any known active users, is hard to justify. For example, recent work to improve our futex routines and to implement nested virtualisation support is non-trivially complicated by having to support both big- and little-endianness. Back in 2019 [1], it was claimed that Huawei were using arm64 big-endian machines in their telecommunication products but I don't know whether that's still the case and certainly haven't seen any patch contributions to help support or maintain it. Make CPU_BIG_ENDIAN depend on BROKEN as an initial deprecation step towards its removal. Cc: Catalin Marinas Cc: Marc Zyngier Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Hanjun Guo Cc: Jonathan Cameron Cc: Guenter Roeck Link: https://lore.kernel.org/linux-arm-kernel/73701e9f-bee1-7ae8-2277-7a3576171cd4@huawei.com/ [1] Acked-by: Catalin Marinas Acked-by: Marc Zyngier Acked-by: Ard Biesheuvel Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 514038b18eba..0633520a85f2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1493,7 +1493,7 @@ choice config CPU_BIG_ENDIAN bool "Build big-endian kernel" # https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c - depends on AS_IS_GNU || AS_VERSION >= 150000 + depends on (AS_IS_GNU || AS_VERSION >= 150000) && BROKEN help Say Y if you plan on running a kernel with a big-endian userspace. From ea0b39168d3a2313eabd145fb3440c946ccff4d1 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 23 Sep 2025 11:50:23 +0800 Subject: [PATCH 93/93] arm64: cpufeature: Remove duplicate asm/mmu.h header ./arch/arm64/kernel/cpufeature.c: asm/mmu.h is included more than once. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5dabb349986c..10362b296f2d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -86,7 +86,6 @@ #include #include #include -#include #include #include #include