Merge branches 'for-next/misc', 'for-next/tlbflush', 'for-next/ttbr-macros-cleanup', 'for-next/kselftest', 'for-next/feat_lsui', 'for-next/mpam', 'for-next/hotplug-batched-tlbi', 'for-next/bbml2-fixes', 'for-next/sysreg', 'for-next/generic-entry' and 'for-next/acpi', remote-tracking branches 'arm64/for-next/perf' and 'arm64/for-next/read-once' into for-next/core

* arm64/for-next/perf:
  : Perf updates
  perf/arm-cmn: Fix resource_size_t printk specifier in arm_cmn_init_dtc()
  perf/arm-cmn: Fix incorrect error check for devm_ioremap()
  perf: add NVIDIA Tegra410 C2C PMU
  perf: add NVIDIA Tegra410 CPU Memory Latency PMU
  perf/arm_cspmu: nvidia: Add Tegra410 PCIE-TGT PMU
  perf/arm_cspmu: nvidia: Add Tegra410 PCIE PMU
  perf/arm_cspmu: Add arm_cspmu_acpi_dev_get
  perf/arm_cspmu: nvidia: Add Tegra410 UCF PMU
  perf/arm_cspmu: nvidia: Rename doc to Tegra241
  perf/arm-cmn: Stop claiming entire iomem region
  arm64: cpufeature: Use pmuv3_implemented() function
  arm64: cpufeature: Make PMUVer and PerfMon unsigned
  KVM: arm64: Read PMUVer as unsigned

* arm64/for-next/read-once:
  : Fixes for __READ_ONCE() with CONFIG_LTO=y
  arm64, compiler-context-analysis: Permit alias analysis through __READ_ONCE() with CONFIG_LTO=y
  arm64: Optimize __READ_ONCE() with CONFIG_LTO=y

* for-next/misc:
  : Miscellaneous cleanups/fixes
  arm64: rsi: use linear-map alias for realm config buffer
  arm64: Kconfig: fix duplicate word in CMDLINE help text
  arm64: mte: Skip TFSR_EL1 checks and barriers in synchronous tag check mode
  arm64/hwcap: Generate the KERNEL_HWCAP_ definitions for the hwcaps
  arm64: kexec: Remove duplicate allocation for trans_pgd
  arm64: mm: Use generic enum pgtable_level
  arm64: scs: Remove redundant save/restore of SCS SP on entry to/from EL0
  arm64: remove ARCH_INLINE_*

* for-next/tlbflush:
  : Refactor the arm64 TLB invalidation API and implementation
  arm64: mm: __ptep_set_access_flags must hint correct TTL
  arm64: mm: Provide level hint for flush_tlb_page()
  arm64: mm: Wrap flush_tlb_page() around __do_flush_tlb_range()
  arm64: mm: More flags for __flush_tlb_range()
  arm64: mm: Refactor __flush_tlb_range() to take flags
  arm64: mm: Refactor flush_tlb_page() to use __tlbi_level_asid()
  arm64: mm: Simplify __flush_tlb_range_limit_excess()
  arm64: mm: Simplify __TLBI_RANGE_NUM() macro
  arm64: mm: Re-implement the __flush_tlb_range_op macro in C
  arm64: mm: Inline __TLBI_VADDR_RANGE() into __tlbi_range()
  arm64: mm: Push __TLBI_VADDR() into __tlbi_level()
  arm64: mm: Implicitly invalidate user ASID based on TLBI operation
  arm64: mm: Introduce a C wrapper for by-range TLB invalidation
  arm64: mm: Re-implement the __tlbi_level macro as a C function

* for-next/ttbr-macros-cleanup:
  : Cleanups of the TTBR1_* macros
  arm64/mm: Directly use TTBRx_EL1_CnP
  arm64/mm: Directly use TTBRx_EL1_ASID_MASK
  arm64/mm: Describe TTBR1_BADDR_4852_OFFSET

* for-next/kselftest:
  : arm64 kselftest updates
  selftests/arm64: Implement cmpbr_sigill() to hwcap test

* for-next/feat_lsui:
  : Futex support using FEAT_LSUI instructions to avoid toggling PAN
  arm64: armv8_deprecated: Disable swp emulation when FEAT_LSUI present
  arm64: Kconfig: Add support for LSUI
  KVM: arm64: Use CAST instruction for swapping guest descriptor
  arm64: futex: Support futex with FEAT_LSUI
  arm64: futex: Refactor futex atomic operation
  KVM: arm64: kselftest: set_id_regs: Add test for FEAT_LSUI
  KVM: arm64: Expose FEAT_LSUI to guests
  arm64: cpufeature: Add FEAT_LSUI

* for-next/mpam: (40 commits)
  : Expose MPAM to user-space via resctrl:
  :  - Add architecture context-switch and hiding of the feature from KVM.
  :  - Add interface to allow MPAM to be exposed to user-space using resctrl.
  :  - Add errata workaoround for some existing platforms.
  :  - Add documentation for using MPAM and what shape of platforms can use resctrl
  arm64: mpam: Add initial MPAM documentation
  arm_mpam: Quirk CMN-650's CSU NRDY behaviour
  arm_mpam: Add workaround for T241-MPAM-6
  arm_mpam: Add workaround for T241-MPAM-4
  arm_mpam: Add workaround for T241-MPAM-1
  arm_mpam: Add quirk framework
  arm_mpam: resctrl: Call resctrl_init() on platforms that can support resctrl
  arm64: mpam: Select ARCH_HAS_CPU_RESCTRL
  arm_mpam: resctrl: Add empty definitions for assorted resctrl functions
  arm_mpam: resctrl: Update the rmid reallocation limit
  arm_mpam: resctrl: Add resctrl_arch_rmid_read()
  arm_mpam: resctrl: Allow resctrl to allocate monitors
  arm_mpam: resctrl: Add support for csu counters
  arm_mpam: resctrl: Add monitor initialisation and domain boilerplate
  arm_mpam: resctrl: Add kunit test for control format conversions
  arm_mpam: resctrl: Add support for 'MB' resource
  arm_mpam: resctrl: Wait for cacheinfo to be ready
  arm_mpam: resctrl: Add rmid index helpers
  arm_mpam: resctrl: Convert to/from MPAMs fixed-point formats
  arm_mpam: resctrl: Hide CDP emulation behind CONFIG_EXPERT
  ...

* for-next/hotplug-batched-tlbi:
  : arm64/mm: Enable batched TLB flush in unmap_hotplug_range()
  arm64/mm: Reject memory removal that splits a kernel leaf mapping
  arm64/mm: Enable batched TLB flush in unmap_hotplug_range()

* for-next/bbml2-fixes:
  : Fixes for realm guest and BBML2_NOABORT
  arm64: mm: Remove pmd_sect() and pud_sect()
  arm64: mm: Handle invalid large leaf mappings correctly
  arm64: mm: Fix rodata=full block mapping support for realm guests

* for-next/sysreg:
  : arm64 sysreg updates
  arm64/sysreg: Update ID_AA64SMFR0_EL1 description to DDI0601 2025-12
  arm64/sysreg: Update ID_AA64ZFR0_EL1 description to DDI0601 2025-12
  arm64/sysreg: Update ID_AA64FPFR0_EL1 description to DDI0601 2025-12
  arm64/sysreg: Update ID_AA64ISAR2_EL1 description to DDI0601 2025-12
  arm64/sysreg: Update ID_AA64ISAR0_EL1 description to DDI0601 2025-12
  arm64/sysreg: Update SMIDR_EL1 to DDI0601 2025-06

* for-next/generic-entry:
  : More arm64 refactoring towards using the generic entry code
  arm64: Check DAIF (and PMR) at task-switch time
  arm64: entry: Use split preemption logic
  arm64: entry: Use irqentry_{enter_from,exit_to}_kernel_mode()
  arm64: entry: Consistently prefix arm64-specific wrappers
  arm64: entry: Don't preempt with SError or Debug masked
  entry: Split preemption from irqentry_exit_to_kernel_mode()
  entry: Split kernel mode logic from irqentry_{enter,exit}()
  entry: Move irqentry_enter() prototype later
  entry: Remove local_irq_{enable,disable}_exit_to_user()
  entry: Fix stale comment for irqentry_enter()

* for-next/acpi:
  : arm64 ACPI updates
  ACPI: AGDI: fix missing newline in error message
This commit is contained in:
70 changed files with 4094 additions and 881 deletions

View File

@@ -23,6 +23,7 @@ ARM64 Architecture
memory
memory-tagging-extension
mops
mpam
perf
pointer-authentication
ptdump

View File

@@ -0,0 +1,72 @@
.. SPDX-License-Identifier: GPL-2.0
====
MPAM
====
What is MPAM
============
MPAM (Memory Partitioning and Monitoring) is a feature in the CPUs and memory
system components such as the caches or memory controllers that allow memory
traffic to be labelled, partitioned and monitored.
Traffic is labelled by the CPU, based on the control or monitor group the
current task is assigned to using resctrl. Partitioning policy can be set
using the schemata file in resctrl, and monitor values read via resctrl.
See Documentation/filesystems/resctrl.rst for more details.
This allows tasks that share memory system resources, such as caches, to be
isolated from each other according to the partitioning policy (so called noisy
neighbours).
Supported Platforms
===================
Use of this feature requires CPU support, support in the memory system
components, and a description from firmware of where the MPAM device controls
are in the MMIO address space. (e.g. the 'MPAM' ACPI table).
The MMIO device that provides MPAM controls/monitors for a memory system
component is called a memory system component. (MSC).
Because the user interface to MPAM is via resctrl, only MPAM features that are
compatible with resctrl can be exposed to user-space.
MSC are considered as a group based on the topology. MSC that correspond with
the L3 cache are considered together, it is not possible to mix MSC between L2
and L3 to 'cover' a resctrl schema.
The supported features are:
* Cache portion bitmap controls (CPOR) on the L2 or L3 caches. To expose
CPOR at L2 or L3, every CPU must have a corresponding CPU cache at this
level that also supports the feature. Mismatched big/little platforms are
not supported as resctrl's controls would then also depend on task
placement.
* Memory bandwidth maximum controls (MBW_MAX) on or after the L3 cache.
resctrl uses the L3 cache-id to identify where the memory bandwidth
control is applied. For this reason the platform must have an L3 cache
with cache-id's supplied by firmware. (It doesn't need to support MPAM.)
To be exported as the 'MB' schema, the topology of the group of MSC chosen
must match the topology of the L3 cache so that the cache-id's can be
repainted. For example: Platforms with Memory bandwidth maximum controls
on CPU-less NUMA nodes cannot expose the 'MB' schema to resctrl as these
nodes do not have a corresponding L3 cache. If the memory bandwidth
control is on the memory rather than the L3 then there must be a single
global L3 as otherwise it is unknown which L3 the traffic came from. There
must be no caches between the L3 and the memory so that the two ends of
the path have equivalent traffic.
When the MPAM driver finds multiple groups of MSC it can use for the 'MB'
schema, it prefers the group closest to the L3 cache.
* Cache Storage Usage (CSU) counters can expose the 'llc_occupancy' provided
there is at least one CSU monitor on each MSC that makes up the L3 group.
Exposing CSU counters from other caches or devices is not supported.
Reporting Bugs
==============
If you are not seeing the counters or controls you expect please share the
debug messages produced when enabling dynamic debug and booting with:
dyndbg="file mpam_resctrl.c +pl"

View File

@@ -214,6 +214,9 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | SI L1 | #4311569 | ARM64_ERRATUM_4311569 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | CMN-650 | #3642720 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
| Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_845719 |
+----------------+-----------------+-----------------+-----------------------------+
| Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_843419 |
@@ -247,6 +250,12 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| NVIDIA | T241 GICv3/4.x | T241-FABRIC-4 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
| NVIDIA | T241 MPAM | T241-MPAM-1 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
| NVIDIA | T241 MPAM | T241-MPAM-4 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
| NVIDIA | T241 MPAM | T241-MPAM-6 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
| Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 |
+----------------+-----------------+-----------------+-----------------------------+

View File

@@ -61,32 +61,6 @@ config ARM64
select ARCH_HAVE_ELF_PROT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_HAVE_TRACE_MMIO_ACCESS
select ARCH_INLINE_READ_LOCK if !PREEMPTION
select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION
select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION
select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPTION
select ARCH_INLINE_READ_UNLOCK if !PREEMPTION
select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPTION
select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPTION
select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPTION
select ARCH_INLINE_WRITE_LOCK if !PREEMPTION
select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPTION
select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPTION
select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPTION
select ARCH_INLINE_WRITE_UNLOCK if !PREEMPTION
select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPTION
select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPTION
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPTION
select ARCH_INLINE_SPIN_TRYLOCK if !PREEMPTION
select ARCH_INLINE_SPIN_TRYLOCK_BH if !PREEMPTION
select ARCH_INLINE_SPIN_LOCK if !PREEMPTION
select ARCH_INLINE_SPIN_LOCK_BH if !PREEMPTION
select ARCH_INLINE_SPIN_LOCK_IRQ if !PREEMPTION
select ARCH_INLINE_SPIN_LOCK_IRQSAVE if !PREEMPTION
select ARCH_INLINE_SPIN_UNLOCK if !PREEMPTION
select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPTION
select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION
select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION
select ARCH_KEEP_MEMBLOCK
select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_USE_CMPXCHG_LOCKREF
@@ -2016,8 +1990,8 @@ config ARM64_TLB_RANGE
config ARM64_MPAM
bool "Enable support for MPAM"
select ARM64_MPAM_DRIVER if EXPERT # does nothing yet
select ACPI_MPAM if ACPI
select ARM64_MPAM_DRIVER
select ARCH_HAS_CPU_RESCTRL
help
Memory System Resource Partitioning and Monitoring (MPAM) is an
optional extension to the Arm architecture that allows each
@@ -2039,6 +2013,8 @@ config ARM64_MPAM
MPAM is exposed to user-space via the resctrl pseudo filesystem.
This option enables the extra context switch code.
endmenu # "ARMv8.4 architectural features"
menu "ARMv8.5 architectural features"
@@ -2215,6 +2191,26 @@ config ARM64_GCS
endmenu # "ARMv9.4 architectural features"
config AS_HAS_LSUI
def_bool $(as-instr,.arch_extension lsui)
help
Supported by LLVM 20+ and binutils 2.45+.
menu "ARMv9.6 architectural features"
config ARM64_LSUI
bool "Support Unprivileged Load Store Instructions (LSUI)"
default y
depends on AS_HAS_LSUI && !CPU_BIG_ENDIAN
help
The Unprivileged Load Store Instructions (LSUI) provides
variants load/store instructions that access user-space memory
from the kernel without clearing PSTATE.PAN bit.
This feature is supported by LLVM 20+ and binutils 2.45+.
endmenu # "ARMv9.6 architectural feature"
config ARM64_SVE
bool "ARM Scalable Vector Extension support"
default y
@@ -2372,7 +2368,7 @@ config CMDLINE
default ""
help
Provide a set of default command-line options at build time by
entering them here. As a minimum, you should specify the the
entering them here. As a minimum, you should specify the
root device (e.g. root=/dev/nfs).
choice

View File

@@ -15,7 +15,7 @@
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
.macro __uaccess_ttbr0_disable, tmp1
mrs \tmp1, ttbr1_el1 // swapper_pg_dir
bic \tmp1, \tmp1, #TTBR_ASID_MASK
bic \tmp1, \tmp1, #TTBRx_EL1_ASID_MASK
sub \tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET // reserved_pg_dir
msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1
add \tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET

View File

@@ -71,6 +71,8 @@ cpucap_is_possible(const unsigned int cap)
return true;
case ARM64_HAS_PMUV3:
return IS_ENABLED(CONFIG_HW_PERF_EVENTS);
case ARM64_HAS_LSUI:
return IS_ENABLED(CONFIG_ARM64_LSUI);
}
return true;

View File

@@ -513,7 +513,8 @@
check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2
.Linit_mpam_\@:
msr_s SYS_MPAM2_EL2, xzr // use the default partition
mov x0, #MPAM2_EL2_EnMPAMSM_MASK
msr_s SYS_MPAM2_EL2, x0 // use the default partition,
// and disable lower traps
mrs_s x0, SYS_MPAMIDR_EL1
tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg

View File

@@ -9,71 +9,292 @@
#include <linux/uaccess.h>
#include <asm/errno.h>
#include <asm/lsui.h>
#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */
#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \
do { \
#define LLSC_FUTEX_ATOMIC_OP(op, insn) \
static __always_inline int \
__llsc_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \
{ \
unsigned int loops = FUTEX_MAX_LOOPS; \
int ret, oldval, newval; \
\
uaccess_enable_privileged(); \
asm volatile( \
" prfm pstl1strm, %2\n" \
"1: ldxr %w1, %2\n" \
asm volatile("// __llsc_futex_atomic_" #op "\n" \
" prfm pstl1strm, %[uaddr]\n" \
"1: ldxr %w[oldval], %[uaddr]\n" \
insn "\n" \
"2: stlxr %w0, %w3, %2\n" \
" cbz %w0, 3f\n" \
" sub %w4, %w4, %w0\n" \
" cbnz %w4, 1b\n" \
" mov %w0, %w6\n" \
"2: stlxr %w[ret], %w[newval], %[uaddr]\n" \
" cbz %w[ret], 3f\n" \
" sub %w[loops], %w[loops], %w[ret]\n" \
" cbnz %w[loops], 1b\n" \
" mov %w[ret], %w[err]\n" \
"3:\n" \
" dmb ish\n" \
_ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) \
_ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0) \
: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), \
"+r" (loops) \
: "r" (oparg), "Ir" (-EAGAIN) \
_ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) \
_ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) \
: [ret] "=&r" (ret), [oldval] "=&r" (oldval), \
[uaddr] "+Q" (*uaddr), [newval] "=&r" (newval), \
[loops] "+r" (loops) \
: [oparg] "r" (oparg), [err] "Ir" (-EAGAIN) \
: "memory"); \
uaccess_disable_privileged(); \
} while (0)
\
if (!ret) \
*oval = oldval; \
\
return ret; \
}
LLSC_FUTEX_ATOMIC_OP(add, "add %w[newval], %w[oldval], %w[oparg]")
LLSC_FUTEX_ATOMIC_OP(or, "orr %w[newval], %w[oldval], %w[oparg]")
LLSC_FUTEX_ATOMIC_OP(and, "and %w[newval], %w[oldval], %w[oparg]")
LLSC_FUTEX_ATOMIC_OP(eor, "eor %w[newval], %w[oldval], %w[oparg]")
LLSC_FUTEX_ATOMIC_OP(set, "mov %w[newval], %w[oparg]")
static __always_inline int
__llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
{
int ret = 0;
unsigned int loops = FUTEX_MAX_LOOPS;
u32 val, tmp;
uaccess_enable_privileged();
asm volatile("//__llsc_futex_cmpxchg\n"
" prfm pstl1strm, %[uaddr]\n"
"1: ldxr %w[curval], %[uaddr]\n"
" eor %w[tmp], %w[curval], %w[oldval]\n"
" cbnz %w[tmp], 4f\n"
"2: stlxr %w[tmp], %w[newval], %[uaddr]\n"
" cbz %w[tmp], 3f\n"
" sub %w[loops], %w[loops], %w[tmp]\n"
" cbnz %w[loops], 1b\n"
" mov %w[ret], %w[err]\n"
"3:\n"
" dmb ish\n"
"4:\n"
_ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w[ret])
_ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w[ret])
: [ret] "+r" (ret), [curval] "=&r" (val),
[uaddr] "+Q" (*uaddr), [tmp] "=&r" (tmp),
[loops] "+r" (loops)
: [oldval] "r" (oldval), [newval] "r" (newval),
[err] "Ir" (-EAGAIN)
: "memory");
uaccess_disable_privileged();
if (!ret)
*oval = val;
return ret;
}
#ifdef CONFIG_ARM64_LSUI
/*
* Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(), as
* PAN toggling is not required.
*/
#define LSUI_FUTEX_ATOMIC_OP(op, asm_op) \
static __always_inline int \
__lsui_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \
{ \
int ret = 0; \
int oldval; \
\
uaccess_ttbr0_enable(); \
\
asm volatile("// __lsui_futex_atomic_" #op "\n" \
__LSUI_PREAMBLE \
"1: " #asm_op "al %w[oparg], %w[oldval], %[uaddr]\n" \
"2:\n" \
_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) \
: [ret] "+r" (ret), [uaddr] "+Q" (*uaddr), \
[oldval] "=r" (oldval) \
: [oparg] "r" (oparg) \
: "memory"); \
\
uaccess_ttbr0_disable(); \
\
if (!ret) \
*oval = oldval; \
return ret; \
}
LSUI_FUTEX_ATOMIC_OP(add, ldtadd)
LSUI_FUTEX_ATOMIC_OP(or, ldtset)
LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr)
LSUI_FUTEX_ATOMIC_OP(set, swpt)
static __always_inline int
__lsui_cmpxchg64(u64 __user *uaddr, u64 *oldval, u64 newval)
{
int ret = 0;
uaccess_ttbr0_enable();
asm volatile("// __lsui_cmpxchg64\n"
__LSUI_PREAMBLE
"1: casalt %[oldval], %[newval], %[uaddr]\n"
"2:\n"
_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
: [ret] "+r" (ret), [uaddr] "+Q" (*uaddr),
[oldval] "+r" (*oldval)
: [newval] "r" (newval)
: "memory");
uaccess_ttbr0_disable();
return ret;
}
static __always_inline int
__lsui_cmpxchg32(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
{
u64 __user *uaddr64;
bool futex_pos, other_pos;
u32 other, orig_other;
union {
u32 futex[2];
u64 raw;
} oval64, orig64, nval64;
uaddr64 = (u64 __user *)PTR_ALIGN_DOWN(uaddr, sizeof(u64));
futex_pos = !IS_ALIGNED((unsigned long)uaddr, sizeof(u64));
other_pos = !futex_pos;
oval64.futex[futex_pos] = oldval;
if (get_user(oval64.futex[other_pos], (u32 __user *)uaddr64 + other_pos))
return -EFAULT;
orig64.raw = oval64.raw;
nval64.futex[futex_pos] = newval;
nval64.futex[other_pos] = oval64.futex[other_pos];
if (__lsui_cmpxchg64(uaddr64, &oval64.raw, nval64.raw))
return -EFAULT;
oldval = oval64.futex[futex_pos];
other = oval64.futex[other_pos];
orig_other = orig64.futex[other_pos];
if (other != orig_other)
return -EAGAIN;
*oval = oldval;
return 0;
}
static __always_inline int
__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
{
/*
* Undo the bitwise negation applied to the oparg passed from
* arch_futex_atomic_op_inuser() with FUTEX_OP_ANDN.
*/
return __lsui_futex_atomic_andnot(~oparg, uaddr, oval);
}
static __always_inline int
__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
{
u32 oldval, newval, val;
int ret, i;
if (get_user(oldval, uaddr))
return -EFAULT;
/*
* there are no ldteor/stteor instructions...
*/
for (i = 0; i < FUTEX_MAX_LOOPS; i++) {
newval = oldval ^ oparg;
ret = __lsui_cmpxchg32(uaddr, oldval, newval, &val);
switch (ret) {
case -EFAULT:
return ret;
case -EAGAIN:
continue;
}
if (val == oldval) {
*oval = val;
return 0;
}
oldval = val;
}
return -EAGAIN;
}
static __always_inline int
__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
{
/*
* Callers of futex_atomic_cmpxchg_inatomic() already retry on
* -EAGAIN, no need for another loop of max retries.
*/
return __lsui_cmpxchg32(uaddr, oldval, newval, oval);
}
#endif /* CONFIG_ARM64_LSUI */
#define FUTEX_ATOMIC_OP(op) \
static __always_inline int \
__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \
{ \
return __lsui_llsc_body(futex_atomic_##op, oparg, uaddr, oval); \
}
FUTEX_ATOMIC_OP(add)
FUTEX_ATOMIC_OP(or)
FUTEX_ATOMIC_OP(and)
FUTEX_ATOMIC_OP(eor)
FUTEX_ATOMIC_OP(set)
static __always_inline int
__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
{
return __lsui_llsc_body(futex_cmpxchg, uaddr, oldval, newval, oval);
}
static inline int
arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr)
{
int oldval = 0, ret, tmp;
u32 __user *uaddr = __uaccess_mask_ptr(_uaddr);
int ret;
u32 __user *uaddr;
if (!access_ok(_uaddr, sizeof(u32)))
return -EFAULT;
uaddr = __uaccess_mask_ptr(_uaddr);
switch (op) {
case FUTEX_OP_SET:
__futex_atomic_op("mov %w3, %w5",
ret, oldval, uaddr, tmp, oparg);
ret = __futex_atomic_set(oparg, uaddr, oval);
break;
case FUTEX_OP_ADD:
__futex_atomic_op("add %w3, %w1, %w5",
ret, oldval, uaddr, tmp, oparg);
ret = __futex_atomic_add(oparg, uaddr, oval);
break;
case FUTEX_OP_OR:
__futex_atomic_op("orr %w3, %w1, %w5",
ret, oldval, uaddr, tmp, oparg);
ret = __futex_atomic_or(oparg, uaddr, oval);
break;
case FUTEX_OP_ANDN:
__futex_atomic_op("and %w3, %w1, %w5",
ret, oldval, uaddr, tmp, ~oparg);
ret = __futex_atomic_and(~oparg, uaddr, oval);
break;
case FUTEX_OP_XOR:
__futex_atomic_op("eor %w3, %w1, %w5",
ret, oldval, uaddr, tmp, oparg);
ret = __futex_atomic_eor(oparg, uaddr, oval);
break;
default:
ret = -ENOSYS;
}
if (!ret)
*oval = oldval;
return ret;
}
@@ -81,40 +302,14 @@ static inline int
futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr,
u32 oldval, u32 newval)
{
int ret = 0;
unsigned int loops = FUTEX_MAX_LOOPS;
u32 val, tmp;
u32 __user *uaddr;
if (!access_ok(_uaddr, sizeof(u32)))
return -EFAULT;
uaddr = __uaccess_mask_ptr(_uaddr);
uaccess_enable_privileged();
asm volatile("// futex_atomic_cmpxchg_inatomic\n"
" prfm pstl1strm, %2\n"
"1: ldxr %w1, %2\n"
" sub %w3, %w1, %w5\n"
" cbnz %w3, 4f\n"
"2: stlxr %w3, %w6, %2\n"
" cbz %w3, 3f\n"
" sub %w4, %w4, %w3\n"
" cbnz %w4, 1b\n"
" mov %w0, %w7\n"
"3:\n"
" dmb ish\n"
"4:\n"
_ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
_ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
: "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
: "memory");
uaccess_disable_privileged();
if (!ret)
*uval = val;
return ret;
return __futex_cmpxchg(uaddr, oldval, newval, uval);
}
#endif /* __ASM_FUTEX_H */

View File

@@ -71,23 +71,23 @@ static inline void __flush_hugetlb_tlb_range(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
unsigned long stride,
bool last_level)
tlbf_t flags)
{
switch (stride) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SIZE:
__flush_tlb_range(vma, start, end, PUD_SIZE, last_level, 1);
__flush_tlb_range(vma, start, end, PUD_SIZE, 1, flags);
break;
#endif
case CONT_PMD_SIZE:
case PMD_SIZE:
__flush_tlb_range(vma, start, end, PMD_SIZE, last_level, 2);
__flush_tlb_range(vma, start, end, PMD_SIZE, 2, flags);
break;
case CONT_PTE_SIZE:
__flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, 3);
__flush_tlb_range(vma, start, end, PAGE_SIZE, 3, flags);
break;
default:
__flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, TLBI_TTL_UNKNOWN);
__flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, flags);
}
}
@@ -98,7 +98,7 @@ static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
{
unsigned long stride = huge_page_size(hstate_vma(vma));
__flush_hugetlb_tlb_range(vma, start, end, stride, false);
__flush_hugetlb_tlb_range(vma, start, end, stride, TLBF_NONE);
}
#endif /* __ASM_HUGETLB_H */

View File

@@ -60,126 +60,10 @@
* of KERNEL_HWCAP_{feature}.
*/
#define __khwcap_feature(x) const_ilog2(HWCAP_ ## x)
#define KERNEL_HWCAP_FP __khwcap_feature(FP)
#define KERNEL_HWCAP_ASIMD __khwcap_feature(ASIMD)
#define KERNEL_HWCAP_EVTSTRM __khwcap_feature(EVTSTRM)
#define KERNEL_HWCAP_AES __khwcap_feature(AES)
#define KERNEL_HWCAP_PMULL __khwcap_feature(PMULL)
#define KERNEL_HWCAP_SHA1 __khwcap_feature(SHA1)
#define KERNEL_HWCAP_SHA2 __khwcap_feature(SHA2)
#define KERNEL_HWCAP_CRC32 __khwcap_feature(CRC32)
#define KERNEL_HWCAP_ATOMICS __khwcap_feature(ATOMICS)
#define KERNEL_HWCAP_FPHP __khwcap_feature(FPHP)
#define KERNEL_HWCAP_ASIMDHP __khwcap_feature(ASIMDHP)
#define KERNEL_HWCAP_CPUID __khwcap_feature(CPUID)
#define KERNEL_HWCAP_ASIMDRDM __khwcap_feature(ASIMDRDM)
#define KERNEL_HWCAP_JSCVT __khwcap_feature(JSCVT)
#define KERNEL_HWCAP_FCMA __khwcap_feature(FCMA)
#define KERNEL_HWCAP_LRCPC __khwcap_feature(LRCPC)
#define KERNEL_HWCAP_DCPOP __khwcap_feature(DCPOP)
#define KERNEL_HWCAP_SHA3 __khwcap_feature(SHA3)
#define KERNEL_HWCAP_SM3 __khwcap_feature(SM3)
#define KERNEL_HWCAP_SM4 __khwcap_feature(SM4)
#define KERNEL_HWCAP_ASIMDDP __khwcap_feature(ASIMDDP)
#define KERNEL_HWCAP_SHA512 __khwcap_feature(SHA512)
#define KERNEL_HWCAP_SVE __khwcap_feature(SVE)
#define KERNEL_HWCAP_ASIMDFHM __khwcap_feature(ASIMDFHM)
#define KERNEL_HWCAP_DIT __khwcap_feature(DIT)
#define KERNEL_HWCAP_USCAT __khwcap_feature(USCAT)
#define KERNEL_HWCAP_ILRCPC __khwcap_feature(ILRCPC)
#define KERNEL_HWCAP_FLAGM __khwcap_feature(FLAGM)
#define KERNEL_HWCAP_SSBS __khwcap_feature(SSBS)
#define KERNEL_HWCAP_SB __khwcap_feature(SB)
#define KERNEL_HWCAP_PACA __khwcap_feature(PACA)
#define KERNEL_HWCAP_PACG __khwcap_feature(PACG)
#define KERNEL_HWCAP_GCS __khwcap_feature(GCS)
#define KERNEL_HWCAP_CMPBR __khwcap_feature(CMPBR)
#define KERNEL_HWCAP_FPRCVT __khwcap_feature(FPRCVT)
#define KERNEL_HWCAP_F8MM8 __khwcap_feature(F8MM8)
#define KERNEL_HWCAP_F8MM4 __khwcap_feature(F8MM4)
#define KERNEL_HWCAP_SVE_F16MM __khwcap_feature(SVE_F16MM)
#define KERNEL_HWCAP_SVE_ELTPERM __khwcap_feature(SVE_ELTPERM)
#define KERNEL_HWCAP_SVE_AES2 __khwcap_feature(SVE_AES2)
#define KERNEL_HWCAP_SVE_BFSCALE __khwcap_feature(SVE_BFSCALE)
#define KERNEL_HWCAP_SVE2P2 __khwcap_feature(SVE2P2)
#define KERNEL_HWCAP_SME2P2 __khwcap_feature(SME2P2)
#define KERNEL_HWCAP_SME_SBITPERM __khwcap_feature(SME_SBITPERM)
#define KERNEL_HWCAP_SME_AES __khwcap_feature(SME_AES)
#define KERNEL_HWCAP_SME_SFEXPA __khwcap_feature(SME_SFEXPA)
#define KERNEL_HWCAP_SME_STMOP __khwcap_feature(SME_STMOP)
#define KERNEL_HWCAP_SME_SMOP4 __khwcap_feature(SME_SMOP4)
#define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 64)
#define KERNEL_HWCAP_DCPODP __khwcap2_feature(DCPODP)
#define KERNEL_HWCAP_SVE2 __khwcap2_feature(SVE2)
#define KERNEL_HWCAP_SVEAES __khwcap2_feature(SVEAES)
#define KERNEL_HWCAP_SVEPMULL __khwcap2_feature(SVEPMULL)
#define KERNEL_HWCAP_SVEBITPERM __khwcap2_feature(SVEBITPERM)
#define KERNEL_HWCAP_SVESHA3 __khwcap2_feature(SVESHA3)
#define KERNEL_HWCAP_SVESM4 __khwcap2_feature(SVESM4)
#define KERNEL_HWCAP_FLAGM2 __khwcap2_feature(FLAGM2)
#define KERNEL_HWCAP_FRINT __khwcap2_feature(FRINT)
#define KERNEL_HWCAP_SVEI8MM __khwcap2_feature(SVEI8MM)
#define KERNEL_HWCAP_SVEF32MM __khwcap2_feature(SVEF32MM)
#define KERNEL_HWCAP_SVEF64MM __khwcap2_feature(SVEF64MM)
#define KERNEL_HWCAP_SVEBF16 __khwcap2_feature(SVEBF16)
#define KERNEL_HWCAP_I8MM __khwcap2_feature(I8MM)
#define KERNEL_HWCAP_BF16 __khwcap2_feature(BF16)
#define KERNEL_HWCAP_DGH __khwcap2_feature(DGH)
#define KERNEL_HWCAP_RNG __khwcap2_feature(RNG)
#define KERNEL_HWCAP_BTI __khwcap2_feature(BTI)
#define KERNEL_HWCAP_MTE __khwcap2_feature(MTE)
#define KERNEL_HWCAP_ECV __khwcap2_feature(ECV)
#define KERNEL_HWCAP_AFP __khwcap2_feature(AFP)
#define KERNEL_HWCAP_RPRES __khwcap2_feature(RPRES)
#define KERNEL_HWCAP_MTE3 __khwcap2_feature(MTE3)
#define KERNEL_HWCAP_SME __khwcap2_feature(SME)
#define KERNEL_HWCAP_SME_I16I64 __khwcap2_feature(SME_I16I64)
#define KERNEL_HWCAP_SME_F64F64 __khwcap2_feature(SME_F64F64)
#define KERNEL_HWCAP_SME_I8I32 __khwcap2_feature(SME_I8I32)
#define KERNEL_HWCAP_SME_F16F32 __khwcap2_feature(SME_F16F32)
#define KERNEL_HWCAP_SME_B16F32 __khwcap2_feature(SME_B16F32)
#define KERNEL_HWCAP_SME_F32F32 __khwcap2_feature(SME_F32F32)
#define KERNEL_HWCAP_SME_FA64 __khwcap2_feature(SME_FA64)
#define KERNEL_HWCAP_WFXT __khwcap2_feature(WFXT)
#define KERNEL_HWCAP_EBF16 __khwcap2_feature(EBF16)
#define KERNEL_HWCAP_SVE_EBF16 __khwcap2_feature(SVE_EBF16)
#define KERNEL_HWCAP_CSSC __khwcap2_feature(CSSC)
#define KERNEL_HWCAP_RPRFM __khwcap2_feature(RPRFM)
#define KERNEL_HWCAP_SVE2P1 __khwcap2_feature(SVE2P1)
#define KERNEL_HWCAP_SME2 __khwcap2_feature(SME2)
#define KERNEL_HWCAP_SME2P1 __khwcap2_feature(SME2P1)
#define KERNEL_HWCAP_SME_I16I32 __khwcap2_feature(SME_I16I32)
#define KERNEL_HWCAP_SME_BI32I32 __khwcap2_feature(SME_BI32I32)
#define KERNEL_HWCAP_SME_B16B16 __khwcap2_feature(SME_B16B16)
#define KERNEL_HWCAP_SME_F16F16 __khwcap2_feature(SME_F16F16)
#define KERNEL_HWCAP_MOPS __khwcap2_feature(MOPS)
#define KERNEL_HWCAP_HBC __khwcap2_feature(HBC)
#define KERNEL_HWCAP_SVE_B16B16 __khwcap2_feature(SVE_B16B16)
#define KERNEL_HWCAP_LRCPC3 __khwcap2_feature(LRCPC3)
#define KERNEL_HWCAP_LSE128 __khwcap2_feature(LSE128)
#define KERNEL_HWCAP_FPMR __khwcap2_feature(FPMR)
#define KERNEL_HWCAP_LUT __khwcap2_feature(LUT)
#define KERNEL_HWCAP_FAMINMAX __khwcap2_feature(FAMINMAX)
#define KERNEL_HWCAP_F8CVT __khwcap2_feature(F8CVT)
#define KERNEL_HWCAP_F8FMA __khwcap2_feature(F8FMA)
#define KERNEL_HWCAP_F8DP4 __khwcap2_feature(F8DP4)
#define KERNEL_HWCAP_F8DP2 __khwcap2_feature(F8DP2)
#define KERNEL_HWCAP_F8E4M3 __khwcap2_feature(F8E4M3)
#define KERNEL_HWCAP_F8E5M2 __khwcap2_feature(F8E5M2)
#define KERNEL_HWCAP_SME_LUTV2 __khwcap2_feature(SME_LUTV2)
#define KERNEL_HWCAP_SME_F8F16 __khwcap2_feature(SME_F8F16)
#define KERNEL_HWCAP_SME_F8F32 __khwcap2_feature(SME_F8F32)
#define KERNEL_HWCAP_SME_SF8FMA __khwcap2_feature(SME_SF8FMA)
#define KERNEL_HWCAP_SME_SF8DP4 __khwcap2_feature(SME_SF8DP4)
#define KERNEL_HWCAP_SME_SF8DP2 __khwcap2_feature(SME_SF8DP2)
#define KERNEL_HWCAP_POE __khwcap2_feature(POE)
#define __khwcap3_feature(x) (const_ilog2(HWCAP3_ ## x) + 128)
#define KERNEL_HWCAP_MTE_FAR __khwcap3_feature(MTE_FAR)
#define KERNEL_HWCAP_MTE_STORE_ONLY __khwcap3_feature(MTE_STORE_ONLY)
#define KERNEL_HWCAP_LSFE __khwcap3_feature(LSFE)
#define KERNEL_HWCAP_LS64 __khwcap3_feature(LS64)
#include "asm/kernel-hwcap.h"
/*
* This yields a mask that user programs can use to figure out what

View File

@@ -0,0 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_LSUI_H
#define __ASM_LSUI_H
#include <linux/compiler_types.h>
#include <linux/stringify.h>
#include <asm/alternative.h>
#include <asm/alternative-macros.h>
#include <asm/cpucaps.h>
#define __LSUI_PREAMBLE ".arch_extension lsui\n"
#ifdef CONFIG_ARM64_LSUI
#define __lsui_llsc_body(op, ...) \
({ \
alternative_has_cap_unlikely(ARM64_HAS_LSUI) ? \
__lsui_##op(__VA_ARGS__) : __llsc_##op(__VA_ARGS__); \
})
#else /* CONFIG_ARM64_LSUI */
#define __lsui_llsc_body(op, ...) __llsc_##op(__VA_ARGS__)
#endif /* CONFIG_ARM64_LSUI */
#endif /* __ASM_LSUI_H */

View File

@@ -10,20 +10,12 @@
#define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */
#define USER_ASID_BIT 48
#define USER_ASID_FLAG (UL(1) << USER_ASID_BIT)
#define TTBR_ASID_MASK (UL(0xffff) << 48)
#ifndef __ASSEMBLER__
#include <linux/refcount.h>
#include <asm/cpufeature.h>
enum pgtable_type {
TABLE_PTE,
TABLE_PMD,
TABLE_PUD,
TABLE_P4D,
};
typedef struct {
atomic64_t id;
#ifdef CONFIG_COMPAT
@@ -112,5 +104,7 @@ void kpti_install_ng_mappings(void);
static inline void kpti_install_ng_mappings(void) {}
#endif
extern bool page_alloc_available;
#endif /* !__ASSEMBLER__ */
#endif

View File

@@ -210,7 +210,8 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,
if (mm == &init_mm)
ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
else
ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | ASID(mm) << 48;
ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) |
FIELD_PREP(TTBRx_EL1_ASID_MASK, ASID(mm));
WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
}

View File

@@ -0,0 +1,96 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (C) 2025 Arm Ltd. */
#ifndef __ASM__MPAM_H
#define __ASM__MPAM_H
#include <linux/arm_mpam.h>
#include <linux/bitfield.h>
#include <linux/jump_label.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <asm/sysreg.h>
DECLARE_STATIC_KEY_FALSE(mpam_enabled);
DECLARE_PER_CPU(u64, arm64_mpam_default);
DECLARE_PER_CPU(u64, arm64_mpam_current);
/*
* The value of the MPAM0_EL1 sysreg when a task is in resctrl's default group.
* This is used by the context switch code to use the resctrl CPU property
* instead. The value is modified when CDP is enabled/disabled by mounting
* the resctrl filesystem.
*/
extern u64 arm64_mpam_global_default;
#ifdef CONFIG_ARM64_MPAM
static inline u64 __mpam_regval(u16 partid_d, u16 partid_i, u8 pmg_d, u8 pmg_i)
{
return FIELD_PREP(MPAM0_EL1_PARTID_D, partid_d) |
FIELD_PREP(MPAM0_EL1_PARTID_I, partid_i) |
FIELD_PREP(MPAM0_EL1_PMG_D, pmg_d) |
FIELD_PREP(MPAM0_EL1_PMG_I, pmg_i);
}
static inline void mpam_set_cpu_defaults(int cpu, u16 partid_d, u16 partid_i,
u8 pmg_d, u8 pmg_i)
{
u64 default_val = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i);
WRITE_ONCE(per_cpu(arm64_mpam_default, cpu), default_val);
}
/*
* The resctrl filesystem writes to the partid/pmg values for threads and CPUs,
* which may race with reads in mpam_thread_switch(). Ensure only one of the old
* or new values are used. Particular care should be taken with the pmg field as
* mpam_thread_switch() may read a partid and pmg that don't match, causing this
* value to be stored with cache allocations, despite being considered 'free' by
* resctrl.
*/
static inline u64 mpam_get_regval(struct task_struct *tsk)
{
return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg);
}
static inline void mpam_set_task_partid_pmg(struct task_struct *tsk,
u16 partid_d, u16 partid_i,
u8 pmg_d, u8 pmg_i)
{
u64 regval = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i);
WRITE_ONCE(task_thread_info(tsk)->mpam_partid_pmg, regval);
}
static inline void mpam_thread_switch(struct task_struct *tsk)
{
u64 oldregval;
int cpu = smp_processor_id();
u64 regval = mpam_get_regval(tsk);
if (!static_branch_likely(&mpam_enabled))
return;
if (regval == READ_ONCE(arm64_mpam_global_default))
regval = READ_ONCE(per_cpu(arm64_mpam_default, cpu));
oldregval = READ_ONCE(per_cpu(arm64_mpam_current, cpu));
if (oldregval == regval)
return;
write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1);
if (system_supports_sme())
write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1);
isb();
/* Synchronising the EL0 write is left until the ERET to EL0 */
write_sysreg_s(regval, SYS_MPAM0_EL1);
WRITE_ONCE(per_cpu(arm64_mpam_current, cpu), regval);
}
#else
static inline void mpam_thread_switch(struct task_struct *tsk) {}
#endif /* CONFIG_ARM64_MPAM */
#endif /* __ASM__MPAM_H */

View File

@@ -252,6 +252,9 @@ static inline void mte_check_tfsr_entry(void)
if (!kasan_hw_tags_enabled())
return;
if (!system_uses_mte_async_or_asymm_mode())
return;
mte_check_tfsr_el1();
}
@@ -260,6 +263,9 @@ static inline void mte_check_tfsr_exit(void)
if (!kasan_hw_tags_enabled())
return;
if (!system_uses_mte_async_or_asymm_mode())
return;
/*
* The asynchronous faults are sync'ed automatically with
* TFSR_EL1 on kernel entry but for exit an explicit dsb()

View File

@@ -223,8 +223,6 @@
*/
#define S1_TABLE_AP (_AT(pmdval_t, 3) << 61)
#define TTBR_CNP_BIT (UL(1) << 0)
/*
* TCR flags.
*/
@@ -287,9 +285,12 @@
#endif
#ifdef CONFIG_ARM64_VA_BITS_52
#define PTRS_PER_PGD_52_VA (UL(1) << (52 - PGDIR_SHIFT))
#define PTRS_PER_PGD_48_VA (UL(1) << (48 - PGDIR_SHIFT))
#define PTRS_PER_PGD_EXTRA (PTRS_PER_PGD_52_VA - PTRS_PER_PGD_48_VA)
/* Must be at least 64-byte aligned to prevent corruption of the TTBR */
#define TTBR1_BADDR_4852_OFFSET (((UL(1) << (52 - PGDIR_SHIFT)) - \
(UL(1) << (48 - PGDIR_SHIFT))) * 8)
#define TTBR1_BADDR_4852_OFFSET (PTRS_PER_PGD_EXTRA << PTDESC_ORDER)
#endif
#endif

View File

@@ -25,6 +25,8 @@
*/
#define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */
#define PTE_PRESENT_VALID_KERNEL (PTE_VALID | PTE_MAYBE_NG)
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define PTE_UFFD_WP (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */
#define PTE_SWP_UFFD_WP (_AT(pteval_t, 1) << 3) /* only for swp ptes */

View File

@@ -89,9 +89,9 @@ static inline void arch_leave_lazy_mmu_mode(void)
/* Set stride and tlb_level in flush_*_tlb_range */
#define flush_pmd_tlb_range(vma, addr, end) \
__flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2)
__flush_tlb_range(vma, addr, end, PMD_SIZE, 2, TLBF_NONE)
#define flush_pud_tlb_range(vma, addr, end) \
__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
__flush_tlb_range(vma, addr, end, PUD_SIZE, 1, TLBF_NONE)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
@@ -101,10 +101,11 @@ static inline void arch_leave_lazy_mmu_mode(void)
* entries exist.
*/
#define flush_tlb_fix_spurious_fault(vma, address, ptep) \
local_flush_tlb_page_nonotify(vma, address)
__flush_tlb_page(vma, address, TLBF_NOBROADCAST | TLBF_NONOTIFY)
#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \
local_flush_tlb_page_nonotify(vma, address)
#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \
__flush_tlb_range(vma, address, address + PMD_SIZE, PMD_SIZE, 2, \
TLBF_NOBROADCAST | TLBF_NONOTIFY | TLBF_NOWALKCACHE)
/*
* ZERO_PAGE is a global shared page that is always zero: used
@@ -322,9 +323,11 @@ static inline pte_t pte_mknoncont(pte_t pte)
return clear_pte_bit(pte, __pgprot(PTE_CONT));
}
static inline pte_t pte_mkvalid(pte_t pte)
static inline pte_t pte_mkvalid_k(pte_t pte)
{
return set_pte_bit(pte, __pgprot(PTE_VALID));
pte = clear_pte_bit(pte, __pgprot(PTE_PRESENT_INVALID));
pte = set_pte_bit(pte, __pgprot(PTE_PRESENT_VALID_KERNEL));
return pte;
}
static inline pte_t pte_mkinvalid(pte_t pte)
@@ -594,6 +597,7 @@ static inline int pmd_protnone(pmd_t pmd)
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mkvalid_k(pmd) pte_pmd(pte_mkvalid_k(pmd_pte(pmd)))
#define pmd_mkinvalid(pmd) pte_pmd(pte_mkinvalid(pmd_pte(pmd)))
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define pmd_uffd_wp(pmd) pte_uffd_wp(pmd_pte(pmd))
@@ -635,6 +639,8 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd)
#define pud_young(pud) pte_young(pud_pte(pud))
#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud)))
#define pud_mkwrite_novma(pud) pte_pud(pte_mkwrite_novma(pud_pte(pud)))
#define pud_mkvalid_k(pud) pte_pud(pte_mkvalid_k(pud_pte(pud)))
#define pud_write(pud) pte_write(pud_pte(pud))
static inline pud_t pud_mkhuge(pud_t pud)
@@ -779,9 +785,13 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
#define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \
PMD_TYPE_TABLE)
#define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \
PMD_TYPE_SECT)
#define pmd_leaf(pmd) (pmd_present(pmd) && !pmd_table(pmd))
#define pmd_leaf pmd_leaf
static inline bool pmd_leaf(pmd_t pmd)
{
return pmd_present(pmd) && !pmd_table(pmd);
}
#define pmd_bad(pmd) (!pmd_table(pmd))
#define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
@@ -799,11 +809,8 @@ static inline int pmd_trans_huge(pmd_t pmd)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
static inline bool pud_sect(pud_t pud) { return false; }
static inline bool pud_table(pud_t pud) { return true; }
#else
#define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \
PUD_TYPE_SECT)
#define pud_table(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \
PUD_TYPE_TABLE)
#endif
@@ -873,7 +880,11 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
PUD_TYPE_TABLE)
#define pud_present(pud) pte_present(pud_pte(pud))
#ifndef __PAGETABLE_PMD_FOLDED
#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud))
#define pud_leaf pud_leaf
static inline bool pud_leaf(pud_t pud)
{
return pud_present(pud) && !pud_table(pud);
}
#else
#define pud_leaf(pud) false
#endif
@@ -1247,9 +1258,18 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
}
extern int __ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty);
extern int __ptep_set_access_flags_anysz(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty,
unsigned long pgsize);
static inline int __ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
return __ptep_set_access_flags_anysz(vma, address, ptep, entry, dirty,
PAGE_SIZE);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
@@ -1257,8 +1277,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
pmd_t entry, int dirty)
{
return __ptep_set_access_flags(vma, address, (pte_t *)pmdp,
pmd_pte(entry), dirty);
return __ptep_set_access_flags_anysz(vma, address, (pte_t *)pmdp,
pmd_pte(entry), dirty, PMD_SIZE);
}
#endif
@@ -1320,7 +1340,7 @@ static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
* context-switch, which provides a DSB to complete the TLB
* invalidation.
*/
flush_tlb_page_nosync(vma, address);
__flush_tlb_page(vma, address, TLBF_NOSYNC);
}
return young;

View File

@@ -0,0 +1,2 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/arm_mpam.h>

View File

@@ -19,6 +19,17 @@
"ldapr" #sfx "\t" #regs, \
ARM64_HAS_LDAPR)
/*
* Replace this with typeof_unqual() when minimum compiler versions are
* increased to GCC 14 and Clang 19. For the time being, we need this
* workaround, which relies on function return values dropping qualifiers.
*/
#define __rwonce_typeof_unqual(x) typeof(({ \
__diag_push() \
__diag_ignore_all("-Wignored-qualifiers", "") \
((typeof(x)(*)(void))0)(); \
__diag_pop() }))
/*
* When building with LTO, there is an increased risk of the compiler
* converting an address dependency headed by a READ_ONCE() invocation
@@ -31,9 +42,12 @@
*/
#define __READ_ONCE(x) \
({ \
typeof(&(x)) __x = &(x); \
int atomic = 1; \
union { __unqual_scalar_typeof(*__x) __val; char __c[1]; } __u; \
auto __x = &(x); \
auto __ret = (__rwonce_typeof_unqual(*__x) *)__x; \
/* Hides alias reassignment from Clang's -Wthread-safety. */ \
auto __retp = &__ret; \
union { typeof(*__ret) __val; char __c[1]; } __u; \
*__retp = &__u.__val; \
switch (sizeof(x)) { \
case 1: \
asm volatile(__LOAD_RCPC(b, %w0, %1) \
@@ -56,9 +70,9 @@
: "Q" (*__x) : "memory"); \
break; \
default: \
atomic = 0; \
__u.__val = *(volatile typeof(*__x) *)__x; \
} \
atomic ? (typeof(*__x))__u.__val : (*(volatile typeof(*__x) *)__x);\
*__ret; \
})
#endif /* !BUILD_VDSO */

View File

@@ -10,6 +10,11 @@
#ifdef CONFIG_SHADOW_CALL_STACK
scs_sp .req x18
.macro scs_load_current_base
get_current_task scs_sp
ldr scs_sp, [scs_sp, #TSK_TI_SCS_BASE]
.endm
.macro scs_load_current
get_current_task scs_sp
ldr scs_sp, [scs_sp, #TSK_TI_SCS_SP]
@@ -19,6 +24,9 @@
str scs_sp, [\tsk, #TSK_TI_SCS_SP]
.endm
#else
.macro scs_load_current_base
.endm
.macro scs_load_current
.endm

View File

@@ -41,6 +41,9 @@ struct thread_info {
#ifdef CONFIG_SHADOW_CALL_STACK
void *scs_base;
void *scs_sp;
#endif
#ifdef CONFIG_ARM64_MPAM
u64 mpam_partid_pmg;
#endif
u32 cpu;
};

View File

@@ -53,7 +53,7 @@ static inline int tlb_get_level(struct mmu_gather *tlb)
static inline void tlb_flush(struct mmu_gather *tlb)
{
struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0);
bool last_level = !tlb->freed_tables;
tlbf_t flags = tlb->freed_tables ? TLBF_NONE : TLBF_NOWALKCACHE;
unsigned long stride = tlb_get_unmap_size(tlb);
int tlb_level = tlb_get_level(tlb);
@@ -63,13 +63,13 @@ static inline void tlb_flush(struct mmu_gather *tlb)
* reallocate our ASID without invalidating the entire TLB.
*/
if (tlb->fullmm) {
if (!last_level)
if (tlb->freed_tables)
flush_tlb_mm(tlb->mm);
return;
}
__flush_tlb_range(&vma, tlb->start, tlb->end, stride,
last_level, tlb_level);
tlb_level, flags);
}
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,

View File

@@ -97,24 +97,69 @@ static inline unsigned long get_trans_granule(void)
#define TLBI_TTL_UNKNOWN INT_MAX
#define __tlbi_level(op, addr, level) do { \
u64 arg = addr; \
\
if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && \
level >= 0 && level <= 3) { \
u64 ttl = level & 3; \
ttl |= get_trans_granule() << 2; \
arg &= ~TLBI_TTL_MASK; \
arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \
} \
\
__tlbi(op, arg); \
} while(0)
typedef void (*tlbi_op)(u64 arg);
#define __tlbi_user_level(op, arg, level) do { \
if (arm64_kernel_unmapped_at_el0()) \
__tlbi_level(op, (arg | USER_ASID_FLAG), level); \
} while (0)
static __always_inline void vae1is(u64 arg)
{
__tlbi(vae1is, arg);
__tlbi_user(vae1is, arg);
}
static __always_inline void vae2is(u64 arg)
{
__tlbi(vae2is, arg);
}
static __always_inline void vale1(u64 arg)
{
__tlbi(vale1, arg);
__tlbi_user(vale1, arg);
}
static __always_inline void vale1is(u64 arg)
{
__tlbi(vale1is, arg);
__tlbi_user(vale1is, arg);
}
static __always_inline void vale2is(u64 arg)
{
__tlbi(vale2is, arg);
}
static __always_inline void vaale1is(u64 arg)
{
__tlbi(vaale1is, arg);
}
static __always_inline void ipas2e1(u64 arg)
{
__tlbi(ipas2e1, arg);
}
static __always_inline void ipas2e1is(u64 arg)
{
__tlbi(ipas2e1is, arg);
}
static __always_inline void __tlbi_level_asid(tlbi_op op, u64 addr, u32 level,
u16 asid)
{
u64 arg = __TLBI_VADDR(addr, asid);
if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && level <= 3) {
u64 ttl = level | (get_trans_granule() << 2);
FIELD_MODIFY(TLBI_TTL_MASK, &arg, ttl);
}
op(arg);
}
static inline void __tlbi_level(tlbi_op op, u64 addr, u32 level)
{
__tlbi_level_asid(op, addr, level, 0);
}
/*
* This macro creates a properly formatted VA operand for the TLB RANGE. The
@@ -141,19 +186,6 @@ static inline unsigned long get_trans_granule(void)
#define TLBIR_TTL_MASK GENMASK_ULL(38, 37)
#define TLBIR_BADDR_MASK GENMASK_ULL(36, 0)
#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl) \
({ \
unsigned long __ta = 0; \
unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0; \
__ta |= FIELD_PREP(TLBIR_BADDR_MASK, baddr); \
__ta |= FIELD_PREP(TLBIR_TTL_MASK, __ttl); \
__ta |= FIELD_PREP(TLBIR_NUM_MASK, num); \
__ta |= FIELD_PREP(TLBIR_SCALE_MASK, scale); \
__ta |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule()); \
__ta |= FIELD_PREP(TLBIR_ASID_MASK, asid); \
__ta; \
})
/* These macros are used by the TLBI RANGE feature. */
#define __TLBI_RANGE_PAGES(num, scale) \
((unsigned long)((num) + 1) << (5 * (scale) + 1))
@@ -167,11 +199,7 @@ static inline unsigned long get_trans_granule(void)
* range.
*/
#define __TLBI_RANGE_NUM(pages, scale) \
({ \
int __pages = min((pages), \
__TLBI_RANGE_PAGES(31, (scale))); \
(__pages >> (5 * (scale) + 1)) - 1; \
})
(((pages) >> (5 * (scale) + 1)) - 1)
#define __repeat_tlbi_sync(op, arg...) \
do { \
@@ -241,10 +269,7 @@ static inline void __tlbi_sync_s1ish_hyp(void)
* unmapping pages from vmalloc/io space.
*
* flush_tlb_page(vma, addr)
* Invalidate a single user mapping for address 'addr' in the
* address space corresponding to 'vma->mm'. Note that this
* operation only invalidates a single, last-level page-table
* entry and therefore does not affect any walk-caches.
* Equivalent to __flush_tlb_page(..., flags=TLBF_NONE)
*
*
* Next, we have some undocumented invalidation routines that you probably
@@ -258,30 +283,28 @@ static inline void __tlbi_sync_s1ish_hyp(void)
* CPUs, ensuring that any walk-cache entries associated with the
* translation are also invalidated.
*
* __flush_tlb_range(vma, start, end, stride, last_level, tlb_level)
* __flush_tlb_range(vma, start, end, stride, tlb_level, flags)
* Invalidate the virtual-address range '[start, end)' on all
* CPUs for the user address space corresponding to 'vma->mm'.
* The invalidation operations are issued at a granularity
* determined by 'stride' and only affect any walk-cache entries
* if 'last_level' is equal to false. tlb_level is the level at
* determined by 'stride'. tlb_level is the level at
* which the invalidation must take place. If the level is wrong,
* no invalidation may take place. In the case where the level
* cannot be easily determined, the value TLBI_TTL_UNKNOWN will
* perform a non-hinted invalidation.
* perform a non-hinted invalidation. flags may be TLBF_NONE (0) or
* any combination of TLBF_NOWALKCACHE (elide eviction of walk
* cache entries), TLBF_NONOTIFY (don't call mmu notifiers),
* TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST
* (only perform the invalidation for the local cpu).
*
* local_flush_tlb_page(vma, addr)
* Local variant of flush_tlb_page(). Stale TLB entries may
* remain in remote CPUs.
*
* local_flush_tlb_page_nonotify(vma, addr)
* Same as local_flush_tlb_page() except MMU notifier will not be
* called.
*
* local_flush_tlb_contpte(vma, addr)
* Invalidate the virtual-address range
* '[addr, addr+CONT_PTE_SIZE)' mapped with contpte on local CPU
* for the user address space corresponding to 'vma->mm'. Stale
* TLB entries may remain in remote CPUs.
* __flush_tlb_page(vma, addr, flags)
* Invalidate a single user mapping for address 'addr' in the
* address space corresponding to 'vma->mm'. Note that this
* operation only invalidates a single level 3 page-table entry
* and therefore does not affect any walk-caches. flags may contain
* any combination of TLBF_NONOTIFY (don't call mmu notifiers),
* TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST
* (only perform the invalidation for the local cpu).
*
* Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented
* on top of these routines, since that is our interface to the mmu_gather
@@ -315,59 +338,6 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
static inline void __local_flush_tlb_page_nonotify_nosync(struct mm_struct *mm,
unsigned long uaddr)
{
unsigned long addr;
dsb(nshst);
addr = __TLBI_VADDR(uaddr, ASID(mm));
__tlbi(vale1, addr);
__tlbi_user(vale1, addr);
}
static inline void local_flush_tlb_page_nonotify(struct vm_area_struct *vma,
unsigned long uaddr)
{
__local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr);
dsb(nsh);
}
static inline void local_flush_tlb_page(struct vm_area_struct *vma,
unsigned long uaddr)
{
__local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr);
mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, uaddr & PAGE_MASK,
(uaddr & PAGE_MASK) + PAGE_SIZE);
dsb(nsh);
}
static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
unsigned long uaddr)
{
unsigned long addr;
dsb(ishst);
addr = __TLBI_VADDR(uaddr, ASID(mm));
__tlbi(vale1is, addr);
__tlbi_user(vale1is, addr);
mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK,
(uaddr & PAGE_MASK) + PAGE_SIZE);
}
static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
unsigned long uaddr)
{
return __flush_tlb_page_nosync(vma->vm_mm, uaddr);
}
static inline void flush_tlb_page(struct vm_area_struct *vma,
unsigned long uaddr)
{
flush_tlb_page_nosync(vma, uaddr);
__tlbi_sync_s1ish();
}
static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
{
return true;
@@ -397,14 +367,13 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
/*
* __flush_tlb_range_op - Perform TLBI operation upon a range
*
* @op: TLBI instruction that operates on a range (has 'r' prefix)
* @lop: TLBI level operation to perform
* @rop: TLBI range operation to perform
* @start: The start address of the range
* @pages: Range as the number of pages from 'start'
* @stride: Flush granularity
* @asid: The ASID of the task (0 for IPA instructions)
* @tlb_level: Translation Table level hint, if known
* @tlbi_user: If 'true', call an additional __tlbi_user()
* (typically for user ASIDs). 'flase' for IPA instructions
* @level: Translation Table level hint, if known
* @lpa2: If 'true', the lpa2 scheme is used as set out below
*
* When the CPU does not support TLB range operations, flush the TLB
@@ -427,116 +396,181 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
* operations can only span an even number of pages. We save this for last to
* ensure 64KB start alignment is maintained for the LPA2 case.
*/
#define __flush_tlb_range_op(op, start, pages, stride, \
asid, tlb_level, tlbi_user, lpa2) \
do { \
typeof(start) __flush_start = start; \
typeof(pages) __flush_pages = pages; \
int num = 0; \
int scale = 3; \
int shift = lpa2 ? 16 : PAGE_SHIFT; \
unsigned long addr; \
\
while (__flush_pages > 0) { \
if (!system_supports_tlb_range() || \
__flush_pages == 1 || \
(lpa2 && __flush_start != ALIGN(__flush_start, SZ_64K))) { \
addr = __TLBI_VADDR(__flush_start, asid); \
__tlbi_level(op, addr, tlb_level); \
if (tlbi_user) \
__tlbi_user_level(op, addr, tlb_level); \
__flush_start += stride; \
__flush_pages -= stride >> PAGE_SHIFT; \
continue; \
} \
\
num = __TLBI_RANGE_NUM(__flush_pages, scale); \
if (num >= 0) { \
addr = __TLBI_VADDR_RANGE(__flush_start >> shift, asid, \
scale, num, tlb_level); \
__tlbi(r##op, addr); \
if (tlbi_user) \
__tlbi_user(r##op, addr); \
__flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \
__flush_pages -= __TLBI_RANGE_PAGES(num, scale);\
} \
scale--; \
} \
} while (0)
#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
static inline bool __flush_tlb_range_limit_excess(unsigned long start,
unsigned long end, unsigned long pages, unsigned long stride)
static __always_inline void rvae1is(u64 arg)
{
/*
* When the system does not support TLB range based flush
* operation, (MAX_DVM_OPS - 1) pages can be handled. But
* with TLB range based operation, MAX_TLBI_RANGE_PAGES
* pages can be handled.
*/
if ((!system_supports_tlb_range() &&
(end - start) >= (MAX_DVM_OPS * stride)) ||
pages > MAX_TLBI_RANGE_PAGES)
return true;
return false;
__tlbi(rvae1is, arg);
__tlbi_user(rvae1is, arg);
}
static inline void __flush_tlb_range_nosync(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned long stride, bool last_level,
int tlb_level)
static __always_inline void rvale1(u64 arg)
{
__tlbi(rvale1, arg);
__tlbi_user(rvale1, arg);
}
static __always_inline void rvale1is(u64 arg)
{
__tlbi(rvale1is, arg);
__tlbi_user(rvale1is, arg);
}
static __always_inline void rvaale1is(u64 arg)
{
__tlbi(rvaale1is, arg);
}
static __always_inline void ripas2e1is(u64 arg)
{
__tlbi(ripas2e1is, arg);
}
static __always_inline void __tlbi_range(tlbi_op op, u64 addr,
u16 asid, int scale, int num,
u32 level, bool lpa2)
{
u64 arg = 0;
arg |= FIELD_PREP(TLBIR_BADDR_MASK, addr >> (lpa2 ? 16 : PAGE_SHIFT));
arg |= FIELD_PREP(TLBIR_TTL_MASK, level > 3 ? 0 : level);
arg |= FIELD_PREP(TLBIR_NUM_MASK, num);
arg |= FIELD_PREP(TLBIR_SCALE_MASK, scale);
arg |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule());
arg |= FIELD_PREP(TLBIR_ASID_MASK, asid);
op(arg);
}
static __always_inline void __flush_tlb_range_op(tlbi_op lop, tlbi_op rop,
u64 start, size_t pages,
u64 stride, u16 asid,
u32 level, bool lpa2)
{
u64 addr = start, end = start + pages * PAGE_SIZE;
int scale = 3;
while (addr != end) {
int num;
pages = (end - addr) >> PAGE_SHIFT;
if (!system_supports_tlb_range() || pages == 1)
goto invalidate_one;
if (lpa2 && !IS_ALIGNED(addr, SZ_64K))
goto invalidate_one;
num = __TLBI_RANGE_NUM(pages, scale);
if (num >= 0) {
__tlbi_range(rop, addr, asid, scale, num, level, lpa2);
addr += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
}
scale--;
continue;
invalidate_one:
__tlbi_level_asid(lop, addr, level, asid);
addr += stride;
}
}
#define __flush_s1_tlb_range_op(op, start, pages, stride, asid, tlb_level) \
__flush_tlb_range_op(op, r##op, start, pages, stride, asid, tlb_level, lpa2_is_enabled())
#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
__flush_tlb_range_op(op, r##op, start, pages, stride, 0, tlb_level, kvm_lpa2_is_enabled())
static inline bool __flush_tlb_range_limit_excess(unsigned long pages,
unsigned long stride)
{
/*
* Assume that the worst case number of DVM ops required to flush a
* given range on a system that supports tlb-range is 20 (4 scales, 1
* final page, 15 for alignment on LPA2 systems), which is much smaller
* than MAX_DVM_OPS.
*/
if (system_supports_tlb_range())
return pages > MAX_TLBI_RANGE_PAGES;
return pages >= (MAX_DVM_OPS * stride) >> PAGE_SHIFT;
}
typedef unsigned __bitwise tlbf_t;
/* No special behaviour. */
#define TLBF_NONE ((__force tlbf_t)0)
/* Invalidate tlb entries only, leaving the page table walk cache intact. */
#define TLBF_NOWALKCACHE ((__force tlbf_t)BIT(0))
/* Skip the trailing dsb after issuing tlbi. */
#define TLBF_NOSYNC ((__force tlbf_t)BIT(1))
/* Suppress tlb notifier callbacks for this flush operation. */
#define TLBF_NONOTIFY ((__force tlbf_t)BIT(2))
/* Perform the tlbi locally without broadcasting to other CPUs. */
#define TLBF_NOBROADCAST ((__force tlbf_t)BIT(3))
static __always_inline void __do_flush_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long stride, int tlb_level,
tlbf_t flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long asid, pages;
start = round_down(start, stride);
end = round_up(end, stride);
pages = (end - start) >> PAGE_SHIFT;
if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
if (__flush_tlb_range_limit_excess(pages, stride)) {
flush_tlb_mm(mm);
return;
}
dsb(ishst);
if (!(flags & TLBF_NOBROADCAST))
dsb(ishst);
else
dsb(nshst);
asid = ASID(mm);
if (last_level)
__flush_tlb_range_op(vale1is, start, pages, stride, asid,
tlb_level, true, lpa2_is_enabled());
else
__flush_tlb_range_op(vae1is, start, pages, stride, asid,
tlb_level, true, lpa2_is_enabled());
switch (flags & (TLBF_NOWALKCACHE | TLBF_NOBROADCAST)) {
case TLBF_NONE:
__flush_s1_tlb_range_op(vae1is, start, pages, stride,
asid, tlb_level);
break;
case TLBF_NOWALKCACHE:
__flush_s1_tlb_range_op(vale1is, start, pages, stride,
asid, tlb_level);
break;
case TLBF_NOBROADCAST:
/* Combination unused */
BUG();
break;
case TLBF_NOWALKCACHE | TLBF_NOBROADCAST:
__flush_s1_tlb_range_op(vale1, start, pages, stride,
asid, tlb_level);
break;
}
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
if (!(flags & TLBF_NONOTIFY))
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
if (!(flags & TLBF_NOSYNC)) {
if (!(flags & TLBF_NOBROADCAST))
__tlbi_sync_s1ish();
else
dsb(nsh);
}
}
static inline void __flush_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long stride, bool last_level,
int tlb_level)
unsigned long stride, int tlb_level,
tlbf_t flags)
{
__flush_tlb_range_nosync(vma->vm_mm, start, end, stride,
last_level, tlb_level);
__tlbi_sync_s1ish();
}
static inline void local_flush_tlb_contpte(struct vm_area_struct *vma,
unsigned long addr)
{
unsigned long asid;
addr = round_down(addr, CONT_PTE_SIZE);
dsb(nshst);
asid = ASID(vma->vm_mm);
__flush_tlb_range_op(vale1, addr, CONT_PTES, PAGE_SIZE, asid,
3, true, lpa2_is_enabled());
mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, addr,
addr + CONT_PTE_SIZE);
dsb(nsh);
start = round_down(start, stride);
end = round_up(end, stride);
__do_flush_tlb_range(vma, start, end, stride, tlb_level, flags);
}
static inline void flush_tlb_range(struct vm_area_struct *vma,
@@ -548,7 +582,23 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
* Set the tlb_level to TLBI_TTL_UNKNOWN because we can not get enough
* information here.
*/
__flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN);
__flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, TLBF_NONE);
}
static inline void __flush_tlb_page(struct vm_area_struct *vma,
unsigned long uaddr, tlbf_t flags)
{
unsigned long start = round_down(uaddr, PAGE_SIZE);
unsigned long end = start + PAGE_SIZE;
__do_flush_tlb_range(vma, start, end, PAGE_SIZE, 3,
TLBF_NOWALKCACHE | flags);
}
static inline void flush_tlb_page(struct vm_area_struct *vma,
unsigned long uaddr)
{
__flush_tlb_page(vma, uaddr, TLBF_NONE);
}
static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
@@ -560,14 +610,14 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end
end = round_up(end, stride);
pages = (end - start) >> PAGE_SHIFT;
if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
if (__flush_tlb_range_limit_excess(pages, stride)) {
flush_tlb_all();
return;
}
dsb(ishst);
__flush_tlb_range_op(vaale1is, start, pages, stride, 0,
TLBI_TTL_UNKNOWN, false, lpa2_is_enabled());
__flush_s1_tlb_range_op(vaale1is, start, pages, stride, 0,
TLBI_TTL_UNKNOWN);
__tlbi_sync_s1ish();
isb();
}
@@ -589,7 +639,10 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr)
static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm, unsigned long start, unsigned long end)
{
__flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, true, 3);
struct vm_area_struct vma = { .vm_mm = mm, .vm_flags = 0 };
__flush_tlb_range(&vma, start, end, PAGE_SIZE, 3,
TLBF_NOWALKCACHE | TLBF_NOSYNC);
}
static inline bool __pte_flags_need_flush(ptdesc_t oldval, ptdesc_t newval)
@@ -618,6 +671,8 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
}
#define huge_pmd_needs_flush huge_pmd_needs_flush
#undef __tlbi_user
#undef __TLBI_VADDR
#endif
#endif

View File

@@ -62,7 +62,7 @@ static inline void __uaccess_ttbr0_disable(void)
local_irq_save(flags);
ttbr = read_sysreg(ttbr1_el1);
ttbr &= ~TTBR_ASID_MASK;
ttbr &= ~TTBRx_EL1_ASID_MASK;
/* reserved_pg_dir placed before swapper_pg_dir */
write_sysreg(ttbr - RESERVED_SWAPPER_OFFSET, ttbr0_el1);
/* Set reserved ASID */
@@ -85,8 +85,8 @@ static inline void __uaccess_ttbr0_enable(void)
/* Restore active ASID */
ttbr1 = read_sysreg(ttbr1_el1);
ttbr1 &= ~TTBR_ASID_MASK; /* safety measure */
ttbr1 |= ttbr0 & TTBR_ASID_MASK;
ttbr1 &= ~TTBRx_EL1_ASID_MASK; /* safety measure */
ttbr1 |= ttbr0 & TTBRx_EL1_ASID_MASK;
write_sysreg(ttbr1, ttbr1_el1);
/* Restore user page table */

View File

@@ -67,6 +67,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o
obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o
obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o
obj-$(CONFIG_ARM64_MPAM) += mpam.o
obj-$(CONFIG_ARM64_MTE) += mte.o
obj-y += vdso-wrap.o
obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o

View File

@@ -610,6 +610,20 @@ static int __init armv8_deprecated_init(void)
}
#endif
#ifdef CONFIG_SWP_EMULATION
/*
* The purpose of supporting LSUI is to eliminate PAN toggling. CPUs
* that support LSUI are unlikely to support a 32-bit runtime. Rather
* than emulating the SWP instruction using LSUI instructions, simply
* disable SWP emulation.
*/
if (cpus_have_final_cap(ARM64_HAS_LSUI)) {
insn_swp.status = INSN_UNAVAILABLE;
pr_info("swp/swpb instruction emulation is not supported on this system\n");
}
#endif
for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) {
struct insn_emulation *ie = insn_emulations[i];

View File

@@ -87,6 +87,7 @@
#include <asm/kvm_host.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/mpam.h>
#include <asm/mte.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -282,6 +283,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
static const struct arm64_ftr_bits ftr_id_aa64isar3[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSUI_SHIFT, 4, ID_AA64ISAR3_EL1_LSUI_NI),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0),
ARM64_FTR_END,
@@ -2484,13 +2486,19 @@ test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope)
static void
cpu_enable_mpam(const struct arm64_cpu_capabilities *entry)
{
/*
* Access by the kernel (at EL1) should use the reserved PARTID
* which is configured unrestricted. This avoids priority-inversion
* where latency sensitive tasks have to wait for a task that has
* been throttled to release the lock.
*/
write_sysreg_s(0, SYS_MPAM1_EL1);
int cpu = smp_processor_id();
u64 regval = 0;
if (IS_ENABLED(CONFIG_ARM64_MPAM) && static_branch_likely(&mpam_enabled))
regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu));
write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1);
if (cpus_have_cap(ARM64_SME))
write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1);
isb();
/* Synchronising the EL0 write is left until the ERET to EL0 */
write_sysreg_s(regval, SYS_MPAM0_EL1);
}
static bool
@@ -3161,6 +3169,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.cpu_enable = cpu_enable_ls64_v,
ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LS64, LS64_V)
},
#ifdef CONFIG_ARM64_LSUI
{
.desc = "Unprivileged Load Store Instructions (LSUI)",
.capability = ARM64_HAS_LSUI,
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.matches = has_cpuid_feature,
ARM64_CPUID_FIELDS(ID_AA64ISAR3_EL1, LSUI, IMP)
},
#endif
{},
};

View File

@@ -35,11 +35,11 @@
* Before this function is called it is not safe to call regular kernel code,
* instrumentable code, or any code which may trigger an exception.
*/
static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs)
static noinstr irqentry_state_t arm64_enter_from_kernel_mode(struct pt_regs *regs)
{
irqentry_state_t state;
state = irqentry_enter(regs);
state = irqentry_enter_from_kernel_mode(regs);
mte_check_tfsr_entry();
mte_disable_tco_entry(current);
@@ -51,11 +51,14 @@ static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs)
* After this function returns it is not safe to call regular kernel code,
* instrumentable code, or any code which may trigger an exception.
*/
static void noinstr exit_to_kernel_mode(struct pt_regs *regs,
irqentry_state_t state)
static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs,
irqentry_state_t state)
{
local_irq_disable();
irqentry_exit_to_kernel_mode_preempt(regs, state);
local_daif_mask();
mte_check_tfsr_exit();
irqentry_exit(regs, state);
irqentry_exit_to_kernel_mode_after_preempt(regs, state);
}
/*
@@ -298,11 +301,10 @@ static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr)
unsigned long far = read_sysreg(far_el1);
irqentry_state_t state;
state = enter_from_kernel_mode(regs);
state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_mem_abort(far, esr, regs);
local_daif_mask();
exit_to_kernel_mode(regs, state);
arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr)
@@ -310,55 +312,50 @@ static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr)
unsigned long far = read_sysreg(far_el1);
irqentry_state_t state;
state = enter_from_kernel_mode(regs);
state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_sp_pc_abort(far, esr, regs);
local_daif_mask();
exit_to_kernel_mode(regs, state);
arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr)
{
irqentry_state_t state;
state = enter_from_kernel_mode(regs);
state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_el1_undef(regs, esr);
local_daif_mask();
exit_to_kernel_mode(regs, state);
arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr)
{
irqentry_state_t state;
state = enter_from_kernel_mode(regs);
state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_el1_bti(regs, esr);
local_daif_mask();
exit_to_kernel_mode(regs, state);
arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr)
{
irqentry_state_t state;
state = enter_from_kernel_mode(regs);
state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_el1_gcs(regs, esr);
local_daif_mask();
exit_to_kernel_mode(regs, state);
arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr)
{
irqentry_state_t state;
state = enter_from_kernel_mode(regs);
state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_el1_mops(regs, esr);
local_daif_mask();
exit_to_kernel_mode(regs, state);
arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr)
@@ -420,11 +417,10 @@ static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr)
{
irqentry_state_t state;
state = enter_from_kernel_mode(regs);
state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_el1_fpac(regs, esr);
local_daif_mask();
exit_to_kernel_mode(regs, state);
arm64_exit_to_kernel_mode(regs, state);
}
asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
@@ -491,13 +487,13 @@ static __always_inline void __el1_irq(struct pt_regs *regs,
{
irqentry_state_t state;
state = enter_from_kernel_mode(regs);
state = arm64_enter_from_kernel_mode(regs);
irq_enter_rcu();
do_interrupt_handler(regs, handler);
irq_exit_rcu();
exit_to_kernel_mode(regs, state);
arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_interrupt(struct pt_regs *regs,
void (*handler)(struct pt_regs *))

View File

@@ -273,7 +273,7 @@ alternative_if ARM64_HAS_ADDRESS_AUTH
alternative_else_nop_endif
1:
scs_load_current
scs_load_current_base
.else
add x21, sp, #PT_REGS_SIZE
get_current_task tsk
@@ -378,8 +378,6 @@ alternative_if ARM64_WORKAROUND_845719
alternative_else_nop_endif
#endif
3:
scs_save tsk
/* Ignore asynchronous tag check faults in the uaccess routines */
ldr x0, [tsk, THREAD_SCTLR_USER]
clear_mte_async_tcf x0
@@ -473,7 +471,7 @@ alternative_else_nop_endif
*/
SYM_CODE_START_LOCAL(__swpan_entry_el1)
mrs x21, ttbr0_el1
tst x21, #TTBR_ASID_MASK // Check for the reserved ASID
tst x21, #TTBRx_EL1_ASID_MASK // Check for the reserved ASID
orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR
b.eq 1f // TTBR0 access already disabled
and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR

View File

@@ -129,9 +129,6 @@ int machine_kexec_post_load(struct kimage *kimage)
}
/* Create a copy of the linear map */
trans_pgd = kexec_page_alloc(kimage);
if (!trans_pgd)
return -ENOMEM;
rc = trans_pgd_create_copy(&info, &trans_pgd, PAGE_OFFSET, PAGE_END);
if (rc)
return rc;

62
arch/arm64/kernel/mpam.c Normal file
View File

@@ -0,0 +1,62 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2025 Arm Ltd. */
#include <asm/mpam.h>
#include <linux/arm_mpam.h>
#include <linux/cpu_pm.h>
#include <linux/jump_label.h>
#include <linux/percpu.h>
DEFINE_STATIC_KEY_FALSE(mpam_enabled);
DEFINE_PER_CPU(u64, arm64_mpam_default);
DEFINE_PER_CPU(u64, arm64_mpam_current);
u64 arm64_mpam_global_default;
static int mpam_pm_notifier(struct notifier_block *self,
unsigned long cmd, void *v)
{
u64 regval;
int cpu = smp_processor_id();
switch (cmd) {
case CPU_PM_EXIT:
/*
* Don't use mpam_thread_switch() as the system register
* value has changed under our feet.
*/
regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu));
write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1);
if (system_supports_sme()) {
write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D),
SYS_MPAMSM_EL1);
}
isb();
write_sysreg_s(regval, SYS_MPAM0_EL1);
return NOTIFY_OK;
default:
return NOTIFY_DONE;
}
}
static struct notifier_block mpam_pm_nb = {
.notifier_call = mpam_pm_notifier,
};
static int __init arm64_mpam_register_cpus(void)
{
u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1);
u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr);
u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr);
if (!system_supports_mpam())
return 0;
cpu_pm_register_notifier(&mpam_pm_nb);
return mpam_register_requestor(partid_max, pmg_max);
}
/* Must occur before mpam_msc_driver_init() from subsys_initcall() */
arch_initcall(arm64_mpam_register_cpus)

View File

@@ -291,6 +291,9 @@ void mte_thread_switch(struct task_struct *next)
/* TCO may not have been disabled on exception entry for the current task. */
mte_disable_tco_entry(next);
if (!system_uses_mte_async_or_asymm_mode())
return;
/*
* Check if an async tag exception occurred at EL1.
*
@@ -315,8 +318,8 @@ void mte_cpu_setup(void)
* CnP is not a boot feature so MTE gets enabled before CnP, but let's
* make sure that is the case.
*/
BUG_ON(read_sysreg(ttbr0_el1) & TTBR_CNP_BIT);
BUG_ON(read_sysreg(ttbr1_el1) & TTBR_CNP_BIT);
BUG_ON(read_sysreg(ttbr0_el1) & TTBRx_EL1_CnP);
BUG_ON(read_sysreg(ttbr1_el1) & TTBRx_EL1_CnP);
/* Normal Tagged memory type at the corresponding MAIR index */
sysreg_clear_set(mair_el1,
@@ -350,6 +353,9 @@ void mte_suspend_enter(void)
if (!system_supports_mte())
return;
if (!system_uses_mte_async_or_asymm_mode())
return;
/*
* The barriers are required to guarantee that the indirect writes
* to TFSR_EL1 are synchronized before we report the state.

View File

@@ -51,6 +51,7 @@
#include <asm/fpsimd.h>
#include <asm/gcs.h>
#include <asm/mmu_context.h>
#include <asm/mpam.h>
#include <asm/mte.h>
#include <asm/processor.h>
#include <asm/pointer_auth.h>
@@ -699,6 +700,29 @@ void update_sctlr_el1(u64 sctlr)
isb();
}
static inline void debug_switch_state(void)
{
if (system_uses_irq_prio_masking()) {
unsigned long daif_expected = 0;
unsigned long daif_actual = read_sysreg(daif);
unsigned long pmr_expected = GIC_PRIO_IRQOFF;
unsigned long pmr_actual = read_sysreg_s(SYS_ICC_PMR_EL1);
WARN_ONCE(daif_actual != daif_expected ||
pmr_actual != pmr_expected,
"Unexpected DAIF + PMR: 0x%lx + 0x%lx (expected 0x%lx + 0x%lx)\n",
daif_actual, pmr_actual,
daif_expected, pmr_expected);
} else {
unsigned long daif_expected = DAIF_PROCCTX_NOIRQ;
unsigned long daif_actual = read_sysreg(daif);
WARN_ONCE(daif_actual != daif_expected,
"Unexpected DAIF value: 0x%lx (expected 0x%lx)\n",
daif_actual, daif_expected);
}
}
/*
* Thread switching.
*/
@@ -708,6 +732,8 @@ struct task_struct *__switch_to(struct task_struct *prev,
{
struct task_struct *last;
debug_switch_state();
fpsimd_thread_switch(next);
tls_thread_switch(next);
hw_breakpoint_thread_switch(next);
@@ -738,6 +764,12 @@ struct task_struct *__switch_to(struct task_struct *prev,
if (prev->thread.sctlr_user != next->thread.sctlr_user)
update_sctlr_el1(next->thread.sctlr_user);
/*
* MPAM thread switch happens after the DSB to ensure prev's accesses
* use prev's MPAM settings.
*/
mpam_thread_switch(next);
/* the actual thread switch */
last = cpu_switch_to(prev, next);

View File

@@ -144,7 +144,7 @@ void __init arm64_rsi_init(void)
return;
if (!rsi_version_matches())
return;
if (WARN_ON(rsi_get_realm_config(&config)))
if (WARN_ON(rsi_get_realm_config(lm_alias(&config))))
return;
prot_ns_shared = BIT(config.ipa_bits - 1);

View File

@@ -36,7 +36,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end)
* The workaround requires an inner-shareable tlbi.
* We pick the reserved-ASID to minimise the impact.
*/
__tlbi(aside1is, __TLBI_VADDR(0, 0));
__tlbi(aside1is, 0UL);
__tlbi_sync_s1ish();
}

View File

@@ -9,6 +9,7 @@
#include <asm/esr.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/lsui.h>
static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
{
@@ -1681,6 +1682,35 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
}
}
static int __lsui_swap_desc(u64 __user *ptep, u64 old, u64 new)
{
u64 tmp = old;
int ret = 0;
/*
* Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(),
* as PAN toggling is not required.
*/
uaccess_ttbr0_enable();
asm volatile(__LSUI_PREAMBLE
"1: cast %[old], %[new], %[addr]\n"
"2:\n"
_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
: [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret)
: [new] "r" (new)
: "memory");
uaccess_ttbr0_disable();
if (ret)
return ret;
if (tmp != old)
return -EAGAIN;
return ret;
}
static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
{
u64 tmp = old;
@@ -1756,7 +1786,9 @@ int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new)
return -EPERM;
ptep = (u64 __user *)hva + offset;
if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
if (cpus_have_final_cap(ARM64_HAS_LSUI))
r = __lsui_swap_desc(ptep, old, new);
else if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
r = __lse_swap_desc(ptep, old, new);
else
r = __llsc_swap_desc(ptep, old, new);

View File

@@ -267,7 +267,8 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu)
static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu)
{
u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1;
u64 clr = MPAM2_EL2_EnMPAMSM;
u64 set = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1;
if (!system_supports_mpam())
return;
@@ -277,18 +278,21 @@ static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu)
write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2);
} else {
/* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */
r |= MPAM2_EL2_TIDR;
set |= MPAM2_EL2_TIDR;
}
write_sysreg_s(r, SYS_MPAM2_EL2);
sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set);
}
static inline void __deactivate_traps_mpam(void)
{
u64 clr = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1 | MPAM2_EL2_TIDR;
u64 set = MPAM2_EL2_EnMPAMSM;
if (!system_supports_mpam())
return;
write_sysreg_s(0, SYS_MPAM2_EL2);
sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set);
if (system_supports_mpam_hcr())
write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2);

View File

@@ -130,7 +130,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init)
ldr x1, [x0, #NVHE_INIT_PGD_PA]
phys_to_ttbr x2, x1
alternative_if ARM64_HAS_CNP
orr x2, x2, #TTBR_CNP_BIT
orr x2, x2, #TTBRx_EL1_CnP
alternative_else_nop_endif
msr ttbr0_el2, x2
@@ -291,7 +291,7 @@ SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd)
/* Install the new pgtables */
phys_to_ttbr x5, x0
alternative_if ARM64_HAS_CNP
orr x5, x5, #TTBR_CNP_BIT
orr x5, x5, #TTBRx_EL1_CnP
alternative_else_nop_endif
msr ttbr0_el2, x5

View File

@@ -270,7 +270,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
* https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
*/
dsb(ishst);
__tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level);
__tlbi_level(vale2is, addr, level);
__tlbi_sync_s1ish_hyp();
isb();
}

View File

@@ -158,7 +158,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
ipa >>= 12;
__tlbi_level(ipas2e1is, ipa, level);
/*
@@ -188,7 +187,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
ipa >>= 12;
__tlbi_level(ipas2e1, ipa, level);
/*

View File

@@ -490,14 +490,14 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
kvm_clear_pte(ctx->ptep);
dsb(ishst);
__tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN);
__tlbi_level(vae2is, ctx->addr, TLBI_TTL_UNKNOWN);
} else {
if (ctx->end - ctx->addr < granule)
return -EINVAL;
kvm_clear_pte(ctx->ptep);
dsb(ishst);
__tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
__tlbi_level(vale2is, ctx->addr, ctx->level);
*unmapped += granule;
}

View File

@@ -183,6 +183,21 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt)
}
NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe);
/*
* The _EL0 value was written by the host's context switch and belongs to the
* VMM. Copy this into the guest's _EL1 register.
*/
static inline void __mpam_guest_load(void)
{
u64 mask = MPAM0_EL1_PARTID_D | MPAM0_EL1_PARTID_I | MPAM0_EL1_PMG_D | MPAM0_EL1_PMG_I;
if (system_supports_mpam()) {
u64 val = (read_sysreg_s(SYS_MPAM0_EL1) & mask) | MPAM1_EL1_MPAMEN;
write_sysreg_el1(val, SYS_MPAM1);
}
}
/**
* __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU
*
@@ -222,6 +237,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu)
*/
__sysreg32_restore_state(vcpu);
__sysreg_restore_user_state(guest_ctxt);
__mpam_guest_load();
if (unlikely(is_hyp_ctxt(vcpu))) {
__sysreg_restore_vel2_state(vcpu);

View File

@@ -104,7 +104,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
ipa >>= 12;
__tlbi_level(ipas2e1is, ipa, level);
/*
@@ -136,7 +135,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
ipa >>= 12;
__tlbi_level(ipas2e1, ipa, level);
/*

View File

@@ -1805,7 +1805,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
break;
case SYS_ID_AA64ISAR3_EL1:
val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE |
ID_AA64ISAR3_EL1_FAMINMAX;
ID_AA64ISAR3_EL1_FAMINMAX | ID_AA64ISAR3_EL1_LSUI;
break;
case SYS_ID_AA64MMFR2_EL1:
val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
@@ -3252,6 +3252,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64ISAR2_EL1_GPA3)),
ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT |
ID_AA64ISAR3_EL1_LSFE |
ID_AA64ISAR3_EL1_LSUI |
ID_AA64ISAR3_EL1_FAMINMAX)),
ID_UNALLOCATED(6,4),
ID_UNALLOCATED(6,5),
@@ -3376,6 +3377,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_MPAM1_EL1), undef_access },
{ SYS_DESC(SYS_MPAM0_EL1), undef_access },
{ SYS_DESC(SYS_MPAMSM_EL1), undef_access },
{ SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 },
{ SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },

View File

@@ -354,15 +354,15 @@ void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm)
/* Skip CNP for the reserved ASID */
if (system_supports_cnp() && asid)
ttbr0 |= TTBR_CNP_BIT;
ttbr0 |= TTBRx_EL1_CnP;
/* SW PAN needs a copy of the ASID in TTBR0 for entry */
if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN))
ttbr0 |= FIELD_PREP(TTBR_ASID_MASK, asid);
ttbr0 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid);
/* Set ASID in TTBR1 since TCR.A1 is set */
ttbr1 &= ~TTBR_ASID_MASK;
ttbr1 |= FIELD_PREP(TTBR_ASID_MASK, asid);
ttbr1 &= ~TTBRx_EL1_ASID_MASK;
ttbr1 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid);
cpu_set_reserved_ttbr0_nosync();
write_sysreg(ttbr1, ttbr1_el1);

View File

@@ -225,7 +225,8 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr,
*/
if (!system_supports_bbml2_noabort())
__flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
__flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, 3,
TLBF_NOWALKCACHE);
__set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
}
@@ -551,8 +552,8 @@ int contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
* See comment in __ptep_clear_flush_young(); same rationale for
* eliding the trailing DSB applies here.
*/
__flush_tlb_range_nosync(vma->vm_mm, addr, end,
PAGE_SIZE, true, 3);
__flush_tlb_range(vma, addr, end, PAGE_SIZE, 3,
TLBF_NOWALKCACHE | TLBF_NOSYNC);
}
return young;
@@ -685,7 +686,10 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
__ptep_set_access_flags(vma, addr, ptep, entry, 0);
if (dirty)
local_flush_tlb_contpte(vma, start_addr);
__flush_tlb_range(vma, start_addr,
start_addr + CONT_PTE_SIZE,
PAGE_SIZE, 3,
TLBF_NOWALKCACHE | TLBF_NOBROADCAST);
} else {
__contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
__ptep_set_access_flags(vma, addr, ptep, entry, dirty);

View File

@@ -204,12 +204,13 @@ static void show_pte(unsigned long addr)
*
* Returns whether or not the PTE actually changed.
*/
int __ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
int __ptep_set_access_flags_anysz(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty, unsigned long pgsize)
{
pteval_t old_pteval, pteval;
pte_t pte = __ptep_get(ptep);
int level;
if (pte_same(pte, entry))
return 0;
@@ -238,8 +239,27 @@ int __ptep_set_access_flags(struct vm_area_struct *vma,
* may still cause page faults and be invalidated via
* flush_tlb_fix_spurious_fault().
*/
if (dirty)
local_flush_tlb_page(vma, address);
if (dirty) {
switch (pgsize) {
case PAGE_SIZE:
level = 3;
break;
case PMD_SIZE:
level = 2;
break;
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SIZE:
level = 1;
break;
#endif
default:
level = TLBI_TTL_UNKNOWN;
WARN_ON(1);
}
__flush_tlb_range(vma, address, address + pgsize, pgsize, level,
TLBF_NOWALKCACHE | TLBF_NOBROADCAST);
}
return 1;
}

View File

@@ -181,7 +181,7 @@ static pte_t get_clear_contig_flush(struct mm_struct *mm,
struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
unsigned long end = addr + (pgsize * ncontig);
__flush_hugetlb_tlb_range(&vma, addr, end, pgsize, true);
__flush_hugetlb_tlb_range(&vma, addr, end, pgsize, TLBF_NOWALKCACHE);
return orig_pte;
}
@@ -209,7 +209,7 @@ static void clear_flush(struct mm_struct *mm,
if (mm == &init_mm)
flush_tlb_kernel_range(saddr, addr);
else
__flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true);
__flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, TLBF_NOWALKCACHE);
}
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -427,11 +427,11 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
pte_t orig_pte;
VM_WARN_ON(!pte_present(pte));
ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize);
if (!pte_cont(pte))
return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize);
return __ptep_set_access_flags_anysz(vma, addr, ptep, pte,
dirty, pgsize);
if (!__cont_access_flags_changed(ptep, pte, ncontig))
return 0;

View File

@@ -350,7 +350,6 @@ void __init arch_mm_preinit(void)
}
swiotlb_init(swiotlb, flags);
swiotlb_update_mem_attributes();
/*
* Check boundaries twice: Some fundamental inconsistencies can be
@@ -377,6 +376,14 @@ void __init arch_mm_preinit(void)
}
}
bool page_alloc_available __ro_after_init;
void __init mem_init(void)
{
page_alloc_available = true;
swiotlb_update_mem_attributes();
}
void free_initmem(void)
{
void *lm_init_begin = lm_alias(__init_begin);

View File

@@ -112,7 +112,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
}
EXPORT_SYMBOL(phys_mem_access_prot);
static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type)
static phys_addr_t __init early_pgtable_alloc(enum pgtable_level pgtable_level)
{
phys_addr_t phys;
@@ -197,14 +197,14 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
unsigned long end, phys_addr_t phys,
pgprot_t prot,
phys_addr_t (*pgtable_alloc)(enum pgtable_type),
phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
unsigned long next;
pmd_t pmd = READ_ONCE(*pmdp);
pte_t *ptep;
BUG_ON(pmd_sect(pmd));
BUG_ON(pmd_leaf(pmd));
if (pmd_none(pmd)) {
pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
phys_addr_t pte_phys;
@@ -212,7 +212,7 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
if (flags & NO_EXEC_MAPPINGS)
pmdval |= PMD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
pte_phys = pgtable_alloc(TABLE_PTE);
pte_phys = pgtable_alloc(PGTABLE_LEVEL_PTE);
if (pte_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
ptep = pte_set_fixmap(pte_phys);
@@ -252,7 +252,7 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags)
phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags)
{
unsigned long next;
@@ -292,7 +292,7 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
unsigned long end, phys_addr_t phys,
pgprot_t prot,
phys_addr_t (*pgtable_alloc)(enum pgtable_type),
phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret;
@@ -303,7 +303,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
/*
* Check for initial section mappings in the pgd/pud.
*/
BUG_ON(pud_sect(pud));
BUG_ON(pud_leaf(pud));
if (pud_none(pud)) {
pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
phys_addr_t pmd_phys;
@@ -311,7 +311,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
if (flags & NO_EXEC_MAPPINGS)
pudval |= PUD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
pmd_phys = pgtable_alloc(TABLE_PMD);
pmd_phys = pgtable_alloc(PGTABLE_LEVEL_PMD);
if (pmd_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
pmdp = pmd_set_fixmap(pmd_phys);
@@ -349,7 +349,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
phys_addr_t (*pgtable_alloc)(enum pgtable_type),
phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret = 0;
@@ -364,7 +364,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
if (flags & NO_EXEC_MAPPINGS)
p4dval |= P4D_TABLE_PXN;
BUG_ON(!pgtable_alloc);
pud_phys = pgtable_alloc(TABLE_PUD);
pud_phys = pgtable_alloc(PGTABLE_LEVEL_PUD);
if (pud_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
pudp = pud_set_fixmap(pud_phys);
@@ -415,7 +415,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
phys_addr_t (*pgtable_alloc)(enum pgtable_type),
phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret;
@@ -430,7 +430,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
if (flags & NO_EXEC_MAPPINGS)
pgdval |= PGD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
p4d_phys = pgtable_alloc(TABLE_P4D);
p4d_phys = pgtable_alloc(PGTABLE_LEVEL_P4D);
if (p4d_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
p4dp = p4d_set_fixmap(p4d_phys);
@@ -467,7 +467,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
phys_addr_t (*pgtable_alloc)(enum pgtable_type),
phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret;
@@ -500,7 +500,7 @@ static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
phys_addr_t (*pgtable_alloc)(enum pgtable_type),
phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret;
@@ -516,7 +516,7 @@ static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
phys_addr_t (*pgtable_alloc)(enum pgtable_type),
phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret;
@@ -528,7 +528,7 @@ static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
}
static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
enum pgtable_type pgtable_type)
enum pgtable_level pgtable_level)
{
/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0);
@@ -539,40 +539,43 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
pa = page_to_phys(ptdesc_page(ptdesc));
switch (pgtable_type) {
case TABLE_PTE:
switch (pgtable_level) {
case PGTABLE_LEVEL_PTE:
BUG_ON(!pagetable_pte_ctor(mm, ptdesc));
break;
case TABLE_PMD:
case PGTABLE_LEVEL_PMD:
BUG_ON(!pagetable_pmd_ctor(mm, ptdesc));
break;
case TABLE_PUD:
case PGTABLE_LEVEL_PUD:
pagetable_pud_ctor(ptdesc);
break;
case TABLE_P4D:
case PGTABLE_LEVEL_P4D:
pagetable_p4d_ctor(ptdesc);
break;
case PGTABLE_LEVEL_PGD:
VM_WARN_ON(1);
break;
}
return pa;
}
static phys_addr_t
pgd_pgtable_alloc_init_mm_gfp(enum pgtable_type pgtable_type, gfp_t gfp)
pgd_pgtable_alloc_init_mm_gfp(enum pgtable_level pgtable_level, gfp_t gfp)
{
return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type);
return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_level);
}
static phys_addr_t __maybe_unused
pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
pgd_pgtable_alloc_init_mm(enum pgtable_level pgtable_level)
{
return pgd_pgtable_alloc_init_mm_gfp(pgtable_type, GFP_PGTABLE_KERNEL);
return pgd_pgtable_alloc_init_mm_gfp(pgtable_level, GFP_PGTABLE_KERNEL);
}
static phys_addr_t
pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
pgd_pgtable_alloc_special_mm(enum pgtable_level pgtable_level)
{
return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_level);
}
static void split_contpte(pte_t *ptep)
@@ -593,7 +596,7 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
pte_t *ptep;
int i;
pte_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PTE, gfp);
pte_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PTE, gfp);
if (pte_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
ptep = (pte_t *)phys_to_virt(pte_phys);
@@ -602,6 +605,8 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
tableprot |= PMD_TABLE_PXN;
prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE);
if (!pmd_valid(pmd))
prot = pte_pgprot(pte_mkinvalid(pfn_pte(0, prot)));
prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
if (to_cont)
prot = __pgprot(pgprot_val(prot) | PTE_CONT);
@@ -638,7 +643,7 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
pmd_t *pmdp;
int i;
pmd_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PMD, gfp);
pmd_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PMD, gfp);
if (pmd_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
pmdp = (pmd_t *)phys_to_virt(pmd_phys);
@@ -647,6 +652,8 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
tableprot |= PUD_TABLE_PXN;
prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT);
if (!pud_valid(pud))
prot = pmd_pgprot(pmd_mkinvalid(pfn_pmd(0, prot)));
prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
if (to_cont)
prot = __pgprot(pgprot_val(prot) | PTE_CONT);
@@ -768,30 +775,51 @@ static inline bool force_pte_mapping(void)
}
static DEFINE_MUTEX(pgtable_split_lock);
static bool linear_map_requires_bbml2;
int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
{
int ret;
/*
* !BBML2_NOABORT systems should not be trying to change permissions on
* anything that is not pte-mapped in the first place. Just return early
* and let the permission change code raise a warning if not already
* pte-mapped.
*/
if (!system_supports_bbml2_noabort())
return 0;
/*
* If the region is within a pte-mapped area, there is no need to try to
* split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may
* change permissions from atomic context so for those cases (which are
* always pte-mapped), we must not go any further because taking the
* mutex below may sleep.
* mutex below may sleep. Do not call force_pte_mapping() here because
* it could return a confusing result if called from a secondary cpu
* prior to finalizing caps. Instead, linear_map_requires_bbml2 gives us
* what we need.
*/
if (force_pte_mapping() || is_kfence_address((void *)start))
if (!linear_map_requires_bbml2 || is_kfence_address((void *)start))
return 0;
if (!system_supports_bbml2_noabort()) {
/*
* !BBML2_NOABORT systems should not be trying to change
* permissions on anything that is not pte-mapped in the first
* place. Just return early and let the permission change code
* raise a warning if not already pte-mapped.
*/
if (system_capabilities_finalized())
return 0;
/*
* Boot-time: split_kernel_leaf_mapping_locked() allocates from
* page allocator. Can't split until it's available.
*/
if (WARN_ON(!page_alloc_available))
return -EBUSY;
/*
* Boot-time: Started secondary cpus but don't know if they
* support BBML2_NOABORT yet. Can't allow splitting in this
* window in case they don't.
*/
if (WARN_ON(num_online_cpus() > 1))
return -EBUSY;
}
/*
* Ensure start and end are at least page-aligned since this is the
* finest granularity we can split to.
@@ -891,8 +919,6 @@ static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp
return ret;
}
static bool linear_map_requires_bbml2 __initdata;
u32 idmap_kpti_bbml2_flag;
static void __init init_idmap_kpti_bbml2_flag(void)
@@ -1226,7 +1252,7 @@ static void __init declare_vma(struct vm_struct *vma,
static phys_addr_t kpti_ng_temp_alloc __initdata;
static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_type type)
static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_level pgtable_level)
{
kpti_ng_temp_alloc -= PAGE_SIZE;
return kpti_ng_temp_alloc;
@@ -1458,10 +1484,14 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
WARN_ON(!pte_present(pte));
__pte_clear(&init_mm, addr, ptep);
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
if (free_mapped)
if (free_mapped) {
/* CONT blocks are not supported in the vmemmap */
WARN_ON(pte_cont(pte));
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
free_hotplug_page_range(pte_page(pte),
PAGE_SIZE, altmap);
}
/* unmap_hotplug_range() flushes TLB for !free_mapped */
} while (addr += PAGE_SIZE, addr < end);
}
@@ -1480,17 +1510,16 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
continue;
WARN_ON(!pmd_present(pmd));
if (pmd_sect(pmd)) {
if (pmd_leaf(pmd)) {
pmd_clear(pmdp);
/*
* One TLBI should be sufficient here as the PMD_SIZE
* range is mapped with a single block entry.
*/
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
if (free_mapped)
if (free_mapped) {
/* CONT blocks are not supported in the vmemmap */
WARN_ON(pmd_cont(pmd));
flush_tlb_kernel_range(addr, addr + PMD_SIZE);
free_hotplug_page_range(pmd_page(pmd),
PMD_SIZE, altmap);
}
/* unmap_hotplug_range() flushes TLB for !free_mapped */
continue;
}
WARN_ON(!pmd_table(pmd));
@@ -1513,17 +1542,14 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
continue;
WARN_ON(!pud_present(pud));
if (pud_sect(pud)) {
if (pud_leaf(pud)) {
pud_clear(pudp);
/*
* One TLBI should be sufficient here as the PUD_SIZE
* range is mapped with a single block entry.
*/
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
if (free_mapped)
if (free_mapped) {
flush_tlb_kernel_range(addr, addr + PUD_SIZE);
free_hotplug_page_range(pud_page(pud),
PUD_SIZE, altmap);
}
/* unmap_hotplug_range() flushes TLB for !free_mapped */
continue;
}
WARN_ON(!pud_table(pud));
@@ -1553,6 +1579,7 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
static void unmap_hotplug_range(unsigned long addr, unsigned long end,
bool free_mapped, struct vmem_altmap *altmap)
{
unsigned long start = addr;
unsigned long next;
pgd_t *pgdp, pgd;
@@ -1574,6 +1601,9 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end,
WARN_ON(!pgd_present(pgd));
unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
if (!free_mapped)
flush_tlb_kernel_range(start, end);
}
static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
@@ -1627,7 +1657,7 @@ static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
if (pmd_none(pmd))
continue;
WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
WARN_ON(!pmd_present(pmd) || !pmd_table(pmd));
free_empty_pte_table(pmdp, addr, next, floor, ceiling);
} while (addr = next, addr < end);
@@ -1667,7 +1697,7 @@ static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
if (pud_none(pud))
continue;
WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
WARN_ON(!pud_present(pud) || !pud_table(pud));
free_empty_pmd_table(pudp, addr, next, floor, ceiling);
} while (addr = next, addr < end);
@@ -1763,7 +1793,7 @@ int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
{
vmemmap_verify((pte_t *)pmdp, node, addr, next);
return pmd_sect(READ_ONCE(*pmdp));
return pmd_leaf(READ_ONCE(*pmdp));
}
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -1827,7 +1857,7 @@ void p4d_clear_huge(p4d_t *p4dp)
int pud_clear_huge(pud_t *pudp)
{
if (!pud_sect(READ_ONCE(*pudp)))
if (!pud_leaf(READ_ONCE(*pudp)))
return 0;
pud_clear(pudp);
return 1;
@@ -1835,7 +1865,7 @@ int pud_clear_huge(pud_t *pudp)
int pmd_clear_huge(pmd_t *pmdp)
{
if (!pmd_sect(READ_ONCE(*pmdp)))
if (!pmd_leaf(READ_ONCE(*pmdp)))
return 0;
pmd_clear(pmdp);
return 1;
@@ -2010,6 +2040,107 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
}
static bool addr_splits_kernel_leaf(unsigned long addr)
{
pgd_t *pgdp, pgd;
p4d_t *p4dp, p4d;
pud_t *pudp, pud;
pmd_t *pmdp, pmd;
pte_t *ptep, pte;
/*
* If the given address points at a the start address of
* a possible leaf, we certainly won't split. Otherwise,
* check if we would actually split a leaf by traversing
* the page tables further.
*/
if (IS_ALIGNED(addr, PGDIR_SIZE))
return false;
pgdp = pgd_offset_k(addr);
pgd = pgdp_get(pgdp);
if (!pgd_present(pgd))
return false;
if (IS_ALIGNED(addr, P4D_SIZE))
return false;
p4dp = p4d_offset(pgdp, addr);
p4d = p4dp_get(p4dp);
if (!p4d_present(p4d))
return false;
if (IS_ALIGNED(addr, PUD_SIZE))
return false;
pudp = pud_offset(p4dp, addr);
pud = pudp_get(pudp);
if (!pud_present(pud))
return false;
if (pud_leaf(pud))
return true;
if (IS_ALIGNED(addr, CONT_PMD_SIZE))
return false;
pmdp = pmd_offset(pudp, addr);
pmd = pmdp_get(pmdp);
if (!pmd_present(pmd))
return false;
if (pmd_cont(pmd))
return true;
if (IS_ALIGNED(addr, PMD_SIZE))
return false;
if (pmd_leaf(pmd))
return true;
if (IS_ALIGNED(addr, CONT_PTE_SIZE))
return false;
ptep = pte_offset_kernel(pmdp, addr);
pte = __ptep_get(ptep);
if (!pte_present(pte))
return false;
if (pte_cont(pte))
return true;
return !IS_ALIGNED(addr, PAGE_SIZE);
}
static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages)
{
unsigned long phys_start, phys_end, start, end;
phys_start = PFN_PHYS(pfn);
phys_end = phys_start + nr_pages * PAGE_SIZE;
/* PFN range's linear map edges are leaf entry aligned */
start = __phys_to_virt(phys_start);
end = __phys_to_virt(phys_end);
if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) {
pr_warn("[%lx %lx] splits a leaf entry in linear map\n",
phys_start, phys_end);
return false;
}
/* PFN range's vmemmap edges are leaf entry aligned */
BUILD_BUG_ON(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP));
start = (unsigned long)pfn_to_page(pfn);
end = (unsigned long)pfn_to_page(pfn + nr_pages);
if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) {
pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n",
phys_start, phys_end);
return false;
}
return true;
}
/*
* This memory hotplug notifier helps prevent boot memory from being
* inadvertently removed as it blocks pfn range offlining process in
@@ -2018,8 +2149,11 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
* In future if and when boot memory could be removed, this notifier
* should be dropped and free_hotplug_page_range() should handle any
* reserved pages allocated during boot.
*
* This also blocks any memory remove that would have caused a split
* in leaf entry in kernel linear or vmemmap mapping.
*/
static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
static int prevent_memory_remove_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
struct mem_section *ms;
@@ -2065,11 +2199,15 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
return NOTIFY_DONE;
}
}
if (!can_unmap_without_split(pfn, arg->nr_pages))
return NOTIFY_BAD;
return NOTIFY_OK;
}
static struct notifier_block prevent_bootmem_remove_nb = {
.notifier_call = prevent_bootmem_remove_notifier,
static struct notifier_block prevent_memory_remove_nb = {
.notifier_call = prevent_memory_remove_notifier,
};
/*
@@ -2119,7 +2257,7 @@ static void validate_bootmem_online(void)
}
}
static int __init prevent_bootmem_remove_init(void)
static int __init prevent_memory_remove_init(void)
{
int ret = 0;
@@ -2127,13 +2265,13 @@ static int __init prevent_bootmem_remove_init(void)
return ret;
validate_bootmem_online();
ret = register_memory_notifier(&prevent_bootmem_remove_nb);
ret = register_memory_notifier(&prevent_memory_remove_nb);
if (ret)
pr_err("%s: Notifier registration failed %d\n", __func__, ret);
return ret;
}
early_initcall(prevent_bootmem_remove_init);
early_initcall(prevent_memory_remove_init);
#endif
pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
@@ -2149,7 +2287,7 @@ pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
*/
if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte))
__flush_tlb_range(vma, addr, nr * PAGE_SIZE,
PAGE_SIZE, true, 3);
PAGE_SIZE, 3, TLBF_NOWALKCACHE);
}
return pte;
@@ -2188,7 +2326,7 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));
if (cnp)
ttbr1 |= TTBR_CNP_BIT;
ttbr1 |= TTBRx_EL1_CnP;
replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);

View File

@@ -25,6 +25,11 @@ static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
{
struct page_change_data *masks = walk->private;
/*
* Some users clear and set bits which alias each other (e.g. PTE_NG and
* PTE_PRESENT_INVALID). It is therefore important that we always clear
* first then set.
*/
val &= ~(pgprot_val(masks->clear_mask));
val |= (pgprot_val(masks->set_mask));
@@ -36,7 +41,7 @@ static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
{
pud_t val = pudp_get(pud);
if (pud_sect(val)) {
if (pud_leaf(val)) {
if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
return -EINVAL;
val = __pud(set_pageattr_masks(pud_val(val), walk));
@@ -52,7 +57,7 @@ static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
{
pmd_t val = pmdp_get(pmd);
if (pmd_sect(val)) {
if (pmd_leaf(val)) {
if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
return -EINVAL;
val = __pmd(set_pageattr_masks(pmd_val(val), walk));
@@ -132,11 +137,12 @@ static int __change_memory_common(unsigned long start, unsigned long size,
ret = update_range_prot(start, size, set_mask, clear_mask);
/*
* If the memory is being made valid without changing any other bits
* then a TLBI isn't required as a non-valid entry cannot be cached in
* the TLB.
* If the memory is being switched from present-invalid to valid without
* changing any other bits then a TLBI isn't required as a non-valid
* entry cannot be cached in the TLB.
*/
if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask))
if (pgprot_val(set_mask) != PTE_PRESENT_VALID_KERNEL ||
pgprot_val(clear_mask) != PTE_PRESENT_INVALID)
flush_tlb_kernel_range(start, start + size);
return ret;
}
@@ -237,18 +243,18 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
{
if (enable)
return __change_memory_common(addr, PAGE_SIZE * numpages,
__pgprot(PTE_VALID),
__pgprot(0));
__pgprot(PTE_PRESENT_VALID_KERNEL),
__pgprot(PTE_PRESENT_INVALID));
else
return __change_memory_common(addr, PAGE_SIZE * numpages,
__pgprot(0),
__pgprot(PTE_VALID));
__pgprot(PTE_PRESENT_INVALID),
__pgprot(PTE_PRESENT_VALID_KERNEL));
}
int set_direct_map_invalid_noflush(struct page *page)
{
pgprot_t clear_mask = __pgprot(PTE_VALID);
pgprot_t set_mask = __pgprot(0);
pgprot_t clear_mask = __pgprot(PTE_PRESENT_VALID_KERNEL);
pgprot_t set_mask = __pgprot(PTE_PRESENT_INVALID);
if (!can_set_direct_map())
return 0;
@@ -259,8 +265,8 @@ int set_direct_map_invalid_noflush(struct page *page)
int set_direct_map_default_noflush(struct page *page)
{
pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE);
pgprot_t clear_mask = __pgprot(PTE_RDONLY);
pgprot_t set_mask = __pgprot(PTE_PRESENT_VALID_KERNEL | PTE_WRITE);
pgprot_t clear_mask = __pgprot(PTE_PRESENT_INVALID | PTE_RDONLY);
if (!can_set_direct_map())
return 0;
@@ -296,8 +302,8 @@ static int __set_memory_enc_dec(unsigned long addr,
* entries or Synchronous External Aborts caused by RIPAS_EMPTY
*/
ret = __change_memory_common(addr, PAGE_SIZE * numpages,
__pgprot(set_prot),
__pgprot(clear_prot | PTE_VALID));
__pgprot(set_prot | PTE_PRESENT_INVALID),
__pgprot(clear_prot | PTE_PRESENT_VALID_KERNEL));
if (ret)
return ret;
@@ -311,8 +317,8 @@ static int __set_memory_enc_dec(unsigned long addr,
return ret;
return __change_memory_common(addr, PAGE_SIZE * numpages,
__pgprot(PTE_VALID),
__pgprot(0));
__pgprot(PTE_PRESENT_VALID_KERNEL),
__pgprot(PTE_PRESENT_INVALID));
}
static int realm_set_memory_encrypted(unsigned long addr, int numpages)
@@ -404,15 +410,15 @@ bool kernel_page_present(struct page *page)
pud = READ_ONCE(*pudp);
if (pud_none(pud))
return false;
if (pud_sect(pud))
return true;
if (pud_leaf(pud))
return pud_valid(pud);
pmdp = pmd_offset(pudp, addr);
pmd = READ_ONCE(*pmdp);
if (pmd_none(pmd))
return false;
if (pmd_sect(pmd))
return true;
if (pmd_leaf(pmd))
return pmd_valid(pmd);
ptep = pte_offset_kernel(pmdp, addr);
return pte_valid(__ptep_get(ptep));

View File

@@ -31,36 +31,6 @@ static void *trans_alloc(struct trans_pgd_info *info)
return info->trans_alloc_page(info->trans_alloc_arg);
}
static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
{
pte_t pte = __ptep_get(src_ptep);
if (pte_valid(pte)) {
/*
* Resume will overwrite areas that may be marked
* read only (code, rodata). Clear the RDONLY bit from
* the temporary mappings we use during restore.
*/
__set_pte(dst_ptep, pte_mkwrite_novma(pte));
} else if (!pte_none(pte)) {
/*
* debug_pagealloc will removed the PTE_VALID bit if
* the page isn't in use by the resume kernel. It may have
* been in use by the original kernel, in which case we need
* to put it back in our copy to do the restore.
*
* Other cases include kfence / vmalloc / memfd_secret which
* may call `set_direct_map_invalid_noflush()`.
*
* Before marking this entry valid, check the pfn should
* be mapped.
*/
BUG_ON(!pfn_valid(pte_pfn(pte)));
__set_pte(dst_ptep, pte_mkvalid(pte_mkwrite_novma(pte)));
}
}
static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
pmd_t *src_pmdp, unsigned long start, unsigned long end)
{
@@ -76,7 +46,11 @@ static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
src_ptep = pte_offset_kernel(src_pmdp, start);
do {
_copy_pte(dst_ptep, src_ptep, addr);
pte_t pte = __ptep_get(src_ptep);
if (pte_none(pte))
continue;
__set_pte(dst_ptep, pte_mkvalid_k(pte_mkwrite_novma(pte)));
} while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
return 0;
@@ -109,8 +83,7 @@ static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp,
if (copy_pte(info, dst_pmdp, src_pmdp, addr, next))
return -ENOMEM;
} else {
set_pmd(dst_pmdp,
__pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY));
set_pmd(dst_pmdp, pmd_mkvalid_k(pmd_mkwrite_novma(pmd)));
}
} while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
@@ -145,8 +118,7 @@ static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp,
if (copy_pmd(info, dst_pudp, src_pudp, addr, next))
return -ENOMEM;
} else {
set_pud(dst_pudp,
__pud(pud_val(pud) & ~PUD_SECT_RDONLY));
set_pud(dst_pudp, pud_mkvalid_k(pud_mkwrite_novma(pud)));
}
} while (dst_pudp++, src_pudp++, addr = next, addr != end);

View File

@@ -3,7 +3,7 @@
gen := arch/$(ARCH)/include/generated
kapi := $(gen)/asm
kapisyshdr-y := cpucap-defs.h sysreg-defs.h
kapisyshdr-y := cpucap-defs.h kernel-hwcap.h sysreg-defs.h
kapi-hdrs-y := $(addprefix $(kapi)/, $(kapisyshdr-y))
@@ -18,11 +18,17 @@ kapi: $(kapi-hdrs-y)
quiet_cmd_gen_cpucaps = GEN $@
cmd_gen_cpucaps = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@
quiet_cmd_gen_kernel_hwcap = GEN $@
cmd_gen_kernel_hwcap = mkdir -p $(dir $@); /bin/sh -e $(real-prereqs) > $@
quiet_cmd_gen_sysreg = GEN $@
cmd_gen_sysreg = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@
$(kapi)/cpucap-defs.h: $(src)/gen-cpucaps.awk $(src)/cpucaps FORCE
$(call if_changed,gen_cpucaps)
$(kapi)/kernel-hwcap.h: $(src)/gen-kernel-hwcaps.sh $(srctree)/arch/arm64/include/uapi/asm/hwcap.h FORCE
$(call if_changed,gen_kernel_hwcap)
$(kapi)/sysreg-defs.h: $(src)/gen-sysreg.awk $(src)/sysreg FORCE
$(call if_changed,gen_sysreg)

View File

@@ -48,6 +48,7 @@ HAS_LPA2
HAS_LSE_ATOMICS
HAS_LS64
HAS_LS64_V
HAS_LSUI
HAS_MOPS
HAS_NESTED_VIRT
HAS_BBML2_NOABORT

View File

@@ -0,0 +1,23 @@
#!/bin/sh -e
# SPDX-License-Identifier: GPL-2.0
#
# gen-kernel-hwcap.sh - Generate kernel internal hwcap.h definitions
#
# Copyright 2026 Arm, Ltd.
if [ "$1" = "" ]; then
echo "$0: no filename specified"
exit 1
fi
echo "#ifndef __ASM_KERNEL_HWCAPS_H"
echo "#define __ASM_KERNEL_HWCAPS_H"
echo ""
echo "/* Generated file - do not edit */"
echo ""
grep -E '^#define HWCAP[0-9]*_[A-Z0-9_]+' $1 | \
sed 's/.*HWCAP\([0-9]*\)_\([A-Z0-9_]\+\).*/#define KERNEL_HWCAP_\2\t__khwcap\1_feature(\2)/'
echo ""
echo "#endif /* __ASM_KERNEL_HWCAPS_H */"

View File

@@ -1496,6 +1496,7 @@ UnsignedEnum 27:24 B16B16
0b0000 NI
0b0001 IMP
0b0010 BFSCALE
0b0011 B16MM
EndEnum
UnsignedEnum 23:20 BF16
0b0000 NI
@@ -1522,6 +1523,7 @@ UnsignedEnum 3:0 SVEver
0b0001 SVE2
0b0010 SVE2p1
0b0011 SVE2p2
0b0100 SVE2p3
EndEnum
EndSysreg
@@ -1530,7 +1532,11 @@ UnsignedEnum 63 FA64
0b0 NI
0b1 IMP
EndEnum
Res0 62:61
Res0 62
UnsignedEnum 61 LUT6
0b0 NI
0b1 IMP
EndEnum
UnsignedEnum 60 LUTv2
0b0 NI
0b1 IMP
@@ -1540,6 +1546,7 @@ UnsignedEnum 59:56 SMEver
0b0001 SME2
0b0010 SME2p1
0b0011 SME2p2
0b0100 SME2p3
EndEnum
UnsignedEnum 55:52 I16I64
0b0000 NI
@@ -1654,7 +1661,13 @@ UnsignedEnum 26 F8MM4
0b0 NI
0b1 IMP
EndEnum
Res0 25:2
Res0 25:16
UnsignedEnum 15 F16MM2
0b0 NI
0b1 IMP
EndEnum
Res0 14:8
Raz 7:2
UnsignedEnum 1 F8E4M3
0b0 NI
0b1 IMP
@@ -1835,6 +1848,8 @@ EndEnum
UnsignedEnum 51:48 FHM
0b0000 NI
0b0001 IMP
0b0010 F16F32DOT
0b0011 F16F32MM
EndEnum
UnsignedEnum 47:44 DP
0b0000 NI
@@ -1976,6 +1991,7 @@ EndEnum
UnsignedEnum 59:56 LUT
0b0000 NI
0b0001 IMP
0b0010 LUT6
EndEnum
UnsignedEnum 55:52 CSSC
0b0000 NI
@@ -3655,11 +3671,15 @@ Field 3:0 BS
EndSysreg
Sysreg SMIDR_EL1 3 1 0 0 6
Res0 63:32
Res0 63:60
Field 59:56 NSMC
Field 55:52 HIP
Field 51:32 AFFINITY2
Field 31:24 IMPLEMENTER
Field 23:16 REVISION
Field 15 SMPS
Res0 14:12
Field 14:13 SH
Res0 12
Field 11:0 AFFINITY
EndSysreg
@@ -5172,6 +5192,14 @@ Field 31:16 PARTID_D
Field 15:0 PARTID_I
EndSysreg
Sysreg MPAMSM_EL1 3 0 10 5 3
Res0 63:48
Field 47:40 PMG_D
Res0 39:32
Field 31:16 PARTID_D
Res0 15:0
EndSysreg
Sysreg ISR_EL1 3 0 12 1 0
Res0 63:11
Field 10 IS

View File

@@ -36,7 +36,7 @@ static int agdi_sdei_probe(struct platform_device *pdev,
err = sdei_event_register(adata->sdei_event, agdi_sdei_handler, pdev);
if (err) {
dev_err(&pdev->dev, "Failed to register for SDEI event %d",
dev_err(&pdev->dev, "Failed to register for SDEI event %d\n",
adata->sdei_event);
return err;
}

View File

@@ -1,6 +1,7 @@
menuconfig ARM64_MPAM_DRIVER
bool "MPAM driver"
depends on ARM64 && ARM64_MPAM && EXPERT
depends on ARM64 && ARM64_MPAM
select ACPI_MPAM if ACPI
help
Memory System Resource Partitioning and Monitoring (MPAM) driver for
System IP, e.g. caches and memory controllers.
@@ -22,3 +23,9 @@ config MPAM_KUNIT_TEST
If unsure, say N.
endif
config ARM64_MPAM_RESCTRL_FS
bool
default y if ARM64_MPAM_DRIVER && RESCTRL_FS
select RESCTRL_RMID_DEPENDS_ON_CLOSID
select RESCTRL_ASSIGN_FIXED

View File

@@ -1,4 +1,5 @@
obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o
mpam-y += mpam_devices.o
mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o
ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG

View File

@@ -29,7 +29,15 @@
#include "mpam_internal.h"
DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */
/* Values for the T241 errata workaround */
#define T241_CHIPS_MAX 4
#define T241_CHIP_NSLICES 12
#define T241_SPARE_REG0_OFF 0x1b0000
#define T241_SPARE_REG1_OFF 0x1c0000
#define T241_CHIP_ID(phys) FIELD_GET(GENMASK_ULL(44, 43), phys)
#define T241_SHADOW_REG_OFF(sidx, pid) (0x360048 + (sidx) * 0x10000 + (pid) * 8)
#define SMCCC_SOC_ID_T241 0x036b0241
static void __iomem *t241_scratch_regs[T241_CHIPS_MAX];
/*
* mpam_list_lock protects the SRCU lists when writing. Once the
@@ -75,6 +83,14 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable);
/* When mpam is disabled, the printed reason to aid debugging */
static char *mpam_disable_reason;
/*
* Whether resctrl has been setup. Used by cpuhp in preference to
* mpam_is_enabled(). The disable call after an error interrupt makes
* mpam_is_enabled() false before the cpuhp callbacks are made.
* Reads/writes should hold mpam_cpuhp_state_lock, (or be cpuhp callbacks).
*/
static bool mpam_resctrl_enabled;
/*
* An MSC is a physical container for controls and monitors, each identified by
* their RIS index. These share a base-address, interrupts and some MMIO
@@ -624,6 +640,86 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc,
return ERR_PTR(-ENOENT);
}
static int mpam_enable_quirk_nvidia_t241_1(struct mpam_msc *msc,
const struct mpam_quirk *quirk)
{
s32 soc_id = arm_smccc_get_soc_id_version();
struct resource *r;
phys_addr_t phys;
/*
* A mapping to a device other than the MSC is needed, check
* SOC_ID is NVIDIA T241 chip (036b:0241)
*/
if (soc_id < 0 || soc_id != SMCCC_SOC_ID_T241)
return -EINVAL;
r = platform_get_resource(msc->pdev, IORESOURCE_MEM, 0);
if (!r)
return -EINVAL;
/* Find the internal registers base addr from the CHIP ID */
msc->t241_id = T241_CHIP_ID(r->start);
phys = FIELD_PREP(GENMASK_ULL(45, 44), msc->t241_id) | 0x19000000ULL;
t241_scratch_regs[msc->t241_id] = ioremap(phys, SZ_8M);
if (WARN_ON_ONCE(!t241_scratch_regs[msc->t241_id]))
return -EINVAL;
pr_info_once("Enabled workaround for NVIDIA T241 erratum T241-MPAM-1\n");
return 0;
}
static const struct mpam_quirk mpam_quirks[] = {
{
/* NVIDIA t241 erratum T241-MPAM-1 */
.init = mpam_enable_quirk_nvidia_t241_1,
.iidr = MPAM_IIDR_NVIDIA_T241,
.iidr_mask = MPAM_IIDR_MATCH_ONE,
.workaround = T241_SCRUB_SHADOW_REGS,
},
{
/* NVIDIA t241 erratum T241-MPAM-4 */
.iidr = MPAM_IIDR_NVIDIA_T241,
.iidr_mask = MPAM_IIDR_MATCH_ONE,
.workaround = T241_FORCE_MBW_MIN_TO_ONE,
},
{
/* NVIDIA t241 erratum T241-MPAM-6 */
.iidr = MPAM_IIDR_NVIDIA_T241,
.iidr_mask = MPAM_IIDR_MATCH_ONE,
.workaround = T241_MBW_COUNTER_SCALE_64,
},
{
/* ARM CMN-650 CSU erratum 3642720 */
.iidr = MPAM_IIDR_ARM_CMN_650,
.iidr_mask = MPAM_IIDR_MATCH_ONE,
.workaround = IGNORE_CSU_NRDY,
},
{ NULL } /* Sentinel */
};
static void mpam_enable_quirks(struct mpam_msc *msc)
{
const struct mpam_quirk *quirk;
for (quirk = &mpam_quirks[0]; quirk->iidr_mask; quirk++) {
int err = 0;
if (quirk->iidr != (msc->iidr & quirk->iidr_mask))
continue;
if (quirk->init)
err = quirk->init(msc, quirk);
if (err)
continue;
mpam_set_quirk(quirk->workaround, msc);
}
}
/*
* IHI009A.a has this nugget: "If a monitor does not support automatic behaviour
* of NRDY, software can use this bit for any purpose" - so hardware might not
@@ -715,6 +811,13 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris)
mpam_set_feature(mpam_feat_mbw_part, props);
props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features);
/*
* The BWA_WD field can represent 0-63, but the control fields it
* describes have a maximum of 16 bits.
*/
props->bwa_wd = min(props->bwa_wd, 16);
if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features))
mpam_set_feature(mpam_feat_mbw_max, props);
@@ -851,8 +954,11 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc)
/* Grab an IDR value to find out how many RIS there are */
mutex_lock(&msc->part_sel_lock);
idr = mpam_msc_read_idr(msc);
msc->iidr = mpam_read_partsel_reg(msc, IIDR);
mutex_unlock(&msc->part_sel_lock);
mpam_enable_quirks(msc);
msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr);
/* Use these values so partid/pmg always starts with a valid value */
@@ -903,6 +1009,7 @@ struct mon_read {
enum mpam_device_features type;
u64 *val;
int err;
bool waited_timeout;
};
static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris)
@@ -1052,7 +1159,7 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val,
}
}
static u64 mpam_msmon_overflow_val(enum mpam_device_features type)
static u64 __mpam_msmon_overflow_val(enum mpam_device_features type)
{
/* TODO: implement scaling counters */
switch (type) {
@@ -1067,6 +1174,18 @@ static u64 mpam_msmon_overflow_val(enum mpam_device_features type)
}
}
static u64 mpam_msmon_overflow_val(enum mpam_device_features type,
struct mpam_msc *msc)
{
u64 overflow_val = __mpam_msmon_overflow_val(type);
if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) &&
type != mpam_feat_msmon_mbwu_63counter)
overflow_val *= 64;
return overflow_val;
}
static void __ris_msmon_read(void *arg)
{
u64 now;
@@ -1137,6 +1256,10 @@ static void __ris_msmon_read(void *arg)
if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops))
nrdy = now & MSMON___NRDY;
now = FIELD_GET(MSMON___VALUE, now);
if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout)
nrdy = false;
break;
case mpam_feat_msmon_mbwu_31counter:
case mpam_feat_msmon_mbwu_44counter:
@@ -1157,13 +1280,17 @@ static void __ris_msmon_read(void *arg)
now = FIELD_GET(MSMON___VALUE, now);
}
if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) &&
m->type != mpam_feat_msmon_mbwu_63counter)
now *= 64;
if (nrdy)
break;
mbwu_state = &ris->mbwu_state[ctx->mon];
if (overflow)
mbwu_state->correction += mpam_msmon_overflow_val(m->type);
mbwu_state->correction += mpam_msmon_overflow_val(m->type, msc);
/*
* Include bandwidth consumed before the last hardware reset and
@@ -1270,6 +1397,7 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx,
.ctx = ctx,
.type = type,
.val = val,
.waited_timeout = true,
};
*val = 0;
@@ -1338,6 +1466,75 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd)
__mpam_write_reg(msc, reg, bm);
}
static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid)
{
int sidx, i, lcount = 1000;
void __iomem *regs;
u64 val0, val;
regs = t241_scratch_regs[ris->vmsc->msc->t241_id];
for (i = 0; i < lcount; i++) {
/* Read the shadow register at index 0 */
val0 = readq_relaxed(regs + T241_SHADOW_REG_OFF(0, partid));
/* Check if all the shadow registers have the same value */
for (sidx = 1; sidx < T241_CHIP_NSLICES; sidx++) {
val = readq_relaxed(regs +
T241_SHADOW_REG_OFF(sidx, partid));
if (val != val0)
break;
}
if (sidx == T241_CHIP_NSLICES)
break;
}
if (i == lcount)
pr_warn_once("t241: inconsistent values in shadow regs");
/* Write a value zero to spare registers to take effect of MBW conf */
writeq_relaxed(0, regs + T241_SPARE_REG0_OFF);
writeq_relaxed(0, regs + T241_SPARE_REG1_OFF);
}
static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid,
struct mpam_config *cfg)
{
if (mpam_has_quirk(T241_SCRUB_SHADOW_REGS, ris->vmsc->msc))
mpam_apply_t241_erratum(ris, partid);
}
static u16 mpam_wa_t241_force_mbw_min_to_one(struct mpam_props *props)
{
u16 max_hw_value, min_hw_granule, res0_bits;
res0_bits = 16 - props->bwa_wd;
max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits;
min_hw_granule = ~max_hw_value;
return min_hw_granule + 1;
}
static u16 mpam_wa_t241_calc_min_from_max(struct mpam_props *props,
struct mpam_config *cfg)
{
u16 val = 0;
u16 max;
u16 delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1;
if (mpam_has_feature(mpam_feat_mbw_max, cfg)) {
max = cfg->mbw_max;
} else {
/* Resetting. Hence, use the ris specific default. */
max = GENMASK(15, 16 - props->bwa_wd);
}
if (max > delta)
val = max - delta;
return val;
}
/* Called via IPI. Call while holding an SRCU reference */
static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
struct mpam_config *cfg)
@@ -1364,36 +1561,41 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
__mpam_intpart_sel(ris->ris_idx, partid, msc);
}
if (mpam_has_feature(mpam_feat_cpor_part, rprops) &&
mpam_has_feature(mpam_feat_cpor_part, cfg)) {
if (cfg->reset_cpbm)
mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd);
else
if (mpam_has_feature(mpam_feat_cpor_part, rprops)) {
if (mpam_has_feature(mpam_feat_cpor_part, cfg))
mpam_write_partsel_reg(msc, CPBM, cfg->cpbm);
else
mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd);
}
if (mpam_has_feature(mpam_feat_mbw_part, rprops) &&
mpam_has_feature(mpam_feat_mbw_part, cfg)) {
if (cfg->reset_mbw_pbm)
if (mpam_has_feature(mpam_feat_mbw_part, rprops)) {
if (mpam_has_feature(mpam_feat_mbw_part, cfg))
mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits);
else
mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm);
}
if (mpam_has_feature(mpam_feat_mbw_min, rprops) &&
mpam_has_feature(mpam_feat_mbw_min, cfg))
mpam_write_partsel_reg(msc, MBW_MIN, 0);
if (mpam_has_feature(mpam_feat_mbw_min, rprops)) {
u16 val = 0;
if (mpam_has_feature(mpam_feat_mbw_max, rprops) &&
mpam_has_feature(mpam_feat_mbw_max, cfg)) {
if (cfg->reset_mbw_max)
mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX);
else
mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max);
if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) {
u16 min = mpam_wa_t241_force_mbw_min_to_one(rprops);
val = mpam_wa_t241_calc_min_from_max(rprops, cfg);
val = max(val, min);
}
mpam_write_partsel_reg(msc, MBW_MIN, val);
}
if (mpam_has_feature(mpam_feat_mbw_prop, rprops) &&
mpam_has_feature(mpam_feat_mbw_prop, cfg))
if (mpam_has_feature(mpam_feat_mbw_max, rprops)) {
if (mpam_has_feature(mpam_feat_mbw_max, cfg))
mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max);
else
mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX);
}
if (mpam_has_feature(mpam_feat_mbw_prop, rprops))
mpam_write_partsel_reg(msc, MBW_PROP, 0);
if (mpam_has_feature(mpam_feat_cmax_cmax, rprops))
@@ -1421,6 +1623,8 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
mpam_write_partsel_reg(msc, PRI, pri_val);
}
mpam_quirk_post_config_change(ris, partid, cfg);
mutex_unlock(&msc->part_sel_lock);
}
@@ -1491,16 +1695,6 @@ static int mpam_save_mbwu_state(void *arg)
return 0;
}
static void mpam_init_reset_cfg(struct mpam_config *reset_cfg)
{
*reset_cfg = (struct mpam_config) {
.reset_cpbm = true,
.reset_mbw_pbm = true,
.reset_mbw_max = true,
};
bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST);
}
/*
* Called via smp_call_on_cpu() to prevent migration, while still being
* pre-emptible. Caller must hold mpam_srcu.
@@ -1508,14 +1702,12 @@ static void mpam_init_reset_cfg(struct mpam_config *reset_cfg)
static int mpam_reset_ris(void *arg)
{
u16 partid, partid_max;
struct mpam_config reset_cfg;
struct mpam_config reset_cfg = {};
struct mpam_msc_ris *ris = arg;
if (ris->in_reset_state)
return 0;
mpam_init_reset_cfg(&reset_cfg);
spin_lock(&partid_max_lock);
partid_max = mpam_partid_max;
spin_unlock(&partid_max_lock);
@@ -1630,6 +1822,9 @@ static int mpam_cpu_online(unsigned int cpu)
mpam_reprogram_msc(msc);
}
if (mpam_resctrl_enabled)
return mpam_resctrl_online_cpu(cpu);
return 0;
}
@@ -1673,6 +1868,9 @@ static int mpam_cpu_offline(unsigned int cpu)
{
struct mpam_msc *msc;
if (mpam_resctrl_enabled)
mpam_resctrl_offline_cpu(cpu);
guard(srcu)(&mpam_srcu);
list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
srcu_read_lock_held(&mpam_srcu)) {
@@ -1969,6 +2167,7 @@ static bool mpam_has_cmax_wd_feature(struct mpam_props *props)
* resulting safe value must be compatible with both. When merging values in
* the tree, all the aliasing resources must be handled first.
* On mismatch, parent is modified.
* Quirks on an MSC will apply to all MSC in that class.
*/
static void __props_mismatch(struct mpam_props *parent,
struct mpam_props *child, bool alias)
@@ -2088,6 +2287,7 @@ static void __props_mismatch(struct mpam_props *parent,
* nobble the class feature, as we can't configure all the resources.
* e.g. The L3 cache is composed of two resources with 13 and 17 portion
* bitmaps respectively.
* Quirks on an MSC will apply to all MSC in that class.
*/
static void
__class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc)
@@ -2101,6 +2301,9 @@ __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc)
dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n",
(long)cprops->features, (long)vprops->features);
/* Merge quirks */
class->quirks |= vmsc->msc->quirks;
/* Take the safe value for any common features */
__props_mismatch(cprops, vprops, false);
}
@@ -2165,6 +2368,9 @@ static void mpam_enable_merge_class_features(struct mpam_component *comp)
list_for_each_entry(vmsc, &comp->vmsc, comp_list)
__class_props_mismatch(class, vmsc);
if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class))
mpam_clear_feature(mpam_feat_mbw_min, &class->props);
}
/*
@@ -2518,6 +2724,12 @@ static void mpam_enable_once(void)
mutex_unlock(&mpam_list_lock);
cpus_read_unlock();
if (!err) {
err = mpam_resctrl_setup();
if (err)
pr_err("Failed to initialise resctrl: %d\n", err);
}
if (err) {
mpam_disable_reason = "Failed to enable.";
schedule_work(&mpam_broken_work);
@@ -2525,6 +2737,7 @@ static void mpam_enable_once(void)
}
static_branch_enable(&mpam_enabled);
mpam_resctrl_enabled = true;
mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline,
"mpam:online");
@@ -2557,7 +2770,7 @@ static void mpam_reset_component_locked(struct mpam_component *comp)
}
}
static void mpam_reset_class_locked(struct mpam_class *class)
void mpam_reset_class_locked(struct mpam_class *class)
{
struct mpam_component *comp;
@@ -2584,24 +2797,39 @@ static void mpam_reset_class(struct mpam_class *class)
void mpam_disable(struct work_struct *ignored)
{
int idx;
bool do_resctrl_exit;
struct mpam_class *class;
struct mpam_msc *msc, *tmp;
if (mpam_is_enabled())
static_branch_disable(&mpam_enabled);
mutex_lock(&mpam_cpuhp_state_lock);
if (mpam_cpuhp_state) {
cpuhp_remove_state(mpam_cpuhp_state);
mpam_cpuhp_state = 0;
}
/*
* Removing the cpuhp state called mpam_cpu_offline() and told resctrl
* all the CPUs are offline.
*/
do_resctrl_exit = mpam_resctrl_enabled;
mpam_resctrl_enabled = false;
mutex_unlock(&mpam_cpuhp_state_lock);
static_branch_disable(&mpam_enabled);
if (do_resctrl_exit)
mpam_resctrl_exit();
mpam_unregister_irqs();
idx = srcu_read_lock(&mpam_srcu);
list_for_each_entry_srcu(class, &mpam_classes, classes_list,
srcu_read_lock_held(&mpam_srcu))
srcu_read_lock_held(&mpam_srcu)) {
mpam_reset_class(class);
if (do_resctrl_exit)
mpam_resctrl_teardown_class(class);
}
srcu_read_unlock(&mpam_srcu, idx);
mutex_lock(&mpam_list_lock);
@@ -2692,6 +2920,7 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid,
srcu_read_lock_held(&mpam_srcu)) {
arg.ris = ris;
mpam_touch_msc(msc, __write_config, &arg);
ris->in_reset_state = false;
}
mutex_unlock(&msc->cfg_lock);
}

View File

@@ -12,22 +12,31 @@
#include <linux/jump_label.h>
#include <linux/llist.h>
#include <linux/mutex.h>
#include <linux/resctrl.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
#include <linux/types.h>
#include <asm/mpam.h>
#define MPAM_MSC_MAX_NUM_RIS 16
struct platform_device;
DECLARE_STATIC_KEY_FALSE(mpam_enabled);
#ifdef CONFIG_MPAM_KUNIT_TEST
#define PACKED_FOR_KUNIT __packed
#else
#define PACKED_FOR_KUNIT
#endif
/*
* This 'mon' values must not alias an actual monitor, so must be larger than
* U16_MAX, but not be confused with an errno value, so smaller than
* (u32)-SZ_4K.
* USE_PRE_ALLOCATED is used to avoid confusion with an actual monitor.
*/
#define USE_PRE_ALLOCATED (U16_MAX + 1)
static inline bool mpam_is_enabled(void)
{
return static_branch_likely(&mpam_enabled);
@@ -76,6 +85,8 @@ struct mpam_msc {
u8 pmg_max;
unsigned long ris_idxs;
u32 ris_max;
u32 iidr;
u16 quirks;
/*
* error_irq_lock is taken when registering/unregistering the error
@@ -119,6 +130,9 @@ struct mpam_msc {
void __iomem *mapped_hwpage;
size_t mapped_hwpage_sz;
/* Values only used on some platforms for quirks */
u32 t241_id;
struct mpam_garbage garbage;
};
@@ -207,6 +221,42 @@ struct mpam_props {
#define mpam_set_feature(_feat, x) __set_bit(_feat, (x)->features)
#define mpam_clear_feature(_feat, x) __clear_bit(_feat, (x)->features)
/* Workaround bits for msc->quirks */
enum mpam_device_quirks {
T241_SCRUB_SHADOW_REGS,
T241_FORCE_MBW_MIN_TO_ONE,
T241_MBW_COUNTER_SCALE_64,
IGNORE_CSU_NRDY,
MPAM_QUIRK_LAST
};
#define mpam_has_quirk(_quirk, x) ((1 << (_quirk) & (x)->quirks))
#define mpam_set_quirk(_quirk, x) ((x)->quirks |= (1 << (_quirk)))
struct mpam_quirk {
int (*init)(struct mpam_msc *msc, const struct mpam_quirk *quirk);
u32 iidr;
u32 iidr_mask;
enum mpam_device_quirks workaround;
};
#define MPAM_IIDR_MATCH_ONE (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0xfff) | \
FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0xf) | \
FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0xf) | \
FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff))
#define MPAM_IIDR_NVIDIA_T241 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0x241) | \
FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \
FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \
FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b))
#define MPAM_IIDR_ARM_CMN_650 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0) | \
FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \
FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \
FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x43b))
/* The values for MSMON_CFG_MBWU_FLT.RWBW */
enum mon_filter_options {
COUNT_BOTH = 0,
@@ -215,7 +265,11 @@ enum mon_filter_options {
};
struct mon_cfg {
u16 mon;
/*
* mon must be large enough to hold out of range values like
* USE_PRE_ALLOCATED
*/
u32 mon;
u8 pmg;
bool match_pmg;
bool csu_exclude_clean;
@@ -246,6 +300,7 @@ struct mpam_class {
struct mpam_props props;
u32 nrdy_usec;
u16 quirks;
u8 level;
enum mpam_class_types type;
@@ -266,10 +321,6 @@ struct mpam_config {
u32 mbw_pbm;
u16 mbw_max;
bool reset_cpbm;
bool reset_mbw_pbm;
bool reset_mbw_max;
struct mpam_garbage garbage;
};
@@ -337,6 +388,32 @@ struct mpam_msc_ris {
struct mpam_garbage garbage;
};
struct mpam_resctrl_dom {
struct mpam_component *ctrl_comp;
/*
* There is no single mon_comp because different events may be backed
* by different class/components. mon_comp is indexed by the event
* number.
*/
struct mpam_component *mon_comp[QOS_NUM_EVENTS];
struct rdt_ctrl_domain resctrl_ctrl_dom;
struct rdt_l3_mon_domain resctrl_mon_dom;
};
struct mpam_resctrl_res {
struct mpam_class *class;
struct rdt_resource resctrl_res;
bool cdp_enabled;
};
struct mpam_resctrl_mon {
struct mpam_class *class;
/* per-class data that resctrl needs will live here */
};
static inline int mpam_alloc_csu_mon(struct mpam_class *class)
{
struct mpam_props *cprops = &class->props;
@@ -381,6 +458,9 @@ extern u8 mpam_pmg_max;
void mpam_enable(struct work_struct *work);
void mpam_disable(struct work_struct *work);
/* Reset all the RIS in a class under cpus_read_lock() */
void mpam_reset_class_locked(struct mpam_class *class);
int mpam_apply_config(struct mpam_component *comp, u16 partid,
struct mpam_config *cfg);
@@ -391,6 +471,20 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx);
int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level,
cpumask_t *affinity);
#ifdef CONFIG_RESCTRL_FS
int mpam_resctrl_setup(void);
void mpam_resctrl_exit(void);
int mpam_resctrl_online_cpu(unsigned int cpu);
void mpam_resctrl_offline_cpu(unsigned int cpu);
void mpam_resctrl_teardown_class(struct mpam_class *class);
#else
static inline int mpam_resctrl_setup(void) { return 0; }
static inline void mpam_resctrl_exit(void) { }
static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; }
static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { }
static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { }
#endif /* CONFIG_RESCTRL_FS */
/*
* MPAM MSCs have the following register layout. See:
* Arm Memory System Resource Partitioning and Monitoring (MPAM) System

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,315 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2025 Arm Ltd.
/* This file is intended to be included into mpam_resctrl.c */
#include <kunit/test.h>
#include <linux/array_size.h>
#include <linux/bits.h>
#include <linux/math.h>
#include <linux/sprintf.h>
struct percent_value_case {
u8 pc;
u8 width;
u16 value;
};
/*
* Mysterious inscriptions taken from the union of ARM DDI 0598D.b,
* "Arm Architecture Reference Manual Supplement - Memory System
* Resource Partitioning and Monitoring (MPAM), for A-profile
* architecture", Section 9.8, "About the fixed-point fractional
* format" (exact percentage entries only) and ARM IHI0099B.a
* "MPAM system component specification", Section 9.3,
* "The fixed-point fractional format":
*/
static const struct percent_value_case percent_value_cases[] = {
/* Architectural cases: */
{ 1, 8, 1 }, { 1, 12, 0x27 }, { 1, 16, 0x28e },
{ 25, 8, 0x3f }, { 25, 12, 0x3ff }, { 25, 16, 0x3fff },
{ 33, 8, 0x53 }, { 33, 12, 0x546 }, { 33, 16, 0x5479 },
{ 35, 8, 0x58 }, { 35, 12, 0x598 }, { 35, 16, 0x5998 },
{ 45, 8, 0x72 }, { 45, 12, 0x732 }, { 45, 16, 0x7332 },
{ 50, 8, 0x7f }, { 50, 12, 0x7ff }, { 50, 16, 0x7fff },
{ 52, 8, 0x84 }, { 52, 12, 0x850 }, { 52, 16, 0x851d },
{ 55, 8, 0x8b }, { 55, 12, 0x8cb }, { 55, 16, 0x8ccb },
{ 58, 8, 0x93 }, { 58, 12, 0x946 }, { 58, 16, 0x9479 },
{ 75, 8, 0xbf }, { 75, 12, 0xbff }, { 75, 16, 0xbfff },
{ 80, 8, 0xcb }, { 80, 12, 0xccb }, { 80, 16, 0xcccb },
{ 88, 8, 0xe0 }, { 88, 12, 0xe13 }, { 88, 16, 0xe146 },
{ 95, 8, 0xf2 }, { 95, 12, 0xf32 }, { 95, 16, 0xf332 },
{ 100, 8, 0xff }, { 100, 12, 0xfff }, { 100, 16, 0xffff },
};
static void test_percent_value_desc(const struct percent_value_case *param,
char *desc)
{
snprintf(desc, KUNIT_PARAM_DESC_SIZE,
"pc=%d, width=%d, value=0x%.*x\n",
param->pc, param->width,
DIV_ROUND_UP(param->width, 4), param->value);
}
KUNIT_ARRAY_PARAM(test_percent_value, percent_value_cases,
test_percent_value_desc);
struct percent_value_test_info {
u32 pc; /* result of value-to-percent conversion */
u32 value; /* result of percent-to-value conversion */
u32 max_value; /* maximum raw value allowed by test params */
unsigned int shift; /* promotes raw testcase value to 16 bits */
};
/*
* Convert a reference percentage to a fixed-point MAX value and
* vice-versa, based on param (not test->param_value!)
*/
static void __prepare_percent_value_test(struct kunit *test,
struct percent_value_test_info *res,
const struct percent_value_case *param)
{
struct mpam_props fake_props = { };
/* Reject bogus test parameters that would break the tests: */
KUNIT_ASSERT_GE(test, param->width, 1);
KUNIT_ASSERT_LE(test, param->width, 16);
KUNIT_ASSERT_LT(test, param->value, 1 << param->width);
mpam_set_feature(mpam_feat_mbw_max, &fake_props);
fake_props.bwa_wd = param->width;
res->shift = 16 - param->width;
res->max_value = GENMASK_U32(param->width - 1, 0);
res->value = percent_to_mbw_max(param->pc, &fake_props);
res->pc = mbw_max_to_percent(param->value << res->shift, &fake_props);
}
static void test_get_mba_granularity(struct kunit *test)
{
int ret;
struct mpam_props fake_props = { };
/* Use MBW_MAX */
mpam_set_feature(mpam_feat_mbw_max, &fake_props);
fake_props.bwa_wd = 0;
KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_max(&fake_props));
fake_props.bwa_wd = 1;
KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props));
/* Architectural maximum: */
fake_props.bwa_wd = 16;
KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props));
/* No usable control... */
fake_props.bwa_wd = 0;
ret = get_mba_granularity(&fake_props);
KUNIT_EXPECT_EQ(test, ret, 0);
fake_props.bwa_wd = 1;
ret = get_mba_granularity(&fake_props);
KUNIT_EXPECT_EQ(test, ret, 50); /* DIV_ROUND_UP(100, 1 << 1)% = 50% */
fake_props.bwa_wd = 2;
ret = get_mba_granularity(&fake_props);
KUNIT_EXPECT_EQ(test, ret, 25); /* DIV_ROUND_UP(100, 1 << 2)% = 25% */
fake_props.bwa_wd = 3;
ret = get_mba_granularity(&fake_props);
KUNIT_EXPECT_EQ(test, ret, 13); /* DIV_ROUND_UP(100, 1 << 3)% = 13% */
fake_props.bwa_wd = 6;
ret = get_mba_granularity(&fake_props);
KUNIT_EXPECT_EQ(test, ret, 2); /* DIV_ROUND_UP(100, 1 << 6)% = 2% */
fake_props.bwa_wd = 7;
ret = get_mba_granularity(&fake_props);
KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 7)% = 1% */
/* Granularity saturates at 1% */
fake_props.bwa_wd = 16; /* architectural maximum */
ret = get_mba_granularity(&fake_props);
KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 16)% = 1% */
}
static void test_mbw_max_to_percent(struct kunit *test)
{
const struct percent_value_case *param = test->param_value;
struct percent_value_test_info res;
/*
* Since the reference values in percent_value_cases[] all
* correspond to exact percentages, round-to-nearest will
* always give the exact percentage back when the MPAM max
* value has precision of 0.5% or finer. (Always true for the
* reference data, since they all specify 8 bits or more of
* precision.
*
* So, keep it simple and demand an exact match:
*/
__prepare_percent_value_test(test, &res, param);
KUNIT_EXPECT_EQ(test, res.pc, param->pc);
}
static void test_percent_to_mbw_max(struct kunit *test)
{
const struct percent_value_case *param = test->param_value;
struct percent_value_test_info res;
__prepare_percent_value_test(test, &res, param);
KUNIT_EXPECT_GE(test, res.value, param->value << res.shift);
KUNIT_EXPECT_LE(test, res.value, (param->value + 1) << res.shift);
KUNIT_EXPECT_LE(test, res.value, res.max_value << res.shift);
/* No flexibility allowed for 0% and 100%! */
if (param->pc == 0)
KUNIT_EXPECT_EQ(test, res.value, 0);
if (param->pc == 100)
KUNIT_EXPECT_EQ(test, res.value, res.max_value << res.shift);
}
static const void *test_all_bwa_wd_gen_params(struct kunit *test, const void *prev,
char *desc)
{
uintptr_t param = (uintptr_t)prev;
if (param > 15)
return NULL;
param++;
snprintf(desc, KUNIT_PARAM_DESC_SIZE, "wd=%u\n", (unsigned int)param);
return (void *)param;
}
static unsigned int test_get_bwa_wd(struct kunit *test)
{
uintptr_t param = (uintptr_t)test->param_value;
KUNIT_ASSERT_GE(test, param, 1);
KUNIT_ASSERT_LE(test, param, 16);
return param;
}
static void test_mbw_max_to_percent_limits(struct kunit *test)
{
struct mpam_props fake_props = {0};
u32 max_value;
mpam_set_feature(mpam_feat_mbw_max, &fake_props);
fake_props.bwa_wd = test_get_bwa_wd(test);
max_value = GENMASK(15, 16 - fake_props.bwa_wd);
KUNIT_EXPECT_EQ(test, mbw_max_to_percent(max_value, &fake_props),
MAX_MBA_BW);
KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props),
get_mba_min(&fake_props));
/*
* Rounding policy dependent 0% sanity-check:
* With round-to-nearest, the minimum mbw_max value really
* should map to 0% if there are at least 200 steps.
* (100 steps may be enough for some other rounding policies.)
*/
if (fake_props.bwa_wd >= 8)
KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), 0);
if (fake_props.bwa_wd < 8 &&
mbw_max_to_percent(0, &fake_props) == 0)
kunit_warn(test, "wd=%d: Testsuite/driver Rounding policy mismatch?",
fake_props.bwa_wd);
}
/*
* Check that converting a percentage to mbw_max and back again (or, as
* appropriate, vice-versa) always restores the original value:
*/
static void test_percent_max_roundtrip_stability(struct kunit *test)
{
struct mpam_props fake_props = {0};
unsigned int shift;
u32 pc, max, pc2, max2;
mpam_set_feature(mpam_feat_mbw_max, &fake_props);
fake_props.bwa_wd = test_get_bwa_wd(test);
shift = 16 - fake_props.bwa_wd;
/*
* Converting a valid value from the coarser scale to the finer
* scale and back again must yield the original value:
*/
if (fake_props.bwa_wd >= 7) {
/* More than 100 steps: only test exact pc values: */
for (pc = get_mba_min(&fake_props); pc <= MAX_MBA_BW; pc++) {
max = percent_to_mbw_max(pc, &fake_props);
pc2 = mbw_max_to_percent(max, &fake_props);
KUNIT_EXPECT_EQ(test, pc2, pc);
}
} else {
/* Fewer than 100 steps: only test exact mbw_max values: */
for (max = 0; max < 1 << 16; max += 1 << shift) {
pc = mbw_max_to_percent(max, &fake_props);
max2 = percent_to_mbw_max(pc, &fake_props);
KUNIT_EXPECT_EQ(test, max2, max);
}
}
}
static void test_percent_to_max_rounding(struct kunit *test)
{
const struct percent_value_case *param = test->param_value;
unsigned int num_rounded_up = 0, total = 0;
struct percent_value_test_info res;
for (param = percent_value_cases, total = 0;
param < &percent_value_cases[ARRAY_SIZE(percent_value_cases)];
param++, total++) {
__prepare_percent_value_test(test, &res, param);
if (res.value > param->value << res.shift)
num_rounded_up++;
}
/*
* The MPAM driver applies a round-to-nearest policy, whereas a
* round-down policy seems to have been applied in the
* reference table from which the test vectors were selected.
*
* For a large and well-distributed suite of test vectors,
* about half should be rounded up and half down compared with
* the reference table. The actual test vectors are few in
* number and probably not very well distributed however, so
* tolerate a round-up rate of between 1/4 and 3/4 before
* crying foul:
*/
kunit_info(test, "Round-up rate: %u%% (%u/%u)\n",
DIV_ROUND_CLOSEST(num_rounded_up * 100, total),
num_rounded_up, total);
KUNIT_EXPECT_GE(test, 4 * num_rounded_up, 1 * total);
KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total);
}
static struct kunit_case mpam_resctrl_test_cases[] = {
KUNIT_CASE(test_get_mba_granularity),
KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params),
KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params),
KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params),
KUNIT_CASE(test_percent_to_max_rounding),
KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability,
test_all_bwa_wd_gen_params),
{}
};
static struct kunit_suite mpam_resctrl_test_suite = {
.name = "mpam_resctrl_test_suite",
.test_cases = mpam_resctrl_test_cases,
};
kunit_test_suites(&mpam_resctrl_test_suite);

View File

@@ -5,6 +5,7 @@
#define __LINUX_ARM_MPAM_H
#include <linux/acpi.h>
#include <linux/resctrl_types.h>
#include <linux/types.h>
struct mpam_msc;
@@ -49,6 +50,37 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
}
#endif
bool resctrl_arch_alloc_capable(void);
bool resctrl_arch_mon_capable(void);
void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid);
void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid);
void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid);
void resctrl_arch_sched_in(struct task_struct *tsk);
bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid);
bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid);
u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid);
void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid);
u32 resctrl_arch_system_num_rmid_idx(void);
struct rdt_resource;
void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid);
void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx);
/*
* The CPU configuration for MPAM is cheap to write, and is only written if it
* has changed. No need for fine grained enables.
*/
static inline void resctrl_arch_enable_mon(void) { }
static inline void resctrl_arch_disable_mon(void) { }
static inline void resctrl_arch_enable_alloc(void) { }
static inline void resctrl_arch_disable_alloc(void) { }
static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
{
return val;
}
/**
* mpam_register_requestor() - Register a requestor with the MPAM driver
* @partid_max: The maximum PARTID value the requestor can generate.

View File

@@ -321,7 +321,7 @@ static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
{
instrumentation_begin();
syscall_exit_to_user_mode_work(regs);
local_irq_disable_exit_to_user();
local_irq_disable();
syscall_exit_to_user_mode_prepare(regs);
instrumentation_end();
exit_to_user_mode();

View File

@@ -100,37 +100,6 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
instrumentation_end();
}
/**
* local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable()
* @ti_work: Cached TIF flags gathered with interrupts disabled
*
* Defaults to local_irq_enable(). Can be supplied by architecture specific
* code.
*/
static inline void local_irq_enable_exit_to_user(unsigned long ti_work);
#ifndef local_irq_enable_exit_to_user
static __always_inline void local_irq_enable_exit_to_user(unsigned long ti_work)
{
local_irq_enable();
}
#endif
/**
* local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable()
*
* Defaults to local_irq_disable(). Can be supplied by architecture specific
* code.
*/
static inline void local_irq_disable_exit_to_user(void);
#ifndef local_irq_disable_exit_to_user
static __always_inline void local_irq_disable_exit_to_user(void)
{
local_irq_disable();
}
#endif
/**
* arch_exit_to_user_mode_work - Architecture specific TIF work for exit
* to user mode.
@@ -335,6 +304,8 @@ static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
*/
static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs)
{
lockdep_assert_irqs_disabled();
instrumentation_begin();
irqentry_exit_to_user_mode_prepare(regs);
instrumentation_end();
@@ -365,6 +336,205 @@ typedef struct irqentry_state {
} irqentry_state_t;
#endif
/**
* irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt
*
* Conditional reschedule with additional sanity checks.
*/
void raw_irqentry_exit_cond_resched(void);
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched
#define irqentry_exit_cond_resched_dynamic_disabled NULL
DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
#define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)()
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
void dynamic_irqentry_exit_cond_resched(void);
#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched()
#endif
#else /* CONFIG_PREEMPT_DYNAMIC */
#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched()
#endif /* CONFIG_PREEMPT_DYNAMIC */
/**
* irqentry_enter_from_kernel_mode - Establish state before invoking the irq handler
* @regs: Pointer to currents pt_regs
*
* Invoked from architecture specific entry code with interrupts disabled.
* Can only be called when the interrupt entry came from kernel mode. The
* calling code must be non-instrumentable. When the function returns all
* state is correct and the subsequent functions can be instrumented.
*
* The function establishes state (lockdep, RCU (context tracking), tracing) and
* is provided for architectures which require a strict split between entry from
* kernel and user mode and therefore cannot use irqentry_enter() which handles
* both entry modes.
*
* Returns: An opaque object that must be passed to irqentry_exit_to_kernel_mode().
*/
static __always_inline irqentry_state_t irqentry_enter_from_kernel_mode(struct pt_regs *regs)
{
irqentry_state_t ret = {
.exit_rcu = false,
};
/*
* If this entry hit the idle task invoke ct_irq_enter() whether
* RCU is watching or not.
*
* Interrupts can nest when the first interrupt invokes softirq
* processing on return which enables interrupts.
*
* Scheduler ticks in the idle task can mark quiescent state and
* terminate a grace period, if and only if the timer interrupt is
* not nested into another interrupt.
*
* Checking for rcu_is_watching() here would prevent the nesting
* interrupt to invoke ct_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
* Unconditionally invoke ct_irq_enter() so RCU state stays
* consistent.
*
* TINY_RCU does not support EQS, so let the compiler eliminate
* this part when enabled.
*/
if (!IS_ENABLED(CONFIG_TINY_RCU) &&
(is_idle_task(current) || arch_in_rcu_eqs())) {
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
* as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
ct_irq_enter();
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
ret.exit_rcu = true;
return ret;
}
/*
* If RCU is watching then RCU only wants to check whether it needs
* to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
* already contains a warning when RCU is not watching, so no point
* in having another one here.
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
return ret;
}
/**
* irqentry_exit_to_kernel_mode_preempt - Run preempt checks on return to kernel mode
* @regs: Pointer to current's pt_regs
* @state: Return value from matching call to irqentry_enter_from_kernel_mode()
*
* This is to be invoked before irqentry_exit_to_kernel_mode_after_preempt() to
* allow kernel preemption on return from interrupt.
*
* Must be invoked with interrupts disabled and CPU state which allows kernel
* preemption.
*
* After returning from this function, the caller can modify CPU state before
* invoking irqentry_exit_to_kernel_mode_after_preempt(), which is required to
* re-establish the tracing, lockdep and RCU state for returning to the
* interrupted context.
*/
static inline void irqentry_exit_to_kernel_mode_preempt(struct pt_regs *regs,
irqentry_state_t state)
{
if (regs_irqs_disabled(regs) || state.exit_rcu)
return;
if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
}
/**
* irqentry_exit_to_kernel_mode_after_preempt - Establish trace, lockdep and RCU state
* @regs: Pointer to current's pt_regs
* @state: Return value from matching call to irqentry_enter_from_kernel_mode()
*
* This is to be invoked after irqentry_exit_to_kernel_mode_preempt() and before
* actually returning to the interrupted context.
*
* There are no requirements for the CPU state other than being able to complete
* the tracing, lockdep and RCU state transitions. After this function returns
* the caller must return directly to the interrupted context.
*/
static __always_inline void
irqentry_exit_to_kernel_mode_after_preempt(struct pt_regs *regs, irqentry_state_t state)
{
if (!regs_irqs_disabled(regs)) {
/*
* If RCU was not watching on entry this needs to be done
* carefully and needs the same ordering of lockdep/tracing
* and RCU as the return to user mode path.
*/
if (state.exit_rcu) {
instrumentation_begin();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
instrumentation_end();
ct_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
return;
}
instrumentation_begin();
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
} else {
/*
* IRQ flags state is correct already. Just tell RCU if it
* was not watching on entry.
*/
if (state.exit_rcu)
ct_irq_exit();
}
}
/**
* irqentry_exit_to_kernel_mode - Run preempt checks and establish state after
* invoking the interrupt handler
* @regs: Pointer to current's pt_regs
* @state: Return value from matching call to irqentry_enter_from_kernel_mode()
*
* This is the counterpart of irqentry_enter_from_kernel_mode() and combines
* the calls to irqentry_exit_to_kernel_mode_preempt() and
* irqentry_exit_to_kernel_mode_after_preempt().
*
* The requirement for the CPU state is that it can schedule. After the function
* returns the tracing, lockdep and RCU state transitions are completed and the
* caller must return directly to the interrupted context.
*/
static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs,
irqentry_state_t state)
{
lockdep_assert_irqs_disabled();
instrumentation_begin();
irqentry_exit_to_kernel_mode_preempt(regs, state);
instrumentation_end();
irqentry_exit_to_kernel_mode_after_preempt(regs, state);
}
/**
* irqentry_enter - Handle state tracking on ordinary interrupt entries
* @regs: Pointer to pt_regs of interrupted context
@@ -394,32 +564,10 @@ typedef struct irqentry_state {
* establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
* would not be possible.
*
* Returns: An opaque object that must be passed to idtentry_exit()
* Returns: An opaque object that must be passed to irqentry_exit()
*/
irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
/**
* irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt
*
* Conditional reschedule with additional sanity checks.
*/
void raw_irqentry_exit_cond_resched(void);
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched
#define irqentry_exit_cond_resched_dynamic_disabled NULL
DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
#define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)()
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
void dynamic_irqentry_exit_cond_resched(void);
#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched()
#endif
#else /* CONFIG_PREEMPT_DYNAMIC */
#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched()
#endif /* CONFIG_PREEMPT_DYNAMIC */
/**
* irqentry_exit - Handle return from exception that used irqentry_enter()
* @regs: Pointer to pt_regs (exception entry regs)

View File

@@ -47,7 +47,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
*/
while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
local_irq_enable_exit_to_user(ti_work);
local_irq_enable();
if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY))
@@ -74,7 +74,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
* might have changed while interrupts and preemption was
* enabled above.
*/
local_irq_disable_exit_to_user();
local_irq_disable();
/* Check if any of the above work has queued a deferred wakeup */
tick_nohz_user_enter_prepare();
@@ -105,70 +105,16 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
{
irqentry_state_t ret = {
.exit_rcu = false,
};
if (user_mode(regs)) {
irqentry_state_t ret = {
.exit_rcu = false,
};
irqentry_enter_from_user_mode(regs);
return ret;
}
/*
* If this entry hit the idle task invoke ct_irq_enter() whether
* RCU is watching or not.
*
* Interrupts can nest when the first interrupt invokes softirq
* processing on return which enables interrupts.
*
* Scheduler ticks in the idle task can mark quiescent state and
* terminate a grace period, if and only if the timer interrupt is
* not nested into another interrupt.
*
* Checking for rcu_is_watching() here would prevent the nesting
* interrupt to invoke ct_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
* Unconditionally invoke ct_irq_enter() so RCU state stays
* consistent.
*
* TINY_RCU does not support EQS, so let the compiler eliminate
* this part when enabled.
*/
if (!IS_ENABLED(CONFIG_TINY_RCU) &&
(is_idle_task(current) || arch_in_rcu_eqs())) {
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
* as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
ct_irq_enter();
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
trace_hardirqs_off_finish();
instrumentation_end();
ret.exit_rcu = true;
return ret;
}
/*
* If RCU is watching then RCU only wants to check whether it needs
* to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
* already contains a warning when RCU is not watching, so no point
* in having another one here.
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
kmsan_unpoison_entry_regs(regs);
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
return ret;
return irqentry_enter_from_kernel_mode(regs);
}
/**
@@ -212,43 +158,10 @@ void dynamic_irqentry_exit_cond_resched(void)
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
{
lockdep_assert_irqs_disabled();
/* Check whether this returns to user mode */
if (user_mode(regs)) {
if (user_mode(regs))
irqentry_exit_to_user_mode(regs);
} else if (!regs_irqs_disabled(regs)) {
/*
* If RCU was not watching on entry this needs to be done
* carefully and needs the same ordering of lockdep/tracing
* and RCU as the return to user mode path.
*/
if (state.exit_rcu) {
instrumentation_begin();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
instrumentation_end();
ct_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
return;
}
instrumentation_begin();
if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
} else {
/*
* IRQ flags state is correct already. Just tell RCU if it
* was not watching on entry.
*/
if (state.exit_rcu)
ct_irq_exit();
}
else
irqentry_exit_to_kernel_mode(regs, state);
}
irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)

View File

@@ -56,7 +56,8 @@ static void atomics_sigill(void)
static void cmpbr_sigill(void)
{
/* Not implemented, too complicated and unreliable anyway */
asm volatile(".inst 0x74C00040\n" /* CBEQ w0, w0, +8 */
"udf #0" : : : "cc"); /* UDF #0 */
}
static void crc32_sigill(void)

View File

@@ -124,6 +124,7 @@ static const struct reg_ftr_bits ftr_id_aa64isar2_el1[] = {
static const struct reg_ftr_bits ftr_id_aa64isar3_el1[] = {
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, FPRCVT, 0),
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, LSUI, 0),
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, LSFE, 0),
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, FAMINMAX, 0),
REG_FTR_END,