diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index 47d9a3df6329..aa12708ddb96 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -24,7 +24,8 @@ Performance monitor support
    thunderx2-pmu
    alibaba_pmu
    dwc_pcie_pmu
-   nvidia-pmu
+   nvidia-tegra241-pmu
+   nvidia-tegra410-pmu
    meson-ddr-pmu
    cxl
    ampere_cspmu
diff --git a/Documentation/admin-guide/perf/nvidia-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst
similarity index 98%
rename from Documentation/admin-guide/perf/nvidia-pmu.rst
rename to Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst
index f538ef67e0e8..fad5bc4cee6c 100644
--- a/Documentation/admin-guide/perf/nvidia-pmu.rst
+++ b/Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst
@@ -1,8 +1,8 @@
-=========================================================
-NVIDIA Tegra SoC Uncore Performance Monitoring Unit (PMU)
-=========================================================
+============================================================
+NVIDIA Tegra241 SoC Uncore Performance Monitoring Unit (PMU)
+============================================================
 
-The NVIDIA Tegra SoC includes various system PMUs to measure key performance
+The NVIDIA Tegra241 SoC includes various system PMUs to measure key performance
 metrics like memory bandwidth, latency, and utilization:
 
 * Scalable Coherency Fabric (SCF)
diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst
new file mode 100644
index 000000000000..0656223b61d4
--- /dev/null
+++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst
@@ -0,0 +1,522 @@
+=====================================================================
+NVIDIA Tegra410 SoC Uncore Performance Monitoring Unit (PMU)
+=====================================================================
+
+The NVIDIA Tegra410 SoC includes various system PMUs to measure key performance
+metrics like memory bandwidth, latency, and utilization:
+
+* Unified Coherence Fabric (UCF)
+* PCIE
+* PCIE-TGT
+* CPU Memory (CMEM) Latency
+* NVLink-C2C
+* NV-CLink
+* NV-DLink
+
+PMU Driver
+----------
+
+The PMU driver describes the available events and configuration of each PMU in
+sysfs. Please see the sections below to get the sysfs path of each PMU. Like
+other uncore PMU drivers, the driver provides "cpumask" sysfs attribute to show
+the CPU id used to handle the PMU event. There is also "associated_cpus"
+sysfs attribute, which contains a list of CPUs associated with the PMU instance.
+
+UCF PMU
+-------
+
+The Unified Coherence Fabric (UCF) in the NVIDIA Tegra410 SoC serves as a
+distributed cache, last level for CPU Memory and CXL Memory, and cache coherent
+interconnect that supports hardware coherence across multiple coherently caching
+agents, including:
+
+  * CPU clusters
+  * GPU
+  * PCIe Ordering Controller Unit (OCU)
+  * Other IO-coherent requesters
+
+The events and configuration options of this PMU device are described in sysfs,
+see /sys/bus/event_source/devices/nvidia_ucf_pmu_<socket-id>.
+
+Some of the events available in this PMU can be used to measure bandwidth and
+utilization:
+
+  * slc_access_rd: count the number of read requests to SLC.
+  * slc_access_wr: count the number of write requests to SLC.
+  * slc_bytes_rd: count the number of bytes transferred by slc_access_rd.
+  * slc_bytes_wr: count the number of bytes transferred by slc_access_wr.
+  * mem_access_rd: count the number of read requests to local or remote memory.
+  * mem_access_wr: count the number of write requests to local or remote memory.
+  * mem_bytes_rd: count the number of bytes transferred by mem_access_rd.
+  * mem_bytes_wr: count the number of bytes transferred by mem_access_wr.
+  * cycles: counts the UCF cycles.
+
+The average bandwidth is calculated as::
+
+   AVG_SLC_READ_BANDWIDTH_IN_GBPS = SLC_BYTES_RD / ELAPSED_TIME_IN_NS
+   AVG_SLC_WRITE_BANDWIDTH_IN_GBPS = SLC_BYTES_WR / ELAPSED_TIME_IN_NS
+   AVG_MEM_READ_BANDWIDTH_IN_GBPS = MEM_BYTES_RD / ELAPSED_TIME_IN_NS
+   AVG_MEM_WRITE_BANDWIDTH_IN_GBPS = MEM_BYTES_WR / ELAPSED_TIME_IN_NS
+
+The average request rate is calculated as::
+
+   AVG_SLC_READ_REQUEST_RATE = SLC_ACCESS_RD / CYCLES
+   AVG_SLC_WRITE_REQUEST_RATE = SLC_ACCESS_WR / CYCLES
+   AVG_MEM_READ_REQUEST_RATE = MEM_ACCESS_RD / CYCLES
+   AVG_MEM_WRITE_REQUEST_RATE = MEM_ACCESS_WR / CYCLES
+
+More details about what other events are available can be found in Tegra410 SoC
+technical reference manual.
+
+The events can be filtered based on source or destination. The source filter
+indicates the traffic initiator to the SLC, e.g local CPU, non-CPU device, or
+remote socket. The destination filter specifies the destination memory type,
+e.g. local system memory (CMEM), local GPU memory (GMEM), or remote memory. The
+local/remote classification of the destination filter is based on the home
+socket of the address, not where the data actually resides. The available
+filters are described in
+/sys/bus/event_source/devices/nvidia_ucf_pmu_<socket-id>/format/.
+
+The list of UCF PMU event filters:
+
+* Source filter:
+
+  * src_loc_cpu: if set, count events from local CPU
+  * src_loc_noncpu: if set, count events from local non-CPU device
+  * src_rem: if set, count events from CPU, GPU, PCIE devices of remote socket
+
+* Destination filter:
+
+  * dst_loc_cmem: if set, count events to local system memory (CMEM) address
+  * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address
+  * dst_loc_other: if set, count events to local CXL memory address
+  * dst_rem: if set, count events to CPU, GPU, and CXL memory address of remote socket
+
+If the source is not specified, the PMU will count events from all sources. If
+the destination is not specified, the PMU will count events to all destinations.
+
+Example usage:
+
+* Count event id 0x0 in socket 0 from all sources and to all destinations::
+
+    perf stat -a -e nvidia_ucf_pmu_0/event=0x0/
+
+* Count event id 0x0 in socket 0 with source filter = local CPU and destination
+  filter = local system memory (CMEM)::
+
+    perf stat -a -e nvidia_ucf_pmu_0/event=0x0,src_loc_cpu=0x1,dst_loc_cmem=0x1/
+
+* Count event id 0x0 in socket 1 with source filter = local non-CPU device and
+  destination filter = remote memory::
+
+    perf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/
+
+PCIE PMU
+--------
+
+This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and
+the memory subsystem. It monitors all read/write traffic from the root port(s)
+or a particular BDF in a PCIE RC to local or remote memory. There is one PMU per
+PCIE RC in the SoC. Each RC can have up to 16 lanes that can be bifurcated into
+up to 8 root ports. The traffic from each root port can be filtered using RP or
+BDF filter. For example, specifying "src_rp_mask=0xFF" means the PMU counter will
+capture traffic from all RPs. Please see below for more details.
+
+The events and configuration options of this PMU device are described in sysfs,
+see /sys/bus/event_source/devices/nvidia_pcie_pmu_<socket-id>_rc_<pcie-rc-id>.
+
+The events in this PMU can be used to measure bandwidth, utilization, and
+latency:
+
+  * rd_req: count the number of read requests by PCIE device.
+  * wr_req: count the number of write requests by PCIE device.
+  * rd_bytes: count the number of bytes transferred by rd_req.
+  * wr_bytes: count the number of bytes transferred by wr_req.
+  * rd_cum_outs: count outstanding rd_req each cycle.
+  * cycles: count the clock cycles of SOC fabric connected to the PCIE interface.
+
+The average bandwidth is calculated as::
+
+   AVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS
+   AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS
+
+The average request rate is calculated as::
+
+   AVG_RD_REQUEST_RATE = RD_REQ / CYCLES
+   AVG_WR_REQUEST_RATE = WR_REQ / CYCLES
+
+
+The average latency is calculated as::
+
+   FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS
+   AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ
+   AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ
+
+The PMU events can be filtered based on the traffic source and destination.
+The source filter indicates the PCIE devices that will be monitored. The
+destination filter specifies the destination memory type, e.g. local system
+memory (CMEM), local GPU memory (GMEM), or remote memory. The local/remote
+classification of the destination filter is based on the home socket of the
+address, not where the data actually resides. These filters can be found in
+/sys/bus/event_source/devices/nvidia_pcie_pmu_<socket-id>_rc_<pcie-rc-id>/format/.
+
+The list of event filters:
+
+* Source filter:
+
+  * src_rp_mask: bitmask of root ports that will be monitored. Each bit in this
+    bitmask represents the RP index in the RC. If the bit is set, all devices under
+    the associated RP will be monitored. E.g "src_rp_mask=0xF" will monitor
+    devices in root port 0 to 3.
+  * src_bdf: the BDF that will be monitored. This is a 16-bit value that
+    follows formula: (bus << 8) + (device << 3) + (function). For example, the
+    value of BDF 27:01.1 is 0x2781.
+  * src_bdf_en: enable the BDF filter. If this is set, the BDF filter value in
+    "src_bdf" is used to filter the traffic.
+
+  Note that Root-Port and BDF filters are mutually exclusive and the PMU in
+  each RC can only have one BDF filter for the whole counters. If BDF filter
+  is enabled, the BDF filter value will be applied to all events.
+
+* Destination filter:
+
+  * dst_loc_cmem: if set, count events to local system memory (CMEM) address
+  * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address
+  * dst_loc_pcie_p2p: if set, count events to local PCIE peer address
+  * dst_loc_pcie_cxl: if set, count events to local CXL memory address
+  * dst_rem: if set, count events to remote memory address
+
+If the source filter is not specified, the PMU will count events from all root
+ports. If the destination filter is not specified, the PMU will count events
+to all destinations.
+
+Example usage:
+
+* Count event id 0x0 from root port 0 of PCIE RC-0 on socket 0 targeting all
+  destinations::
+
+    perf stat -a -e nvidia_pcie_pmu_0_rc_0/event=0x0,src_rp_mask=0x1/
+
+* Count event id 0x1 from root port 0 and 1 of PCIE RC-1 on socket 0 and
+  targeting just local CMEM of socket 0::
+
+    perf stat -a -e nvidia_pcie_pmu_0_rc_1/event=0x1,src_rp_mask=0x3,dst_loc_cmem=0x1/
+
+* Count event id 0x2 from root port 0 of PCIE RC-2 on socket 1 targeting all
+  destinations::
+
+    perf stat -a -e nvidia_pcie_pmu_1_rc_2/event=0x2,src_rp_mask=0x1/
+
+* Count event id 0x3 from root port 0 and 1 of PCIE RC-3 on socket 1 and
+  targeting just local CMEM of socket 1::
+
+    perf stat -a -e nvidia_pcie_pmu_1_rc_3/event=0x3,src_rp_mask=0x3,dst_loc_cmem=0x1/
+
+* Count event id 0x4 from BDF 01:01.0 of PCIE RC-4 on socket 0 targeting all
+  destinations::
+
+    perf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/
+
+.. _NVIDIA_T410_PCIE_PMU_RC_Mapping_Section:
+
+Mapping the RC# to lspci segment number
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Mapping the RC# to lspci segment number can be non-trivial; hence a new NVIDIA
+Designated Vendor Specific Capability (DVSEC) register is added into the PCIE config space
+for each RP. This DVSEC has vendor id "10de" and DVSEC id of "0x4". The DVSEC register
+contains the following information to map PCIE devices under the RP back to its RC# :
+
+  - Bus# (byte 0xc) : bus number as reported by the lspci output
+  - Segment# (byte 0xd) : segment number as reported by the lspci output
+  - RP# (byte 0xe) : port number as reported by LnkCap attribute from lspci for a device with Root Port capability
+  - RC# (byte 0xf): root complex number associated with the RP
+  - Socket# (byte 0x10): socket number associated with the RP
+
+Example script for mapping lspci BDF to RC# and socket#::
+
+  #!/bin/bash
+  while read bdf rest; do
+    dvsec4_reg=$(lspci -vv -s $bdf | awk '
+      /Designated Vendor-Specific: Vendor=10de ID=0004/ {
+        match($0, /\[([0-9a-fA-F]+)/, arr);
+        print "0x" arr[1];
+        exit
+      }
+    ')
+    if [ -n "$dvsec4_reg" ]; then
+      bus=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xc))).b)
+      segment=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xd))).b)
+      rp=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xe))).b)
+      rc=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xf))).b)
+      socket=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0x10))).b)
+      echo "$bdf: Bus=$bus, Segment=$segment, RP=$rp, RC=$rc, Socket=$socket"
+    fi
+  done < <(lspci -d 10de:)
+
+Example output::
+
+  0001:00:00.0: Bus=00, Segment=01, RP=00, RC=00, Socket=00
+  0002:80:00.0: Bus=80, Segment=02, RP=01, RC=01, Socket=00
+  0002:a0:00.0: Bus=a0, Segment=02, RP=02, RC=01, Socket=00
+  0002:c0:00.0: Bus=c0, Segment=02, RP=03, RC=01, Socket=00
+  0002:e0:00.0: Bus=e0, Segment=02, RP=04, RC=01, Socket=00
+  0003:00:00.0: Bus=00, Segment=03, RP=00, RC=02, Socket=00
+  0004:00:00.0: Bus=00, Segment=04, RP=00, RC=03, Socket=00
+  0005:00:00.0: Bus=00, Segment=05, RP=00, RC=04, Socket=00
+  0005:40:00.0: Bus=40, Segment=05, RP=01, RC=04, Socket=00
+  0005:c0:00.0: Bus=c0, Segment=05, RP=02, RC=04, Socket=00
+  0006:00:00.0: Bus=00, Segment=06, RP=00, RC=05, Socket=00
+  0009:00:00.0: Bus=00, Segment=09, RP=00, RC=00, Socket=01
+  000a:80:00.0: Bus=80, Segment=0a, RP=01, RC=01, Socket=01
+  000a:a0:00.0: Bus=a0, Segment=0a, RP=02, RC=01, Socket=01
+  000a:e0:00.0: Bus=e0, Segment=0a, RP=03, RC=01, Socket=01
+  000b:00:00.0: Bus=00, Segment=0b, RP=00, RC=02, Socket=01
+  000c:00:00.0: Bus=00, Segment=0c, RP=00, RC=03, Socket=01
+  000d:00:00.0: Bus=00, Segment=0d, RP=00, RC=04, Socket=01
+  000d:40:00.0: Bus=40, Segment=0d, RP=01, RC=04, Socket=01
+  000d:c0:00.0: Bus=c0, Segment=0d, RP=02, RC=04, Socket=01
+  000e:00:00.0: Bus=00, Segment=0e, RP=00, RC=05, Socket=01
+
+PCIE-TGT PMU
+------------
+
+This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and
+the memory subsystem. It monitors traffic targeting PCIE BAR and CXL HDM ranges.
+There is one PCIE-TGT PMU per PCIE RC in the SoC. Each RC in Tegra410 SoC can
+have up to 16 lanes that can be bifurcated into up to 8 root ports (RP). The PMU
+provides RP filter to count PCIE BAR traffic to each RP and address filter to
+count access to PCIE BAR or CXL HDM ranges. The details of the filters are
+described in the following sections.
+
+Mapping the RC# to lspci segment number is similar to the PCIE PMU. Please see
+:ref:`NVIDIA_T410_PCIE_PMU_RC_Mapping_Section` for more info.
+
+The events and configuration options of this PMU device are available in sysfs,
+see /sys/bus/event_source/devices/nvidia_pcie_tgt_pmu_<socket-id>_rc_<pcie-rc-id>.
+
+The events in this PMU can be used to measure bandwidth and utilization:
+
+  * rd_req: count the number of read requests to PCIE.
+  * wr_req: count the number of write requests to PCIE.
+  * rd_bytes: count the number of bytes transferred by rd_req.
+  * wr_bytes: count the number of bytes transferred by wr_req.
+  * cycles: count the clock cycles of SOC fabric connected to the PCIE interface.
+
+The average bandwidth is calculated as::
+
+   AVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS
+   AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS
+
+The average request rate is calculated as::
+
+   AVG_RD_REQUEST_RATE = RD_REQ / CYCLES
+   AVG_WR_REQUEST_RATE = WR_REQ / CYCLES
+
+The PMU events can be filtered based on the destination root port or target
+address range. Filtering based on RP is only available for PCIE BAR traffic.
+Address filter works for both PCIE BAR and CXL HDM ranges. These filters can be
+found in sysfs, see
+/sys/bus/event_source/devices/nvidia_pcie_tgt_pmu_<socket-id>_rc_<pcie-rc-id>/format/.
+
+Destination filter settings:
+
+* dst_rp_mask: bitmask to select the root port(s) to monitor. E.g. "dst_rp_mask=0xFF"
+  corresponds to all root ports (from 0 to 7) in the PCIE RC. Note that this filter is
+  only available for PCIE BAR traffic.
+* dst_addr_base: BAR or CXL HDM filter base address.
+* dst_addr_mask: BAR or CXL HDM filter address mask.
+* dst_addr_en: enable BAR or CXL HDM address range filter. If this is set, the
+  address range specified by "dst_addr_base" and "dst_addr_mask" will be used to filter
+  the PCIE BAR and CXL HDM traffic address. The PMU uses the following comparison
+  to determine if the traffic destination address falls within the filter range::
+
+    (txn's addr & dst_addr_mask) == (dst_addr_base & dst_addr_mask)
+
+  If the comparison succeeds, then the event will be counted.
+
+If the destination filter is not specified, the RP filter will be configured by default
+to count PCIE BAR traffic to all root ports.
+
+Example usage:
+
+* Count event id 0x0 to root port 0 and 1 of PCIE RC-0 on socket 0::
+
+    perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_0/event=0x0,dst_rp_mask=0x3/
+
+* Count event id 0x1 for accesses to PCIE BAR or CXL HDM address range
+  0x10000 to 0x100FF on socket 0's PCIE RC-1::
+
+    perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/
+
+CPU Memory (CMEM) Latency PMU
+-----------------------------
+
+This PMU monitors latency events of memory read requests from the edge of the
+Unified Coherence Fabric (UCF) to local CPU DRAM:
+
+  * RD_REQ counters: count read requests (32B per request).
+  * RD_CUM_OUTS counters: accumulated outstanding request counter, which track
+    how many cycles the read requests are in flight.
+  * CYCLES counter: counts the number of elapsed cycles.
+
+The average latency is calculated as::
+
+   FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS
+   AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ
+   AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ
+
+The events and configuration options of this PMU device are described in sysfs,
+see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_<socket-id>.
+
+Example usage::
+
+  perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}'
+
+NVLink-C2C PMU
+--------------
+
+This PMU monitors latency events of memory read/write requests that pass through
+the NVIDIA Chip-to-Chip (C2C) interface. Bandwidth events are not available
+in this PMU, unlike the C2C PMU in Grace (Tegra241 SoC).
+
+The events and configuration options of this PMU device are available in sysfs,
+see /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_<socket-id>.
+
+The list of events:
+
+  * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests.
+  * IN_RD_REQ: the number of incoming read requests.
+  * IN_WR_CUM_OUTS: accumulated outstanding request (in cycles) of incoming write requests.
+  * IN_WR_REQ: the number of incoming write requests.
+  * OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests.
+  * OUT_RD_REQ: the number of outgoing read requests.
+  * OUT_WR_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing write requests.
+  * OUT_WR_REQ: the number of outgoing write requests.
+  * CYCLES: NVLink-C2C interface cycle counts.
+
+The incoming events count the reads/writes from remote device to the SoC.
+The outgoing events count the reads/writes from the SoC to remote device.
+
+The sysfs /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_<socket-id>/peer
+contains the information about the connected device.
+
+When the C2C interface is connected to GPU(s), the user can use the
+"gpu_mask" parameter to filter traffic to/from specific GPU(s). Each bit represents the GPU
+index, e.g. "gpu_mask=0x1" corresponds to GPU 0 and "gpu_mask=0x3" is for GPU 0 and 1.
+The PMU will monitor all GPUs by default if not specified.
+
+When connected to another SoC, only the read events are available.
+
+The events can be used to calculate the average latency of the read/write requests::
+
+   C2C_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS
+
+   IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ
+   IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ
+
+   IN_WR_AVG_LATENCY_IN_CYCLES = IN_WR_CUM_OUTS / IN_WR_REQ
+   IN_WR_AVG_LATENCY_IN_NS = IN_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ
+
+   OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ
+   OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ
+
+   OUT_WR_AVG_LATENCY_IN_CYCLES = OUT_WR_CUM_OUTS / OUT_WR_REQ
+   OUT_WR_AVG_LATENCY_IN_NS = OUT_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ
+
+Example usage:
+
+  * Count incoming traffic from all GPUs connected via NVLink-C2C::
+
+      perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_req/
+
+  * Count incoming traffic from GPU 0 connected via NVLink-C2C::
+
+      perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x1/
+
+  * Count incoming traffic from GPU 1 connected via NVLink-C2C::
+
+      perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x2/
+
+  * Count outgoing traffic to all GPUs connected via NVLink-C2C::
+
+      perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_req/
+
+  * Count outgoing traffic to GPU 0 connected via NVLink-C2C::
+
+      perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x1/
+
+  * Count outgoing traffic to GPU 1 connected via NVLink-C2C::
+
+      perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x2/
+
+NV-CLink PMU
+------------
+
+This PMU monitors latency events of memory read requests that pass through
+the NV-CLINK interface. Bandwidth events are not available in this PMU.
+In Tegra410 SoC, the NV-CLink interface is used to connect to another Tegra410
+SoC and this PMU only counts read traffic.
+
+The events and configuration options of this PMU device are available in sysfs,
+see /sys/bus/event_source/devices/nvidia_nvclink_pmu_<socket-id>.
+
+The list of events:
+
+  * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests.
+  * IN_RD_REQ: the number of incoming read requests.
+  * OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests.
+  * OUT_RD_REQ: the number of outgoing read requests.
+  * CYCLES: NV-CLINK interface cycle counts.
+
+The incoming events count the reads from remote device to the SoC.
+The outgoing events count the reads from the SoC to remote device.
+
+The events can be used to calculate the average latency of the read requests::
+
+   CLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS
+
+   IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ
+   IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ
+
+   OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ
+   OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ
+
+Example usage:
+
+  * Count incoming read traffic from remote SoC connected via NV-CLINK::
+
+      perf stat -a -e nvidia_nvclink_pmu_0/in_rd_req/
+
+  * Count outgoing read traffic to remote SoC connected via NV-CLINK::
+
+      perf stat -a -e nvidia_nvclink_pmu_0/out_rd_req/
+
+NV-DLink PMU
+------------
+
+This PMU monitors latency events of memory read requests that pass through
+the NV-DLINK interface.  Bandwidth events are not available in this PMU.
+In Tegra410 SoC, this PMU only counts CXL memory read traffic.
+
+The events and configuration options of this PMU device are available in sysfs,
+see /sys/bus/event_source/devices/nvidia_nvdlink_pmu_<socket-id>.
+
+The list of events:
+
+  * IN_RD_CUM_OUTS: accumulated outstanding read requests (in cycles) to CXL memory.
+  * IN_RD_REQ: the number of read requests to CXL memory.
+  * CYCLES: NV-DLINK interface cycle counts.
+
+The events can be used to calculate the average latency of the read requests::
+
+   DLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS
+
+   IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ
+   IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / DLINK_FREQ_IN_GHZ
+
+Example usage:
+
+  * Count read events to CXL memory::
+
+      perf stat -a -e '{nvidia_nvdlink_pmu_0/in_rd_req/,nvidia_nvdlink_pmu_0/in_rd_cum_outs/}'
diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst
index af52edc8c0ac..98052b4ef4a1 100644
--- a/Documentation/arch/arm64/index.rst
+++ b/Documentation/arch/arm64/index.rst
@@ -23,6 +23,7 @@ ARM64 Architecture
     memory
     memory-tagging-extension
     mops
+    mpam
     perf
     pointer-authentication
     ptdump
diff --git a/Documentation/arch/arm64/mpam.rst b/Documentation/arch/arm64/mpam.rst
new file mode 100644
index 000000000000..570f51a8d4eb
--- /dev/null
+++ b/Documentation/arch/arm64/mpam.rst
@@ -0,0 +1,72 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+MPAM
+====
+
+What is MPAM
+============
+MPAM (Memory Partitioning and Monitoring) is a feature in the CPUs and memory
+system components such as the caches or memory controllers that allow memory
+traffic to be labelled, partitioned and monitored.
+
+Traffic is labelled by the CPU, based on the control or monitor group the
+current task is assigned to using resctrl.  Partitioning policy can be set
+using the schemata file in resctrl, and monitor values read via resctrl.
+See Documentation/filesystems/resctrl.rst for more details.
+
+This allows tasks that share memory system resources, such as caches, to be
+isolated from each other according to the partitioning policy (so called noisy
+neighbours).
+
+Supported Platforms
+===================
+Use of this feature requires CPU support, support in the memory system
+components, and a description from firmware of where the MPAM device controls
+are in the MMIO address space. (e.g. the 'MPAM' ACPI table).
+
+The MMIO device that provides MPAM controls/monitors for a memory system
+component is called a memory system component. (MSC).
+
+Because the user interface to MPAM is via resctrl, only MPAM features that are
+compatible with resctrl can be exposed to user-space.
+
+MSC are considered as a group based on the topology. MSC that correspond with
+the L3 cache are considered together, it is not possible to mix MSC between L2
+and L3 to 'cover' a resctrl schema.
+
+The supported features are:
+
+* Cache portion bitmap controls (CPOR) on the L2 or L3 caches.  To expose
+  CPOR at L2 or L3, every CPU must have a corresponding CPU cache at this
+  level that also supports the feature.  Mismatched big/little platforms are
+  not supported as resctrl's controls would then also depend on task
+  placement.
+
+* Memory bandwidth maximum controls (MBW_MAX) on or after the L3 cache.
+  resctrl uses the L3 cache-id to identify where the memory bandwidth
+  control is applied. For this reason the platform must have an L3 cache
+  with cache-id's supplied by firmware. (It doesn't need to support MPAM.)
+
+  To be exported as the 'MB' schema, the topology of the group of MSC chosen
+  must match the topology of the L3 cache so that the cache-id's can be
+  repainted. For example: Platforms with Memory bandwidth maximum controls
+  on CPU-less NUMA nodes cannot expose the 'MB' schema to resctrl as these
+  nodes do not have a corresponding L3 cache. If the memory bandwidth
+  control is on the memory rather than the L3 then there must be a single
+  global L3 as otherwise it is unknown which L3 the traffic came from. There
+  must be no caches between the L3 and the memory so that the two ends of
+  the path have equivalent traffic.
+
+  When the MPAM driver finds multiple groups of MSC it can use for the 'MB'
+  schema, it prefers the group closest to the L3 cache.
+
+* Cache Storage Usage (CSU) counters can expose the 'llc_occupancy' provided
+  there is at least one CSU monitor on each MSC that makes up the L3 group.
+  Exposing CSU counters from other caches or devices is not supported.
+
+Reporting Bugs
+==============
+If you are not seeing the counters or controls you expect please share the
+debug messages produced when enabling dynamic debug and booting with:
+dyndbg="file mpam_resctrl.c +pl"
diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst
index 4c300caad901..65ed6ea33751 100644
--- a/Documentation/arch/arm64/silicon-errata.rst
+++ b/Documentation/arch/arm64/silicon-errata.rst
@@ -214,6 +214,9 @@ stable kernels.
 +----------------+-----------------+-----------------+-----------------------------+
 | ARM            | SI L1           | #4311569        | ARM64_ERRATUM_4311569       |
 +----------------+-----------------+-----------------+-----------------------------+
+| ARM            | CMN-650         | #3642720        | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
 | Broadcom       | Brahma-B53      | N/A             | ARM64_ERRATUM_845719        |
 +----------------+-----------------+-----------------+-----------------------------+
 | Broadcom       | Brahma-B53      | N/A             | ARM64_ERRATUM_843419        |
@@ -247,6 +250,12 @@ stable kernels.
 +----------------+-----------------+-----------------+-----------------------------+
 | NVIDIA         | T241 GICv3/4.x  | T241-FABRIC-4   | N/A                         |
 +----------------+-----------------+-----------------+-----------------------------+
+| NVIDIA         | T241 MPAM       | T241-MPAM-1     | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
+| NVIDIA         | T241 MPAM       | T241-MPAM-4     | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
+| NVIDIA         | T241 MPAM       | T241-MPAM-6     | N/A                         |
++----------------+-----------------+-----------------+-----------------------------+
 +----------------+-----------------+-----------------+-----------------------------+
 | Freescale/NXP  | LS2080A/LS1043A | A-008585        | FSL_ERRATUM_A008585         |
 +----------------+-----------------+-----------------+-----------------------------+
diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h
index 2ec0e5e83fc9..ecfede0c0348 100644
--- a/arch/arm/include/asm/arm_pmuv3.h
+++ b/arch/arm/include/asm/arm_pmuv3.h
@@ -238,6 +238,13 @@ static inline void kvm_vcpu_pmu_resync_el0(void) {}
 
 static inline bool pmuv3_implemented(int pmuver)
 {
+	/*
+	 * PMUVer follows the standard ID scheme for an unsigned field with the
+	 * exception of 0xF (IMP_DEF) which is treated specially and implies
+	 * FEAT_PMUv3 is not implemented.
+	 *
+	 * See DDI0487L.a D24.1.3.2 for more details.
+	 */
 	return !(pmuver == ARMV8_PMU_DFR_VER_IMP_DEF ||
 		 pmuver == ARMV8_PMU_DFR_VER_NI);
 }
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 4de630e398ca..1cf37b30b861 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -61,32 +61,6 @@ config ARM64
 	select ARCH_HAVE_ELF_PROT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_HAVE_TRACE_MMIO_ACCESS
-	select ARCH_INLINE_READ_LOCK if !PREEMPTION
-	select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION
-	select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION
-	select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPTION
-	select ARCH_INLINE_READ_UNLOCK if !PREEMPTION
-	select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPTION
-	select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPTION
-	select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPTION
-	select ARCH_INLINE_WRITE_LOCK if !PREEMPTION
-	select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPTION
-	select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPTION
-	select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPTION
-	select ARCH_INLINE_WRITE_UNLOCK if !PREEMPTION
-	select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPTION
-	select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPTION
-	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPTION
-	select ARCH_INLINE_SPIN_TRYLOCK if !PREEMPTION
-	select ARCH_INLINE_SPIN_TRYLOCK_BH if !PREEMPTION
-	select ARCH_INLINE_SPIN_LOCK if !PREEMPTION
-	select ARCH_INLINE_SPIN_LOCK_BH if !PREEMPTION
-	select ARCH_INLINE_SPIN_LOCK_IRQ if !PREEMPTION
-	select ARCH_INLINE_SPIN_LOCK_IRQSAVE if !PREEMPTION
-	select ARCH_INLINE_SPIN_UNLOCK if !PREEMPTION
-	select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPTION
-	select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION
-	select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
 	select ARCH_USE_CMPXCHG_LOCKREF
@@ -2009,8 +1983,8 @@ config ARM64_TLB_RANGE
 
 config ARM64_MPAM
 	bool "Enable support for MPAM"
-	select ARM64_MPAM_DRIVER if EXPERT	# does nothing yet
-	select ACPI_MPAM if ACPI
+	select ARM64_MPAM_DRIVER
+	select ARCH_HAS_CPU_RESCTRL
 	help
 	  Memory System Resource Partitioning and Monitoring (MPAM) is an
 	  optional extension to the Arm architecture that allows each
@@ -2032,6 +2006,8 @@ config ARM64_MPAM
 
 	  MPAM is exposed to user-space via the resctrl pseudo filesystem.
 
+	  This option enables the extra context switch code.
+
 endmenu # "ARMv8.4 architectural features"
 
 menu "ARMv8.5 architectural features"
@@ -2208,6 +2184,26 @@ config ARM64_GCS
 
 endmenu # "ARMv9.4 architectural features"
 
+config AS_HAS_LSUI
+	def_bool $(as-instr,.arch_extension lsui)
+	help
+	  Supported by LLVM 20+ and binutils 2.45+.
+
+menu "ARMv9.6 architectural features"
+
+config ARM64_LSUI
+	bool "Support Unprivileged Load Store Instructions (LSUI)"
+	default y
+	depends on AS_HAS_LSUI && !CPU_BIG_ENDIAN
+	help
+	  The Unprivileged Load Store Instructions (LSUI) provides
+	  variants load/store instructions that access user-space memory
+	  from the kernel without clearing PSTATE.PAN bit.
+
+	  This feature is supported by LLVM 20+ and binutils 2.45+.
+
+endmenu # "ARMv9.6 architectural feature"
+
 config ARM64_SVE
 	bool "ARM Scalable Vector Extension support"
 	default y
@@ -2365,7 +2361,7 @@ config CMDLINE
 	default ""
 	help
 	  Provide a set of default command-line options at build time by
-	  entering them here. As a minimum, you should specify the the
+	  entering them here. As a minimum, you should specify the
 	  root device (e.g. root=/dev/nfs).
 
 choice
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 9148f5a31968..12aa6a283249 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -15,7 +15,7 @@
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	.macro	__uaccess_ttbr0_disable, tmp1
 	mrs	\tmp1, ttbr1_el1			// swapper_pg_dir
-	bic	\tmp1, \tmp1, #TTBR_ASID_MASK
+	bic	\tmp1, \tmp1, #TTBRx_EL1_ASID_MASK
 	sub	\tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET	// reserved_pg_dir
 	msr	ttbr0_el1, \tmp1			// set reserved TTBR0_EL1
 	add	\tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 177c691914f8..6e3da333442e 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -71,6 +71,8 @@ cpucap_is_possible(const unsigned int cap)
 		return true;
 	case ARM64_HAS_PMUV3:
 		return IS_ENABLED(CONFIG_HW_PERF_EVENTS);
+	case ARM64_HAS_LSUI:
+		return IS_ENABLED(CONFIG_ARM64_LSUI);
 	}
 
 	return true;
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 85f4c1615472..4d15071a4f3f 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -513,7 +513,8 @@
 	check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2
 
 .Linit_mpam_\@:
-	msr_s	SYS_MPAM2_EL2, xzr		// use the default partition
+	mov	x0, #MPAM2_EL2_EnMPAMSM_MASK
+	msr_s	SYS_MPAM2_EL2, x0		// use the default partition,
 						// and disable lower traps
 	mrs_s	x0, SYS_MPAMIDR_EL1
 	tbz	x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@  // skip if no MPAMHCR reg
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index bc06691d2062..d1d2ff9d323a 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -9,71 +9,292 @@
 #include <linux/uaccess.h>
 
 #include <asm/errno.h>
+#include <asm/lsui.h>
 
 #define FUTEX_MAX_LOOPS	128 /* What's the largest number you can think of? */
 
-#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg)		\
-do {									\
+#define LLSC_FUTEX_ATOMIC_OP(op, insn)					\
+static __always_inline int						\
+__llsc_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)	\
+{									\
 	unsigned int loops = FUTEX_MAX_LOOPS;				\
+	int ret, oldval, newval;					\
 									\
 	uaccess_enable_privileged();					\
-	asm volatile(							\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ldxr	%w1, %2\n"						\
+	asm volatile("// __llsc_futex_atomic_" #op "\n"			\
+"	prfm	pstl1strm, %[uaddr]\n"					\
+"1:	ldxr	%w[oldval], %[uaddr]\n"					\
 	insn "\n"							\
-"2:	stlxr	%w0, %w3, %2\n"						\
-"	cbz	%w0, 3f\n"						\
-"	sub	%w4, %w4, %w0\n"					\
-"	cbnz	%w4, 1b\n"						\
-"	mov	%w0, %w6\n"						\
+"2:	stlxr	%w[ret], %w[newval], %[uaddr]\n"			\
+"	cbz	%w[ret], 3f\n"						\
+"	sub	%w[loops], %w[loops], %w[ret]\n"			\
+"	cbnz	%w[loops], 1b\n"					\
+"	mov	%w[ret], %w[err]\n"					\
 "3:\n"									\
 "	dmb	ish\n"							\
-	_ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0)				\
-	_ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0)				\
-	: "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp),	\
-	  "+r" (loops)							\
-	: "r" (oparg), "Ir" (-EAGAIN)					\
+	_ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret])			\
+	_ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret])			\
+	: [ret] "=&r" (ret), [oldval] "=&r" (oldval),			\
+	  [uaddr] "+Q" (*uaddr), [newval] "=&r" (newval),		\
+	  [loops] "+r" (loops)						\
+	: [oparg] "r" (oparg), [err] "Ir" (-EAGAIN)			\
 	: "memory");							\
 	uaccess_disable_privileged();					\
-} while (0)
+									\
+	if (!ret)							\
+		*oval = oldval;						\
+									\
+	return ret;							\
+}
+
+LLSC_FUTEX_ATOMIC_OP(add, "add	%w[newval], %w[oldval], %w[oparg]")
+LLSC_FUTEX_ATOMIC_OP(or,  "orr	%w[newval], %w[oldval], %w[oparg]")
+LLSC_FUTEX_ATOMIC_OP(and, "and	%w[newval], %w[oldval], %w[oparg]")
+LLSC_FUTEX_ATOMIC_OP(eor, "eor	%w[newval], %w[oldval], %w[oparg]")
+LLSC_FUTEX_ATOMIC_OP(set, "mov	%w[newval], %w[oparg]")
+
+static __always_inline int
+__llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+	int ret = 0;
+	unsigned int loops = FUTEX_MAX_LOOPS;
+	u32 val, tmp;
+
+	uaccess_enable_privileged();
+	asm volatile("//__llsc_futex_cmpxchg\n"
+"	prfm	pstl1strm, %[uaddr]\n"
+"1:	ldxr	%w[curval], %[uaddr]\n"
+"	eor	%w[tmp], %w[curval], %w[oldval]\n"
+"	cbnz	%w[tmp], 4f\n"
+"2:	stlxr	%w[tmp], %w[newval], %[uaddr]\n"
+"	cbz	%w[tmp], 3f\n"
+"	sub	%w[loops], %w[loops], %w[tmp]\n"
+"	cbnz	%w[loops], 1b\n"
+"	mov	%w[ret], %w[err]\n"
+"3:\n"
+"	dmb	ish\n"
+"4:\n"
+	_ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w[ret])
+	_ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w[ret])
+	: [ret] "+r" (ret), [curval] "=&r" (val),
+	  [uaddr] "+Q" (*uaddr), [tmp] "=&r" (tmp),
+	  [loops] "+r" (loops)
+	: [oldval] "r" (oldval), [newval] "r" (newval),
+	  [err] "Ir" (-EAGAIN)
+	: "memory");
+	uaccess_disable_privileged();
+
+	if (!ret)
+		*oval = val;
+
+	return ret;
+}
+
+#ifdef CONFIG_ARM64_LSUI
+
+/*
+ * Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(), as
+ * PAN toggling is not required.
+ */
+
+#define LSUI_FUTEX_ATOMIC_OP(op, asm_op)				\
+static __always_inline int						\
+__lsui_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)	\
+{									\
+	int ret = 0;							\
+	int oldval;							\
+									\
+	uaccess_ttbr0_enable();						\
+									\
+	asm volatile("// __lsui_futex_atomic_" #op "\n"			\
+	__LSUI_PREAMBLE							\
+"1:	" #asm_op "al	%w[oparg], %w[oldval], %[uaddr]\n"		\
+"2:\n"									\
+	_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])			\
+	: [ret] "+r" (ret), [uaddr] "+Q" (*uaddr),			\
+	  [oldval] "=r" (oldval)					\
+	: [oparg] "r" (oparg)						\
+	: "memory");							\
+									\
+	uaccess_ttbr0_disable();					\
+									\
+	if (!ret)							\
+		*oval = oldval;						\
+	return ret;							\
+}
+
+LSUI_FUTEX_ATOMIC_OP(add, ldtadd)
+LSUI_FUTEX_ATOMIC_OP(or, ldtset)
+LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr)
+LSUI_FUTEX_ATOMIC_OP(set, swpt)
+
+static __always_inline int
+__lsui_cmpxchg64(u64 __user *uaddr, u64 *oldval, u64 newval)
+{
+	int ret = 0;
+
+	uaccess_ttbr0_enable();
+
+	asm volatile("// __lsui_cmpxchg64\n"
+	__LSUI_PREAMBLE
+"1:	casalt	%[oldval], %[newval], %[uaddr]\n"
+"2:\n"
+	_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
+	: [ret] "+r" (ret), [uaddr] "+Q" (*uaddr),
+	  [oldval] "+r" (*oldval)
+	: [newval] "r" (newval)
+	: "memory");
+
+	uaccess_ttbr0_disable();
+
+	return ret;
+}
+
+static __always_inline int
+__lsui_cmpxchg32(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+	u64 __user *uaddr64;
+	bool futex_pos, other_pos;
+	u32 other, orig_other;
+	union {
+		u32 futex[2];
+		u64 raw;
+	} oval64, orig64, nval64;
+
+	uaddr64 = (u64 __user *)PTR_ALIGN_DOWN(uaddr, sizeof(u64));
+	futex_pos = !IS_ALIGNED((unsigned long)uaddr, sizeof(u64));
+	other_pos = !futex_pos;
+
+	oval64.futex[futex_pos] = oldval;
+	if (get_user(oval64.futex[other_pos], (u32 __user *)uaddr64 + other_pos))
+		return -EFAULT;
+
+	orig64.raw = oval64.raw;
+
+	nval64.futex[futex_pos] = newval;
+	nval64.futex[other_pos] = oval64.futex[other_pos];
+
+	if (__lsui_cmpxchg64(uaddr64, &oval64.raw, nval64.raw))
+		return -EFAULT;
+
+	oldval = oval64.futex[futex_pos];
+	other = oval64.futex[other_pos];
+	orig_other = orig64.futex[other_pos];
+
+	if (other != orig_other)
+		return -EAGAIN;
+
+	*oval = oldval;
+
+	return 0;
+}
+
+static __always_inline int
+__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
+{
+	/*
+	 * Undo the bitwise negation applied to the oparg passed from
+	 * arch_futex_atomic_op_inuser() with FUTEX_OP_ANDN.
+	 */
+	return __lsui_futex_atomic_andnot(~oparg, uaddr, oval);
+}
+
+static __always_inline int
+__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
+{
+	u32 oldval, newval, val;
+	int ret, i;
+
+	if (get_user(oldval, uaddr))
+		return -EFAULT;
+
+	/*
+	 * there are no ldteor/stteor instructions...
+	 */
+	for (i = 0; i < FUTEX_MAX_LOOPS; i++) {
+		newval = oldval ^ oparg;
+
+		ret = __lsui_cmpxchg32(uaddr, oldval, newval, &val);
+		switch (ret) {
+		case -EFAULT:
+			return ret;
+		case -EAGAIN:
+			continue;
+		}
+
+		if (val == oldval) {
+			*oval = val;
+			return 0;
+		}
+
+		oldval = val;
+	}
+
+	return -EAGAIN;
+}
+
+static __always_inline int
+__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+	/*
+	 * Callers of futex_atomic_cmpxchg_inatomic() already retry on
+	 * -EAGAIN, no need for another loop of max retries.
+	 */
+	return __lsui_cmpxchg32(uaddr, oldval, newval, oval);
+}
+#endif	/* CONFIG_ARM64_LSUI */
+
+
+#define FUTEX_ATOMIC_OP(op)						\
+static __always_inline int						\
+__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval)		\
+{									\
+	return __lsui_llsc_body(futex_atomic_##op, oparg, uaddr, oval);	\
+}
+
+FUTEX_ATOMIC_OP(add)
+FUTEX_ATOMIC_OP(or)
+FUTEX_ATOMIC_OP(and)
+FUTEX_ATOMIC_OP(eor)
+FUTEX_ATOMIC_OP(set)
+
+static __always_inline int
+__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+	return __lsui_llsc_body(futex_cmpxchg, uaddr, oldval, newval, oval);
+}
 
 static inline int
 arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr)
 {
-	int oldval = 0, ret, tmp;
-	u32 __user *uaddr = __uaccess_mask_ptr(_uaddr);
+	int ret;
+	u32 __user *uaddr;
 
 	if (!access_ok(_uaddr, sizeof(u32)))
 		return -EFAULT;
 
+	uaddr = __uaccess_mask_ptr(_uaddr);
+
 	switch (op) {
 	case FUTEX_OP_SET:
-		__futex_atomic_op("mov	%w3, %w5",
-				  ret, oldval, uaddr, tmp, oparg);
+		ret = __futex_atomic_set(oparg, uaddr, oval);
 		break;
 	case FUTEX_OP_ADD:
-		__futex_atomic_op("add	%w3, %w1, %w5",
-				  ret, oldval, uaddr, tmp, oparg);
+		ret = __futex_atomic_add(oparg, uaddr, oval);
 		break;
 	case FUTEX_OP_OR:
-		__futex_atomic_op("orr	%w3, %w1, %w5",
-				  ret, oldval, uaddr, tmp, oparg);
+		ret = __futex_atomic_or(oparg, uaddr, oval);
 		break;
 	case FUTEX_OP_ANDN:
-		__futex_atomic_op("and	%w3, %w1, %w5",
-				  ret, oldval, uaddr, tmp, ~oparg);
+		ret = __futex_atomic_and(~oparg, uaddr, oval);
 		break;
 	case FUTEX_OP_XOR:
-		__futex_atomic_op("eor	%w3, %w1, %w5",
-				  ret, oldval, uaddr, tmp, oparg);
+		ret = __futex_atomic_eor(oparg, uaddr, oval);
 		break;
 	default:
 		ret = -ENOSYS;
 	}
 
-	if (!ret)
-		*oval = oldval;
-
 	return ret;
 }
 
@@ -81,40 +302,14 @@ static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr,
 			      u32 oldval, u32 newval)
 {
-	int ret = 0;
-	unsigned int loops = FUTEX_MAX_LOOPS;
-	u32 val, tmp;
 	u32 __user *uaddr;
 
 	if (!access_ok(_uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	uaddr = __uaccess_mask_ptr(_uaddr);
-	uaccess_enable_privileged();
-	asm volatile("// futex_atomic_cmpxchg_inatomic\n"
-"	prfm	pstl1strm, %2\n"
-"1:	ldxr	%w1, %2\n"
-"	sub	%w3, %w1, %w5\n"
-"	cbnz	%w3, 4f\n"
-"2:	stlxr	%w3, %w6, %2\n"
-"	cbz	%w3, 3f\n"
-"	sub	%w4, %w4, %w3\n"
-"	cbnz	%w4, 1b\n"
-"	mov	%w0, %w7\n"
-"3:\n"
-"	dmb	ish\n"
-"4:\n"
-	_ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
-	_ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
-	: "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
-	: "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
-	: "memory");
-	uaccess_disable_privileged();
 
-	if (!ret)
-		*uval = val;
-
-	return ret;
+	return __futex_cmpxchg(uaddr, oldval, newval, uval);
 }
 
 #endif /* __ASM_FUTEX_H */
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index e6f8ff3cc630..d038ff14d16c 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -71,23 +71,23 @@ static inline void __flush_hugetlb_tlb_range(struct vm_area_struct *vma,
 					     unsigned long start,
 					     unsigned long end,
 					     unsigned long stride,
-					     bool last_level)
+					     tlbf_t flags)
 {
 	switch (stride) {
 #ifndef __PAGETABLE_PMD_FOLDED
 	case PUD_SIZE:
-		__flush_tlb_range(vma, start, end, PUD_SIZE, last_level, 1);
+		__flush_tlb_range(vma, start, end, PUD_SIZE, 1, flags);
 		break;
 #endif
 	case CONT_PMD_SIZE:
 	case PMD_SIZE:
-		__flush_tlb_range(vma, start, end, PMD_SIZE, last_level, 2);
+		__flush_tlb_range(vma, start, end, PMD_SIZE, 2, flags);
 		break;
 	case CONT_PTE_SIZE:
-		__flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, 3);
+		__flush_tlb_range(vma, start, end, PAGE_SIZE, 3, flags);
 		break;
 	default:
-		__flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, TLBI_TTL_UNKNOWN);
+		__flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, flags);
 	}
 }
 
@@ -98,7 +98,7 @@ static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
 {
 	unsigned long stride = huge_page_size(hstate_vma(vma));
 
-	__flush_hugetlb_tlb_range(vma, start, end, stride, false);
+	__flush_hugetlb_tlb_range(vma, start, end, stride, TLBF_NONE);
 }
 
 #endif /* __ASM_HUGETLB_H */
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 72ea4bda79f3..abe8218b2325 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -60,126 +60,10 @@
  * of KERNEL_HWCAP_{feature}.
  */
 #define __khwcap_feature(x)		const_ilog2(HWCAP_ ## x)
-#define KERNEL_HWCAP_FP			__khwcap_feature(FP)
-#define KERNEL_HWCAP_ASIMD		__khwcap_feature(ASIMD)
-#define KERNEL_HWCAP_EVTSTRM		__khwcap_feature(EVTSTRM)
-#define KERNEL_HWCAP_AES		__khwcap_feature(AES)
-#define KERNEL_HWCAP_PMULL		__khwcap_feature(PMULL)
-#define KERNEL_HWCAP_SHA1		__khwcap_feature(SHA1)
-#define KERNEL_HWCAP_SHA2		__khwcap_feature(SHA2)
-#define KERNEL_HWCAP_CRC32		__khwcap_feature(CRC32)
-#define KERNEL_HWCAP_ATOMICS		__khwcap_feature(ATOMICS)
-#define KERNEL_HWCAP_FPHP		__khwcap_feature(FPHP)
-#define KERNEL_HWCAP_ASIMDHP		__khwcap_feature(ASIMDHP)
-#define KERNEL_HWCAP_CPUID		__khwcap_feature(CPUID)
-#define KERNEL_HWCAP_ASIMDRDM		__khwcap_feature(ASIMDRDM)
-#define KERNEL_HWCAP_JSCVT		__khwcap_feature(JSCVT)
-#define KERNEL_HWCAP_FCMA		__khwcap_feature(FCMA)
-#define KERNEL_HWCAP_LRCPC		__khwcap_feature(LRCPC)
-#define KERNEL_HWCAP_DCPOP		__khwcap_feature(DCPOP)
-#define KERNEL_HWCAP_SHA3		__khwcap_feature(SHA3)
-#define KERNEL_HWCAP_SM3		__khwcap_feature(SM3)
-#define KERNEL_HWCAP_SM4		__khwcap_feature(SM4)
-#define KERNEL_HWCAP_ASIMDDP		__khwcap_feature(ASIMDDP)
-#define KERNEL_HWCAP_SHA512		__khwcap_feature(SHA512)
-#define KERNEL_HWCAP_SVE		__khwcap_feature(SVE)
-#define KERNEL_HWCAP_ASIMDFHM		__khwcap_feature(ASIMDFHM)
-#define KERNEL_HWCAP_DIT		__khwcap_feature(DIT)
-#define KERNEL_HWCAP_USCAT		__khwcap_feature(USCAT)
-#define KERNEL_HWCAP_ILRCPC		__khwcap_feature(ILRCPC)
-#define KERNEL_HWCAP_FLAGM		__khwcap_feature(FLAGM)
-#define KERNEL_HWCAP_SSBS		__khwcap_feature(SSBS)
-#define KERNEL_HWCAP_SB			__khwcap_feature(SB)
-#define KERNEL_HWCAP_PACA		__khwcap_feature(PACA)
-#define KERNEL_HWCAP_PACG		__khwcap_feature(PACG)
-#define KERNEL_HWCAP_GCS		__khwcap_feature(GCS)
-#define KERNEL_HWCAP_CMPBR		__khwcap_feature(CMPBR)
-#define KERNEL_HWCAP_FPRCVT		__khwcap_feature(FPRCVT)
-#define KERNEL_HWCAP_F8MM8		__khwcap_feature(F8MM8)
-#define KERNEL_HWCAP_F8MM4		__khwcap_feature(F8MM4)
-#define KERNEL_HWCAP_SVE_F16MM		__khwcap_feature(SVE_F16MM)
-#define KERNEL_HWCAP_SVE_ELTPERM	__khwcap_feature(SVE_ELTPERM)
-#define KERNEL_HWCAP_SVE_AES2		__khwcap_feature(SVE_AES2)
-#define KERNEL_HWCAP_SVE_BFSCALE	__khwcap_feature(SVE_BFSCALE)
-#define KERNEL_HWCAP_SVE2P2		__khwcap_feature(SVE2P2)
-#define KERNEL_HWCAP_SME2P2		__khwcap_feature(SME2P2)
-#define KERNEL_HWCAP_SME_SBITPERM	__khwcap_feature(SME_SBITPERM)
-#define KERNEL_HWCAP_SME_AES		__khwcap_feature(SME_AES)
-#define KERNEL_HWCAP_SME_SFEXPA		__khwcap_feature(SME_SFEXPA)
-#define KERNEL_HWCAP_SME_STMOP		__khwcap_feature(SME_STMOP)
-#define KERNEL_HWCAP_SME_SMOP4		__khwcap_feature(SME_SMOP4)
-
 #define __khwcap2_feature(x)		(const_ilog2(HWCAP2_ ## x) + 64)
-#define KERNEL_HWCAP_DCPODP		__khwcap2_feature(DCPODP)
-#define KERNEL_HWCAP_SVE2		__khwcap2_feature(SVE2)
-#define KERNEL_HWCAP_SVEAES		__khwcap2_feature(SVEAES)
-#define KERNEL_HWCAP_SVEPMULL		__khwcap2_feature(SVEPMULL)
-#define KERNEL_HWCAP_SVEBITPERM		__khwcap2_feature(SVEBITPERM)
-#define KERNEL_HWCAP_SVESHA3		__khwcap2_feature(SVESHA3)
-#define KERNEL_HWCAP_SVESM4		__khwcap2_feature(SVESM4)
-#define KERNEL_HWCAP_FLAGM2		__khwcap2_feature(FLAGM2)
-#define KERNEL_HWCAP_FRINT		__khwcap2_feature(FRINT)
-#define KERNEL_HWCAP_SVEI8MM		__khwcap2_feature(SVEI8MM)
-#define KERNEL_HWCAP_SVEF32MM		__khwcap2_feature(SVEF32MM)
-#define KERNEL_HWCAP_SVEF64MM		__khwcap2_feature(SVEF64MM)
-#define KERNEL_HWCAP_SVEBF16		__khwcap2_feature(SVEBF16)
-#define KERNEL_HWCAP_I8MM		__khwcap2_feature(I8MM)
-#define KERNEL_HWCAP_BF16		__khwcap2_feature(BF16)
-#define KERNEL_HWCAP_DGH		__khwcap2_feature(DGH)
-#define KERNEL_HWCAP_RNG		__khwcap2_feature(RNG)
-#define KERNEL_HWCAP_BTI		__khwcap2_feature(BTI)
-#define KERNEL_HWCAP_MTE		__khwcap2_feature(MTE)
-#define KERNEL_HWCAP_ECV		__khwcap2_feature(ECV)
-#define KERNEL_HWCAP_AFP		__khwcap2_feature(AFP)
-#define KERNEL_HWCAP_RPRES		__khwcap2_feature(RPRES)
-#define KERNEL_HWCAP_MTE3		__khwcap2_feature(MTE3)
-#define KERNEL_HWCAP_SME		__khwcap2_feature(SME)
-#define KERNEL_HWCAP_SME_I16I64		__khwcap2_feature(SME_I16I64)
-#define KERNEL_HWCAP_SME_F64F64		__khwcap2_feature(SME_F64F64)
-#define KERNEL_HWCAP_SME_I8I32		__khwcap2_feature(SME_I8I32)
-#define KERNEL_HWCAP_SME_F16F32		__khwcap2_feature(SME_F16F32)
-#define KERNEL_HWCAP_SME_B16F32		__khwcap2_feature(SME_B16F32)
-#define KERNEL_HWCAP_SME_F32F32		__khwcap2_feature(SME_F32F32)
-#define KERNEL_HWCAP_SME_FA64		__khwcap2_feature(SME_FA64)
-#define KERNEL_HWCAP_WFXT		__khwcap2_feature(WFXT)
-#define KERNEL_HWCAP_EBF16		__khwcap2_feature(EBF16)
-#define KERNEL_HWCAP_SVE_EBF16		__khwcap2_feature(SVE_EBF16)
-#define KERNEL_HWCAP_CSSC		__khwcap2_feature(CSSC)
-#define KERNEL_HWCAP_RPRFM		__khwcap2_feature(RPRFM)
-#define KERNEL_HWCAP_SVE2P1		__khwcap2_feature(SVE2P1)
-#define KERNEL_HWCAP_SME2		__khwcap2_feature(SME2)
-#define KERNEL_HWCAP_SME2P1		__khwcap2_feature(SME2P1)
-#define KERNEL_HWCAP_SME_I16I32		__khwcap2_feature(SME_I16I32)
-#define KERNEL_HWCAP_SME_BI32I32	__khwcap2_feature(SME_BI32I32)
-#define KERNEL_HWCAP_SME_B16B16		__khwcap2_feature(SME_B16B16)
-#define KERNEL_HWCAP_SME_F16F16		__khwcap2_feature(SME_F16F16)
-#define KERNEL_HWCAP_MOPS		__khwcap2_feature(MOPS)
-#define KERNEL_HWCAP_HBC		__khwcap2_feature(HBC)
-#define KERNEL_HWCAP_SVE_B16B16		__khwcap2_feature(SVE_B16B16)
-#define KERNEL_HWCAP_LRCPC3		__khwcap2_feature(LRCPC3)
-#define KERNEL_HWCAP_LSE128		__khwcap2_feature(LSE128)
-#define KERNEL_HWCAP_FPMR		__khwcap2_feature(FPMR)
-#define KERNEL_HWCAP_LUT		__khwcap2_feature(LUT)
-#define KERNEL_HWCAP_FAMINMAX		__khwcap2_feature(FAMINMAX)
-#define KERNEL_HWCAP_F8CVT		__khwcap2_feature(F8CVT)
-#define KERNEL_HWCAP_F8FMA		__khwcap2_feature(F8FMA)
-#define KERNEL_HWCAP_F8DP4		__khwcap2_feature(F8DP4)
-#define KERNEL_HWCAP_F8DP2		__khwcap2_feature(F8DP2)
-#define KERNEL_HWCAP_F8E4M3		__khwcap2_feature(F8E4M3)
-#define KERNEL_HWCAP_F8E5M2		__khwcap2_feature(F8E5M2)
-#define KERNEL_HWCAP_SME_LUTV2		__khwcap2_feature(SME_LUTV2)
-#define KERNEL_HWCAP_SME_F8F16		__khwcap2_feature(SME_F8F16)
-#define KERNEL_HWCAP_SME_F8F32		__khwcap2_feature(SME_F8F32)
-#define KERNEL_HWCAP_SME_SF8FMA		__khwcap2_feature(SME_SF8FMA)
-#define KERNEL_HWCAP_SME_SF8DP4		__khwcap2_feature(SME_SF8DP4)
-#define KERNEL_HWCAP_SME_SF8DP2		__khwcap2_feature(SME_SF8DP2)
-#define KERNEL_HWCAP_POE		__khwcap2_feature(POE)
-
 #define __khwcap3_feature(x)		(const_ilog2(HWCAP3_ ## x) + 128)
-#define KERNEL_HWCAP_MTE_FAR		__khwcap3_feature(MTE_FAR)
-#define KERNEL_HWCAP_MTE_STORE_ONLY	__khwcap3_feature(MTE_STORE_ONLY)
-#define KERNEL_HWCAP_LSFE		__khwcap3_feature(LSFE)
-#define KERNEL_HWCAP_LS64		__khwcap3_feature(LS64)
+
+#include "asm/kernel-hwcap.h"
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/asm/lsui.h b/arch/arm64/include/asm/lsui.h
new file mode 100644
index 000000000000..8f0d81953eb6
--- /dev/null
+++ b/arch/arm64/include/asm/lsui.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_LSUI_H
+#define __ASM_LSUI_H
+
+#include <linux/compiler_types.h>
+#include <linux/stringify.h>
+#include <asm/alternative.h>
+#include <asm/alternative-macros.h>
+#include <asm/cpucaps.h>
+
+#define __LSUI_PREAMBLE	".arch_extension lsui\n"
+
+#ifdef CONFIG_ARM64_LSUI
+
+#define __lsui_llsc_body(op, ...)					\
+({									\
+	alternative_has_cap_unlikely(ARM64_HAS_LSUI) ?			\
+		__lsui_##op(__VA_ARGS__) : __llsc_##op(__VA_ARGS__);	\
+})
+
+#else	/* CONFIG_ARM64_LSUI */
+
+#define __lsui_llsc_body(op, ...)	__llsc_##op(__VA_ARGS__)
+
+#endif	/* CONFIG_ARM64_LSUI */
+
+#endif	/* __ASM_LSUI_H */
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 137a173df1ff..5e1211c540ab 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -10,20 +10,12 @@
 #define MMCF_AARCH32	0x1	/* mm context flag for AArch32 executables */
 #define USER_ASID_BIT	48
 #define USER_ASID_FLAG	(UL(1) << USER_ASID_BIT)
-#define TTBR_ASID_MASK	(UL(0xffff) << 48)
 
 #ifndef __ASSEMBLER__
 
 #include <linux/refcount.h>
 #include <asm/cpufeature.h>
 
-enum pgtable_type {
-	TABLE_PTE,
-	TABLE_PMD,
-	TABLE_PUD,
-	TABLE_P4D,
-};
-
 typedef struct {
 	atomic64_t	id;
 #ifdef CONFIG_COMPAT
@@ -112,5 +104,7 @@ void kpti_install_ng_mappings(void);
 static inline void kpti_install_ng_mappings(void) {}
 #endif
 
+extern bool page_alloc_available;
+
 #endif	/* !__ASSEMBLER__ */
 #endif
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index cc80af59c69e..803b68758152 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -210,7 +210,8 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,
 	if (mm == &init_mm)
 		ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
 	else
-		ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | ASID(mm) << 48;
+		ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) |
+		       FIELD_PREP(TTBRx_EL1_ASID_MASK, ASID(mm));
 
 	WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
 }
diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h
new file mode 100644
index 000000000000..70d396e7b6da
--- /dev/null
+++ b/arch/arm64/include/asm/mpam.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2025 Arm Ltd. */
+
+#ifndef __ASM__MPAM_H
+#define __ASM__MPAM_H
+
+#include <linux/arm_mpam.h>
+#include <linux/bitfield.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+
+#include <asm/sysreg.h>
+
+DECLARE_STATIC_KEY_FALSE(mpam_enabled);
+DECLARE_PER_CPU(u64, arm64_mpam_default);
+DECLARE_PER_CPU(u64, arm64_mpam_current);
+
+/*
+ * The value of the MPAM0_EL1 sysreg when a task is in resctrl's default group.
+ * This is used by the context switch code to use the resctrl CPU property
+ * instead. The value is modified when CDP is enabled/disabled by mounting
+ * the resctrl filesystem.
+ */
+extern u64 arm64_mpam_global_default;
+
+#ifdef CONFIG_ARM64_MPAM
+static inline u64 __mpam_regval(u16 partid_d, u16 partid_i, u8 pmg_d, u8 pmg_i)
+{
+	return FIELD_PREP(MPAM0_EL1_PARTID_D, partid_d) |
+	       FIELD_PREP(MPAM0_EL1_PARTID_I, partid_i) |
+	       FIELD_PREP(MPAM0_EL1_PMG_D, pmg_d) |
+	       FIELD_PREP(MPAM0_EL1_PMG_I, pmg_i);
+}
+
+static inline void mpam_set_cpu_defaults(int cpu, u16 partid_d, u16 partid_i,
+					 u8 pmg_d, u8 pmg_i)
+{
+	u64 default_val = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i);
+
+	WRITE_ONCE(per_cpu(arm64_mpam_default, cpu), default_val);
+}
+
+/*
+ * The resctrl filesystem writes to the partid/pmg values for threads and CPUs,
+ * which may race with reads in mpam_thread_switch(). Ensure only one of the old
+ * or new values are used. Particular care should be taken with the pmg field as
+ * mpam_thread_switch() may read a partid and pmg that don't match, causing this
+ * value to be stored with cache allocations, despite being considered 'free' by
+ * resctrl.
+ */
+static inline u64 mpam_get_regval(struct task_struct *tsk)
+{
+	return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg);
+}
+
+static inline void mpam_set_task_partid_pmg(struct task_struct *tsk,
+					    u16 partid_d, u16 partid_i,
+					    u8 pmg_d, u8 pmg_i)
+{
+	u64 regval = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i);
+
+	WRITE_ONCE(task_thread_info(tsk)->mpam_partid_pmg, regval);
+}
+
+static inline void mpam_thread_switch(struct task_struct *tsk)
+{
+	u64 oldregval;
+	int cpu = smp_processor_id();
+	u64 regval = mpam_get_regval(tsk);
+
+	if (!static_branch_likely(&mpam_enabled))
+		return;
+
+	if (regval == READ_ONCE(arm64_mpam_global_default))
+		regval = READ_ONCE(per_cpu(arm64_mpam_default, cpu));
+
+	oldregval = READ_ONCE(per_cpu(arm64_mpam_current, cpu));
+	if (oldregval == regval)
+		return;
+
+	write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1);
+	if (system_supports_sme())
+		write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1);
+	isb();
+
+	/* Synchronising the EL0 write is left until the ERET to EL0 */
+	write_sysreg_s(regval, SYS_MPAM0_EL1);
+
+	WRITE_ONCE(per_cpu(arm64_mpam_current, cpu), regval);
+}
+#else
+static inline void mpam_thread_switch(struct task_struct *tsk) {}
+#endif /* CONFIG_ARM64_MPAM */
+
+#endif /* __ASM__MPAM_H */
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 6d4a78b9dc3e..7f7b97e09996 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -252,6 +252,9 @@ static inline void mte_check_tfsr_entry(void)
 	if (!kasan_hw_tags_enabled())
 		return;
 
+	if (!system_uses_mte_async_or_asymm_mode())
+		return;
+
 	mte_check_tfsr_el1();
 }
 
@@ -260,6 +263,9 @@ static inline void mte_check_tfsr_exit(void)
 	if (!kasan_hw_tags_enabled())
 		return;
 
+	if (!system_uses_mte_async_or_asymm_mode())
+		return;
+
 	/*
 	 * The asynchronous faults are sync'ed automatically with
 	 * TFSR_EL1 on kernel entry but for exit an explicit dsb()
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index d49180bb7cb3..72f31800c703 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -223,8 +223,6 @@
  */
 #define S1_TABLE_AP		(_AT(pmdval_t, 3) << 61)
 
-#define TTBR_CNP_BIT		(UL(1) << 0)
-
 /*
  * TCR flags.
  */
@@ -287,9 +285,12 @@
 #endif
 
 #ifdef CONFIG_ARM64_VA_BITS_52
+#define PTRS_PER_PGD_52_VA (UL(1) << (52 - PGDIR_SHIFT))
+#define PTRS_PER_PGD_48_VA (UL(1) << (48 - PGDIR_SHIFT))
+#define PTRS_PER_PGD_EXTRA (PTRS_PER_PGD_52_VA - PTRS_PER_PGD_48_VA)
+
 /* Must be at least 64-byte aligned to prevent corruption of the TTBR */
-#define TTBR1_BADDR_4852_OFFSET	(((UL(1) << (52 - PGDIR_SHIFT)) - \
-				 (UL(1) << (48 - PGDIR_SHIFT))) * 8)
+#define TTBR1_BADDR_4852_OFFSET (PTRS_PER_PGD_EXTRA << PTDESC_ORDER)
 #endif
 
 #endif
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index f560e6420267..212ce1b02e15 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -25,6 +25,8 @@
  */
 #define PTE_PRESENT_INVALID	(PTE_NG)		 /* only when !PTE_VALID */
 
+#define PTE_PRESENT_VALID_KERNEL (PTE_VALID | PTE_MAYBE_NG)
+
 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
 #define PTE_UFFD_WP		(_AT(pteval_t, 1) << 58) /* uffd-wp tracking */
 #define PTE_SWP_UFFD_WP		(_AT(pteval_t, 1) << 3)	 /* only for swp ptes */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b3e58735c49b..308e29e829b8 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -89,9 +89,9 @@ static inline void arch_leave_lazy_mmu_mode(void)
 
 /* Set stride and tlb_level in flush_*_tlb_range */
 #define flush_pmd_tlb_range(vma, addr, end)	\
-	__flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2)
+	__flush_tlb_range(vma, addr, end, PMD_SIZE, 2, TLBF_NONE)
 #define flush_pud_tlb_range(vma, addr, end)	\
-	__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
+	__flush_tlb_range(vma, addr, end, PUD_SIZE, 1, TLBF_NONE)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
@@ -101,10 +101,11 @@ static inline void arch_leave_lazy_mmu_mode(void)
  * entries exist.
  */
 #define flush_tlb_fix_spurious_fault(vma, address, ptep)	\
-	local_flush_tlb_page_nonotify(vma, address)
+	__flush_tlb_page(vma, address, TLBF_NOBROADCAST | TLBF_NONOTIFY)
 
-#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp)	\
-	local_flush_tlb_page_nonotify(vma, address)
+#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp)			\
+	__flush_tlb_range(vma, address, address + PMD_SIZE, PMD_SIZE, 2,	\
+			  TLBF_NOBROADCAST | TLBF_NONOTIFY | TLBF_NOWALKCACHE)
 
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
@@ -322,9 +323,11 @@ static inline pte_t pte_mknoncont(pte_t pte)
 	return clear_pte_bit(pte, __pgprot(PTE_CONT));
 }
 
-static inline pte_t pte_mkvalid(pte_t pte)
+static inline pte_t pte_mkvalid_k(pte_t pte)
 {
-	return set_pte_bit(pte, __pgprot(PTE_VALID));
+	pte = clear_pte_bit(pte, __pgprot(PTE_PRESENT_INVALID));
+	pte = set_pte_bit(pte, __pgprot(PTE_PRESENT_VALID_KERNEL));
+	return pte;
 }
 
 static inline pte_t pte_mkinvalid(pte_t pte)
@@ -594,6 +597,7 @@ static inline int pmd_protnone(pmd_t pmd)
 #define pmd_mkclean(pmd)	pte_pmd(pte_mkclean(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
+#define pmd_mkvalid_k(pmd)	pte_pmd(pte_mkvalid_k(pmd_pte(pmd)))
 #define pmd_mkinvalid(pmd)	pte_pmd(pte_mkinvalid(pmd_pte(pmd)))
 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
 #define pmd_uffd_wp(pmd)	pte_uffd_wp(pmd_pte(pmd))
@@ -635,6 +639,8 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd)
 
 #define pud_young(pud)		pte_young(pud_pte(pud))
 #define pud_mkyoung(pud)	pte_pud(pte_mkyoung(pud_pte(pud)))
+#define pud_mkwrite_novma(pud)	pte_pud(pte_mkwrite_novma(pud_pte(pud)))
+#define pud_mkvalid_k(pud)	pte_pud(pte_mkvalid_k(pud_pte(pud)))
 #define pud_write(pud)		pte_write(pud_pte(pud))
 
 static inline pud_t pud_mkhuge(pud_t pud)
@@ -779,9 +785,13 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 
 #define pmd_table(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_TABLE)
-#define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
-				 PMD_TYPE_SECT)
-#define pmd_leaf(pmd)		(pmd_present(pmd) && !pmd_table(pmd))
+
+#define pmd_leaf pmd_leaf
+static inline bool pmd_leaf(pmd_t pmd)
+{
+	return pmd_present(pmd) && !pmd_table(pmd);
+}
+
 #define pmd_bad(pmd)		(!pmd_table(pmd))
 
 #define pmd_leaf_size(pmd)	(pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
@@ -799,11 +809,8 @@ static inline int pmd_trans_huge(pmd_t pmd)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
-static inline bool pud_sect(pud_t pud) { return false; }
 static inline bool pud_table(pud_t pud) { return true; }
 #else
-#define pud_sect(pud)		((pud_val(pud) & PUD_TYPE_MASK) == \
-				 PUD_TYPE_SECT)
 #define pud_table(pud)		((pud_val(pud) & PUD_TYPE_MASK) == \
 				 PUD_TYPE_TABLE)
 #endif
@@ -873,7 +880,11 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 				 PUD_TYPE_TABLE)
 #define pud_present(pud)	pte_present(pud_pte(pud))
 #ifndef __PAGETABLE_PMD_FOLDED
-#define pud_leaf(pud)		(pud_present(pud) && !pud_table(pud))
+#define pud_leaf pud_leaf
+static inline bool pud_leaf(pud_t pud)
+{
+	return pud_present(pud) && !pud_table(pud);
+}
 #else
 #define pud_leaf(pud)		false
 #endif
@@ -1247,9 +1258,18 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 	return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
 }
 
-extern int __ptep_set_access_flags(struct vm_area_struct *vma,
-				 unsigned long address, pte_t *ptep,
-				 pte_t entry, int dirty);
+extern int __ptep_set_access_flags_anysz(struct vm_area_struct *vma,
+					 unsigned long address, pte_t *ptep,
+					 pte_t entry, int dirty,
+					 unsigned long pgsize);
+
+static inline int __ptep_set_access_flags(struct vm_area_struct *vma,
+					  unsigned long address, pte_t *ptep,
+					  pte_t entry, int dirty)
+{
+	return __ptep_set_access_flags_anysz(vma, address, ptep, entry, dirty,
+					     PAGE_SIZE);
+}
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
@@ -1257,8 +1277,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 					unsigned long address, pmd_t *pmdp,
 					pmd_t entry, int dirty)
 {
-	return __ptep_set_access_flags(vma, address, (pte_t *)pmdp,
-							pmd_pte(entry), dirty);
+	return __ptep_set_access_flags_anysz(vma, address, (pte_t *)pmdp,
+					     pmd_pte(entry), dirty, PMD_SIZE);
 }
 #endif
 
@@ -1320,7 +1340,7 @@ static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
 		 * context-switch, which provides a DSB to complete the TLB
 		 * invalidation.
 		 */
-		flush_tlb_page_nosync(vma, address);
+		__flush_tlb_page(vma, address, TLBF_NOSYNC);
 	}
 
 	return young;
diff --git a/arch/arm64/include/asm/resctrl.h b/arch/arm64/include/asm/resctrl.h
new file mode 100644
index 000000000000..b506e95cf6e3
--- /dev/null
+++ b/arch/arm64/include/asm/resctrl.h
@@ -0,0 +1,2 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/arm_mpam.h>
diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
index 0fbc2e7867d3..a15a2968e7b6 100644
--- a/arch/arm64/include/asm/scs.h
+++ b/arch/arm64/include/asm/scs.h
@@ -10,6 +10,11 @@
 #ifdef CONFIG_SHADOW_CALL_STACK
 	scs_sp	.req	x18
 
+	.macro scs_load_current_base
+	get_current_task scs_sp
+	ldr	scs_sp, [scs_sp, #TSK_TI_SCS_BASE]
+	.endm
+
 	.macro scs_load_current
 	get_current_task scs_sp
 	ldr	scs_sp, [scs_sp, #TSK_TI_SCS_SP]
@@ -19,6 +24,9 @@
 	str	scs_sp, [\tsk, #TSK_TI_SCS_SP]
 	.endm
 #else
+	.macro scs_load_current_base
+	.endm
+
 	.macro scs_load_current
 	.endm
 
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 7942478e4065..5d7fe3e153c8 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -41,6 +41,9 @@ struct thread_info {
 #ifdef CONFIG_SHADOW_CALL_STACK
 	void			*scs_base;
 	void			*scs_sp;
+#endif
+#ifdef CONFIG_ARM64_MPAM
+	u64			mpam_partid_pmg;
 #endif
 	u32			cpu;
 };
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 8d762607285c..10869d7731b8 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -53,7 +53,7 @@ static inline int tlb_get_level(struct mmu_gather *tlb)
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
 	struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0);
-	bool last_level = !tlb->freed_tables;
+	tlbf_t flags = tlb->freed_tables ? TLBF_NONE : TLBF_NOWALKCACHE;
 	unsigned long stride = tlb_get_unmap_size(tlb);
 	int tlb_level = tlb_get_level(tlb);
 
@@ -63,13 +63,13 @@ static inline void tlb_flush(struct mmu_gather *tlb)
 	 * reallocate our ASID without invalidating the entire TLB.
 	 */
 	if (tlb->fullmm) {
-		if (!last_level)
+		if (tlb->freed_tables)
 			flush_tlb_mm(tlb->mm);
 		return;
 	}
 
 	__flush_tlb_range(&vma, tlb->start, tlb->end, stride,
-			  last_level, tlb_level);
+			  tlb_level, flags);
 }
 
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 1416e652612b..47fa4d39a461 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -97,24 +97,69 @@ static inline unsigned long get_trans_granule(void)
 
 #define TLBI_TTL_UNKNOWN	INT_MAX
 
-#define __tlbi_level(op, addr, level) do {				\
-	u64 arg = addr;							\
-									\
-	if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) &&	\
-	    level >= 0 && level <= 3) {					\
-		u64 ttl = level & 3;					\
-		ttl |= get_trans_granule() << 2;			\
-		arg &= ~TLBI_TTL_MASK;					\
-		arg |= FIELD_PREP(TLBI_TTL_MASK, ttl);			\
-	}								\
-									\
-	__tlbi(op, arg);						\
-} while(0)
+typedef void (*tlbi_op)(u64 arg);
 
-#define __tlbi_user_level(op, arg, level) do {				\
-	if (arm64_kernel_unmapped_at_el0())				\
-		__tlbi_level(op, (arg | USER_ASID_FLAG), level);	\
-} while (0)
+static __always_inline void vae1is(u64 arg)
+{
+	__tlbi(vae1is, arg);
+	__tlbi_user(vae1is, arg);
+}
+
+static __always_inline void vae2is(u64 arg)
+{
+	__tlbi(vae2is, arg);
+}
+
+static __always_inline void vale1(u64 arg)
+{
+	__tlbi(vale1, arg);
+	__tlbi_user(vale1, arg);
+}
+
+static __always_inline void vale1is(u64 arg)
+{
+	__tlbi(vale1is, arg);
+	__tlbi_user(vale1is, arg);
+}
+
+static __always_inline void vale2is(u64 arg)
+{
+	__tlbi(vale2is, arg);
+}
+
+static __always_inline void vaale1is(u64 arg)
+{
+	__tlbi(vaale1is, arg);
+}
+
+static __always_inline void ipas2e1(u64 arg)
+{
+	__tlbi(ipas2e1, arg);
+}
+
+static __always_inline void ipas2e1is(u64 arg)
+{
+	__tlbi(ipas2e1is, arg);
+}
+
+static __always_inline void __tlbi_level_asid(tlbi_op op, u64 addr, u32 level,
+					      u16 asid)
+{
+	u64 arg = __TLBI_VADDR(addr, asid);
+
+	if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && level <= 3) {
+		u64 ttl = level | (get_trans_granule() << 2);
+
+		FIELD_MODIFY(TLBI_TTL_MASK, &arg, ttl);
+	}
+
+	op(arg);
+}
+
+static inline void __tlbi_level(tlbi_op op, u64 addr, u32 level)
+{
+	__tlbi_level_asid(op, addr, level, 0);
+}
 
 /*
  * This macro creates a properly formatted VA operand for the TLB RANGE. The
@@ -141,19 +186,6 @@ static inline unsigned long get_trans_granule(void)
 #define TLBIR_TTL_MASK		GENMASK_ULL(38, 37)
 #define TLBIR_BADDR_MASK	GENMASK_ULL(36,  0)
 
-#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl)		\
-	({								\
-		unsigned long __ta = 0;					\
-		unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0;	\
-		__ta |= FIELD_PREP(TLBIR_BADDR_MASK, baddr);		\
-		__ta |= FIELD_PREP(TLBIR_TTL_MASK, __ttl);		\
-		__ta |= FIELD_PREP(TLBIR_NUM_MASK, num);		\
-		__ta |= FIELD_PREP(TLBIR_SCALE_MASK, scale);		\
-		__ta |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule());	\
-		__ta |= FIELD_PREP(TLBIR_ASID_MASK, asid);		\
-		__ta;							\
-	})
-
 /* These macros are used by the TLBI RANGE feature. */
 #define __TLBI_RANGE_PAGES(num, scale)	\
 	((unsigned long)((num) + 1) << (5 * (scale) + 1))
@@ -167,11 +199,7 @@ static inline unsigned long get_trans_granule(void)
  * range.
  */
 #define __TLBI_RANGE_NUM(pages, scale)					\
-	({								\
-		int __pages = min((pages),				\
-				  __TLBI_RANGE_PAGES(31, (scale)));	\
-		(__pages >> (5 * (scale) + 1)) - 1;			\
-	})
+	(((pages) >> (5 * (scale) + 1)) - 1)
 
 #define __repeat_tlbi_sync(op, arg...)						\
 do {										\
@@ -241,10 +269,7 @@ static inline void __tlbi_sync_s1ish_hyp(void)
  *		unmapping pages from vmalloc/io space.
  *
  *	flush_tlb_page(vma, addr)
- *		Invalidate a single user mapping for address 'addr' in the
- *		address space corresponding to 'vma->mm'.  Note that this
- *		operation only invalidates a single, last-level page-table
- *		entry and therefore does not affect any walk-caches.
+ *		Equivalent to __flush_tlb_page(..., flags=TLBF_NONE)
  *
  *
  *	Next, we have some undocumented invalidation routines that you probably
@@ -258,30 +283,28 @@ static inline void __tlbi_sync_s1ish_hyp(void)
  *		CPUs, ensuring that any walk-cache entries associated with the
  *		translation are also invalidated.
  *
- *	__flush_tlb_range(vma, start, end, stride, last_level, tlb_level)
+ *	__flush_tlb_range(vma, start, end, stride, tlb_level, flags)
  *		Invalidate the virtual-address range '[start, end)' on all
  *		CPUs for the user address space corresponding to 'vma->mm'.
  *		The invalidation operations are issued at a granularity
- *		determined by 'stride' and only affect any walk-cache entries
- *		if 'last_level' is equal to false. tlb_level is the level at
+ *		determined by 'stride'. tlb_level is the level at
  *		which the invalidation must take place. If the level is wrong,
  *		no invalidation may take place. In the case where the level
  *		cannot be easily determined, the value TLBI_TTL_UNKNOWN will
- *		perform a non-hinted invalidation.
+ *		perform a non-hinted invalidation. flags may be TLBF_NONE (0) or
+ *		any combination of TLBF_NOWALKCACHE (elide eviction of walk
+ *		cache entries), TLBF_NONOTIFY (don't call mmu notifiers),
+ *		TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST
+ *		(only perform the invalidation for the local cpu).
  *
- *	local_flush_tlb_page(vma, addr)
- *		Local variant of flush_tlb_page().  Stale TLB entries may
- *		remain in remote CPUs.
- *
- *	local_flush_tlb_page_nonotify(vma, addr)
- *		Same as local_flush_tlb_page() except MMU notifier will not be
- *		called.
- *
- *	local_flush_tlb_contpte(vma, addr)
- *		Invalidate the virtual-address range
- *		'[addr, addr+CONT_PTE_SIZE)' mapped with contpte on local CPU
- *		for the user address space corresponding to 'vma->mm'.  Stale
- *		TLB entries may remain in remote CPUs.
+ *	__flush_tlb_page(vma, addr, flags)
+ *		Invalidate a single user mapping for address 'addr' in the
+ *		address space corresponding to 'vma->mm'.  Note that this
+ *		operation only invalidates a single level 3 page-table entry
+ *		and therefore does not affect any walk-caches. flags may contain
+ *		any combination of TLBF_NONOTIFY (don't call mmu notifiers),
+ *		TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST
+ *		(only perform the invalidation for the local cpu).
  *
  *	Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented
  *	on top of these routines, since that is our interface to the mmu_gather
@@ -315,59 +338,6 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
 }
 
-static inline void __local_flush_tlb_page_nonotify_nosync(struct mm_struct *mm,
-							  unsigned long uaddr)
-{
-	unsigned long addr;
-
-	dsb(nshst);
-	addr = __TLBI_VADDR(uaddr, ASID(mm));
-	__tlbi(vale1, addr);
-	__tlbi_user(vale1, addr);
-}
-
-static inline void local_flush_tlb_page_nonotify(struct vm_area_struct *vma,
-						 unsigned long uaddr)
-{
-	__local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr);
-	dsb(nsh);
-}
-
-static inline void local_flush_tlb_page(struct vm_area_struct *vma,
-					unsigned long uaddr)
-{
-	__local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr);
-	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, uaddr & PAGE_MASK,
-						(uaddr & PAGE_MASK) + PAGE_SIZE);
-	dsb(nsh);
-}
-
-static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
-					   unsigned long uaddr)
-{
-	unsigned long addr;
-
-	dsb(ishst);
-	addr = __TLBI_VADDR(uaddr, ASID(mm));
-	__tlbi(vale1is, addr);
-	__tlbi_user(vale1is, addr);
-	mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK,
-						(uaddr & PAGE_MASK) + PAGE_SIZE);
-}
-
-static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
-					 unsigned long uaddr)
-{
-	return __flush_tlb_page_nosync(vma->vm_mm, uaddr);
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-				  unsigned long uaddr)
-{
-	flush_tlb_page_nosync(vma, uaddr);
-	__tlbi_sync_s1ish();
-}
-
 static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
 {
 	return true;
@@ -397,14 +367,13 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 /*
  * __flush_tlb_range_op - Perform TLBI operation upon a range
  *
- * @op:	TLBI instruction that operates on a range (has 'r' prefix)
+ * @lop:	TLBI level operation to perform
+ * @rop:	TLBI range operation to perform
  * @start:	The start address of the range
  * @pages:	Range as the number of pages from 'start'
  * @stride:	Flush granularity
  * @asid:	The ASID of the task (0 for IPA instructions)
- * @tlb_level:	Translation Table level hint, if known
- * @tlbi_user:	If 'true', call an additional __tlbi_user()
- *              (typically for user ASIDs). 'flase' for IPA instructions
+ * @level:	Translation Table level hint, if known
  * @lpa2:	If 'true', the lpa2 scheme is used as set out below
  *
  * When the CPU does not support TLB range operations, flush the TLB
@@ -427,116 +396,181 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
  *    operations can only span an even number of pages. We save this for last to
  *    ensure 64KB start alignment is maintained for the LPA2 case.
  */
-#define __flush_tlb_range_op(op, start, pages, stride,			\
-				asid, tlb_level, tlbi_user, lpa2)	\
-do {									\
-	typeof(start) __flush_start = start;				\
-	typeof(pages) __flush_pages = pages;				\
-	int num = 0;							\
-	int scale = 3;							\
-	int shift = lpa2 ? 16 : PAGE_SHIFT;				\
-	unsigned long addr;						\
-									\
-	while (__flush_pages > 0) {					\
-		if (!system_supports_tlb_range() ||			\
-		    __flush_pages == 1 ||				\
-		    (lpa2 && __flush_start != ALIGN(__flush_start, SZ_64K))) {	\
-			addr = __TLBI_VADDR(__flush_start, asid);	\
-			__tlbi_level(op, addr, tlb_level);		\
-			if (tlbi_user)					\
-				__tlbi_user_level(op, addr, tlb_level);	\
-			__flush_start += stride;			\
-			__flush_pages -= stride >> PAGE_SHIFT;		\
-			continue;					\
-		}							\
-									\
-		num = __TLBI_RANGE_NUM(__flush_pages, scale);		\
-		if (num >= 0) {						\
-			addr = __TLBI_VADDR_RANGE(__flush_start >> shift, asid, \
-						scale, num, tlb_level);	\
-			__tlbi(r##op, addr);				\
-			if (tlbi_user)					\
-				__tlbi_user(r##op, addr);		\
-			__flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \
-			__flush_pages -= __TLBI_RANGE_PAGES(num, scale);\
-		}							\
-		scale--;						\
-	}								\
-} while (0)
-
-#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
-	__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
-
-static inline bool __flush_tlb_range_limit_excess(unsigned long start,
-		unsigned long end, unsigned long pages, unsigned long stride)
+static __always_inline void rvae1is(u64 arg)
 {
-	/*
-	 * When the system does not support TLB range based flush
-	 * operation, (MAX_DVM_OPS - 1) pages can be handled. But
-	 * with TLB range based operation, MAX_TLBI_RANGE_PAGES
-	 * pages can be handled.
-	 */
-	if ((!system_supports_tlb_range() &&
-	     (end - start) >= (MAX_DVM_OPS * stride)) ||
-	    pages > MAX_TLBI_RANGE_PAGES)
-		return true;
-
-	return false;
+	__tlbi(rvae1is, arg);
+	__tlbi_user(rvae1is, arg);
 }
 
-static inline void __flush_tlb_range_nosync(struct mm_struct *mm,
-				     unsigned long start, unsigned long end,
-				     unsigned long stride, bool last_level,
-				     int tlb_level)
+static __always_inline void rvale1(u64 arg)
 {
+	__tlbi(rvale1, arg);
+	__tlbi_user(rvale1, arg);
+}
+
+static __always_inline void rvale1is(u64 arg)
+{
+	__tlbi(rvale1is, arg);
+	__tlbi_user(rvale1is, arg);
+}
+
+static __always_inline void rvaale1is(u64 arg)
+{
+	__tlbi(rvaale1is, arg);
+}
+
+static __always_inline void ripas2e1is(u64 arg)
+{
+	__tlbi(ripas2e1is, arg);
+}
+
+static __always_inline void __tlbi_range(tlbi_op op, u64 addr,
+					 u16 asid, int scale, int num,
+					 u32 level, bool lpa2)
+{
+	u64 arg = 0;
+
+	arg |= FIELD_PREP(TLBIR_BADDR_MASK, addr >> (lpa2 ? 16 : PAGE_SHIFT));
+	arg |= FIELD_PREP(TLBIR_TTL_MASK, level > 3 ? 0 : level);
+	arg |= FIELD_PREP(TLBIR_NUM_MASK, num);
+	arg |= FIELD_PREP(TLBIR_SCALE_MASK, scale);
+	arg |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule());
+	arg |= FIELD_PREP(TLBIR_ASID_MASK, asid);
+
+	op(arg);
+}
+
+static __always_inline void __flush_tlb_range_op(tlbi_op lop, tlbi_op rop,
+						 u64 start, size_t pages,
+						 u64 stride, u16 asid,
+						 u32 level, bool lpa2)
+{
+	u64 addr = start, end = start + pages * PAGE_SIZE;
+	int scale = 3;
+
+	while (addr != end) {
+		int num;
+
+		pages = (end - addr) >> PAGE_SHIFT;
+
+		if (!system_supports_tlb_range() || pages == 1)
+			goto invalidate_one;
+
+		if (lpa2 && !IS_ALIGNED(addr, SZ_64K))
+			goto invalidate_one;
+
+		num = __TLBI_RANGE_NUM(pages, scale);
+		if (num >= 0) {
+			__tlbi_range(rop, addr, asid, scale, num, level, lpa2);
+			addr += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
+		}
+
+		scale--;
+		continue;
+invalidate_one:
+		__tlbi_level_asid(lop, addr, level, asid);
+		addr += stride;
+	}
+}
+
+#define __flush_s1_tlb_range_op(op, start, pages, stride, asid, tlb_level) \
+	__flush_tlb_range_op(op, r##op, start, pages, stride, asid, tlb_level, lpa2_is_enabled())
+
+#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
+	__flush_tlb_range_op(op, r##op, start, pages, stride, 0, tlb_level, kvm_lpa2_is_enabled())
+
+static inline bool __flush_tlb_range_limit_excess(unsigned long pages,
+						  unsigned long stride)
+{
+	/*
+	 * Assume that the worst case number of DVM ops required to flush a
+	 * given range on a system that supports tlb-range is 20 (4 scales, 1
+	 * final page, 15 for alignment on LPA2 systems), which is much smaller
+	 * than MAX_DVM_OPS.
+	 */
+	if (system_supports_tlb_range())
+		return pages > MAX_TLBI_RANGE_PAGES;
+
+	return pages >= (MAX_DVM_OPS * stride) >> PAGE_SHIFT;
+}
+
+typedef unsigned __bitwise tlbf_t;
+
+/* No special behaviour. */
+#define TLBF_NONE		((__force tlbf_t)0)
+
+/* Invalidate tlb entries only, leaving the page table walk cache intact. */
+#define TLBF_NOWALKCACHE	((__force tlbf_t)BIT(0))
+
+/* Skip the trailing dsb after issuing tlbi. */
+#define TLBF_NOSYNC		((__force tlbf_t)BIT(1))
+
+/* Suppress tlb notifier callbacks for this flush operation. */
+#define TLBF_NONOTIFY		((__force tlbf_t)BIT(2))
+
+/* Perform the tlbi locally without broadcasting to other CPUs. */
+#define TLBF_NOBROADCAST	((__force tlbf_t)BIT(3))
+
+static __always_inline void __do_flush_tlb_range(struct vm_area_struct *vma,
+					unsigned long start, unsigned long end,
+					unsigned long stride, int tlb_level,
+					tlbf_t flags)
+{
+	struct mm_struct *mm = vma->vm_mm;
 	unsigned long asid, pages;
 
-	start = round_down(start, stride);
-	end = round_up(end, stride);
 	pages = (end - start) >> PAGE_SHIFT;
 
-	if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
+	if (__flush_tlb_range_limit_excess(pages, stride)) {
 		flush_tlb_mm(mm);
 		return;
 	}
 
-	dsb(ishst);
+	if (!(flags & TLBF_NOBROADCAST))
+		dsb(ishst);
+	else
+		dsb(nshst);
+
 	asid = ASID(mm);
 
-	if (last_level)
-		__flush_tlb_range_op(vale1is, start, pages, stride, asid,
-				     tlb_level, true, lpa2_is_enabled());
-	else
-		__flush_tlb_range_op(vae1is, start, pages, stride, asid,
-				     tlb_level, true, lpa2_is_enabled());
+	switch (flags & (TLBF_NOWALKCACHE | TLBF_NOBROADCAST)) {
+	case TLBF_NONE:
+		__flush_s1_tlb_range_op(vae1is, start, pages, stride,
+					asid, tlb_level);
+		break;
+	case TLBF_NOWALKCACHE:
+		__flush_s1_tlb_range_op(vale1is, start, pages, stride,
+					asid, tlb_level);
+		break;
+	case TLBF_NOBROADCAST:
+		/* Combination unused */
+		BUG();
+		break;
+	case TLBF_NOWALKCACHE | TLBF_NOBROADCAST:
+		__flush_s1_tlb_range_op(vale1, start, pages, stride,
+					asid, tlb_level);
+		break;
+	}
 
-	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
+	if (!(flags & TLBF_NONOTIFY))
+		mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
+
+	if (!(flags & TLBF_NOSYNC)) {
+		if (!(flags & TLBF_NOBROADCAST))
+			__tlbi_sync_s1ish();
+		else
+			dsb(nsh);
+	}
 }
 
 static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end,
-				     unsigned long stride, bool last_level,
-				     int tlb_level)
+				     unsigned long stride, int tlb_level,
+				     tlbf_t flags)
 {
-	__flush_tlb_range_nosync(vma->vm_mm, start, end, stride,
-				 last_level, tlb_level);
-	__tlbi_sync_s1ish();
-}
-
-static inline void local_flush_tlb_contpte(struct vm_area_struct *vma,
-					   unsigned long addr)
-{
-	unsigned long asid;
-
-	addr = round_down(addr, CONT_PTE_SIZE);
-
-	dsb(nshst);
-	asid = ASID(vma->vm_mm);
-	__flush_tlb_range_op(vale1, addr, CONT_PTES, PAGE_SIZE, asid,
-			     3, true, lpa2_is_enabled());
-	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, addr,
-						    addr + CONT_PTE_SIZE);
-	dsb(nsh);
+	start = round_down(start, stride);
+	end = round_up(end, stride);
+	__do_flush_tlb_range(vma, start, end, stride, tlb_level, flags);
 }
 
 static inline void flush_tlb_range(struct vm_area_struct *vma,
@@ -548,7 +582,23 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 	 * Set the tlb_level to TLBI_TTL_UNKNOWN because we can not get enough
 	 * information here.
 	 */
-	__flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN);
+	__flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, TLBF_NONE);
+}
+
+static inline void __flush_tlb_page(struct vm_area_struct *vma,
+				    unsigned long uaddr, tlbf_t flags)
+{
+	unsigned long start = round_down(uaddr, PAGE_SIZE);
+	unsigned long end = start + PAGE_SIZE;
+
+	__do_flush_tlb_range(vma, start, end, PAGE_SIZE, 3,
+			     TLBF_NOWALKCACHE | flags);
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+				  unsigned long uaddr)
+{
+	__flush_tlb_page(vma, uaddr, TLBF_NONE);
 }
 
 static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
@@ -560,14 +610,14 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end
 	end = round_up(end, stride);
 	pages = (end - start) >> PAGE_SHIFT;
 
-	if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
+	if (__flush_tlb_range_limit_excess(pages, stride)) {
 		flush_tlb_all();
 		return;
 	}
 
 	dsb(ishst);
-	__flush_tlb_range_op(vaale1is, start, pages, stride, 0,
-			     TLBI_TTL_UNKNOWN, false, lpa2_is_enabled());
+	__flush_s1_tlb_range_op(vaale1is, start, pages, stride, 0,
+				TLBI_TTL_UNKNOWN);
 	__tlbi_sync_s1ish();
 	isb();
 }
@@ -589,7 +639,10 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr)
 static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
 		struct mm_struct *mm, unsigned long start, unsigned long end)
 {
-	__flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, true, 3);
+	struct vm_area_struct vma = { .vm_mm = mm, .vm_flags = 0 };
+
+	__flush_tlb_range(&vma, start, end, PAGE_SIZE, 3,
+			  TLBF_NOWALKCACHE | TLBF_NOSYNC);
 }
 
 static inline bool __pte_flags_need_flush(ptdesc_t oldval, ptdesc_t newval)
@@ -618,6 +671,8 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
 }
 #define huge_pmd_needs_flush huge_pmd_needs_flush
 
+#undef __tlbi_user
+#undef __TLBI_VADDR
 #endif
 
 #endif
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index eafc83d255d8..b0c83a08dda9 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -62,7 +62,7 @@ static inline void __uaccess_ttbr0_disable(void)
 
 	local_irq_save(flags);
 	ttbr = read_sysreg(ttbr1_el1);
-	ttbr &= ~TTBR_ASID_MASK;
+	ttbr &= ~TTBRx_EL1_ASID_MASK;
 	/* reserved_pg_dir placed before swapper_pg_dir */
 	write_sysreg(ttbr - RESERVED_SWAPPER_OFFSET, ttbr0_el1);
 	/* Set reserved ASID */
@@ -85,8 +85,8 @@ static inline void __uaccess_ttbr0_enable(void)
 
 	/* Restore active ASID */
 	ttbr1 = read_sysreg(ttbr1_el1);
-	ttbr1 &= ~TTBR_ASID_MASK;		/* safety measure */
-	ttbr1 |= ttbr0 & TTBR_ASID_MASK;
+	ttbr1 &= ~TTBRx_EL1_ASID_MASK;		/* safety measure */
+	ttbr1 |= ttbr0 & TTBRx_EL1_ASID_MASK;
 	write_sysreg(ttbr1, ttbr1_el1);
 
 	/* Restore user page table */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index fe627100d199..74b76bb70452 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -68,6 +68,7 @@ obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
 obj-$(CONFIG_VMCORE_INFO)		+= vmcore_info.o
 obj-$(CONFIG_ARM_SDE_INTERFACE)		+= sdei.o
 obj-$(CONFIG_ARM64_PTR_AUTH)		+= pointer_auth.o
+obj-$(CONFIG_ARM64_MPAM)		+= mpam.o
 obj-$(CONFIG_ARM64_MTE)			+= mte.o
 obj-y					+= vdso-wrap.o
 obj-$(CONFIG_COMPAT_VDSO)		+= vdso32-wrap.o
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index e737c6295ec7..b7a1f8b788bb 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -610,6 +610,20 @@ static int __init armv8_deprecated_init(void)
 	}
 
 #endif
+
+#ifdef CONFIG_SWP_EMULATION
+	/*
+	 * The purpose of supporting LSUI is to eliminate PAN toggling. CPUs
+	 * that support LSUI are unlikely to support a 32-bit runtime. Rather
+	 * than emulating the SWP instruction using LSUI instructions, simply
+	 * disable SWP emulation.
+	 */
+	if (cpus_have_final_cap(ARM64_HAS_LSUI)) {
+		insn_swp.status = INSN_UNAVAILABLE;
+		pr_info("swp/swpb instruction emulation is not supported on this system\n");
+	}
+#endif
+
 	for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) {
 		struct insn_emulation *ie = insn_emulations[i];
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 32c2dbcc0c64..29312081d2e3 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -77,6 +77,7 @@
 #include <linux/percpu.h>
 #include <linux/sched/isolation.h>
 
+#include <asm/arm_pmuv3.h>
 #include <asm/cpu.h>
 #include <asm/cpufeature.h>
 #include <asm/cpu_ops.h>
@@ -86,6 +87,7 @@
 #include <asm/kvm_host.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
+#include <asm/mpam.h>
 #include <asm/mte.h>
 #include <asm/hypervisor.h>
 #include <asm/processor.h>
@@ -281,6 +283,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
 
 static const struct arm64_ftr_bits ftr_id_aa64isar3[] = {
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSUI_SHIFT, 4, ID_AA64ISAR3_EL1_LSUI_NI),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0),
 	ARM64_FTR_END,
@@ -565,7 +568,7 @@ static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = {
 	 * We can instantiate multiple PMU instances with different levels
 	 * of support.
 	 */
-	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_EL1_DebugVer_SHIFT, 4, 0x6),
 	ARM64_FTR_END,
 };
@@ -709,7 +712,7 @@ static const struct arm64_ftr_bits ftr_id_pfr2[] = {
 
 static const struct arm64_ftr_bits ftr_id_dfr0[] = {
 	/* [31:28] TraceFilt */
-	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MProfDbg_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapTrc_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopTrc_SHIFT, 4, 0),
@@ -1927,19 +1930,10 @@ static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope)
 	u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
 	unsigned int pmuver;
 
-	/*
-	 * PMUVer follows the standard ID scheme for an unsigned field with the
-	 * exception of 0xF (IMP_DEF) which is treated specially and implies
-	 * FEAT_PMUv3 is not implemented.
-	 *
-	 * See DDI0487L.a D24.1.3.2 for more details.
-	 */
 	pmuver = cpuid_feature_extract_unsigned_field(dfr0,
 						      ID_AA64DFR0_EL1_PMUVer_SHIFT);
-	if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
-		return false;
 
-	return pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP;
+	return pmuv3_implemented(pmuver);
 }
 #endif
 
@@ -2501,13 +2495,19 @@ test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope)
 static void
 cpu_enable_mpam(const struct arm64_cpu_capabilities *entry)
 {
-	/*
-	 * Access by the kernel (at EL1) should use the reserved PARTID
-	 * which is configured unrestricted. This avoids priority-inversion
-	 * where latency sensitive tasks have to wait for a task that has
-	 * been throttled to release the lock.
-	 */
-	write_sysreg_s(0, SYS_MPAM1_EL1);
+	int cpu = smp_processor_id();
+	u64 regval = 0;
+
+	if (IS_ENABLED(CONFIG_ARM64_MPAM) && static_branch_likely(&mpam_enabled))
+		regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu));
+
+	write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1);
+	if (cpus_have_cap(ARM64_SME))
+		write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1);
+	isb();
+
+	/* Synchronising the EL0 write is left until the ERET to EL0 */
+	write_sysreg_s(regval, SYS_MPAM0_EL1);
 }
 
 static bool
@@ -3178,6 +3178,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpu_enable = cpu_enable_ls64_v,
 		ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LS64, LS64_V)
 	},
+#ifdef CONFIG_ARM64_LSUI
+	{
+		.desc = "Unprivileged Load Store Instructions (LSUI)",
+		.capability = ARM64_HAS_LSUI,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = has_cpuid_feature,
+		ARM64_CPUID_FIELDS(ID_AA64ISAR3_EL1, LSUI, IMP)
+	},
+#endif
 	{},
 };
 
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 3625797e9ee8..f42ce7b5c67f 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -35,11 +35,11 @@
  * Before this function is called it is not safe to call regular kernel code,
  * instrumentable code, or any code which may trigger an exception.
  */
-static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs)
+static noinstr irqentry_state_t arm64_enter_from_kernel_mode(struct pt_regs *regs)
 {
 	irqentry_state_t state;
 
-	state = irqentry_enter(regs);
+	state = irqentry_enter_from_kernel_mode(regs);
 	mte_check_tfsr_entry();
 	mte_disable_tco_entry(current);
 
@@ -51,11 +51,14 @@ static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs)
  * After this function returns it is not safe to call regular kernel code,
  * instrumentable code, or any code which may trigger an exception.
  */
-static void noinstr exit_to_kernel_mode(struct pt_regs *regs,
-					irqentry_state_t state)
+static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs,
+					      irqentry_state_t state)
 {
+	local_irq_disable();
+	irqentry_exit_to_kernel_mode_preempt(regs, state);
+	local_daif_mask();
 	mte_check_tfsr_exit();
-	irqentry_exit(regs, state);
+	irqentry_exit_to_kernel_mode_after_preempt(regs, state);
 }
 
 /*
@@ -298,11 +301,10 @@ static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr)
 	unsigned long far = read_sysreg(far_el1);
 	irqentry_state_t state;
 
-	state = enter_from_kernel_mode(regs);
+	state = arm64_enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_mem_abort(far, esr, regs);
-	local_daif_mask();
-	exit_to_kernel_mode(regs, state);
+	arm64_exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr)
@@ -310,55 +312,50 @@ static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr)
 	unsigned long far = read_sysreg(far_el1);
 	irqentry_state_t state;
 
-	state = enter_from_kernel_mode(regs);
+	state = arm64_enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_sp_pc_abort(far, esr, regs);
-	local_daif_mask();
-	exit_to_kernel_mode(regs, state);
+	arm64_exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr)
 {
 	irqentry_state_t state;
 
-	state = enter_from_kernel_mode(regs);
+	state = arm64_enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_el1_undef(regs, esr);
-	local_daif_mask();
-	exit_to_kernel_mode(regs, state);
+	arm64_exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr)
 {
 	irqentry_state_t state;
 
-	state = enter_from_kernel_mode(regs);
+	state = arm64_enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_el1_bti(regs, esr);
-	local_daif_mask();
-	exit_to_kernel_mode(regs, state);
+	arm64_exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr)
 {
 	irqentry_state_t state;
 
-	state = enter_from_kernel_mode(regs);
+	state = arm64_enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_el1_gcs(regs, esr);
-	local_daif_mask();
-	exit_to_kernel_mode(regs, state);
+	arm64_exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr)
 {
 	irqentry_state_t state;
 
-	state = enter_from_kernel_mode(regs);
+	state = arm64_enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_el1_mops(regs, esr);
-	local_daif_mask();
-	exit_to_kernel_mode(regs, state);
+	arm64_exit_to_kernel_mode(regs, state);
 }
 
 static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr)
@@ -420,11 +417,10 @@ static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr)
 {
 	irqentry_state_t state;
 
-	state = enter_from_kernel_mode(regs);
+	state = arm64_enter_from_kernel_mode(regs);
 	local_daif_inherit(regs);
 	do_el1_fpac(regs, esr);
-	local_daif_mask();
-	exit_to_kernel_mode(regs, state);
+	arm64_exit_to_kernel_mode(regs, state);
 }
 
 asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
@@ -491,13 +487,13 @@ static __always_inline void __el1_irq(struct pt_regs *regs,
 {
 	irqentry_state_t state;
 
-	state = enter_from_kernel_mode(regs);
+	state = arm64_enter_from_kernel_mode(regs);
 
 	irq_enter_rcu();
 	do_interrupt_handler(regs, handler);
 	irq_exit_rcu();
 
-	exit_to_kernel_mode(regs, state);
+	arm64_exit_to_kernel_mode(regs, state);
 }
 static void noinstr el1_interrupt(struct pt_regs *regs,
 				  void (*handler)(struct pt_regs *))
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index f8018b5c1f9a..e0db14e9c843 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -273,7 +273,7 @@ alternative_if ARM64_HAS_ADDRESS_AUTH
 alternative_else_nop_endif
 1:
 
-	scs_load_current
+	scs_load_current_base
 	.else
 	add	x21, sp, #PT_REGS_SIZE
 	get_current_task tsk
@@ -378,8 +378,6 @@ alternative_if ARM64_WORKAROUND_845719
 alternative_else_nop_endif
 #endif
 3:
-	scs_save tsk
-
 	/* Ignore asynchronous tag check faults in the uaccess routines */
 	ldr	x0, [tsk, THREAD_SCTLR_USER]
 	clear_mte_async_tcf x0
@@ -473,7 +471,7 @@ alternative_else_nop_endif
 	 */
 SYM_CODE_START_LOCAL(__swpan_entry_el1)
 	mrs	x21, ttbr0_el1
-	tst	x21, #TTBR_ASID_MASK		// Check for the reserved ASID
+	tst	x21, #TTBRx_EL1_ASID_MASK	// Check for the reserved ASID
 	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
 	b.eq	1f				// TTBR0 access already disabled
 	and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 239c16e3d02f..c5693a32e49b 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -129,9 +129,6 @@ int machine_kexec_post_load(struct kimage *kimage)
 	}
 
 	/* Create a copy of the linear map */
-	trans_pgd = kexec_page_alloc(kimage);
-	if (!trans_pgd)
-		return -ENOMEM;
 	rc = trans_pgd_create_copy(&info, &trans_pgd, PAGE_OFFSET, PAGE_END);
 	if (rc)
 		return rc;
diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c
new file mode 100644
index 000000000000..3a490de4fa12
--- /dev/null
+++ b/arch/arm64/kernel/mpam.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2025 Arm Ltd. */
+
+#include <asm/mpam.h>
+
+#include <linux/arm_mpam.h>
+#include <linux/cpu_pm.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+
+DEFINE_STATIC_KEY_FALSE(mpam_enabled);
+DEFINE_PER_CPU(u64, arm64_mpam_default);
+DEFINE_PER_CPU(u64, arm64_mpam_current);
+
+u64 arm64_mpam_global_default;
+
+static int mpam_pm_notifier(struct notifier_block *self,
+			    unsigned long cmd, void *v)
+{
+	u64 regval;
+	int cpu = smp_processor_id();
+
+	switch (cmd) {
+	case CPU_PM_EXIT:
+		/*
+		 * Don't use mpam_thread_switch() as the system register
+		 * value has changed under our feet.
+		 */
+		regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu));
+		write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1);
+		if (system_supports_sme()) {
+			write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D),
+				       SYS_MPAMSM_EL1);
+		}
+		isb();
+
+		write_sysreg_s(regval, SYS_MPAM0_EL1);
+
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block mpam_pm_nb = {
+	.notifier_call = mpam_pm_notifier,
+};
+
+static int __init arm64_mpam_register_cpus(void)
+{
+	u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1);
+	u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr);
+	u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr);
+
+	if (!system_supports_mpam())
+		return 0;
+
+	cpu_pm_register_notifier(&mpam_pm_nb);
+	return mpam_register_requestor(partid_max, pmg_max);
+}
+/* Must occur before mpam_msc_driver_init() from subsys_initcall() */
+arch_initcall(arm64_mpam_register_cpus)
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 32148bf09c1d..6874b16d0657 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -291,6 +291,9 @@ void mte_thread_switch(struct task_struct *next)
 	/* TCO may not have been disabled on exception entry for the current task. */
 	mte_disable_tco_entry(next);
 
+	if (!system_uses_mte_async_or_asymm_mode())
+		return;
+
 	/*
 	 * Check if an async tag exception occurred at EL1.
 	 *
@@ -315,8 +318,8 @@ void mte_cpu_setup(void)
 	 * CnP is not a boot feature so MTE gets enabled before CnP, but let's
 	 * make sure that is the case.
 	 */
-	BUG_ON(read_sysreg(ttbr0_el1) & TTBR_CNP_BIT);
-	BUG_ON(read_sysreg(ttbr1_el1) & TTBR_CNP_BIT);
+	BUG_ON(read_sysreg(ttbr0_el1) & TTBRx_EL1_CnP);
+	BUG_ON(read_sysreg(ttbr1_el1) & TTBRx_EL1_CnP);
 
 	/* Normal Tagged memory type at the corresponding MAIR index */
 	sysreg_clear_set(mair_el1,
@@ -350,6 +353,9 @@ void mte_suspend_enter(void)
 	if (!system_supports_mte())
 		return;
 
+	if (!system_uses_mte_async_or_asymm_mode())
+		return;
+
 	/*
 	 * The barriers are required to guarantee that the indirect writes
 	 * to TFSR_EL1 are synchronized before we report the state.
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 489554931231..c0bf1f46cdc6 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -51,6 +51,7 @@
 #include <asm/fpsimd.h>
 #include <asm/gcs.h>
 #include <asm/mmu_context.h>
+#include <asm/mpam.h>
 #include <asm/mte.h>
 #include <asm/processor.h>
 #include <asm/pointer_auth.h>
@@ -699,6 +700,29 @@ void update_sctlr_el1(u64 sctlr)
 	isb();
 }
 
+static inline void debug_switch_state(void)
+{
+	if (system_uses_irq_prio_masking()) {
+		unsigned long daif_expected = 0;
+		unsigned long daif_actual = read_sysreg(daif);
+		unsigned long pmr_expected = GIC_PRIO_IRQOFF;
+		unsigned long pmr_actual = read_sysreg_s(SYS_ICC_PMR_EL1);
+
+		WARN_ONCE(daif_actual != daif_expected ||
+			  pmr_actual != pmr_expected,
+			  "Unexpected DAIF + PMR: 0x%lx + 0x%lx (expected 0x%lx + 0x%lx)\n",
+			  daif_actual, pmr_actual,
+			  daif_expected, pmr_expected);
+	} else {
+		unsigned long daif_expected = DAIF_PROCCTX_NOIRQ;
+		unsigned long daif_actual = read_sysreg(daif);
+
+		WARN_ONCE(daif_actual != daif_expected,
+			  "Unexpected DAIF value: 0x%lx (expected 0x%lx)\n",
+			  daif_actual, daif_expected);
+	}
+}
+
 /*
  * Thread switching.
  */
@@ -708,6 +732,8 @@ struct task_struct *__switch_to(struct task_struct *prev,
 {
 	struct task_struct *last;
 
+	debug_switch_state();
+
 	fpsimd_thread_switch(next);
 	tls_thread_switch(next);
 	hw_breakpoint_thread_switch(next);
@@ -738,6 +764,12 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	if (prev->thread.sctlr_user != next->thread.sctlr_user)
 		update_sctlr_el1(next->thread.sctlr_user);
 
+	/*
+	 * MPAM thread switch happens after the DSB to ensure prev's accesses
+	 * use prev's MPAM settings.
+	 */
+	mpam_thread_switch(next);
+
 	/* the actual thread switch */
 	last = cpu_switch_to(prev, next);
 
diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
index 9e846ce4ef9c..92160f2e57ff 100644
--- a/arch/arm64/kernel/rsi.c
+++ b/arch/arm64/kernel/rsi.c
@@ -145,7 +145,7 @@ void __init arm64_rsi_init(void)
 		return;
 	if (!rsi_version_matches())
 		return;
-	if (WARN_ON(rsi_get_realm_config(&config)))
+	if (WARN_ON(rsi_get_realm_config(lm_alias(&config))))
 		return;
 	prot_ns_shared = __phys_to_pte_val(BIT(config.ipa_bits - 1));
 
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index b9d4998c97ef..7e9860143add 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -36,7 +36,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end)
 			 * The workaround requires an inner-shareable tlbi.
 			 * We pick the reserved-ASID to minimise the impact.
 			 */
-			__tlbi(aside1is, __TLBI_VADDR(0, 0));
+			__tlbi(aside1is, 0UL);
 			__tlbi_sync_s1ish();
 		}
 
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index a024d9a770dc..9f8f0ae8e86e 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -9,6 +9,7 @@
 #include <asm/esr.h>
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
+#include <asm/lsui.h>
 
 static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
 {
@@ -1679,6 +1680,35 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
 	}
 }
 
+static int __lsui_swap_desc(u64 __user *ptep, u64 old, u64 new)
+{
+	u64 tmp = old;
+	int ret = 0;
+
+	/*
+	 * Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(),
+	 * as PAN toggling is not required.
+	 */
+	uaccess_ttbr0_enable();
+
+	asm volatile(__LSUI_PREAMBLE
+		     "1: cast	%[old], %[new], %[addr]\n"
+		     "2:\n"
+		     _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
+		     : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret)
+		     : [new] "r" (new)
+		     : "memory");
+
+	uaccess_ttbr0_disable();
+
+	if (ret)
+		return ret;
+	if (tmp != old)
+		return -EAGAIN;
+
+	return ret;
+}
+
 static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
 {
 	u64 tmp = old;
@@ -1754,7 +1784,9 @@ int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new)
 		return -EPERM;
 
 	ptep = (void __user *)hva + offset;
-	if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
+	if (cpus_have_final_cap(ARM64_HAS_LSUI))
+		r = __lsui_swap_desc(ptep, old, new);
+	else if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
 		r = __lse_swap_desc(ptep, old, new);
 	else
 		r = __llsc_swap_desc(ptep, old, new);
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 3ad6b7c6e4ba..f4d7b12045e8 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -10,6 +10,7 @@
 #include <linux/kvm_host.h>
 #include <linux/hw_breakpoint.h>
 
+#include <asm/arm_pmuv3.h>
 #include <asm/debug-monitors.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_arm.h>
@@ -75,8 +76,10 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
 void kvm_init_host_debug_data(void)
 {
 	u64 dfr0 = read_sysreg(id_aa64dfr0_el1);
+	unsigned int pmuver = cpuid_feature_extract_unsigned_field(dfr0,
+								   ID_AA64DFR0_EL1_PMUVer_SHIFT);
 
-	if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0)
+	if (pmuv3_implemented(pmuver))
 		*host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N,
 							      read_sysreg(pmcr_el0));
 
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 2597e8bda867..0b50ddd530f3 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -267,7 +267,8 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu)
 
 static inline void  __activate_traps_mpam(struct kvm_vcpu *vcpu)
 {
-	u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1;
+	u64 clr = MPAM2_EL2_EnMPAMSM;
+	u64 set = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1;
 
 	if (!system_supports_mpam())
 		return;
@@ -277,18 +278,21 @@ static inline void  __activate_traps_mpam(struct kvm_vcpu *vcpu)
 		write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2);
 	} else {
 		/* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */
-		r |= MPAM2_EL2_TIDR;
+		set |= MPAM2_EL2_TIDR;
 	}
 
-	write_sysreg_s(r, SYS_MPAM2_EL2);
+	sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set);
 }
 
 static inline void __deactivate_traps_mpam(void)
 {
+	u64 clr = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1 | MPAM2_EL2_TIDR;
+	u64 set = MPAM2_EL2_EnMPAMSM;
+
 	if (!system_supports_mpam())
 		return;
 
-	write_sysreg_s(0, SYS_MPAM2_EL2);
+	sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set);
 
 	if (system_supports_mpam_hcr())
 		write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 0d42eedc7167..445eb0743af2 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -130,7 +130,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init)
 	ldr	x1, [x0, #NVHE_INIT_PGD_PA]
 	phys_to_ttbr x2, x1
 alternative_if ARM64_HAS_CNP
-	orr	x2, x2, #TTBR_CNP_BIT
+	orr	x2, x2, #TTBRx_EL1_CnP
 alternative_else_nop_endif
 	msr	ttbr0_el2, x2
 
@@ -291,7 +291,7 @@ SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd)
 	/* Install the new pgtables */
 	phys_to_ttbr x5, x0
 alternative_if ARM64_HAS_CNP
-	orr	x5, x5, #TTBR_CNP_BIT
+	orr	x5, x5, #TTBRx_EL1_CnP
 alternative_else_nop_endif
 	msr	ttbr0_el2, x5
 
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index 218976287d3f..4d8fcc7a3a41 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -270,7 +270,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
 	 * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
 	 */
 	dsb(ishst);
-	__tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level);
+	__tlbi_level(vale2is, addr, level);
 	__tlbi_sync_s1ish_hyp();
 	isb();
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 3dc1ce0d27fe..b29140995d48 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -158,7 +158,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
 	 * Instead, we invalidate Stage-2 for this IPA, and the
 	 * whole of Stage-1. Weep...
 	 */
-	ipa >>= 12;
 	__tlbi_level(ipas2e1is, ipa, level);
 
 	/*
@@ -188,7 +187,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
 	 * Instead, we invalidate Stage-2 for this IPA, and the
 	 * whole of Stage-1. Weep...
 	 */
-	ipa >>= 12;
 	__tlbi_level(ipas2e1, ipa, level);
 
 	/*
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 9b480f947da2..30226f2d5564 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -490,14 +490,14 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
 
 		kvm_clear_pte(ctx->ptep);
 		dsb(ishst);
-		__tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN);
+		__tlbi_level(vae2is, ctx->addr, TLBI_TTL_UNKNOWN);
 	} else {
 		if (ctx->end - ctx->addr < granule)
 			return -EINVAL;
 
 		kvm_clear_pte(ctx->ptep);
 		dsb(ishst);
-		__tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
+		__tlbi_level(vale2is, ctx->addr, ctx->level);
 		*unmapped += granule;
 	}
 
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index b254d442e54e..be685b63e8cf 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -183,6 +183,21 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt)
 }
 NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe);
 
+/*
+ * The _EL0 value was written by the host's context switch and belongs to the
+ * VMM. Copy this into the guest's _EL1 register.
+ */
+static inline void __mpam_guest_load(void)
+{
+	u64 mask = MPAM0_EL1_PARTID_D | MPAM0_EL1_PARTID_I | MPAM0_EL1_PMG_D | MPAM0_EL1_PMG_I;
+
+	if (system_supports_mpam()) {
+		u64 val = (read_sysreg_s(SYS_MPAM0_EL1) & mask) | MPAM1_EL1_MPAMEN;
+
+		write_sysreg_el1(val, SYS_MPAM1);
+	}
+}
+
 /**
  * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU
  *
@@ -222,6 +237,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu)
 	 */
 	__sysreg32_restore_state(vcpu);
 	__sysreg_restore_user_state(guest_ctxt);
+	__mpam_guest_load();
 
 	if (unlikely(is_hyp_ctxt(vcpu))) {
 		__sysreg_restore_vel2_state(vcpu);
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index 35855dadfb1b..f7b9dfe3f3a5 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -104,7 +104,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
 	 * Instead, we invalidate Stage-2 for this IPA, and the
 	 * whole of Stage-1. Weep...
 	 */
-	ipa >>= 12;
 	__tlbi_level(ipas2e1is, ipa, level);
 
 	/*
@@ -136,7 +135,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
 	 * Instead, we invalidate Stage-2 for this IPA, and the
 	 * whole of Stage-1. Weep...
 	 */
-	ipa >>= 12;
 	__tlbi_level(ipas2e1, ipa, level);
 
 	/*
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1b4cacb6e918..c1e0dea903a1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1805,7 +1805,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
 		break;
 	case SYS_ID_AA64ISAR3_EL1:
 		val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE |
-			ID_AA64ISAR3_EL1_FAMINMAX;
+			ID_AA64ISAR3_EL1_FAMINMAX | ID_AA64ISAR3_EL1_LSUI;
 		break;
 	case SYS_ID_AA64MMFR2_EL1:
 		val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
@@ -3252,6 +3252,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 					ID_AA64ISAR2_EL1_GPA3)),
 	ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT |
 				       ID_AA64ISAR3_EL1_LSFE |
+				       ID_AA64ISAR3_EL1_LSUI |
 				       ID_AA64ISAR3_EL1_FAMINMAX)),
 	ID_UNALLOCATED(6,4),
 	ID_UNALLOCATED(6,5),
@@ -3376,6 +3377,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 
 	{ SYS_DESC(SYS_MPAM1_EL1), undef_access },
 	{ SYS_DESC(SYS_MPAM0_EL1), undef_access },
+	{ SYS_DESC(SYS_MPAMSM_EL1), undef_access },
+
 	{ SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 },
 	{ SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },
 
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index b2ac06246327..0f4a28b87469 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -354,15 +354,15 @@ void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm)
 
 	/* Skip CNP for the reserved ASID */
 	if (system_supports_cnp() && asid)
-		ttbr0 |= TTBR_CNP_BIT;
+		ttbr0 |= TTBRx_EL1_CnP;
 
 	/* SW PAN needs a copy of the ASID in TTBR0 for entry */
 	if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN))
-		ttbr0 |= FIELD_PREP(TTBR_ASID_MASK, asid);
+		ttbr0 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid);
 
 	/* Set ASID in TTBR1 since TCR.A1 is set */
-	ttbr1 &= ~TTBR_ASID_MASK;
-	ttbr1 |= FIELD_PREP(TTBR_ASID_MASK, asid);
+	ttbr1 &= ~TTBRx_EL1_ASID_MASK;
+	ttbr1 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid);
 
 	cpu_set_reserved_ttbr0_nosync();
 	write_sysreg(ttbr1, ttbr1_el1);
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 1519d090d5ea..3970392c4326 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -225,7 +225,8 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr,
 	 */
 
 	if (!system_supports_bbml2_noabort())
-		__flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
+		__flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, 3,
+				  TLBF_NOWALKCACHE);
 
 	__set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
 }
@@ -551,8 +552,8 @@ int contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
 		 * See comment in __ptep_clear_flush_young(); same rationale for
 		 * eliding the trailing DSB applies here.
 		 */
-		__flush_tlb_range_nosync(vma->vm_mm, addr, end,
-					 PAGE_SIZE, true, 3);
+		__flush_tlb_range(vma, addr, end, PAGE_SIZE, 3,
+				  TLBF_NOWALKCACHE | TLBF_NOSYNC);
 	}
 
 	return young;
@@ -685,7 +686,10 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 			__ptep_set_access_flags(vma, addr, ptep, entry, 0);
 
 		if (dirty)
-			local_flush_tlb_contpte(vma, start_addr);
+			__flush_tlb_range(vma, start_addr,
+					  start_addr + CONT_PTE_SIZE,
+					  PAGE_SIZE, 3,
+					  TLBF_NOWALKCACHE | TLBF_NOBROADCAST);
 	} else {
 		__contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
 		__ptep_set_access_flags(vma, addr, ptep, entry, dirty);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index be9dab2c7d6a..920a8b244d59 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -204,12 +204,13 @@ static void show_pte(unsigned long addr)
  *
  * Returns whether or not the PTE actually changed.
  */
-int __ptep_set_access_flags(struct vm_area_struct *vma,
-			    unsigned long address, pte_t *ptep,
-			    pte_t entry, int dirty)
+int __ptep_set_access_flags_anysz(struct vm_area_struct *vma,
+				  unsigned long address, pte_t *ptep,
+				  pte_t entry, int dirty, unsigned long pgsize)
 {
 	pteval_t old_pteval, pteval;
 	pte_t pte = __ptep_get(ptep);
+	int level;
 
 	if (pte_same(pte, entry))
 		return 0;
@@ -238,8 +239,27 @@ int __ptep_set_access_flags(struct vm_area_struct *vma,
 	 * may still cause page faults and be invalidated via
 	 * flush_tlb_fix_spurious_fault().
 	 */
-	if (dirty)
-		local_flush_tlb_page(vma, address);
+	if (dirty) {
+		switch (pgsize) {
+		case PAGE_SIZE:
+			level = 3;
+			break;
+		case PMD_SIZE:
+			level = 2;
+			break;
+#ifndef __PAGETABLE_PMD_FOLDED
+		case PUD_SIZE:
+			level = 1;
+			break;
+#endif
+		default:
+			level = TLBI_TTL_UNKNOWN;
+			WARN_ON(1);
+		}
+
+		__flush_tlb_range(vma, address, address + pgsize, pgsize, level,
+				  TLBF_NOWALKCACHE | TLBF_NOBROADCAST);
+	}
 	return 1;
 }
 
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index a42c05cf5640..30772a909aea 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -181,7 +181,7 @@ static pte_t get_clear_contig_flush(struct mm_struct *mm,
 	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
 	unsigned long end = addr + (pgsize * ncontig);
 
-	__flush_hugetlb_tlb_range(&vma, addr, end, pgsize, true);
+	__flush_hugetlb_tlb_range(&vma, addr, end, pgsize, TLBF_NOWALKCACHE);
 	return orig_pte;
 }
 
@@ -209,7 +209,7 @@ static void clear_flush(struct mm_struct *mm,
 	if (mm == &init_mm)
 		flush_tlb_kernel_range(saddr, addr);
 	else
-		__flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true);
+		__flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, TLBF_NOWALKCACHE);
 }
 
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -427,11 +427,11 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 	pte_t orig_pte;
 
 	VM_WARN_ON(!pte_present(pte));
+	ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize);
 
 	if (!pte_cont(pte))
-		return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-
-	ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize);
+		return __ptep_set_access_flags_anysz(vma, addr, ptep, pte,
+						     dirty, pgsize);
 
 	if (!__cont_access_flags_changed(ptep, pte, ncontig))
 		return 0;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 96711b8578fd..b9b248d24fd1 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -350,7 +350,6 @@ void __init arch_mm_preinit(void)
 	}
 
 	swiotlb_init(swiotlb, flags);
-	swiotlb_update_mem_attributes();
 
 	/*
 	 * Check boundaries twice: Some fundamental inconsistencies can be
@@ -377,6 +376,14 @@ void __init arch_mm_preinit(void)
 	}
 }
 
+bool page_alloc_available __ro_after_init;
+
+void __init mem_init(void)
+{
+	page_alloc_available = true;
+	swiotlb_update_mem_attributes();
+}
+
 void free_initmem(void)
 {
 	void *lm_init_begin = lm_alias(__init_begin);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a6a00accf4f9..7ea743996a61 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -112,7 +112,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 }
 EXPORT_SYMBOL(phys_mem_access_prot);
 
-static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type)
+static phys_addr_t __init early_pgtable_alloc(enum pgtable_level pgtable_level)
 {
 	phys_addr_t phys;
 
@@ -197,14 +197,14 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
 static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 			       unsigned long end, phys_addr_t phys,
 			       pgprot_t prot,
-			       phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+			       phys_addr_t (*pgtable_alloc)(enum pgtable_level),
 			       int flags)
 {
 	unsigned long next;
 	pmd_t pmd = READ_ONCE(*pmdp);
 	pte_t *ptep;
 
-	BUG_ON(pmd_sect(pmd));
+	BUG_ON(pmd_leaf(pmd));
 	if (pmd_none(pmd)) {
 		pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
 		phys_addr_t pte_phys;
@@ -212,7 +212,7 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 		if (flags & NO_EXEC_MAPPINGS)
 			pmdval |= PMD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
-		pte_phys = pgtable_alloc(TABLE_PTE);
+		pte_phys = pgtable_alloc(PGTABLE_LEVEL_PTE);
 		if (pte_phys == INVALID_PHYS_ADDR)
 			return -ENOMEM;
 		ptep = pte_set_fixmap(pte_phys);
@@ -252,7 +252,7 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 
 static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		    phys_addr_t phys, pgprot_t prot,
-		    phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags)
+		    phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags)
 {
 	unsigned long next;
 
@@ -292,7 +292,7 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
 static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 			       unsigned long end, phys_addr_t phys,
 			       pgprot_t prot,
-			       phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+			       phys_addr_t (*pgtable_alloc)(enum pgtable_level),
 			       int flags)
 {
 	int ret;
@@ -303,7 +303,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 	/*
 	 * Check for initial section mappings in the pgd/pud.
 	 */
-	BUG_ON(pud_sect(pud));
+	BUG_ON(pud_leaf(pud));
 	if (pud_none(pud)) {
 		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
 		phys_addr_t pmd_phys;
@@ -311,7 +311,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 		if (flags & NO_EXEC_MAPPINGS)
 			pudval |= PUD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
-		pmd_phys = pgtable_alloc(TABLE_PMD);
+		pmd_phys = pgtable_alloc(PGTABLE_LEVEL_PMD);
 		if (pmd_phys == INVALID_PHYS_ADDR)
 			return -ENOMEM;
 		pmdp = pmd_set_fixmap(pmd_phys);
@@ -349,7 +349,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 
 static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 			  phys_addr_t phys, pgprot_t prot,
-			  phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+			  phys_addr_t (*pgtable_alloc)(enum pgtable_level),
 			  int flags)
 {
 	int ret = 0;
@@ -364,7 +364,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 		if (flags & NO_EXEC_MAPPINGS)
 			p4dval |= P4D_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
-		pud_phys = pgtable_alloc(TABLE_PUD);
+		pud_phys = pgtable_alloc(PGTABLE_LEVEL_PUD);
 		if (pud_phys == INVALID_PHYS_ADDR)
 			return -ENOMEM;
 		pudp = pud_set_fixmap(pud_phys);
@@ -415,7 +415,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 
 static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 			  phys_addr_t phys, pgprot_t prot,
-			  phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+			  phys_addr_t (*pgtable_alloc)(enum pgtable_level),
 			  int flags)
 {
 	int ret;
@@ -430,7 +430,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 		if (flags & NO_EXEC_MAPPINGS)
 			pgdval |= PGD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
-		p4d_phys = pgtable_alloc(TABLE_P4D);
+		p4d_phys = pgtable_alloc(PGTABLE_LEVEL_P4D);
 		if (p4d_phys == INVALID_PHYS_ADDR)
 			return -ENOMEM;
 		p4dp = p4d_set_fixmap(p4d_phys);
@@ -467,7 +467,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
 				       unsigned long virt, phys_addr_t size,
 				       pgprot_t prot,
-				       phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+				       phys_addr_t (*pgtable_alloc)(enum pgtable_level),
 				       int flags)
 {
 	int ret;
@@ -500,7 +500,7 @@ static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
 static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 				unsigned long virt, phys_addr_t size,
 				pgprot_t prot,
-				phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+				phys_addr_t (*pgtable_alloc)(enum pgtable_level),
 				int flags)
 {
 	int ret;
@@ -516,7 +516,7 @@ static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 				     unsigned long virt, phys_addr_t size,
 				     pgprot_t prot,
-				     phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+				     phys_addr_t (*pgtable_alloc)(enum pgtable_level),
 				     int flags)
 {
 	int ret;
@@ -528,7 +528,7 @@ static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 }
 
 static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
-				       enum pgtable_type pgtable_type)
+				       enum pgtable_level pgtable_level)
 {
 	/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
 	struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0);
@@ -539,40 +539,43 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
 
 	pa = page_to_phys(ptdesc_page(ptdesc));
 
-	switch (pgtable_type) {
-	case TABLE_PTE:
+	switch (pgtable_level) {
+	case PGTABLE_LEVEL_PTE:
 		BUG_ON(!pagetable_pte_ctor(mm, ptdesc));
 		break;
-	case TABLE_PMD:
+	case PGTABLE_LEVEL_PMD:
 		BUG_ON(!pagetable_pmd_ctor(mm, ptdesc));
 		break;
-	case TABLE_PUD:
+	case PGTABLE_LEVEL_PUD:
 		pagetable_pud_ctor(ptdesc);
 		break;
-	case TABLE_P4D:
+	case PGTABLE_LEVEL_P4D:
 		pagetable_p4d_ctor(ptdesc);
 		break;
+	case PGTABLE_LEVEL_PGD:
+		VM_WARN_ON(1);
+		break;
 	}
 
 	return pa;
 }
 
 static phys_addr_t
-pgd_pgtable_alloc_init_mm_gfp(enum pgtable_type pgtable_type, gfp_t gfp)
+pgd_pgtable_alloc_init_mm_gfp(enum pgtable_level pgtable_level, gfp_t gfp)
 {
-	return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type);
+	return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_level);
 }
 
 static phys_addr_t __maybe_unused
-pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
+pgd_pgtable_alloc_init_mm(enum pgtable_level pgtable_level)
 {
-	return pgd_pgtable_alloc_init_mm_gfp(pgtable_type, GFP_PGTABLE_KERNEL);
+	return pgd_pgtable_alloc_init_mm_gfp(pgtable_level, GFP_PGTABLE_KERNEL);
 }
 
 static phys_addr_t
-pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
+pgd_pgtable_alloc_special_mm(enum pgtable_level pgtable_level)
 {
-	return  __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
+	return  __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_level);
 }
 
 static void split_contpte(pte_t *ptep)
@@ -593,7 +596,7 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
 	pte_t *ptep;
 	int i;
 
-	pte_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PTE, gfp);
+	pte_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PTE, gfp);
 	if (pte_phys == INVALID_PHYS_ADDR)
 		return -ENOMEM;
 	ptep = (pte_t *)phys_to_virt(pte_phys);
@@ -602,6 +605,8 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
 		tableprot |= PMD_TABLE_PXN;
 
 	prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE);
+	if (!pmd_valid(pmd))
+		prot = pte_pgprot(pte_mkinvalid(pfn_pte(0, prot)));
 	prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
 	if (to_cont)
 		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
@@ -638,7 +643,7 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
 	pmd_t *pmdp;
 	int i;
 
-	pmd_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PMD, gfp);
+	pmd_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PMD, gfp);
 	if (pmd_phys == INVALID_PHYS_ADDR)
 		return -ENOMEM;
 	pmdp = (pmd_t *)phys_to_virt(pmd_phys);
@@ -647,6 +652,8 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
 		tableprot |= PUD_TABLE_PXN;
 
 	prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT);
+	if (!pud_valid(pud))
+		prot = pmd_pgprot(pmd_mkinvalid(pfn_pmd(0, prot)));
 	prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
 	if (to_cont)
 		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
@@ -768,30 +775,51 @@ static inline bool force_pte_mapping(void)
 }
 
 static DEFINE_MUTEX(pgtable_split_lock);
+static bool linear_map_requires_bbml2;
 
 int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
 {
 	int ret;
 
-	/*
-	 * !BBML2_NOABORT systems should not be trying to change permissions on
-	 * anything that is not pte-mapped in the first place. Just return early
-	 * and let the permission change code raise a warning if not already
-	 * pte-mapped.
-	 */
-	if (!system_supports_bbml2_noabort())
-		return 0;
-
 	/*
 	 * If the region is within a pte-mapped area, there is no need to try to
 	 * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may
 	 * change permissions from atomic context so for those cases (which are
 	 * always pte-mapped), we must not go any further because taking the
-	 * mutex below may sleep.
+	 * mutex below may sleep. Do not call force_pte_mapping() here because
+	 * it could return a confusing result if called from a secondary cpu
+	 * prior to finalizing caps. Instead, linear_map_requires_bbml2 gives us
+	 * what we need.
 	 */
-	if (force_pte_mapping() || is_kfence_address((void *)start))
+	if (!linear_map_requires_bbml2 || is_kfence_address((void *)start))
 		return 0;
 
+	if (!system_supports_bbml2_noabort()) {
+		/*
+		 * !BBML2_NOABORT systems should not be trying to change
+		 * permissions on anything that is not pte-mapped in the first
+		 * place. Just return early and let the permission change code
+		 * raise a warning if not already pte-mapped.
+		 */
+		if (system_capabilities_finalized())
+			return 0;
+
+		/*
+		 * Boot-time: split_kernel_leaf_mapping_locked() allocates from
+		 * page allocator. Can't split until it's available.
+		 */
+		if (WARN_ON(!page_alloc_available))
+			return -EBUSY;
+
+		/*
+		 * Boot-time: Started secondary cpus but don't know if they
+		 * support BBML2_NOABORT yet. Can't allow splitting in this
+		 * window in case they don't.
+		 */
+		if (WARN_ON(num_online_cpus() > 1))
+			return -EBUSY;
+	}
+
 	/*
 	 * Ensure start and end are at least page-aligned since this is the
 	 * finest granularity we can split to.
@@ -891,8 +919,6 @@ static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp
 	return ret;
 }
 
-static bool linear_map_requires_bbml2 __initdata;
-
 u32 idmap_kpti_bbml2_flag;
 
 static void __init init_idmap_kpti_bbml2_flag(void)
@@ -1226,7 +1252,7 @@ static void __init declare_vma(struct vm_struct *vma,
 
 static phys_addr_t kpti_ng_temp_alloc __initdata;
 
-static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_type type)
+static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_level pgtable_level)
 {
 	kpti_ng_temp_alloc -= PAGE_SIZE;
 	return kpti_ng_temp_alloc;
@@ -1458,10 +1484,14 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
 
 		WARN_ON(!pte_present(pte));
 		__pte_clear(&init_mm, addr, ptep);
-		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-		if (free_mapped)
+		if (free_mapped) {
+			/* CONT blocks are not supported in the vmemmap */
+			WARN_ON(pte_cont(pte));
+			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 			free_hotplug_page_range(pte_page(pte),
 						PAGE_SIZE, altmap);
+		}
+		/* unmap_hotplug_range() flushes TLB for !free_mapped */
 	} while (addr += PAGE_SIZE, addr < end);
 }
 
@@ -1480,17 +1510,16 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
 			continue;
 
 		WARN_ON(!pmd_present(pmd));
-		if (pmd_sect(pmd)) {
+		if (pmd_leaf(pmd)) {
 			pmd_clear(pmdp);
-
-			/*
-			 * One TLBI should be sufficient here as the PMD_SIZE
-			 * range is mapped with a single block entry.
-			 */
-			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-			if (free_mapped)
+			if (free_mapped) {
+				/* CONT blocks are not supported in the vmemmap */
+				WARN_ON(pmd_cont(pmd));
+				flush_tlb_kernel_range(addr, addr + PMD_SIZE);
 				free_hotplug_page_range(pmd_page(pmd),
 							PMD_SIZE, altmap);
+			}
+			/* unmap_hotplug_range() flushes TLB for !free_mapped */
 			continue;
 		}
 		WARN_ON(!pmd_table(pmd));
@@ -1513,17 +1542,14 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
 			continue;
 
 		WARN_ON(!pud_present(pud));
-		if (pud_sect(pud)) {
+		if (pud_leaf(pud)) {
 			pud_clear(pudp);
-
-			/*
-			 * One TLBI should be sufficient here as the PUD_SIZE
-			 * range is mapped with a single block entry.
-			 */
-			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-			if (free_mapped)
+			if (free_mapped) {
+				flush_tlb_kernel_range(addr, addr + PUD_SIZE);
 				free_hotplug_page_range(pud_page(pud),
 							PUD_SIZE, altmap);
+			}
+			/* unmap_hotplug_range() flushes TLB for !free_mapped */
 			continue;
 		}
 		WARN_ON(!pud_table(pud));
@@ -1553,6 +1579,7 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
 static void unmap_hotplug_range(unsigned long addr, unsigned long end,
 				bool free_mapped, struct vmem_altmap *altmap)
 {
+	unsigned long start = addr;
 	unsigned long next;
 	pgd_t *pgdp, pgd;
 
@@ -1574,6 +1601,9 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end,
 		WARN_ON(!pgd_present(pgd));
 		unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
 	} while (addr = next, addr < end);
+
+	if (!free_mapped)
+		flush_tlb_kernel_range(start, end);
 }
 
 static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
@@ -1627,7 +1657,7 @@ static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
 		if (pmd_none(pmd))
 			continue;
 
-		WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
+		WARN_ON(!pmd_present(pmd) || !pmd_table(pmd));
 		free_empty_pte_table(pmdp, addr, next, floor, ceiling);
 	} while (addr = next, addr < end);
 
@@ -1667,7 +1697,7 @@ static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
 		if (pud_none(pud))
 			continue;
 
-		WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
+		WARN_ON(!pud_present(pud) || !pud_table(pud));
 		free_empty_pmd_table(pudp, addr, next, floor, ceiling);
 	} while (addr = next, addr < end);
 
@@ -1763,7 +1793,7 @@ int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
 {
 	vmemmap_verify((pte_t *)pmdp, node, addr, next);
 
-	return pmd_sect(READ_ONCE(*pmdp));
+	return pmd_leaf(READ_ONCE(*pmdp));
 }
 
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -1827,7 +1857,7 @@ void p4d_clear_huge(p4d_t *p4dp)
 
 int pud_clear_huge(pud_t *pudp)
 {
-	if (!pud_sect(READ_ONCE(*pudp)))
+	if (!pud_leaf(READ_ONCE(*pudp)))
 		return 0;
 	pud_clear(pudp);
 	return 1;
@@ -1835,7 +1865,7 @@ int pud_clear_huge(pud_t *pudp)
 
 int pmd_clear_huge(pmd_t *pmdp)
 {
-	if (!pmd_sect(READ_ONCE(*pmdp)))
+	if (!pmd_leaf(READ_ONCE(*pmdp)))
 		return 0;
 	pmd_clear(pmdp);
 	return 1;
@@ -2010,6 +2040,107 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
 }
 
+
+static bool addr_splits_kernel_leaf(unsigned long addr)
+{
+	pgd_t *pgdp, pgd;
+	p4d_t *p4dp, p4d;
+	pud_t *pudp, pud;
+	pmd_t *pmdp, pmd;
+	pte_t *ptep, pte;
+
+	/*
+	 * If the given address points at a the start address of
+	 * a possible leaf, we certainly won't split. Otherwise,
+	 * check if we would actually split a leaf by traversing
+	 * the page tables further.
+	 */
+	if (IS_ALIGNED(addr, PGDIR_SIZE))
+		return false;
+
+	pgdp = pgd_offset_k(addr);
+	pgd = pgdp_get(pgdp);
+	if (!pgd_present(pgd))
+		return false;
+
+	if (IS_ALIGNED(addr, P4D_SIZE))
+		return false;
+
+	p4dp = p4d_offset(pgdp, addr);
+	p4d = p4dp_get(p4dp);
+	if (!p4d_present(p4d))
+		return false;
+
+	if (IS_ALIGNED(addr, PUD_SIZE))
+		return false;
+
+	pudp = pud_offset(p4dp, addr);
+	pud = pudp_get(pudp);
+	if (!pud_present(pud))
+		return false;
+
+	if (pud_leaf(pud))
+		return true;
+
+	if (IS_ALIGNED(addr, CONT_PMD_SIZE))
+		return false;
+
+	pmdp = pmd_offset(pudp, addr);
+	pmd = pmdp_get(pmdp);
+	if (!pmd_present(pmd))
+		return false;
+
+	if (pmd_cont(pmd))
+		return true;
+
+	if (IS_ALIGNED(addr, PMD_SIZE))
+		return false;
+
+	if (pmd_leaf(pmd))
+		return true;
+
+	if (IS_ALIGNED(addr, CONT_PTE_SIZE))
+		return false;
+
+	ptep = pte_offset_kernel(pmdp, addr);
+	pte = __ptep_get(ptep);
+	if (!pte_present(pte))
+		return false;
+
+	if (pte_cont(pte))
+		return true;
+
+	return !IS_ALIGNED(addr, PAGE_SIZE);
+}
+
+static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages)
+{
+	unsigned long phys_start, phys_end, start, end;
+
+	phys_start = PFN_PHYS(pfn);
+	phys_end = phys_start + nr_pages * PAGE_SIZE;
+
+	/* PFN range's linear map edges are leaf entry aligned */
+	start = __phys_to_virt(phys_start);
+	end =  __phys_to_virt(phys_end);
+	if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) {
+		pr_warn("[%lx %lx] splits a leaf entry in linear map\n",
+			phys_start, phys_end);
+		return false;
+	}
+
+	/* PFN range's vmemmap edges are leaf entry aligned */
+	BUILD_BUG_ON(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP));
+	start = (unsigned long)pfn_to_page(pfn);
+	end = (unsigned long)pfn_to_page(pfn + nr_pages);
+	if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) {
+		pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n",
+			phys_start, phys_end);
+		return false;
+	}
+	return true;
+}
+
 /*
  * This memory hotplug notifier helps prevent boot memory from being
  * inadvertently removed as it blocks pfn range offlining process in
@@ -2018,8 +2149,11 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
  * In future if and when boot memory could be removed, this notifier
  * should be dropped and free_hotplug_page_range() should handle any
  * reserved pages allocated during boot.
+ *
+ * This also blocks any memory remove that would have caused a split
+ * in leaf entry in kernel linear or vmemmap mapping.
  */
-static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
+static int prevent_memory_remove_notifier(struct notifier_block *nb,
 					   unsigned long action, void *data)
 {
 	struct mem_section *ms;
@@ -2065,11 +2199,15 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
 			return NOTIFY_DONE;
 		}
 	}
+
+	if (!can_unmap_without_split(pfn, arg->nr_pages))
+		return NOTIFY_BAD;
+
 	return NOTIFY_OK;
 }
 
-static struct notifier_block prevent_bootmem_remove_nb = {
-	.notifier_call = prevent_bootmem_remove_notifier,
+static struct notifier_block prevent_memory_remove_nb = {
+	.notifier_call = prevent_memory_remove_notifier,
 };
 
 /*
@@ -2119,7 +2257,7 @@ static void validate_bootmem_online(void)
 	}
 }
 
-static int __init prevent_bootmem_remove_init(void)
+static int __init prevent_memory_remove_init(void)
 {
 	int ret = 0;
 
@@ -2127,13 +2265,13 @@ static int __init prevent_bootmem_remove_init(void)
 		return ret;
 
 	validate_bootmem_online();
-	ret = register_memory_notifier(&prevent_bootmem_remove_nb);
+	ret = register_memory_notifier(&prevent_memory_remove_nb);
 	if (ret)
 		pr_err("%s: Notifier registration failed %d\n", __func__, ret);
 
 	return ret;
 }
-early_initcall(prevent_bootmem_remove_init);
+early_initcall(prevent_memory_remove_init);
 #endif
 
 pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
@@ -2149,7 +2287,7 @@ pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
 		 */
 		if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte))
 			__flush_tlb_range(vma, addr, nr * PAGE_SIZE,
-					  PAGE_SIZE, true, 3);
+					  PAGE_SIZE, 3, TLBF_NOWALKCACHE);
 	}
 
 	return pte;
@@ -2188,7 +2326,7 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
 	phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));
 
 	if (cnp)
-		ttbr1 |= TTBR_CNP_BIT;
+		ttbr1 |= TTBRx_EL1_CnP;
 
 	replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
 
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 358d1dc9a576..ce035e1b4eaf 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -25,6 +25,11 @@ static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
 {
 	struct page_change_data *masks = walk->private;
 
+	/*
+	 * Some users clear and set bits which alias each other (e.g. PTE_NG and
+	 * PTE_PRESENT_INVALID). It is therefore important that we always clear
+	 * first then set.
+	 */
 	val &= ~(pgprot_val(masks->clear_mask));
 	val |= (pgprot_val(masks->set_mask));
 
@@ -36,7 +41,7 @@ static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
 {
 	pud_t val = pudp_get(pud);
 
-	if (pud_sect(val)) {
+	if (pud_leaf(val)) {
 		if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
 			return -EINVAL;
 		val = __pud(set_pageattr_masks(pud_val(val), walk));
@@ -52,7 +57,7 @@ static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
 {
 	pmd_t val = pmdp_get(pmd);
 
-	if (pmd_sect(val)) {
+	if (pmd_leaf(val)) {
 		if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
 			return -EINVAL;
 		val = __pmd(set_pageattr_masks(pmd_val(val), walk));
@@ -132,11 +137,12 @@ static int __change_memory_common(unsigned long start, unsigned long size,
 	ret = update_range_prot(start, size, set_mask, clear_mask);
 
 	/*
-	 * If the memory is being made valid without changing any other bits
-	 * then a TLBI isn't required as a non-valid entry cannot be cached in
-	 * the TLB.
+	 * If the memory is being switched from present-invalid to valid without
+	 * changing any other bits then a TLBI isn't required as a non-valid
+	 * entry cannot be cached in the TLB.
 	 */
-	if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask))
+	if (pgprot_val(set_mask) != PTE_PRESENT_VALID_KERNEL ||
+	    pgprot_val(clear_mask) != PTE_PRESENT_INVALID)
 		flush_tlb_kernel_range(start, start + size);
 	return ret;
 }
@@ -237,18 +243,18 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
 {
 	if (enable)
 		return __change_memory_common(addr, PAGE_SIZE * numpages,
-					__pgprot(PTE_VALID),
-					__pgprot(0));
+					__pgprot(PTE_PRESENT_VALID_KERNEL),
+					__pgprot(PTE_PRESENT_INVALID));
 	else
 		return __change_memory_common(addr, PAGE_SIZE * numpages,
-					__pgprot(0),
-					__pgprot(PTE_VALID));
+					__pgprot(PTE_PRESENT_INVALID),
+					__pgprot(PTE_PRESENT_VALID_KERNEL));
 }
 
 int set_direct_map_invalid_noflush(struct page *page)
 {
-	pgprot_t clear_mask = __pgprot(PTE_VALID);
-	pgprot_t set_mask = __pgprot(0);
+	pgprot_t clear_mask = __pgprot(PTE_PRESENT_VALID_KERNEL);
+	pgprot_t set_mask = __pgprot(PTE_PRESENT_INVALID);
 
 	if (!can_set_direct_map())
 		return 0;
@@ -259,8 +265,8 @@ int set_direct_map_invalid_noflush(struct page *page)
 
 int set_direct_map_default_noflush(struct page *page)
 {
-	pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE);
-	pgprot_t clear_mask = __pgprot(PTE_RDONLY);
+	pgprot_t set_mask = __pgprot(PTE_PRESENT_VALID_KERNEL | PTE_WRITE);
+	pgprot_t clear_mask = __pgprot(PTE_PRESENT_INVALID | PTE_RDONLY);
 
 	if (!can_set_direct_map())
 		return 0;
@@ -296,8 +302,8 @@ static int __set_memory_enc_dec(unsigned long addr,
 	 * entries or Synchronous External Aborts caused by RIPAS_EMPTY
 	 */
 	ret = __change_memory_common(addr, PAGE_SIZE * numpages,
-				     __pgprot(set_prot),
-				     __pgprot(clear_prot | PTE_VALID));
+				     __pgprot(set_prot | PTE_PRESENT_INVALID),
+				     __pgprot(clear_prot | PTE_PRESENT_VALID_KERNEL));
 
 	if (ret)
 		return ret;
@@ -311,8 +317,8 @@ static int __set_memory_enc_dec(unsigned long addr,
 		return ret;
 
 	return __change_memory_common(addr, PAGE_SIZE * numpages,
-				      __pgprot(PTE_VALID),
-				      __pgprot(0));
+				      __pgprot(PTE_PRESENT_VALID_KERNEL),
+				      __pgprot(PTE_PRESENT_INVALID));
 }
 
 static int realm_set_memory_encrypted(unsigned long addr, int numpages)
@@ -404,15 +410,15 @@ bool kernel_page_present(struct page *page)
 	pud = READ_ONCE(*pudp);
 	if (pud_none(pud))
 		return false;
-	if (pud_sect(pud))
-		return true;
+	if (pud_leaf(pud))
+		return pud_valid(pud);
 
 	pmdp = pmd_offset(pudp, addr);
 	pmd = READ_ONCE(*pmdp);
 	if (pmd_none(pmd))
 		return false;
-	if (pmd_sect(pmd))
-		return true;
+	if (pmd_leaf(pmd))
+		return pmd_valid(pmd);
 
 	ptep = pte_offset_kernel(pmdp, addr);
 	return pte_valid(__ptep_get(ptep));
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index 18543b603c77..cca9706a875c 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -31,36 +31,6 @@ static void *trans_alloc(struct trans_pgd_info *info)
 	return info->trans_alloc_page(info->trans_alloc_arg);
 }
 
-static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
-{
-	pte_t pte = __ptep_get(src_ptep);
-
-	if (pte_valid(pte)) {
-		/*
-		 * Resume will overwrite areas that may be marked
-		 * read only (code, rodata). Clear the RDONLY bit from
-		 * the temporary mappings we use during restore.
-		 */
-		__set_pte(dst_ptep, pte_mkwrite_novma(pte));
-	} else if (!pte_none(pte)) {
-		/*
-		 * debug_pagealloc will removed the PTE_VALID bit if
-		 * the page isn't in use by the resume kernel. It may have
-		 * been in use by the original kernel, in which case we need
-		 * to put it back in our copy to do the restore.
-		 *
-		 * Other cases include kfence / vmalloc / memfd_secret which
-		 * may call `set_direct_map_invalid_noflush()`.
-		 *
-		 * Before marking this entry valid, check the pfn should
-		 * be mapped.
-		 */
-		BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-		__set_pte(dst_ptep, pte_mkvalid(pte_mkwrite_novma(pte)));
-	}
-}
-
 static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
 		    pmd_t *src_pmdp, unsigned long start, unsigned long end)
 {
@@ -76,7 +46,11 @@ static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
 
 	src_ptep = pte_offset_kernel(src_pmdp, start);
 	do {
-		_copy_pte(dst_ptep, src_ptep, addr);
+		pte_t pte = __ptep_get(src_ptep);
+
+		if (pte_none(pte))
+			continue;
+		__set_pte(dst_ptep, pte_mkvalid_k(pte_mkwrite_novma(pte)));
 	} while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
 
 	return 0;
@@ -109,8 +83,7 @@ static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp,
 			if (copy_pte(info, dst_pmdp, src_pmdp, addr, next))
 				return -ENOMEM;
 		} else {
-			set_pmd(dst_pmdp,
-				__pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY));
+			set_pmd(dst_pmdp, pmd_mkvalid_k(pmd_mkwrite_novma(pmd)));
 		}
 	} while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
 
@@ -145,8 +118,7 @@ static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp,
 			if (copy_pmd(info, dst_pudp, src_pudp, addr, next))
 				return -ENOMEM;
 		} else {
-			set_pud(dst_pudp,
-				__pud(pud_val(pud) & ~PUD_SECT_RDONLY));
+			set_pud(dst_pudp, pud_mkvalid_k(pud_mkwrite_novma(pud)));
 		}
 	} while (dst_pudp++, src_pudp++, addr = next, addr != end);
 
diff --git a/arch/arm64/tools/Makefile b/arch/arm64/tools/Makefile
index c2b34e761006..a94b3d9caad6 100644
--- a/arch/arm64/tools/Makefile
+++ b/arch/arm64/tools/Makefile
@@ -3,7 +3,7 @@
 gen := arch/$(ARCH)/include/generated
 kapi := $(gen)/asm
 
-kapisyshdr-y := cpucap-defs.h sysreg-defs.h
+kapisyshdr-y := cpucap-defs.h kernel-hwcap.h sysreg-defs.h
 
 kapi-hdrs-y := $(addprefix $(kapi)/, $(kapisyshdr-y))
 
@@ -18,11 +18,17 @@ kapi:   $(kapi-hdrs-y)
 quiet_cmd_gen_cpucaps = GEN     $@
       cmd_gen_cpucaps = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@
 
+quiet_cmd_gen_kernel_hwcap = GEN     $@
+      cmd_gen_kernel_hwcap = mkdir -p $(dir $@); /bin/sh -e $(real-prereqs) > $@
+
 quiet_cmd_gen_sysreg = GEN     $@
       cmd_gen_sysreg = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@
 
 $(kapi)/cpucap-defs.h: $(src)/gen-cpucaps.awk $(src)/cpucaps FORCE
 	$(call if_changed,gen_cpucaps)
 
+$(kapi)/kernel-hwcap.h: $(src)/gen-kernel-hwcaps.sh $(srctree)/arch/arm64/include/uapi/asm/hwcap.h FORCE
+	$(call if_changed,gen_kernel_hwcap)
+
 $(kapi)/sysreg-defs.h: $(src)/gen-sysreg.awk $(src)/sysreg FORCE
 	$(call if_changed,gen_sysreg)
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 7261553b644b..b7286d977788 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -48,6 +48,7 @@ HAS_LPA2
 HAS_LSE_ATOMICS
 HAS_LS64
 HAS_LS64_V
+HAS_LSUI
 HAS_MOPS
 HAS_NESTED_VIRT
 HAS_BBML2_NOABORT
diff --git a/arch/arm64/tools/gen-kernel-hwcaps.sh b/arch/arm64/tools/gen-kernel-hwcaps.sh
new file mode 100644
index 000000000000..e7cdcf428d91
--- /dev/null
+++ b/arch/arm64/tools/gen-kernel-hwcaps.sh
@@ -0,0 +1,23 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+#
+# gen-kernel-hwcap.sh - Generate kernel internal hwcap.h definitions
+#
+# Copyright 2026 Arm, Ltd.
+
+if [ "$1" = "" ]; then
+	echo "$0: no filename specified"
+	exit 1
+fi
+
+echo "#ifndef __ASM_KERNEL_HWCAPS_H"
+echo "#define __ASM_KERNEL_HWCAPS_H"
+echo ""
+echo "/* Generated file - do not edit */"
+echo ""
+
+grep -E '^#define HWCAP[0-9]*_[A-Z0-9_]+' $1 | \
+	sed 's/.*HWCAP\([0-9]*\)_\([A-Z0-9_]\+\).*/#define KERNEL_HWCAP_\2\t__khwcap\1_feature(\2)/'
+
+echo ""
+echo "#endif /* __ASM_KERNEL_HWCAPS_H */"
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 9d1c21108057..9d20ec6816d4 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1496,6 +1496,7 @@ UnsignedEnum	27:24	B16B16
 	0b0000	NI
 	0b0001	IMP
 	0b0010	BFSCALE
+	0b0011	B16MM
 EndEnum
 UnsignedEnum	23:20	BF16
 	0b0000	NI
@@ -1522,6 +1523,7 @@ UnsignedEnum	3:0	SVEver
 	0b0001	SVE2
 	0b0010	SVE2p1
 	0b0011	SVE2p2
+	0b0100	SVE2p3
 EndEnum
 EndSysreg
 
@@ -1530,7 +1532,11 @@ UnsignedEnum	63	FA64
 	0b0	NI
 	0b1	IMP
 EndEnum
-Res0	62:61
+Res0	62
+UnsignedEnum	61	LUT6
+	0b0	NI
+	0b1	IMP
+EndEnum
 UnsignedEnum	60	LUTv2
 	0b0	NI
 	0b1	IMP
@@ -1540,6 +1546,7 @@ UnsignedEnum	59:56	SMEver
 	0b0001	SME2
 	0b0010	SME2p1
 	0b0011	SME2p2
+	0b0100	SME2p3
 EndEnum
 UnsignedEnum	55:52	I16I64
 	0b0000	NI
@@ -1654,7 +1661,13 @@ UnsignedEnum	26	F8MM4
 	0b0	NI
 	0b1	IMP
 EndEnum
-Res0	25:2
+Res0	25:16
+UnsignedEnum	15	F16MM2
+	0b0	NI
+	0b1	IMP
+EndEnum
+Res0	14:8
+Raz	7:2
 UnsignedEnum	1	F8E4M3
 	0b0	NI
 	0b1	IMP
@@ -1835,6 +1848,8 @@ EndEnum
 UnsignedEnum	51:48	FHM
 	0b0000	NI
 	0b0001	IMP
+	0b0010	F16F32DOT
+	0b0011	F16F32MM
 EndEnum
 UnsignedEnum	47:44	DP
 	0b0000	NI
@@ -1976,6 +1991,7 @@ EndEnum
 UnsignedEnum	59:56	LUT
 	0b0000	NI
 	0b0001	IMP
+	0b0010	LUT6
 EndEnum
 UnsignedEnum	55:52	CSSC
 	0b0000	NI
@@ -3655,11 +3671,15 @@ Field	3:0	BS
 EndSysreg
 
 Sysreg	SMIDR_EL1	3	1	0	0	6
-Res0	63:32
+Res0	63:60
+Field	59:56	NSMC
+Field	55:52	HIP
+Field	51:32	AFFINITY2
 Field	31:24	IMPLEMENTER
 Field	23:16	REVISION
 Field	15	SMPS
-Res0	14:12
+Field	14:13	SH
+Res0	12
 Field	11:0	AFFINITY
 EndSysreg
 
@@ -5172,6 +5192,14 @@ Field	31:16	PARTID_D
 Field	15:0	PARTID_I
 EndSysreg
 
+Sysreg	MPAMSM_EL1	3	0	10	5	3
+Res0	63:48
+Field	47:40	PMG_D
+Res0	39:32
+Field	31:16	PARTID_D
+Res0	15:0
+EndSysreg
+
 Sysreg	ISR_EL1	3	0	12	1	0
 Res0	63:11
 Field	10	IS
diff --git a/drivers/acpi/arm64/agdi.c b/drivers/acpi/arm64/agdi.c
index feb4b2cb4618..0c2d9d6c160b 100644
--- a/drivers/acpi/arm64/agdi.c
+++ b/drivers/acpi/arm64/agdi.c
@@ -36,7 +36,7 @@ static int agdi_sdei_probe(struct platform_device *pdev,
 
 	err = sdei_event_register(adata->sdei_event, agdi_sdei_handler, pdev);
 	if (err) {
-		dev_err(&pdev->dev, "Failed to register for SDEI event %d",
+		dev_err(&pdev->dev, "Failed to register for SDEI event %d\n",
 			adata->sdei_event);
 		return err;
 	}
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 638321fc9800..ab90932fc2d0 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -311,4 +311,18 @@ config MARVELL_PEM_PMU
 	  Enable support for PCIe Interface performance monitoring
 	  on Marvell platform.
 
+config NVIDIA_TEGRA410_CMEM_LATENCY_PMU
+	tristate "NVIDIA Tegra410 CPU Memory Latency PMU"
+	depends on ARM64 && ACPI
+	help
+	  Enable perf support for CPU memory latency counters monitoring on
+	  NVIDIA Tegra410 SoC.
+
+config NVIDIA_TEGRA410_C2C_PMU
+	tristate "NVIDIA Tegra410 C2C PMU"
+	depends on ARM64 && ACPI
+	help
+	  Enable perf support for counters in NVIDIA C2C interface of NVIDIA
+	  Tegra410 SoC.
+
 endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index ea52711a87e3..eb8a022dad9a 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -35,3 +35,5 @@ obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
 obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/
 obj-$(CONFIG_MESON_DDR_PMU) += amlogic/
 obj-$(CONFIG_CXL_PMU) += cxl_pmu.o
+obj-$(CONFIG_NVIDIA_TEGRA410_CMEM_LATENCY_PMU) += nvidia_t410_cmem_latency_pmu.o
+obj-$(CONFIG_NVIDIA_TEGRA410_C2C_PMU) += nvidia_t410_c2c_pmu.o
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 40c05c519a1d..f5305c8fdca4 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -2132,6 +2132,8 @@ static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, i
 static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int idx)
 {
 	struct arm_cmn_dtc *dtc = cmn->dtc + idx;
+	const struct resource *cfg;
+	resource_size_t base, size;
 
 	dtc->pmu_base = dn->pmu_base;
 	dtc->base = dtc->pmu_base - arm_cmn_pmu_offset(cmn, dn);
@@ -2139,6 +2141,13 @@ static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int id
 	if (dtc->irq < 0)
 		return dtc->irq;
 
+	cfg = platform_get_resource(to_platform_device(cmn->dev), IORESOURCE_MEM, 0);
+	base = dtc->base - cmn->base + cfg->start;
+	size = cmn->part == PART_CMN600 ? SZ_16K : SZ_64K;
+	if (!devm_request_mem_region(cmn->dev, base, size, dev_name(cmn->dev)))
+		return dev_err_probe(cmn->dev, -EBUSY,
+				     "Failed to request DTC region 0x%pa\n", &base);
+
 	writel_relaxed(CMN_DT_DTC_CTL_DT_EN, dtc->base + CMN_DT_DTC_CTL);
 	writel_relaxed(CMN_DT_PMCR_PMU_EN | CMN_DT_PMCR_OVFL_INTR_EN, CMN_DT_PMCR(dtc));
 	writeq_relaxed(0, CMN_DT_PMCCNTR(dtc));
@@ -2525,43 +2534,26 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	return 0;
 }
 
-static int arm_cmn600_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn)
-{
-	struct resource *cfg, *root;
-
-	cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!cfg)
-		return -EINVAL;
-
-	root = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	if (!root)
-		return -EINVAL;
-
-	if (!resource_contains(cfg, root))
-		swap(cfg, root);
-	/*
-	 * Note that devm_ioremap_resource() is dumb and won't let the platform
-	 * device claim cfg when the ACPI companion device has already claimed
-	 * root within it. But since they *are* already both claimed in the
-	 * appropriate name, we don't really need to do it again here anyway.
-	 */
-	cmn->base = devm_ioremap(cmn->dev, cfg->start, resource_size(cfg));
-	if (!cmn->base)
-		return -ENOMEM;
-
-	return root->start - cfg->start;
-}
-
-static int arm_cmn600_of_probe(struct device_node *np)
+static int arm_cmn_get_root(struct arm_cmn *cmn, const struct resource *cfg)
 {
+	const struct device_node *np = cmn->dev->of_node;
+	const struct resource *root;
 	u32 rootnode;
 
-	return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode;
+	if (cmn->part != PART_CMN600)
+		return 0;
+
+	if (np)
+		return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode;
+
+	root = platform_get_resource(to_platform_device(cmn->dev), IORESOURCE_MEM, 1);
+	return root ? root->start - cfg->start : -EINVAL;
 }
 
 static int arm_cmn_probe(struct platform_device *pdev)
 {
 	struct arm_cmn *cmn;
+	const struct resource *cfg;
 	const char *name;
 	static atomic_t id;
 	int err, rootnode, this_id;
@@ -2575,16 +2567,16 @@ static int arm_cmn_probe(struct platform_device *pdev)
 	cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev));
 	platform_set_drvdata(pdev, cmn);
 
-	if (cmn->part == PART_CMN600 && has_acpi_companion(cmn->dev)) {
-		rootnode = arm_cmn600_acpi_probe(pdev, cmn);
-	} else {
-		rootnode = 0;
-		cmn->base = devm_platform_ioremap_resource(pdev, 0);
-		if (IS_ERR(cmn->base))
-			return PTR_ERR(cmn->base);
-		if (cmn->part == PART_CMN600)
-			rootnode = arm_cmn600_of_probe(pdev->dev.of_node);
-	}
+	cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!cfg)
+		return -EINVAL;
+
+	/* Map the whole region now, claim the DTCs once we've found them */
+	cmn->base = devm_ioremap(cmn->dev, cfg->start, resource_size(cfg));
+	if (!cmn->base)
+		return -ENOMEM;
+
+	rootnode = arm_cmn_get_root(cmn, cfg);
 	if (rootnode < 0)
 		return rootnode;
 
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index ed72c3d1f796..80fb314d5135 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -16,7 +16,7 @@
  * The user should refer to the vendor technical documentation to get details
  * about the supported events.
  *
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  */
 
@@ -1134,6 +1134,23 @@ static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu)
 
 	return 0;
 }
+
+struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu)
+{
+	char hid[16] = {};
+	char uid[16] = {};
+	const struct acpi_apmt_node *apmt_node;
+
+	apmt_node = arm_cspmu_apmt_node(cspmu->dev);
+	if (!apmt_node || apmt_node->type != ACPI_APMT_NODE_TYPE_ACPI)
+		return NULL;
+
+	memcpy(hid, &apmt_node->inst_primary, sizeof(apmt_node->inst_primary));
+	snprintf(uid, sizeof(uid), "%u", apmt_node->inst_secondary);
+
+	return acpi_dev_get_first_match_dev(hid, uid, -1);
+}
+EXPORT_SYMBOL_GPL(arm_cspmu_acpi_dev_get);
 #else
 static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu)
 {
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h
index cd65a58dbd88..3fc5c8d77266 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.h
+++ b/drivers/perf/arm_cspmu/arm_cspmu.h
@@ -1,13 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
  * ARM CoreSight Architecture PMU driver.
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  */
 
 #ifndef __ARM_CSPMU_H__
 #define __ARM_CSPMU_H__
 
+#include <linux/acpi.h>
 #include <linux/bitfield.h>
 #include <linux/cpumask.h>
 #include <linux/device.h>
@@ -255,4 +256,18 @@ int arm_cspmu_impl_register(const struct arm_cspmu_impl_match *impl_match);
 /* Unregister vendor backend. */
 void arm_cspmu_impl_unregister(const struct arm_cspmu_impl_match *impl_match);
 
+#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64)
+/**
+ * Get ACPI device associated with the PMU.
+ * The caller is responsible for calling acpi_dev_put() on the returned device.
+ */
+struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu);
+#else
+static inline struct acpi_device *
+arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu)
+{
+	return NULL;
+}
+#endif
+
 #endif /* __ARM_CSPMU_H__ */
diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c
index e06a06d3407b..bac83e424d6d 100644
--- a/drivers/perf/arm_cspmu/nvidia_cspmu.c
+++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  */
 
@@ -8,6 +8,7 @@
 
 #include <linux/io.h>
 #include <linux/module.h>
+#include <linux/property.h>
 #include <linux/topology.h>
 
 #include "arm_cspmu.h"
@@ -21,6 +22,44 @@
 #define NV_CNVL_PORT_COUNT           4ULL
 #define NV_CNVL_FILTER_ID_MASK       GENMASK_ULL(NV_CNVL_PORT_COUNT - 1, 0)
 
+#define NV_UCF_SRC_COUNT             3ULL
+#define NV_UCF_DST_COUNT             4ULL
+#define NV_UCF_FILTER_ID_MASK        GENMASK_ULL(11, 0)
+#define NV_UCF_FILTER_SRC            GENMASK_ULL(2, 0)
+#define NV_UCF_FILTER_DST            GENMASK_ULL(11, 8)
+#define NV_UCF_FILTER_DEFAULT        (NV_UCF_FILTER_SRC | NV_UCF_FILTER_DST)
+
+#define NV_PCIE_V2_PORT_COUNT        8ULL
+#define NV_PCIE_V2_FILTER_ID_MASK    GENMASK_ULL(24, 0)
+#define NV_PCIE_V2_FILTER_PORT       GENMASK_ULL(NV_PCIE_V2_PORT_COUNT - 1, 0)
+#define NV_PCIE_V2_FILTER_BDF_VAL    GENMASK_ULL(23, NV_PCIE_V2_PORT_COUNT)
+#define NV_PCIE_V2_FILTER_BDF_EN     BIT(24)
+#define NV_PCIE_V2_FILTER_BDF_VAL_EN GENMASK_ULL(24, NV_PCIE_V2_PORT_COUNT)
+#define NV_PCIE_V2_FILTER_DEFAULT    NV_PCIE_V2_FILTER_PORT
+
+#define NV_PCIE_V2_DST_COUNT         5ULL
+#define NV_PCIE_V2_FILTER2_ID_MASK   GENMASK_ULL(4, 0)
+#define NV_PCIE_V2_FILTER2_DST       GENMASK_ULL(NV_PCIE_V2_DST_COUNT - 1, 0)
+#define NV_PCIE_V2_FILTER2_DEFAULT   NV_PCIE_V2_FILTER2_DST
+
+#define NV_PCIE_TGT_PORT_COUNT       8ULL
+#define NV_PCIE_TGT_EV_TYPE_CC       0x4
+#define NV_PCIE_TGT_EV_TYPE_COUNT    3ULL
+#define NV_PCIE_TGT_EV_TYPE_MASK     GENMASK_ULL(NV_PCIE_TGT_EV_TYPE_COUNT - 1, 0)
+#define NV_PCIE_TGT_FILTER2_MASK     GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT, 0)
+#define NV_PCIE_TGT_FILTER2_PORT     GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT - 1, 0)
+#define NV_PCIE_TGT_FILTER2_ADDR_EN  BIT(NV_PCIE_TGT_PORT_COUNT)
+#define NV_PCIE_TGT_FILTER2_ADDR     GENMASK_ULL(15, NV_PCIE_TGT_PORT_COUNT)
+#define NV_PCIE_TGT_FILTER2_DEFAULT  NV_PCIE_TGT_FILTER2_PORT
+
+#define NV_PCIE_TGT_ADDR_COUNT       8ULL
+#define NV_PCIE_TGT_ADDR_STRIDE      20
+#define NV_PCIE_TGT_ADDR_CTRL        0xD38
+#define NV_PCIE_TGT_ADDR_BASE_LO     0xD3C
+#define NV_PCIE_TGT_ADDR_BASE_HI     0xD40
+#define NV_PCIE_TGT_ADDR_MASK_LO     0xD44
+#define NV_PCIE_TGT_ADDR_MASK_HI     0xD48
+
 #define NV_GENERIC_FILTER_ID_MASK    GENMASK_ULL(31, 0)
 
 #define NV_PRODID_MASK	(PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION)
@@ -124,6 +163,55 @@ static struct attribute *mcf_pmu_event_attrs[] = {
 	NULL,
 };
 
+static struct attribute *ucf_pmu_event_attrs[] = {
+	ARM_CSPMU_EVENT_ATTR(bus_cycles,            0x1D),
+
+	ARM_CSPMU_EVENT_ATTR(slc_allocate,          0xF0),
+	ARM_CSPMU_EVENT_ATTR(slc_wb,                0xF3),
+	ARM_CSPMU_EVENT_ATTR(slc_refill_rd,         0x109),
+	ARM_CSPMU_EVENT_ATTR(slc_refill_wr,         0x10A),
+	ARM_CSPMU_EVENT_ATTR(slc_hit_rd,            0x119),
+
+	ARM_CSPMU_EVENT_ATTR(slc_access_dataless,   0x183),
+	ARM_CSPMU_EVENT_ATTR(slc_access_atomic,     0x184),
+
+	ARM_CSPMU_EVENT_ATTR(slc_access_rd,         0x111),
+	ARM_CSPMU_EVENT_ATTR(slc_access_wr,         0x112),
+	ARM_CSPMU_EVENT_ATTR(slc_bytes_rd,          0x113),
+	ARM_CSPMU_EVENT_ATTR(slc_bytes_wr,          0x114),
+
+	ARM_CSPMU_EVENT_ATTR(mem_access_rd,         0x121),
+	ARM_CSPMU_EVENT_ATTR(mem_access_wr,         0x122),
+	ARM_CSPMU_EVENT_ATTR(mem_bytes_rd,          0x123),
+	ARM_CSPMU_EVENT_ATTR(mem_bytes_wr,          0x124),
+
+	ARM_CSPMU_EVENT_ATTR(local_snoop,           0x180),
+	ARM_CSPMU_EVENT_ATTR(ext_snp_access,        0x181),
+	ARM_CSPMU_EVENT_ATTR(ext_snp_evict,         0x182),
+
+	ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT),
+	NULL
+};
+
+static struct attribute *pcie_v2_pmu_event_attrs[] = {
+	ARM_CSPMU_EVENT_ATTR(rd_bytes,		0x0),
+	ARM_CSPMU_EVENT_ATTR(wr_bytes,		0x1),
+	ARM_CSPMU_EVENT_ATTR(rd_req,		0x2),
+	ARM_CSPMU_EVENT_ATTR(wr_req,		0x3),
+	ARM_CSPMU_EVENT_ATTR(rd_cum_outs,	0x4),
+	ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT),
+	NULL
+};
+
+static struct attribute *pcie_tgt_pmu_event_attrs[] = {
+	ARM_CSPMU_EVENT_ATTR(rd_bytes,		0x0),
+	ARM_CSPMU_EVENT_ATTR(wr_bytes,		0x1),
+	ARM_CSPMU_EVENT_ATTR(rd_req,		0x2),
+	ARM_CSPMU_EVENT_ATTR(wr_req,		0x3),
+	ARM_CSPMU_EVENT_ATTR(cycles, NV_PCIE_TGT_EV_TYPE_CC),
+	NULL
+};
+
 static struct attribute *generic_pmu_event_attrs[] = {
 	ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT),
 	NULL,
@@ -152,6 +240,40 @@ static struct attribute *cnvlink_pmu_format_attrs[] = {
 	NULL,
 };
 
+static struct attribute *ucf_pmu_format_attrs[] = {
+	ARM_CSPMU_FORMAT_EVENT_ATTR,
+	ARM_CSPMU_FORMAT_ATTR(src_loc_noncpu, "config1:0"),
+	ARM_CSPMU_FORMAT_ATTR(src_loc_cpu, "config1:1"),
+	ARM_CSPMU_FORMAT_ATTR(src_rem, "config1:2"),
+	ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config1:8"),
+	ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config1:9"),
+	ARM_CSPMU_FORMAT_ATTR(dst_loc_other, "config1:10"),
+	ARM_CSPMU_FORMAT_ATTR(dst_rem, "config1:11"),
+	NULL
+};
+
+static struct attribute *pcie_v2_pmu_format_attrs[] = {
+	ARM_CSPMU_FORMAT_EVENT_ATTR,
+	ARM_CSPMU_FORMAT_ATTR(src_rp_mask, "config1:0-7"),
+	ARM_CSPMU_FORMAT_ATTR(src_bdf, "config1:8-23"),
+	ARM_CSPMU_FORMAT_ATTR(src_bdf_en, "config1:24"),
+	ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config2:0"),
+	ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config2:1"),
+	ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_p2p, "config2:2"),
+	ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_cxl, "config2:3"),
+	ARM_CSPMU_FORMAT_ATTR(dst_rem, "config2:4"),
+	NULL
+};
+
+static struct attribute *pcie_tgt_pmu_format_attrs[] = {
+	ARM_CSPMU_FORMAT_ATTR(event, "config:0-2"),
+	ARM_CSPMU_FORMAT_ATTR(dst_rp_mask, "config:3-10"),
+	ARM_CSPMU_FORMAT_ATTR(dst_addr_en, "config:11"),
+	ARM_CSPMU_FORMAT_ATTR(dst_addr_base, "config1:0-63"),
+	ARM_CSPMU_FORMAT_ATTR(dst_addr_mask, "config2:0-63"),
+	NULL
+};
+
 static struct attribute *generic_pmu_format_attrs[] = {
 	ARM_CSPMU_FORMAT_EVENT_ATTR,
 	ARM_CSPMU_FORMAT_FILTER_ATTR,
@@ -183,6 +305,32 @@ nv_cspmu_get_name(const struct arm_cspmu *cspmu)
 	return ctx->name;
 }
 
+#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64)
+static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id)
+{
+	struct fwnode_handle *fwnode;
+	struct acpi_device *adev;
+	int ret;
+
+	adev = arm_cspmu_acpi_dev_get(cspmu);
+	if (!adev)
+		return -ENODEV;
+
+	fwnode = acpi_fwnode_handle(adev);
+	ret = fwnode_property_read_u32(fwnode, "instance_id", id);
+	if (ret)
+		dev_err(cspmu->dev, "Failed to get instance ID\n");
+
+	acpi_dev_put(adev);
+	return ret;
+}
+#else
+static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id)
+{
+	return -EINVAL;
+}
+#endif
+
 static u32 nv_cspmu_event_filter(const struct perf_event *event)
 {
 	const struct nv_cspmu_ctx *ctx =
@@ -228,6 +376,20 @@ static void nv_cspmu_set_ev_filter(struct arm_cspmu *cspmu,
 	}
 }
 
+static void nv_cspmu_reset_ev_filter(struct arm_cspmu *cspmu,
+				     const struct perf_event *event)
+{
+	const struct nv_cspmu_ctx *ctx =
+		to_nv_cspmu_ctx(to_arm_cspmu(event->pmu));
+	const u32 offset = 4 * event->hw.idx;
+
+	if (ctx->get_filter)
+		writel(0, cspmu->base0 + PMEVFILTR + offset);
+
+	if (ctx->get_filter2)
+		writel(0, cspmu->base0 + PMEVFILT2R + offset);
+}
+
 static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu,
 				   const struct perf_event *event)
 {
@@ -236,10 +398,386 @@ static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu,
 	writel(filter, cspmu->base0 + PMCCFILTR);
 }
 
+static u32 ucf_pmu_event_filter(const struct perf_event *event)
+{
+	u32 ret, filter, src, dst;
+
+	filter = nv_cspmu_event_filter(event);
+
+	/* Monitor all sources if none is selected. */
+	src = FIELD_GET(NV_UCF_FILTER_SRC, filter);
+	if (src == 0)
+		src = GENMASK_ULL(NV_UCF_SRC_COUNT - 1, 0);
+
+	/* Monitor all destinations if none is selected. */
+	dst = FIELD_GET(NV_UCF_FILTER_DST, filter);
+	if (dst == 0)
+		dst = GENMASK_ULL(NV_UCF_DST_COUNT - 1, 0);
+
+	ret = FIELD_PREP(NV_UCF_FILTER_SRC, src);
+	ret |= FIELD_PREP(NV_UCF_FILTER_DST, dst);
+
+	return ret;
+}
+
+static u32 pcie_v2_pmu_bdf_val_en(u32 filter)
+{
+	const u32 bdf_en = FIELD_GET(NV_PCIE_V2_FILTER_BDF_EN, filter);
+
+	/* Returns both BDF value and enable bit if BDF filtering is enabled. */
+	if (bdf_en)
+		return FIELD_GET(NV_PCIE_V2_FILTER_BDF_VAL_EN, filter);
+
+	/* Ignore the BDF value if BDF filter is not enabled. */
+	return 0;
+}
+
+static u32 pcie_v2_pmu_event_filter(const struct perf_event *event)
+{
+	u32 filter, lead_filter, lead_bdf;
+	struct perf_event *leader;
+	const struct nv_cspmu_ctx *ctx =
+		to_nv_cspmu_ctx(to_arm_cspmu(event->pmu));
+
+	filter = event->attr.config1 & ctx->filter_mask;
+	if (filter != 0)
+		return filter;
+
+	leader = event->group_leader;
+
+	/* Use leader's filter value if its BDF filtering is enabled. */
+	if (event != leader) {
+		lead_filter = pcie_v2_pmu_event_filter(leader);
+		lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter);
+		if (lead_bdf != 0)
+			return lead_filter;
+	}
+
+	/* Otherwise, return default filter value. */
+	return ctx->filter_default_val;
+}
+
+static int pcie_v2_pmu_validate_event(struct arm_cspmu *cspmu,
+				   struct perf_event *new_ev)
+{
+	/*
+	 * Make sure the events are using same BDF filter since the PCIE-SRC PMU
+	 * only supports one common BDF filter setting for all of the counters.
+	 */
+
+	int idx;
+	u32 new_filter, new_rp, new_bdf, new_lead_filter, new_lead_bdf;
+	struct perf_event *new_leader;
+
+	if (cspmu->impl.ops.is_cycle_counter_event(new_ev))
+		return 0;
+
+	new_leader = new_ev->group_leader;
+
+	new_filter = pcie_v2_pmu_event_filter(new_ev);
+	new_lead_filter = pcie_v2_pmu_event_filter(new_leader);
+
+	new_bdf = pcie_v2_pmu_bdf_val_en(new_filter);
+	new_lead_bdf = pcie_v2_pmu_bdf_val_en(new_lead_filter);
+
+	new_rp = FIELD_GET(NV_PCIE_V2_FILTER_PORT, new_filter);
+
+	if (new_rp != 0 && new_bdf != 0) {
+		dev_err(cspmu->dev,
+			"RP and BDF filtering are mutually exclusive\n");
+		return -EINVAL;
+	}
+
+	if (new_bdf != new_lead_bdf) {
+		dev_err(cspmu->dev,
+			"sibling and leader BDF value should be equal\n");
+		return -EINVAL;
+	}
+
+	/* Compare BDF filter on existing events. */
+	idx = find_first_bit(cspmu->hw_events.used_ctrs,
+			     cspmu->cycle_counter_logical_idx);
+
+	if (idx != cspmu->cycle_counter_logical_idx) {
+		struct perf_event *leader = cspmu->hw_events.events[idx]->group_leader;
+
+		const u32 lead_filter = pcie_v2_pmu_event_filter(leader);
+		const u32 lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter);
+
+		if (new_lead_bdf != lead_bdf) {
+			dev_err(cspmu->dev, "only one BDF value is supported\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+struct pcie_tgt_addr_filter {
+	u32 refcount;
+	u64 base;
+	u64 mask;
+};
+
+struct pcie_tgt_data {
+	struct pcie_tgt_addr_filter addr_filter[NV_PCIE_TGT_ADDR_COUNT];
+	void __iomem *addr_filter_reg;
+};
+
+#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64)
+static int pcie_tgt_init_data(struct arm_cspmu *cspmu)
+{
+	int ret;
+	struct acpi_device *adev;
+	struct pcie_tgt_data *data;
+	struct list_head resource_list;
+	struct resource_entry *rentry;
+	struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu);
+	struct device *dev = cspmu->dev;
+
+	data = devm_kzalloc(dev, sizeof(struct pcie_tgt_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	adev = arm_cspmu_acpi_dev_get(cspmu);
+	if (!adev) {
+		dev_err(dev, "failed to get associated PCIE-TGT device\n");
+		return -ENODEV;
+	}
+
+	INIT_LIST_HEAD(&resource_list);
+	ret = acpi_dev_get_memory_resources(adev, &resource_list);
+	if (ret < 0) {
+		dev_err(dev, "failed to get PCIE-TGT device memory resources\n");
+		acpi_dev_put(adev);
+		return ret;
+	}
+
+	rentry = list_first_entry_or_null(
+		&resource_list, struct resource_entry, node);
+	if (rentry) {
+		data->addr_filter_reg = devm_ioremap_resource(dev, rentry->res);
+		ret = 0;
+	}
+
+	if (IS_ERR(data->addr_filter_reg)) {
+		dev_err(dev, "failed to get address filter resource\n");
+		ret = PTR_ERR(data->addr_filter_reg);
+	}
+
+	acpi_dev_free_resource_list(&resource_list);
+	acpi_dev_put(adev);
+
+	ctx->data = data;
+
+	return ret;
+}
+#else
+static int pcie_tgt_init_data(struct arm_cspmu *cspmu)
+{
+	return -ENODEV;
+}
+#endif
+
+static struct pcie_tgt_data *pcie_tgt_get_data(struct arm_cspmu *cspmu)
+{
+	struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu);
+
+	return ctx->data;
+}
+
+/* Find the first available address filter slot. */
+static int pcie_tgt_find_addr_idx(struct arm_cspmu *cspmu, u64 base, u64 mask,
+	bool is_reset)
+{
+	int i;
+	struct pcie_tgt_data *data = pcie_tgt_get_data(cspmu);
+
+	for (i = 0; i < NV_PCIE_TGT_ADDR_COUNT; i++) {
+		if (!is_reset && data->addr_filter[i].refcount == 0)
+			return i;
+
+		if (data->addr_filter[i].base == base &&
+			data->addr_filter[i].mask == mask)
+			return i;
+	}
+
+	return -ENODEV;
+}
+
+static u32 pcie_tgt_pmu_event_filter(const struct perf_event *event)
+{
+	u32 filter;
+
+	filter = (event->attr.config >> NV_PCIE_TGT_EV_TYPE_COUNT) &
+		NV_PCIE_TGT_FILTER2_MASK;
+
+	return filter;
+}
+
+static bool pcie_tgt_pmu_addr_en(const struct perf_event *event)
+{
+	u32 filter = pcie_tgt_pmu_event_filter(event);
+
+	return FIELD_GET(NV_PCIE_TGT_FILTER2_ADDR_EN, filter) != 0;
+}
+
+static u32 pcie_tgt_pmu_port_filter(const struct perf_event *event)
+{
+	u32 filter = pcie_tgt_pmu_event_filter(event);
+
+	return FIELD_GET(NV_PCIE_TGT_FILTER2_PORT, filter);
+}
+
+static u64 pcie_tgt_pmu_dst_addr_base(const struct perf_event *event)
+{
+	return event->attr.config1;
+}
+
+static u64 pcie_tgt_pmu_dst_addr_mask(const struct perf_event *event)
+{
+	return event->attr.config2;
+}
+
+static int pcie_tgt_pmu_validate_event(struct arm_cspmu *cspmu,
+				   struct perf_event *new_ev)
+{
+	u64 base, mask;
+	int idx;
+
+	if (!pcie_tgt_pmu_addr_en(new_ev))
+		return 0;
+
+	/* Make sure there is a slot available for the address filter. */
+	base = pcie_tgt_pmu_dst_addr_base(new_ev);
+	mask = pcie_tgt_pmu_dst_addr_mask(new_ev);
+	idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false);
+	if (idx < 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void pcie_tgt_pmu_config_addr_filter(struct arm_cspmu *cspmu,
+	bool en, u64 base, u64 mask, int idx)
+{
+	struct pcie_tgt_data *data;
+	struct pcie_tgt_addr_filter *filter;
+	void __iomem *filter_reg;
+
+	data = pcie_tgt_get_data(cspmu);
+	filter = &data->addr_filter[idx];
+	filter_reg = data->addr_filter_reg + (idx * NV_PCIE_TGT_ADDR_STRIDE);
+
+	if (en) {
+		filter->refcount++;
+		if (filter->refcount == 1) {
+			filter->base = base;
+			filter->mask = mask;
+
+			writel(lower_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_LO);
+			writel(upper_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_HI);
+			writel(lower_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_LO);
+			writel(upper_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_HI);
+			writel(1, filter_reg + NV_PCIE_TGT_ADDR_CTRL);
+		}
+	} else {
+		filter->refcount--;
+		if (filter->refcount == 0) {
+			writel(0, filter_reg + NV_PCIE_TGT_ADDR_CTRL);
+			writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_LO);
+			writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_HI);
+			writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_LO);
+			writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_HI);
+
+			filter->base = 0;
+			filter->mask = 0;
+		}
+	}
+}
+
+static void pcie_tgt_pmu_set_ev_filter(struct arm_cspmu *cspmu,
+				const struct perf_event *event)
+{
+	bool addr_filter_en;
+	int idx;
+	u32 filter2_val, filter2_offset, port_filter;
+	u64 base, mask;
+
+	filter2_val = 0;
+	filter2_offset = PMEVFILT2R + (4 * event->hw.idx);
+
+	addr_filter_en = pcie_tgt_pmu_addr_en(event);
+	if (addr_filter_en) {
+		base = pcie_tgt_pmu_dst_addr_base(event);
+		mask = pcie_tgt_pmu_dst_addr_mask(event);
+		idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false);
+
+		if (idx < 0) {
+			dev_err(cspmu->dev,
+				"Unable to find a slot for address filtering\n");
+			writel(0, cspmu->base0 + filter2_offset);
+			return;
+		}
+
+		/* Configure address range filter registers.*/
+		pcie_tgt_pmu_config_addr_filter(cspmu, true, base, mask, idx);
+
+		/* Config the counter to use the selected address filter slot. */
+		filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_ADDR, 1U << idx);
+	}
+
+	port_filter = pcie_tgt_pmu_port_filter(event);
+
+	/* Monitor all ports if no filter is selected. */
+	if (!addr_filter_en && port_filter == 0)
+		port_filter = NV_PCIE_TGT_FILTER2_PORT;
+
+	filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_PORT, port_filter);
+
+	writel(filter2_val, cspmu->base0 + filter2_offset);
+}
+
+static void pcie_tgt_pmu_reset_ev_filter(struct arm_cspmu *cspmu,
+				     const struct perf_event *event)
+{
+	bool addr_filter_en;
+	u64 base, mask;
+	int idx;
+
+	addr_filter_en = pcie_tgt_pmu_addr_en(event);
+	if (!addr_filter_en)
+		return;
+
+	base = pcie_tgt_pmu_dst_addr_base(event);
+	mask = pcie_tgt_pmu_dst_addr_mask(event);
+	idx = pcie_tgt_find_addr_idx(cspmu, base, mask, true);
+
+	if (idx < 0) {
+		dev_err(cspmu->dev,
+			"Unable to find the address filter slot to reset\n");
+		return;
+	}
+
+	pcie_tgt_pmu_config_addr_filter(cspmu, false, base, mask, idx);
+}
+
+static u32 pcie_tgt_pmu_event_type(const struct perf_event *event)
+{
+	return event->attr.config & NV_PCIE_TGT_EV_TYPE_MASK;
+}
+
+static bool pcie_tgt_pmu_is_cycle_counter_event(const struct perf_event *event)
+{
+	u32 event_type = pcie_tgt_pmu_event_type(event);
+
+	return event_type == NV_PCIE_TGT_EV_TYPE_CC;
+}
 
 enum nv_cspmu_name_fmt {
 	NAME_FMT_GENERIC,
-	NAME_FMT_SOCKET
+	NAME_FMT_SOCKET,
+	NAME_FMT_SOCKET_INST,
 };
 
 struct nv_cspmu_match {
@@ -342,6 +880,63 @@ static const struct nv_cspmu_match nv_cspmu_match[] = {
 		.init_data = NULL
 	  },
 	},
+	{
+	  .prodid = 0x2CF20000,
+	  .prodid_mask = NV_PRODID_MASK,
+	  .name_pattern = "nvidia_ucf_pmu_%u",
+	  .name_fmt = NAME_FMT_SOCKET,
+	  .template_ctx = {
+		.event_attr = ucf_pmu_event_attrs,
+		.format_attr = ucf_pmu_format_attrs,
+		.filter_mask = NV_UCF_FILTER_ID_MASK,
+		.filter_default_val = NV_UCF_FILTER_DEFAULT,
+		.filter2_mask = 0x0,
+		.filter2_default_val = 0x0,
+		.get_filter = ucf_pmu_event_filter,
+	  },
+	},
+	{
+	  .prodid = 0x10301000,
+	  .prodid_mask = NV_PRODID_MASK,
+	  .name_pattern = "nvidia_pcie_pmu_%u_rc_%u",
+	  .name_fmt = NAME_FMT_SOCKET_INST,
+	  .template_ctx = {
+		.event_attr = pcie_v2_pmu_event_attrs,
+		.format_attr = pcie_v2_pmu_format_attrs,
+		.filter_mask = NV_PCIE_V2_FILTER_ID_MASK,
+		.filter_default_val = NV_PCIE_V2_FILTER_DEFAULT,
+		.filter2_mask = NV_PCIE_V2_FILTER2_ID_MASK,
+		.filter2_default_val = NV_PCIE_V2_FILTER2_DEFAULT,
+		.get_filter = pcie_v2_pmu_event_filter,
+		.get_filter2 = nv_cspmu_event_filter2,
+	  },
+	  .ops = {
+		.validate_event = pcie_v2_pmu_validate_event,
+		.reset_ev_filter = nv_cspmu_reset_ev_filter,
+	  }
+	},
+	{
+	  .prodid = 0x10700000,
+	  .prodid_mask = NV_PRODID_MASK,
+	  .name_pattern = "nvidia_pcie_tgt_pmu_%u_rc_%u",
+	  .name_fmt = NAME_FMT_SOCKET_INST,
+	  .template_ctx = {
+		.event_attr = pcie_tgt_pmu_event_attrs,
+		.format_attr = pcie_tgt_pmu_format_attrs,
+		.filter_mask = 0x0,
+		.filter_default_val = 0x0,
+		.filter2_mask = NV_PCIE_TGT_FILTER2_MASK,
+		.filter2_default_val = NV_PCIE_TGT_FILTER2_DEFAULT,
+		.init_data = pcie_tgt_init_data
+	  },
+	  .ops = {
+		.is_cycle_counter_event = pcie_tgt_pmu_is_cycle_counter_event,
+		.event_type = pcie_tgt_pmu_event_type,
+		.validate_event = pcie_tgt_pmu_validate_event,
+		.set_ev_filter = pcie_tgt_pmu_set_ev_filter,
+		.reset_ev_filter = pcie_tgt_pmu_reset_ev_filter,
+	  }
+	},
 	{
 	  .prodid = 0,
 	  .prodid_mask = 0,
@@ -365,7 +960,7 @@ static const struct nv_cspmu_match nv_cspmu_match[] = {
 static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu,
 				  const struct nv_cspmu_match *match)
 {
-	char *name;
+	char *name = NULL;
 	struct device *dev = cspmu->dev;
 
 	static atomic_t pmu_generic_idx = {0};
@@ -379,13 +974,20 @@ static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu,
 				       socket);
 		break;
 	}
+	case NAME_FMT_SOCKET_INST: {
+		const int cpu = cpumask_first(&cspmu->associated_cpus);
+		const int socket = cpu_to_node(cpu);
+		u32 inst_id;
+
+		if (!nv_cspmu_get_inst_id(cspmu, &inst_id))
+			name = devm_kasprintf(dev, GFP_KERNEL,
+					match->name_pattern, socket, inst_id);
+		break;
+	}
 	case NAME_FMT_GENERIC:
 		name = devm_kasprintf(dev, GFP_KERNEL, match->name_pattern,
 				       atomic_fetch_inc(&pmu_generic_idx));
 		break;
-	default:
-		name = NULL;
-		break;
 	}
 
 	return name;
@@ -426,8 +1028,12 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu)
 	cspmu->impl.ctx = ctx;
 
 	/* NVIDIA specific callbacks. */
+	SET_OP(validate_event, impl_ops, match, NULL);
+	SET_OP(event_type, impl_ops, match, NULL);
+	SET_OP(is_cycle_counter_event, impl_ops, match, NULL);
 	SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter);
 	SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter);
+	SET_OP(reset_ev_filter, impl_ops, match, NULL);
 	SET_OP(get_event_attrs, impl_ops, match, nv_cspmu_get_event_attrs);
 	SET_OP(get_format_attrs, impl_ops, match, nv_cspmu_get_format_attrs);
 	SET_OP(get_name, impl_ops, match, nv_cspmu_get_name);
diff --git a/drivers/perf/nvidia_t410_c2c_pmu.c b/drivers/perf/nvidia_t410_c2c_pmu.c
new file mode 100644
index 000000000000..411987153ff3
--- /dev/null
+++ b/drivers/perf/nvidia_t410_c2c_pmu.c
@@ -0,0 +1,1051 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVIDIA Tegra410 C2C PMU driver.
+ *
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#include <linux/acpi.h>
+#include <linux/bitops.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+
+/* The C2C interface types in Tegra410. */
+#define C2C_TYPE_NVLINK          0x0
+#define C2C_TYPE_NVCLINK         0x1
+#define C2C_TYPE_NVDLINK         0x2
+#define C2C_TYPE_COUNT           0x3
+
+/* The type of the peer device connected to the C2C interface. */
+#define C2C_PEER_TYPE_CPU        0x0
+#define C2C_PEER_TYPE_GPU        0x1
+#define C2C_PEER_TYPE_CXLMEM     0x2
+#define C2C_PEER_TYPE_COUNT      0x3
+
+/* The number of peer devices can be connected to the C2C interface. */
+#define C2C_NR_PEER_CPU          0x1
+#define C2C_NR_PEER_GPU          0x2
+#define C2C_NR_PEER_CXLMEM       0x1
+#define C2C_NR_PEER_MAX          0x2
+
+/* Number of instances on each interface. */
+#define C2C_NR_INST_NVLINK       14
+#define C2C_NR_INST_NVCLINK      12
+#define C2C_NR_INST_NVDLINK      16
+#define C2C_NR_INST_MAX          16
+
+/* Register offsets. */
+#define C2C_CTRL                    0x864
+#define C2C_IN_STATUS               0x868
+#define C2C_CYCLE_CNTR              0x86c
+#define C2C_IN_RD_CUM_OUTS_CNTR     0x874
+#define C2C_IN_RD_REQ_CNTR          0x87c
+#define C2C_IN_WR_CUM_OUTS_CNTR     0x884
+#define C2C_IN_WR_REQ_CNTR          0x88c
+#define C2C_OUT_STATUS              0x890
+#define C2C_OUT_RD_CUM_OUTS_CNTR    0x898
+#define C2C_OUT_RD_REQ_CNTR         0x8a0
+#define C2C_OUT_WR_CUM_OUTS_CNTR    0x8a8
+#define C2C_OUT_WR_REQ_CNTR         0x8b0
+
+/* C2C_IN_STATUS register field. */
+#define C2C_IN_STATUS_CYCLE_OVF             BIT(0)
+#define C2C_IN_STATUS_IN_RD_CUM_OUTS_OVF    BIT(1)
+#define C2C_IN_STATUS_IN_RD_REQ_OVF         BIT(2)
+#define C2C_IN_STATUS_IN_WR_CUM_OUTS_OVF    BIT(3)
+#define C2C_IN_STATUS_IN_WR_REQ_OVF         BIT(4)
+
+/* C2C_OUT_STATUS register field. */
+#define C2C_OUT_STATUS_OUT_RD_CUM_OUTS_OVF    BIT(0)
+#define C2C_OUT_STATUS_OUT_RD_REQ_OVF         BIT(1)
+#define C2C_OUT_STATUS_OUT_WR_CUM_OUTS_OVF    BIT(2)
+#define C2C_OUT_STATUS_OUT_WR_REQ_OVF         BIT(3)
+
+/* Events. */
+#define C2C_EVENT_CYCLES                0x0
+#define C2C_EVENT_IN_RD_CUM_OUTS        0x1
+#define C2C_EVENT_IN_RD_REQ             0x2
+#define C2C_EVENT_IN_WR_CUM_OUTS        0x3
+#define C2C_EVENT_IN_WR_REQ             0x4
+#define C2C_EVENT_OUT_RD_CUM_OUTS       0x5
+#define C2C_EVENT_OUT_RD_REQ            0x6
+#define C2C_EVENT_OUT_WR_CUM_OUTS       0x7
+#define C2C_EVENT_OUT_WR_REQ            0x8
+
+#define C2C_NUM_EVENTS           0x9
+#define C2C_MASK_EVENT           0xFF
+#define C2C_MAX_ACTIVE_EVENTS    32
+
+#define C2C_ACTIVE_CPU_MASK        0x0
+#define C2C_ASSOCIATED_CPU_MASK    0x1
+
+/*
+ * Maximum poll count for reading counter value using high-low-high sequence.
+ */
+#define HILOHI_MAX_POLL    1000
+
+static unsigned long nv_c2c_pmu_cpuhp_state;
+
+/* PMU descriptor. */
+
+/* C2C type information. */
+struct nv_c2c_pmu_data {
+	unsigned int c2c_type;
+	unsigned int nr_inst;
+	const char *name_fmt;
+};
+
+static const struct nv_c2c_pmu_data nv_c2c_pmu_data[] = {
+	[C2C_TYPE_NVLINK] = {
+		.c2c_type = C2C_TYPE_NVLINK,
+		.nr_inst = C2C_NR_INST_NVLINK,
+		.name_fmt = "nvidia_nvlink_c2c_pmu_%u",
+	},
+	[C2C_TYPE_NVCLINK] = {
+		.c2c_type = C2C_TYPE_NVCLINK,
+		.nr_inst = C2C_NR_INST_NVCLINK,
+		.name_fmt = "nvidia_nvclink_pmu_%u",
+	},
+	[C2C_TYPE_NVDLINK] = {
+		.c2c_type = C2C_TYPE_NVDLINK,
+		.nr_inst = C2C_NR_INST_NVDLINK,
+		.name_fmt = "nvidia_nvdlink_pmu_%u",
+	},
+};
+
+/* Tracks the events assigned to the PMU for a given logical index. */
+struct nv_c2c_pmu_hw_events {
+	/* The events that are active. */
+	struct perf_event *events[C2C_MAX_ACTIVE_EVENTS];
+
+	/*
+	 * Each bit indicates a logical counter is being used (or not) for an
+	 * event.
+	 */
+	DECLARE_BITMAP(used_ctrs, C2C_MAX_ACTIVE_EVENTS);
+};
+
+struct nv_c2c_pmu {
+	struct pmu pmu;
+	struct device *dev;
+	struct acpi_device *acpi_dev;
+
+	const char *name;
+	const char *identifier;
+
+	const struct nv_c2c_pmu_data *data;
+	unsigned int peer_type;
+	unsigned int socket;
+	unsigned int nr_peer;
+	unsigned long peer_insts[C2C_NR_PEER_MAX][BITS_TO_LONGS(C2C_NR_INST_MAX)];
+	u32 filter_default;
+
+	struct nv_c2c_pmu_hw_events hw_events;
+
+	cpumask_t associated_cpus;
+	cpumask_t active_cpu;
+
+	struct hlist_node cpuhp_node;
+
+	const struct attribute_group **attr_groups;
+
+	void __iomem *base_broadcast;
+	void __iomem *base[C2C_NR_INST_MAX];
+};
+
+#define to_c2c_pmu(p) (container_of(p, struct nv_c2c_pmu, pmu))
+
+/* Get event type from perf_event. */
+static inline u32 get_event_type(struct perf_event *event)
+{
+	return (event->attr.config) & C2C_MASK_EVENT;
+}
+
+static inline u32 get_filter_mask(struct perf_event *event)
+{
+	u32 filter;
+	struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu);
+
+	filter = ((u32)event->attr.config1) & c2c_pmu->filter_default;
+	if (filter == 0)
+		filter = c2c_pmu->filter_default;
+
+	return filter;
+}
+
+/* PMU operations. */
+
+static int nv_c2c_pmu_get_event_idx(struct nv_c2c_pmu_hw_events *hw_events,
+				    struct perf_event *event)
+{
+	u32 idx;
+
+	idx = find_first_zero_bit(hw_events->used_ctrs, C2C_MAX_ACTIVE_EVENTS);
+	if (idx >= C2C_MAX_ACTIVE_EVENTS)
+		return -EAGAIN;
+
+	set_bit(idx, hw_events->used_ctrs);
+
+	return idx;
+}
+
+static bool
+nv_c2c_pmu_validate_event(struct pmu *pmu,
+			  struct nv_c2c_pmu_hw_events *hw_events,
+			  struct perf_event *event)
+{
+	if (is_software_event(event))
+		return true;
+
+	/* Reject groups spanning multiple HW PMUs. */
+	if (event->pmu != pmu)
+		return false;
+
+	return nv_c2c_pmu_get_event_idx(hw_events, event) >= 0;
+}
+
+/*
+ * Make sure the group of events can be scheduled at once
+ * on the PMU.
+ */
+static bool nv_c2c_pmu_validate_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+	struct nv_c2c_pmu_hw_events fake_hw_events;
+
+	if (event->group_leader == event)
+		return true;
+
+	memset(&fake_hw_events, 0, sizeof(fake_hw_events));
+
+	if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, leader))
+		return false;
+
+	for_each_sibling_event(sibling, leader) {
+		if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events,
+					       sibling))
+			return false;
+	}
+
+	return nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, event);
+}
+
+static int nv_c2c_pmu_event_init(struct perf_event *event)
+{
+	struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u32 event_type = get_event_type(event);
+
+	if (event->attr.type != event->pmu->type ||
+	    event_type >= C2C_NUM_EVENTS)
+		return -ENOENT;
+
+	/*
+	 * Following other "uncore" PMUs, we do not support sampling mode or
+	 * attach to a task (per-process mode).
+	 */
+	if (is_sampling_event(event)) {
+		dev_dbg(c2c_pmu->pmu.dev, "Can't support sampling events\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) {
+		dev_dbg(c2c_pmu->pmu.dev, "Can't support per-task counters\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Make sure the CPU assignment is on one of the CPUs associated with
+	 * this PMU.
+	 */
+	if (!cpumask_test_cpu(event->cpu, &c2c_pmu->associated_cpus)) {
+		dev_dbg(c2c_pmu->pmu.dev,
+			"Requested cpu is not associated with the PMU\n");
+		return -EINVAL;
+	}
+
+	/* Enforce the current active CPU to handle the events in this PMU. */
+	event->cpu = cpumask_first(&c2c_pmu->active_cpu);
+	if (event->cpu >= nr_cpu_ids)
+		return -EINVAL;
+
+	if (!nv_c2c_pmu_validate_group(event))
+		return -EINVAL;
+
+	hwc->idx = -1;
+	hwc->config = event_type;
+
+	return 0;
+}
+
+/*
+ * Read 64-bit register as a pair of 32-bit registers using hi-lo-hi sequence.
+ */
+static u64 read_reg64_hilohi(const void __iomem *addr, u32 max_poll_count)
+{
+	u32 val_lo, val_hi;
+	u64 val;
+
+	/* Use high-low-high sequence to avoid tearing */
+	do {
+		if (max_poll_count-- == 0) {
+			pr_err("NV C2C PMU: timeout hi-low-high sequence\n");
+			return 0;
+		}
+
+		val_hi = readl(addr + 4);
+		val_lo = readl(addr);
+	} while (val_hi != readl(addr + 4));
+
+	val = (((u64)val_hi << 32) | val_lo);
+
+	return val;
+}
+
+static void nv_c2c_pmu_check_status(struct nv_c2c_pmu *c2c_pmu, u32 instance)
+{
+	u32 in_status, out_status;
+
+	in_status = readl(c2c_pmu->base[instance] + C2C_IN_STATUS);
+	out_status = readl(c2c_pmu->base[instance] + C2C_OUT_STATUS);
+
+	if (in_status || out_status)
+		dev_warn(c2c_pmu->dev,
+			"C2C PMU overflow in: 0x%x, out: 0x%x\n",
+			in_status, out_status);
+}
+
+static u32 nv_c2c_ctr_offset[C2C_NUM_EVENTS] = {
+	[C2C_EVENT_CYCLES] = C2C_CYCLE_CNTR,
+	[C2C_EVENT_IN_RD_CUM_OUTS] = C2C_IN_RD_CUM_OUTS_CNTR,
+	[C2C_EVENT_IN_RD_REQ] = C2C_IN_RD_REQ_CNTR,
+	[C2C_EVENT_IN_WR_CUM_OUTS] = C2C_IN_WR_CUM_OUTS_CNTR,
+	[C2C_EVENT_IN_WR_REQ] = C2C_IN_WR_REQ_CNTR,
+	[C2C_EVENT_OUT_RD_CUM_OUTS] = C2C_OUT_RD_CUM_OUTS_CNTR,
+	[C2C_EVENT_OUT_RD_REQ] = C2C_OUT_RD_REQ_CNTR,
+	[C2C_EVENT_OUT_WR_CUM_OUTS] = C2C_OUT_WR_CUM_OUTS_CNTR,
+	[C2C_EVENT_OUT_WR_REQ] = C2C_OUT_WR_REQ_CNTR,
+};
+
+static u64 nv_c2c_pmu_read_counter(struct perf_event *event)
+{
+	u32 ctr_id, ctr_offset, filter_mask, filter_idx, inst_idx;
+	unsigned long *inst_mask;
+	DECLARE_BITMAP(filter_bitmap, C2C_NR_PEER_MAX);
+	struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu);
+	u64 val = 0;
+
+	filter_mask = get_filter_mask(event);
+	bitmap_from_arr32(filter_bitmap, &filter_mask, c2c_pmu->nr_peer);
+
+	ctr_id = event->hw.config;
+	ctr_offset = nv_c2c_ctr_offset[ctr_id];
+
+	for_each_set_bit(filter_idx, filter_bitmap, c2c_pmu->nr_peer) {
+		inst_mask = c2c_pmu->peer_insts[filter_idx];
+		for_each_set_bit(inst_idx, inst_mask, c2c_pmu->data->nr_inst) {
+			nv_c2c_pmu_check_status(c2c_pmu, inst_idx);
+
+			/*
+			 * Each instance share same clock and the driver always
+			 * enables all instances. So we can use the counts from
+			 * one instance for cycle counter.
+			 */
+			if (ctr_id == C2C_EVENT_CYCLES)
+				return read_reg64_hilohi(
+					c2c_pmu->base[inst_idx] + ctr_offset,
+					HILOHI_MAX_POLL);
+
+			/*
+			 * For other events, sum up the counts from all instances.
+			 */
+			val += read_reg64_hilohi(
+				c2c_pmu->base[inst_idx] + ctr_offset,
+				HILOHI_MAX_POLL);
+		}
+	}
+
+	return val;
+}
+
+static void nv_c2c_pmu_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev, now;
+
+	do {
+		prev = local64_read(&hwc->prev_count);
+		now = nv_c2c_pmu_read_counter(event);
+	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
+
+	local64_add(now - prev, &event->count);
+}
+
+static void nv_c2c_pmu_start(struct perf_event *event, int pmu_flags)
+{
+	event->hw.state = 0;
+}
+
+static void nv_c2c_pmu_stop(struct perf_event *event, int pmu_flags)
+{
+	event->hw.state |= PERF_HES_STOPPED;
+}
+
+static int nv_c2c_pmu_add(struct perf_event *event, int flags)
+{
+	struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu);
+	struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(),
+					   &c2c_pmu->associated_cpus)))
+		return -ENOENT;
+
+	idx = nv_c2c_pmu_get_event_idx(hw_events, event);
+	if (idx < 0)
+		return idx;
+
+	hw_events->events[idx] = event;
+	hwc->idx = idx;
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+	if (flags & PERF_EF_START)
+		nv_c2c_pmu_start(event, PERF_EF_RELOAD);
+
+	/* Propagate changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void nv_c2c_pmu_del(struct perf_event *event, int flags)
+{
+	struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu);
+	struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	nv_c2c_pmu_stop(event, PERF_EF_UPDATE);
+
+	hw_events->events[idx] = NULL;
+
+	clear_bit(idx, hw_events->used_ctrs);
+
+	perf_event_update_userpage(event);
+}
+
+static void nv_c2c_pmu_read(struct perf_event *event)
+{
+	nv_c2c_pmu_event_update(event);
+}
+
+static void nv_c2c_pmu_enable(struct pmu *pmu)
+{
+	void __iomem *bcast;
+	struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu);
+
+	/* Check if any filter is enabled. */
+	if (bitmap_empty(c2c_pmu->hw_events.used_ctrs, C2C_MAX_ACTIVE_EVENTS))
+		return;
+
+	/* Enable all the counters. */
+	bcast = c2c_pmu->base_broadcast;
+	writel(0x1UL, bcast + C2C_CTRL);
+}
+
+static void nv_c2c_pmu_disable(struct pmu *pmu)
+{
+	unsigned int idx;
+	void __iomem *bcast;
+	struct perf_event *event;
+	struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu);
+
+	/* Disable all the counters. */
+	bcast = c2c_pmu->base_broadcast;
+	writel(0x0UL, bcast + C2C_CTRL);
+
+	/*
+	 * The counters will start from 0 again on restart.
+	 * Update the events immediately to avoid losing the counts.
+	 */
+	for_each_set_bit(idx, c2c_pmu->hw_events.used_ctrs,
+			 C2C_MAX_ACTIVE_EVENTS) {
+		event = c2c_pmu->hw_events.events[idx];
+
+		if (!event)
+			continue;
+
+		nv_c2c_pmu_event_update(event);
+
+		local64_set(&event->hw.prev_count, 0ULL);
+	}
+}
+
+/* PMU identifier attribute. */
+
+static ssize_t nv_c2c_pmu_identifier_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *page)
+{
+	struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(page, "%s\n", c2c_pmu->identifier);
+}
+
+static struct device_attribute nv_c2c_pmu_identifier_attr =
+	__ATTR(identifier, 0444, nv_c2c_pmu_identifier_show, NULL);
+
+static struct attribute *nv_c2c_pmu_identifier_attrs[] = {
+	&nv_c2c_pmu_identifier_attr.attr,
+	NULL,
+};
+
+static struct attribute_group nv_c2c_pmu_identifier_attr_group = {
+	.attrs = nv_c2c_pmu_identifier_attrs,
+};
+
+/* Peer attribute. */
+
+static ssize_t nv_c2c_pmu_peer_show(struct device *dev,
+	struct device_attribute *attr,
+	char *page)
+{
+	const char *peer_type[C2C_PEER_TYPE_COUNT] = {
+		[C2C_PEER_TYPE_CPU] = "cpu",
+		[C2C_PEER_TYPE_GPU] = "gpu",
+		[C2C_PEER_TYPE_CXLMEM] = "cxlmem",
+	};
+
+	struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev));
+	return sysfs_emit(page, "nr_%s=%u\n", peer_type[c2c_pmu->peer_type],
+		c2c_pmu->nr_peer);
+}
+
+static struct device_attribute nv_c2c_pmu_peer_attr =
+	__ATTR(peer, 0444, nv_c2c_pmu_peer_show, NULL);
+
+static struct attribute *nv_c2c_pmu_peer_attrs[] = {
+	&nv_c2c_pmu_peer_attr.attr,
+	NULL,
+};
+
+static struct attribute_group nv_c2c_pmu_peer_attr_group = {
+	.attrs = nv_c2c_pmu_peer_attrs,
+};
+
+/* Format attributes. */
+
+#define NV_C2C_PMU_EXT_ATTR(_name, _func, _config)			\
+	(&((struct dev_ext_attribute[]){				\
+		{							\
+			.attr = __ATTR(_name, 0444, _func, NULL),	\
+			.var = (void *)_config				\
+		}							\
+	})[0].attr.attr)
+
+#define NV_C2C_PMU_FORMAT_ATTR(_name, _config) \
+	NV_C2C_PMU_EXT_ATTR(_name, device_show_string, _config)
+
+#define NV_C2C_PMU_FORMAT_EVENT_ATTR \
+	NV_C2C_PMU_FORMAT_ATTR(event, "config:0-3")
+
+static struct attribute *nv_c2c_pmu_gpu_formats[] = {
+	NV_C2C_PMU_FORMAT_EVENT_ATTR,
+	NV_C2C_PMU_FORMAT_ATTR(gpu_mask, "config1:0-1"),
+	NULL,
+};
+
+static const struct attribute_group nv_c2c_pmu_gpu_format_group = {
+	.name = "format",
+	.attrs = nv_c2c_pmu_gpu_formats,
+};
+
+static struct attribute *nv_c2c_pmu_formats[] = {
+	NV_C2C_PMU_FORMAT_EVENT_ATTR,
+	NULL,
+};
+
+static const struct attribute_group nv_c2c_pmu_format_group = {
+	.name = "format",
+	.attrs = nv_c2c_pmu_formats,
+};
+
+/* Event attributes. */
+
+static ssize_t nv_c2c_pmu_sysfs_event_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, typeof(*pmu_attr), attr);
+	return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id);
+}
+
+#define NV_C2C_PMU_EVENT_ATTR(_name, _config)	\
+	PMU_EVENT_ATTR_ID(_name, nv_c2c_pmu_sysfs_event_show, _config)
+
+static struct attribute *nv_c2c_pmu_gpu_events[] = {
+	NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES),
+	NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS),
+	NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ),
+	NV_C2C_PMU_EVENT_ATTR(in_wr_cum_outs, C2C_EVENT_IN_WR_CUM_OUTS),
+	NV_C2C_PMU_EVENT_ATTR(in_wr_req, C2C_EVENT_IN_WR_REQ),
+	NV_C2C_PMU_EVENT_ATTR(out_rd_cum_outs, C2C_EVENT_OUT_RD_CUM_OUTS),
+	NV_C2C_PMU_EVENT_ATTR(out_rd_req, C2C_EVENT_OUT_RD_REQ),
+	NV_C2C_PMU_EVENT_ATTR(out_wr_cum_outs, C2C_EVENT_OUT_WR_CUM_OUTS),
+	NV_C2C_PMU_EVENT_ATTR(out_wr_req, C2C_EVENT_OUT_WR_REQ),
+	NULL
+};
+
+static const struct attribute_group nv_c2c_pmu_gpu_events_group = {
+	.name = "events",
+	.attrs = nv_c2c_pmu_gpu_events,
+};
+
+static struct attribute *nv_c2c_pmu_cpu_events[] = {
+	NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES),
+	NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS),
+	NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ),
+	NV_C2C_PMU_EVENT_ATTR(out_rd_cum_outs, C2C_EVENT_OUT_RD_CUM_OUTS),
+	NV_C2C_PMU_EVENT_ATTR(out_rd_req, C2C_EVENT_OUT_RD_REQ),
+	NULL
+};
+
+static const struct attribute_group nv_c2c_pmu_cpu_events_group = {
+	.name = "events",
+	.attrs = nv_c2c_pmu_cpu_events,
+};
+
+static struct attribute *nv_c2c_pmu_cxlmem_events[] = {
+	NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES),
+	NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS),
+	NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ),
+	NULL
+};
+
+static const struct attribute_group nv_c2c_pmu_cxlmem_events_group = {
+	.name = "events",
+	.attrs = nv_c2c_pmu_cxlmem_events,
+};
+
+/* Cpumask attributes. */
+
+static ssize_t nv_c2c_pmu_cpumask_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu);
+	struct dev_ext_attribute *eattr =
+		container_of(attr, struct dev_ext_attribute, attr);
+	unsigned long mask_id = (unsigned long)eattr->var;
+	const cpumask_t *cpumask;
+
+	switch (mask_id) {
+	case C2C_ACTIVE_CPU_MASK:
+		cpumask = &c2c_pmu->active_cpu;
+		break;
+	case C2C_ASSOCIATED_CPU_MASK:
+		cpumask = &c2c_pmu->associated_cpus;
+		break;
+	default:
+		return 0;
+	}
+	return cpumap_print_to_pagebuf(true, buf, cpumask);
+}
+
+#define NV_C2C_PMU_CPUMASK_ATTR(_name, _config)			\
+	NV_C2C_PMU_EXT_ATTR(_name, nv_c2c_pmu_cpumask_show,	\
+				(unsigned long)_config)
+
+static struct attribute *nv_c2c_pmu_cpumask_attrs[] = {
+	NV_C2C_PMU_CPUMASK_ATTR(cpumask, C2C_ACTIVE_CPU_MASK),
+	NV_C2C_PMU_CPUMASK_ATTR(associated_cpus, C2C_ASSOCIATED_CPU_MASK),
+	NULL,
+};
+
+static const struct attribute_group nv_c2c_pmu_cpumask_attr_group = {
+	.attrs = nv_c2c_pmu_cpumask_attrs,
+};
+
+/* Attribute groups for C2C PMU connecting SoC and GPU */
+static const struct attribute_group *nv_c2c_pmu_gpu_attr_groups[] = {
+	&nv_c2c_pmu_gpu_format_group,
+	&nv_c2c_pmu_gpu_events_group,
+	&nv_c2c_pmu_cpumask_attr_group,
+	&nv_c2c_pmu_identifier_attr_group,
+	&nv_c2c_pmu_peer_attr_group,
+	NULL
+};
+
+/* Attribute groups for C2C PMU connecting multiple SoCs */
+static const struct attribute_group *nv_c2c_pmu_cpu_attr_groups[] = {
+	&nv_c2c_pmu_format_group,
+	&nv_c2c_pmu_cpu_events_group,
+	&nv_c2c_pmu_cpumask_attr_group,
+	&nv_c2c_pmu_identifier_attr_group,
+	&nv_c2c_pmu_peer_attr_group,
+	NULL
+};
+
+/* Attribute groups for C2C PMU connecting SoC and CXLMEM */
+static const struct attribute_group *nv_c2c_pmu_cxlmem_attr_groups[] = {
+	&nv_c2c_pmu_format_group,
+	&nv_c2c_pmu_cxlmem_events_group,
+	&nv_c2c_pmu_cpumask_attr_group,
+	&nv_c2c_pmu_identifier_attr_group,
+	&nv_c2c_pmu_peer_attr_group,
+	NULL
+};
+
+static int nv_c2c_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct nv_c2c_pmu *c2c_pmu =
+		hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node);
+
+	if (!cpumask_test_cpu(cpu, &c2c_pmu->associated_cpus))
+		return 0;
+
+	/* If the PMU is already managed, there is nothing to do */
+	if (!cpumask_empty(&c2c_pmu->active_cpu))
+		return 0;
+
+	/* Use this CPU for event counting */
+	cpumask_set_cpu(cpu, &c2c_pmu->active_cpu);
+
+	return 0;
+}
+
+static int nv_c2c_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
+{
+	unsigned int dst;
+
+	struct nv_c2c_pmu *c2c_pmu =
+		hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node);
+
+	/* Nothing to do if this CPU doesn't own the PMU */
+	if (!cpumask_test_and_clear_cpu(cpu, &c2c_pmu->active_cpu))
+		return 0;
+
+	/* Choose a new CPU to migrate ownership of the PMU to */
+	dst = cpumask_any_and_but(&c2c_pmu->associated_cpus,
+				  cpu_online_mask, cpu);
+	if (dst >= nr_cpu_ids)
+		return 0;
+
+	/* Use this CPU for event counting */
+	perf_pmu_migrate_context(&c2c_pmu->pmu, cpu, dst);
+	cpumask_set_cpu(dst, &c2c_pmu->active_cpu);
+
+	return 0;
+}
+
+static int nv_c2c_pmu_get_cpus(struct nv_c2c_pmu *c2c_pmu)
+{
+	int socket = c2c_pmu->socket, cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu_to_node(cpu) == socket)
+			cpumask_set_cpu(cpu, &c2c_pmu->associated_cpus);
+	}
+
+	if (cpumask_empty(&c2c_pmu->associated_cpus)) {
+		dev_dbg(c2c_pmu->dev,
+			"No cpu associated with C2C PMU socket-%u\n", socket);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static int nv_c2c_pmu_init_socket(struct nv_c2c_pmu *c2c_pmu)
+{
+	const char *uid_str;
+	int ret, socket;
+
+	uid_str = acpi_device_uid(c2c_pmu->acpi_dev);
+	if (!uid_str) {
+		dev_err(c2c_pmu->dev, "No ACPI device UID\n");
+		return -ENODEV;
+	}
+
+	ret = kstrtou32(uid_str, 0, &socket);
+	if (ret) {
+		dev_err(c2c_pmu->dev, "Failed to parse ACPI device UID\n");
+		return ret;
+	}
+
+	c2c_pmu->socket = socket;
+	return 0;
+}
+
+static int nv_c2c_pmu_init_id(struct nv_c2c_pmu *c2c_pmu)
+{
+	char *name;
+
+	name = devm_kasprintf(c2c_pmu->dev, GFP_KERNEL, c2c_pmu->data->name_fmt,
+				c2c_pmu->socket);
+	if (!name)
+		return -ENOMEM;
+
+	c2c_pmu->name = name;
+
+	c2c_pmu->identifier = acpi_device_hid(c2c_pmu->acpi_dev);
+
+	return 0;
+}
+
+static int nv_c2c_pmu_init_filter(struct nv_c2c_pmu *c2c_pmu)
+{
+	u32 cpu_en = 0;
+	struct device *dev = c2c_pmu->dev;
+	const struct nv_c2c_pmu_data *data = c2c_pmu->data;
+
+	if (data->c2c_type == C2C_TYPE_NVDLINK) {
+		c2c_pmu->peer_type = C2C_PEER_TYPE_CXLMEM;
+
+		c2c_pmu->peer_insts[0][0] = (1UL << data->nr_inst) - 1;
+
+		c2c_pmu->nr_peer = C2C_NR_PEER_CXLMEM;
+		c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1;
+
+		c2c_pmu->attr_groups = nv_c2c_pmu_cxlmem_attr_groups;
+
+		return 0;
+	}
+
+	if (device_property_read_u32(dev, "cpu_en_mask", &cpu_en))
+		dev_dbg(dev, "no cpu_en_mask property\n");
+
+	if (cpu_en) {
+		c2c_pmu->peer_type = C2C_PEER_TYPE_CPU;
+
+		/* Fill peer_insts bitmap with instances connected to peer CPU. */
+		bitmap_from_arr32(c2c_pmu->peer_insts[0], &cpu_en, data->nr_inst);
+
+		c2c_pmu->nr_peer = 1;
+		c2c_pmu->attr_groups = nv_c2c_pmu_cpu_attr_groups;
+	} else {
+		u32 i;
+		const char *props[C2C_NR_PEER_MAX] = {
+			"gpu0_en_mask", "gpu1_en_mask"
+		};
+
+		for (i = 0; i < C2C_NR_PEER_MAX; i++) {
+			u32 gpu_en = 0;
+
+			if (device_property_read_u32(dev, props[i], &gpu_en))
+				dev_dbg(dev, "no %s property\n", props[i]);
+
+			if (gpu_en) {
+				/* Fill peer_insts bitmap with instances connected to peer GPU. */
+				bitmap_from_arr32(c2c_pmu->peer_insts[i], &gpu_en,
+						data->nr_inst);
+
+				c2c_pmu->nr_peer++;
+			}
+		}
+
+		if (c2c_pmu->nr_peer == 0) {
+			dev_err(dev, "No GPU is enabled\n");
+			return -EINVAL;
+		}
+
+		c2c_pmu->peer_type = C2C_PEER_TYPE_GPU;
+		c2c_pmu->attr_groups = nv_c2c_pmu_gpu_attr_groups;
+	}
+
+	c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1;
+
+	return 0;
+}
+
+static void *nv_c2c_pmu_init_pmu(struct platform_device *pdev)
+{
+	int ret;
+	struct nv_c2c_pmu *c2c_pmu;
+	struct acpi_device *acpi_dev;
+	struct device *dev = &pdev->dev;
+
+	acpi_dev = ACPI_COMPANION(dev);
+	if (!acpi_dev)
+		return ERR_PTR(-ENODEV);
+
+	c2c_pmu = devm_kzalloc(dev, sizeof(*c2c_pmu), GFP_KERNEL);
+	if (!c2c_pmu)
+		return ERR_PTR(-ENOMEM);
+
+	c2c_pmu->dev = dev;
+	c2c_pmu->acpi_dev = acpi_dev;
+	c2c_pmu->data = (const struct nv_c2c_pmu_data *)device_get_match_data(dev);
+	if (!c2c_pmu->data)
+		return ERR_PTR(-EINVAL);
+
+	platform_set_drvdata(pdev, c2c_pmu);
+
+	ret = nv_c2c_pmu_init_socket(c2c_pmu);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = nv_c2c_pmu_init_id(c2c_pmu);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = nv_c2c_pmu_init_filter(c2c_pmu);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return c2c_pmu;
+}
+
+static int nv_c2c_pmu_init_mmio(struct nv_c2c_pmu *c2c_pmu)
+{
+	int i;
+	struct device *dev = c2c_pmu->dev;
+	struct platform_device *pdev = to_platform_device(dev);
+	const struct nv_c2c_pmu_data *data = c2c_pmu->data;
+
+	/* Map the address of all the instances. */
+	for (i = 0; i < data->nr_inst; i++) {
+		c2c_pmu->base[i] = devm_platform_ioremap_resource(pdev, i);
+		if (IS_ERR(c2c_pmu->base[i])) {
+			dev_err(dev, "Failed map address for instance %d\n", i);
+			return PTR_ERR(c2c_pmu->base[i]);
+		}
+	}
+
+	/* Map broadcast address. */
+	c2c_pmu->base_broadcast = devm_platform_ioremap_resource(pdev,
+								 data->nr_inst);
+	if (IS_ERR(c2c_pmu->base_broadcast)) {
+		dev_err(dev, "Failed map broadcast address\n");
+		return PTR_ERR(c2c_pmu->base_broadcast);
+	}
+
+	return 0;
+}
+
+static int nv_c2c_pmu_register_pmu(struct nv_c2c_pmu *c2c_pmu)
+{
+	int ret;
+
+	ret = cpuhp_state_add_instance(nv_c2c_pmu_cpuhp_state,
+				       &c2c_pmu->cpuhp_node);
+	if (ret) {
+		dev_err(c2c_pmu->dev, "Error %d registering hotplug\n", ret);
+		return ret;
+	}
+
+	c2c_pmu->pmu = (struct pmu) {
+		.parent		= c2c_pmu->dev,
+		.task_ctx_nr	= perf_invalid_context,
+		.pmu_enable	= nv_c2c_pmu_enable,
+		.pmu_disable	= nv_c2c_pmu_disable,
+		.event_init	= nv_c2c_pmu_event_init,
+		.add		= nv_c2c_pmu_add,
+		.del		= nv_c2c_pmu_del,
+		.start		= nv_c2c_pmu_start,
+		.stop		= nv_c2c_pmu_stop,
+		.read		= nv_c2c_pmu_read,
+		.attr_groups	= c2c_pmu->attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE |
+					PERF_PMU_CAP_NO_INTERRUPT,
+	};
+
+	ret = perf_pmu_register(&c2c_pmu->pmu, c2c_pmu->name, -1);
+	if (ret) {
+		dev_err(c2c_pmu->dev, "Failed to register C2C PMU: %d\n", ret);
+		cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state,
+					  &c2c_pmu->cpuhp_node);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int nv_c2c_pmu_probe(struct platform_device *pdev)
+{
+	int ret;
+	struct nv_c2c_pmu *c2c_pmu;
+
+	c2c_pmu = nv_c2c_pmu_init_pmu(pdev);
+	if (IS_ERR(c2c_pmu))
+		return PTR_ERR(c2c_pmu);
+
+	ret = nv_c2c_pmu_init_mmio(c2c_pmu);
+	if (ret)
+		return ret;
+
+	ret = nv_c2c_pmu_get_cpus(c2c_pmu);
+	if (ret)
+		return ret;
+
+	ret = nv_c2c_pmu_register_pmu(c2c_pmu);
+	if (ret)
+		return ret;
+
+	dev_dbg(c2c_pmu->dev, "Registered %s PMU\n", c2c_pmu->name);
+
+	return 0;
+}
+
+static void nv_c2c_pmu_device_remove(struct platform_device *pdev)
+{
+	struct nv_c2c_pmu *c2c_pmu = platform_get_drvdata(pdev);
+
+	perf_pmu_unregister(&c2c_pmu->pmu);
+	cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state, &c2c_pmu->cpuhp_node);
+}
+
+static const struct acpi_device_id nv_c2c_pmu_acpi_match[] = {
+	{ "NVDA2023", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVLINK] },
+	{ "NVDA2022", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVCLINK] },
+	{ "NVDA2020", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVDLINK] },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, nv_c2c_pmu_acpi_match);
+
+static struct platform_driver nv_c2c_pmu_driver = {
+	.driver = {
+		.name = "nvidia-t410-c2c-pmu",
+		.acpi_match_table = nv_c2c_pmu_acpi_match,
+		.suppress_bind_attrs = true,
+	},
+	.probe = nv_c2c_pmu_probe,
+	.remove = nv_c2c_pmu_device_remove,
+};
+
+static int __init nv_c2c_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+				      "perf/nvidia/c2c:online",
+				      nv_c2c_pmu_online_cpu,
+				      nv_c2c_pmu_cpu_teardown);
+	if (ret < 0)
+		return ret;
+
+	nv_c2c_pmu_cpuhp_state = ret;
+	return platform_driver_register(&nv_c2c_pmu_driver);
+}
+
+static void __exit nv_c2c_pmu_exit(void)
+{
+	platform_driver_unregister(&nv_c2c_pmu_driver);
+	cpuhp_remove_multi_state(nv_c2c_pmu_cpuhp_state);
+}
+
+module_init(nv_c2c_pmu_init);
+module_exit(nv_c2c_pmu_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("NVIDIA Tegra410 C2C PMU driver");
+MODULE_AUTHOR("Besar Wicaksono <bwicaksono@nvidia.com>");
diff --git a/drivers/perf/nvidia_t410_cmem_latency_pmu.c b/drivers/perf/nvidia_t410_cmem_latency_pmu.c
new file mode 100644
index 000000000000..acb8f5571522
--- /dev/null
+++ b/drivers/perf/nvidia_t410_cmem_latency_pmu.c
@@ -0,0 +1,736 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVIDIA Tegra410 CPU Memory (CMEM) Latency PMU driver.
+ *
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#include <linux/acpi.h>
+#include <linux/bitops.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+#define NUM_INSTANCES    14
+
+/* Register offsets. */
+#define CMEM_LAT_CG_CTRL         0x800
+#define CMEM_LAT_CTRL            0x808
+#define CMEM_LAT_STATUS          0x810
+#define CMEM_LAT_CYCLE_CNTR      0x818
+#define CMEM_LAT_MC0_REQ_CNTR    0x820
+#define CMEM_LAT_MC0_AOR_CNTR    0x830
+#define CMEM_LAT_MC1_REQ_CNTR    0x838
+#define CMEM_LAT_MC1_AOR_CNTR    0x848
+#define CMEM_LAT_MC2_REQ_CNTR    0x850
+#define CMEM_LAT_MC2_AOR_CNTR    0x860
+
+/* CMEM_LAT_CTRL values. */
+#define CMEM_LAT_CTRL_DISABLE    0x0ULL
+#define CMEM_LAT_CTRL_ENABLE     0x1ULL
+#define CMEM_LAT_CTRL_CLR        0x2ULL
+
+/* CMEM_LAT_CG_CTRL values. */
+#define CMEM_LAT_CG_CTRL_DISABLE    0x0ULL
+#define CMEM_LAT_CG_CTRL_ENABLE     0x1ULL
+
+/* CMEM_LAT_STATUS register field. */
+#define CMEM_LAT_STATUS_CYCLE_OVF      BIT(0)
+#define CMEM_LAT_STATUS_MC0_AOR_OVF    BIT(1)
+#define CMEM_LAT_STATUS_MC0_REQ_OVF    BIT(3)
+#define CMEM_LAT_STATUS_MC1_AOR_OVF    BIT(4)
+#define CMEM_LAT_STATUS_MC1_REQ_OVF    BIT(6)
+#define CMEM_LAT_STATUS_MC2_AOR_OVF    BIT(7)
+#define CMEM_LAT_STATUS_MC2_REQ_OVF    BIT(9)
+
+/* Events. */
+#define CMEM_LAT_EVENT_CYCLES    0x0
+#define CMEM_LAT_EVENT_REQ       0x1
+#define CMEM_LAT_EVENT_AOR       0x2
+
+#define CMEM_LAT_NUM_EVENTS           0x3
+#define CMEM_LAT_MASK_EVENT           0x3
+#define CMEM_LAT_MAX_ACTIVE_EVENTS    32
+
+#define CMEM_LAT_ACTIVE_CPU_MASK        0x0
+#define CMEM_LAT_ASSOCIATED_CPU_MASK    0x1
+
+static unsigned long cmem_lat_pmu_cpuhp_state;
+
+struct cmem_lat_pmu_hw_events {
+	struct perf_event *events[CMEM_LAT_MAX_ACTIVE_EVENTS];
+	DECLARE_BITMAP(used_ctrs, CMEM_LAT_MAX_ACTIVE_EVENTS);
+};
+
+struct cmem_lat_pmu {
+	struct pmu pmu;
+	struct device *dev;
+	const char *name;
+	const char *identifier;
+	void __iomem *base_broadcast;
+	void __iomem *base[NUM_INSTANCES];
+	cpumask_t associated_cpus;
+	cpumask_t active_cpu;
+	struct hlist_node node;
+	struct cmem_lat_pmu_hw_events hw_events;
+};
+
+#define to_cmem_lat_pmu(p) \
+	container_of(p, struct cmem_lat_pmu, pmu)
+
+
+/* Get event type from perf_event. */
+static inline u32 get_event_type(struct perf_event *event)
+{
+	return (event->attr.config) & CMEM_LAT_MASK_EVENT;
+}
+
+/* PMU operations. */
+static int cmem_lat_pmu_get_event_idx(struct cmem_lat_pmu_hw_events *hw_events,
+				struct perf_event *event)
+{
+	unsigned int idx;
+
+	idx = find_first_zero_bit(hw_events->used_ctrs, CMEM_LAT_MAX_ACTIVE_EVENTS);
+	if (idx >= CMEM_LAT_MAX_ACTIVE_EVENTS)
+		return -EAGAIN;
+
+	set_bit(idx, hw_events->used_ctrs);
+
+	return idx;
+}
+
+static bool cmem_lat_pmu_validate_event(struct pmu *pmu,
+				 struct cmem_lat_pmu_hw_events *hw_events,
+				 struct perf_event *event)
+{
+	int ret;
+
+	if (is_software_event(event))
+		return true;
+
+	/* Reject groups spanning multiple HW PMUs. */
+	if (event->pmu != pmu)
+		return false;
+
+	ret = cmem_lat_pmu_get_event_idx(hw_events, event);
+	if (ret < 0)
+		return false;
+
+	return true;
+}
+
+/* Make sure the group of events can be scheduled at once on the PMU. */
+static bool cmem_lat_pmu_validate_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+	struct cmem_lat_pmu_hw_events fake_hw_events;
+
+	if (event->group_leader == event)
+		return true;
+
+	memset(&fake_hw_events, 0, sizeof(fake_hw_events));
+
+	if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, leader))
+		return false;
+
+	for_each_sibling_event(sibling, leader) {
+		if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, sibling))
+			return false;
+	}
+
+	return cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, event);
+}
+
+static int cmem_lat_pmu_event_init(struct perf_event *event)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u32 event_type = get_event_type(event);
+
+	if (event->attr.type != event->pmu->type ||
+	    event_type >= CMEM_LAT_NUM_EVENTS)
+		return -ENOENT;
+
+	/*
+	 * Sampling, per-process mode, and per-task counters are not supported
+	 * since this PMU is shared across all CPUs.
+	 */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) {
+		dev_dbg(cmem_lat_pmu->pmu.dev,
+				"Can't support sampling and per-process mode\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (event->cpu < 0) {
+		dev_dbg(cmem_lat_pmu->pmu.dev, "Can't support per-task counters\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Make sure the CPU assignment is on one of the CPUs associated with
+	 * this PMU.
+	 */
+	if (!cpumask_test_cpu(event->cpu, &cmem_lat_pmu->associated_cpus)) {
+		dev_dbg(cmem_lat_pmu->pmu.dev,
+				"Requested cpu is not associated with the PMU\n");
+		return -EINVAL;
+	}
+
+	/* Enforce the current active CPU to handle the events in this PMU. */
+	event->cpu = cpumask_first(&cmem_lat_pmu->active_cpu);
+	if (event->cpu >= nr_cpu_ids)
+		return -EINVAL;
+
+	if (!cmem_lat_pmu_validate_group(event))
+		return -EINVAL;
+
+	hwc->idx = -1;
+	hwc->config = event_type;
+
+	return 0;
+}
+
+static u64 cmem_lat_pmu_read_status(struct cmem_lat_pmu *cmem_lat_pmu,
+				   unsigned int inst)
+{
+	return readq(cmem_lat_pmu->base[inst] + CMEM_LAT_STATUS);
+}
+
+static u64 cmem_lat_pmu_read_cycle_counter(struct perf_event *event)
+{
+	const unsigned int instance = 0;
+	u64 status;
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct device *dev = cmem_lat_pmu->dev;
+
+	/*
+	 * Use the reading from first instance since all instances are
+	 * identical.
+	 */
+	status = cmem_lat_pmu_read_status(cmem_lat_pmu, instance);
+	if (status & CMEM_LAT_STATUS_CYCLE_OVF)
+		dev_warn(dev, "Cycle counter overflow\n");
+
+	return readq(cmem_lat_pmu->base[instance] + CMEM_LAT_CYCLE_CNTR);
+}
+
+static u64 cmem_lat_pmu_read_req_counter(struct perf_event *event)
+{
+	unsigned int i;
+	u64 status, val = 0;
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct device *dev = cmem_lat_pmu->dev;
+
+	/* Sum up the counts from all instances. */
+	for (i = 0; i < NUM_INSTANCES; i++) {
+		status = cmem_lat_pmu_read_status(cmem_lat_pmu, i);
+		if (status & CMEM_LAT_STATUS_MC0_REQ_OVF)
+			dev_warn(dev, "MC0 request counter overflow\n");
+		if (status & CMEM_LAT_STATUS_MC1_REQ_OVF)
+			dev_warn(dev, "MC1 request counter overflow\n");
+		if (status & CMEM_LAT_STATUS_MC2_REQ_OVF)
+			dev_warn(dev, "MC2 request counter overflow\n");
+
+		val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC0_REQ_CNTR);
+		val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC1_REQ_CNTR);
+		val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC2_REQ_CNTR);
+	}
+
+	return val;
+}
+
+static u64 cmem_lat_pmu_read_aor_counter(struct perf_event *event)
+{
+	unsigned int i;
+	u64 status, val = 0;
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct device *dev = cmem_lat_pmu->dev;
+
+	/* Sum up the counts from all instances. */
+	for (i = 0; i < NUM_INSTANCES; i++) {
+		status = cmem_lat_pmu_read_status(cmem_lat_pmu, i);
+		if (status & CMEM_LAT_STATUS_MC0_AOR_OVF)
+			dev_warn(dev, "MC0 AOR counter overflow\n");
+		if (status & CMEM_LAT_STATUS_MC1_AOR_OVF)
+			dev_warn(dev, "MC1 AOR counter overflow\n");
+		if (status & CMEM_LAT_STATUS_MC2_AOR_OVF)
+			dev_warn(dev, "MC2 AOR counter overflow\n");
+
+		val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC0_AOR_CNTR);
+		val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC1_AOR_CNTR);
+		val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC2_AOR_CNTR);
+	}
+
+	return val;
+}
+
+static u64 (*read_counter_fn[CMEM_LAT_NUM_EVENTS])(struct perf_event *) = {
+	[CMEM_LAT_EVENT_CYCLES] = cmem_lat_pmu_read_cycle_counter,
+	[CMEM_LAT_EVENT_REQ] = cmem_lat_pmu_read_req_counter,
+	[CMEM_LAT_EVENT_AOR] = cmem_lat_pmu_read_aor_counter,
+};
+
+static void cmem_lat_pmu_event_update(struct perf_event *event)
+{
+	u32 event_type;
+	u64 prev, now;
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->state & PERF_HES_STOPPED)
+		return;
+
+	event_type = hwc->config;
+
+	do {
+		prev = local64_read(&hwc->prev_count);
+		now = read_counter_fn[event_type](event);
+	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
+
+	local64_add(now - prev, &event->count);
+
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static void cmem_lat_pmu_start(struct perf_event *event, int pmu_flags)
+{
+	event->hw.state = 0;
+}
+
+static void cmem_lat_pmu_stop(struct perf_event *event, int pmu_flags)
+{
+	event->hw.state |= PERF_HES_STOPPED;
+}
+
+static int cmem_lat_pmu_add(struct perf_event *event, int flags)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(),
+					   &cmem_lat_pmu->associated_cpus)))
+		return -ENOENT;
+
+	idx = cmem_lat_pmu_get_event_idx(hw_events, event);
+	if (idx < 0)
+		return idx;
+
+	hw_events->events[idx] = event;
+	hwc->idx = idx;
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+	if (flags & PERF_EF_START)
+		cmem_lat_pmu_start(event, PERF_EF_RELOAD);
+
+	/* Propagate changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void cmem_lat_pmu_del(struct perf_event *event, int flags)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+	struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	cmem_lat_pmu_stop(event, PERF_EF_UPDATE);
+
+	hw_events->events[idx] = NULL;
+
+	clear_bit(idx, hw_events->used_ctrs);
+
+	perf_event_update_userpage(event);
+}
+
+static void cmem_lat_pmu_read(struct perf_event *event)
+{
+	cmem_lat_pmu_event_update(event);
+}
+
+static inline void cmem_lat_pmu_cg_ctrl(struct cmem_lat_pmu *cmem_lat_pmu,
+										u64 val)
+{
+	writeq(val, cmem_lat_pmu->base_broadcast + CMEM_LAT_CG_CTRL);
+}
+
+static inline void cmem_lat_pmu_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, u64 val)
+{
+	writeq(val, cmem_lat_pmu->base_broadcast + CMEM_LAT_CTRL);
+}
+
+static void cmem_lat_pmu_enable(struct pmu *pmu)
+{
+	bool disabled;
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
+
+	disabled = bitmap_empty(cmem_lat_pmu->hw_events.used_ctrs,
+							CMEM_LAT_MAX_ACTIVE_EVENTS);
+
+	if (disabled)
+		return;
+
+	/* Enable all the counters. */
+	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_ENABLE);
+	cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_ENABLE);
+}
+
+static void cmem_lat_pmu_disable(struct pmu *pmu)
+{
+	int idx;
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
+
+	/* Disable all the counters. */
+	cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_DISABLE);
+
+	/*
+	 * The counters will start from 0 again on restart.
+	 * Update the events immediately to avoid losing the counts.
+	 */
+	for_each_set_bit(idx, cmem_lat_pmu->hw_events.used_ctrs,
+						CMEM_LAT_MAX_ACTIVE_EVENTS) {
+		struct perf_event *event = cmem_lat_pmu->hw_events.events[idx];
+
+		if (!event)
+			continue;
+
+		cmem_lat_pmu_event_update(event);
+
+		local64_set(&event->hw.prev_count, 0ULL);
+	}
+
+	cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_CLR);
+	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_DISABLE);
+}
+
+/* PMU identifier attribute. */
+
+static ssize_t cmem_lat_pmu_identifier_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *page)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(page, "%s\n", cmem_lat_pmu->identifier);
+}
+
+static struct device_attribute cmem_lat_pmu_identifier_attr =
+	__ATTR(identifier, 0444, cmem_lat_pmu_identifier_show, NULL);
+
+static struct attribute *cmem_lat_pmu_identifier_attrs[] = {
+	&cmem_lat_pmu_identifier_attr.attr,
+	NULL
+};
+
+static struct attribute_group cmem_lat_pmu_identifier_attr_group = {
+	.attrs = cmem_lat_pmu_identifier_attrs,
+};
+
+/* Format attributes. */
+
+#define NV_PMU_EXT_ATTR(_name, _func, _config)			\
+	(&((struct dev_ext_attribute[]){				\
+		{							\
+			.attr = __ATTR(_name, 0444, _func, NULL),	\
+			.var = (void *)_config				\
+		}							\
+	})[0].attr.attr)
+
+static struct attribute *cmem_lat_pmu_formats[] = {
+	NV_PMU_EXT_ATTR(event, device_show_string, "config:0-1"),
+	NULL
+};
+
+static const struct attribute_group cmem_lat_pmu_format_group = {
+	.name = "format",
+	.attrs = cmem_lat_pmu_formats,
+};
+
+/* Event attributes. */
+
+static ssize_t cmem_lat_pmu_sysfs_event_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, typeof(*pmu_attr), attr);
+	return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id);
+}
+
+#define NV_PMU_EVENT_ATTR(_name, _config)	\
+	PMU_EVENT_ATTR_ID(_name, cmem_lat_pmu_sysfs_event_show, _config)
+
+static struct attribute *cmem_lat_pmu_events[] = {
+	NV_PMU_EVENT_ATTR(cycles, CMEM_LAT_EVENT_CYCLES),
+	NV_PMU_EVENT_ATTR(rd_req, CMEM_LAT_EVENT_REQ),
+	NV_PMU_EVENT_ATTR(rd_cum_outs, CMEM_LAT_EVENT_AOR),
+	NULL
+};
+
+static const struct attribute_group cmem_lat_pmu_events_group = {
+	.name = "events",
+	.attrs = cmem_lat_pmu_events,
+};
+
+/* Cpumask attributes. */
+
+static ssize_t cmem_lat_pmu_cpumask_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
+	struct dev_ext_attribute *eattr =
+		container_of(attr, struct dev_ext_attribute, attr);
+	unsigned long mask_id = (unsigned long)eattr->var;
+	const cpumask_t *cpumask;
+
+	switch (mask_id) {
+	case CMEM_LAT_ACTIVE_CPU_MASK:
+		cpumask = &cmem_lat_pmu->active_cpu;
+		break;
+	case CMEM_LAT_ASSOCIATED_CPU_MASK:
+		cpumask = &cmem_lat_pmu->associated_cpus;
+		break;
+	default:
+		return 0;
+	}
+	return cpumap_print_to_pagebuf(true, buf, cpumask);
+}
+
+#define NV_PMU_CPUMASK_ATTR(_name, _config)			\
+	NV_PMU_EXT_ATTR(_name, cmem_lat_pmu_cpumask_show,	\
+				(unsigned long)_config)
+
+static struct attribute *cmem_lat_pmu_cpumask_attrs[] = {
+	NV_PMU_CPUMASK_ATTR(cpumask, CMEM_LAT_ACTIVE_CPU_MASK),
+	NV_PMU_CPUMASK_ATTR(associated_cpus, CMEM_LAT_ASSOCIATED_CPU_MASK),
+	NULL
+};
+
+static const struct attribute_group cmem_lat_pmu_cpumask_attr_group = {
+	.attrs = cmem_lat_pmu_cpumask_attrs,
+};
+
+/* Per PMU device attribute groups. */
+
+static const struct attribute_group *cmem_lat_pmu_attr_groups[] = {
+	&cmem_lat_pmu_identifier_attr_group,
+	&cmem_lat_pmu_format_group,
+	&cmem_lat_pmu_events_group,
+	&cmem_lat_pmu_cpumask_attr_group,
+	NULL
+};
+
+static int cmem_lat_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu =
+		hlist_entry_safe(node, struct cmem_lat_pmu, node);
+
+	if (!cpumask_test_cpu(cpu, &cmem_lat_pmu->associated_cpus))
+		return 0;
+
+	/* If the PMU is already managed, there is nothing to do */
+	if (!cpumask_empty(&cmem_lat_pmu->active_cpu))
+		return 0;
+
+	/* Use this CPU for event counting */
+	cpumask_set_cpu(cpu, &cmem_lat_pmu->active_cpu);
+
+	return 0;
+}
+
+static int cmem_lat_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
+{
+	unsigned int dst;
+
+	struct cmem_lat_pmu *cmem_lat_pmu =
+		hlist_entry_safe(node, struct cmem_lat_pmu, node);
+
+	/* Nothing to do if this CPU doesn't own the PMU */
+	if (!cpumask_test_and_clear_cpu(cpu, &cmem_lat_pmu->active_cpu))
+		return 0;
+
+	/* Choose a new CPU to migrate ownership of the PMU to */
+	dst = cpumask_any_and_but(&cmem_lat_pmu->associated_cpus,
+				  cpu_online_mask, cpu);
+	if (dst >= nr_cpu_ids)
+		return 0;
+
+	/* Use this CPU for event counting */
+	perf_pmu_migrate_context(&cmem_lat_pmu->pmu, cpu, dst);
+	cpumask_set_cpu(dst, &cmem_lat_pmu->active_cpu);
+
+	return 0;
+}
+
+static int cmem_lat_pmu_get_cpus(struct cmem_lat_pmu *cmem_lat_pmu,
+				unsigned int socket)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu_to_node(cpu) == socket)
+			cpumask_set_cpu(cpu, &cmem_lat_pmu->associated_cpus);
+	}
+
+	if (cpumask_empty(&cmem_lat_pmu->associated_cpus)) {
+		dev_dbg(cmem_lat_pmu->dev,
+			"No cpu associated with PMU socket-%u\n", socket);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static int cmem_lat_pmu_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct acpi_device *acpi_dev;
+	struct cmem_lat_pmu *cmem_lat_pmu;
+	char *name, *uid_str;
+	int ret, i;
+	u32 socket;
+
+	acpi_dev = ACPI_COMPANION(dev);
+	if (!acpi_dev)
+		return -ENODEV;
+
+	uid_str = acpi_device_uid(acpi_dev);
+	if (!uid_str)
+		return -ENODEV;
+
+	ret = kstrtou32(uid_str, 0, &socket);
+	if (ret)
+		return ret;
+
+	cmem_lat_pmu = devm_kzalloc(dev, sizeof(*cmem_lat_pmu), GFP_KERNEL);
+	name = devm_kasprintf(dev, GFP_KERNEL, "nvidia_cmem_latency_pmu_%u", socket);
+	if (!cmem_lat_pmu || !name)
+		return -ENOMEM;
+
+	cmem_lat_pmu->dev = dev;
+	cmem_lat_pmu->name = name;
+	cmem_lat_pmu->identifier = acpi_device_hid(acpi_dev);
+	platform_set_drvdata(pdev, cmem_lat_pmu);
+
+	cmem_lat_pmu->pmu = (struct pmu) {
+		.parent		= &pdev->dev,
+		.task_ctx_nr	= perf_invalid_context,
+		.pmu_enable	= cmem_lat_pmu_enable,
+		.pmu_disable	= cmem_lat_pmu_disable,
+		.event_init	= cmem_lat_pmu_event_init,
+		.add		= cmem_lat_pmu_add,
+		.del		= cmem_lat_pmu_del,
+		.start		= cmem_lat_pmu_start,
+		.stop		= cmem_lat_pmu_stop,
+		.read		= cmem_lat_pmu_read,
+		.attr_groups	= cmem_lat_pmu_attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE |
+					PERF_PMU_CAP_NO_INTERRUPT,
+	};
+
+	/* Map the address of all the instances. */
+	for (i = 0; i < NUM_INSTANCES; i++) {
+		cmem_lat_pmu->base[i] = devm_platform_ioremap_resource(pdev, i);
+		if (IS_ERR(cmem_lat_pmu->base[i])) {
+			dev_err(dev, "Failed map address for instance %d\n", i);
+			return PTR_ERR(cmem_lat_pmu->base[i]);
+		}
+	}
+
+	/* Map broadcast address. */
+	cmem_lat_pmu->base_broadcast = devm_platform_ioremap_resource(pdev,
+										NUM_INSTANCES);
+	if (IS_ERR(cmem_lat_pmu->base_broadcast)) {
+		dev_err(dev, "Failed map broadcast address\n");
+		return PTR_ERR(cmem_lat_pmu->base_broadcast);
+	}
+
+	ret = cmem_lat_pmu_get_cpus(cmem_lat_pmu, socket);
+	if (ret)
+		return ret;
+
+	ret = cpuhp_state_add_instance(cmem_lat_pmu_cpuhp_state,
+				       &cmem_lat_pmu->node);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
+		return ret;
+	}
+
+	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_ENABLE);
+	cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_CLR);
+	cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_DISABLE);
+
+	ret = perf_pmu_register(&cmem_lat_pmu->pmu, name, -1);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to register PMU: %d\n", ret);
+		cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state,
+					    &cmem_lat_pmu->node);
+		return ret;
+	}
+
+	dev_dbg(&pdev->dev, "Registered %s PMU\n", name);
+
+	return 0;
+}
+
+static void cmem_lat_pmu_device_remove(struct platform_device *pdev)
+{
+	struct cmem_lat_pmu *cmem_lat_pmu = platform_get_drvdata(pdev);
+
+	perf_pmu_unregister(&cmem_lat_pmu->pmu);
+	cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state,
+				    &cmem_lat_pmu->node);
+}
+
+static const struct acpi_device_id cmem_lat_pmu_acpi_match[] = {
+	{ "NVDA2021" },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, cmem_lat_pmu_acpi_match);
+
+static struct platform_driver cmem_lat_pmu_driver = {
+	.driver = {
+		.name = "nvidia-t410-cmem-latency-pmu",
+		.acpi_match_table = ACPI_PTR(cmem_lat_pmu_acpi_match),
+		.suppress_bind_attrs = true,
+	},
+	.probe = cmem_lat_pmu_probe,
+	.remove = cmem_lat_pmu_device_remove,
+};
+
+static int __init cmem_lat_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+				      "perf/nvidia/cmem_latency:online",
+				      cmem_lat_pmu_cpu_online,
+				      cmem_lat_pmu_cpu_teardown);
+	if (ret < 0)
+		return ret;
+
+	cmem_lat_pmu_cpuhp_state = ret;
+
+	return platform_driver_register(&cmem_lat_pmu_driver);
+}
+
+static void __exit cmem_lat_pmu_exit(void)
+{
+	platform_driver_unregister(&cmem_lat_pmu_driver);
+	cpuhp_remove_multi_state(cmem_lat_pmu_cpuhp_state);
+}
+
+module_init(cmem_lat_pmu_init);
+module_exit(cmem_lat_pmu_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("NVIDIA Tegra410 CPU Memory Latency PMU driver");
+MODULE_AUTHOR("Besar Wicaksono <bwicaksono@nvidia.com>");
diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
index c808e0470394..672abea3b03c 100644
--- a/drivers/resctrl/Kconfig
+++ b/drivers/resctrl/Kconfig
@@ -1,6 +1,7 @@
 menuconfig ARM64_MPAM_DRIVER
 	bool "MPAM driver"
-	depends on ARM64 && ARM64_MPAM && EXPERT
+	depends on ARM64 && ARM64_MPAM
+	select ACPI_MPAM if ACPI
 	help
 	  Memory System Resource Partitioning and Monitoring (MPAM) driver for
 	  System IP, e.g. caches and memory controllers.
@@ -22,3 +23,9 @@ config MPAM_KUNIT_TEST
 	  If unsure, say N.
 
 endif
+
+config ARM64_MPAM_RESCTRL_FS
+	bool
+	default y if ARM64_MPAM_DRIVER && RESCTRL_FS
+	select RESCTRL_RMID_DEPENDS_ON_CLOSID
+	select RESCTRL_ASSIGN_FIXED
diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile
index 898199dcf80d..4f6d0e81f9b8 100644
--- a/drivers/resctrl/Makefile
+++ b/drivers/resctrl/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_ARM64_MPAM_DRIVER)			+= mpam.o
 mpam-y						+= mpam_devices.o
+mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS)		+= mpam_resctrl.o
 
 ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG)	+= -DDEBUG
diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
index 0666be6b0e88..41b14344b16f 100644
--- a/drivers/resctrl/mpam_devices.c
+++ b/drivers/resctrl/mpam_devices.c
@@ -29,7 +29,15 @@
 
 #include "mpam_internal.h"
 
-DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */
+/* Values for the T241 errata workaround */
+#define T241_CHIPS_MAX			4
+#define T241_CHIP_NSLICES		12
+#define T241_SPARE_REG0_OFF		0x1b0000
+#define T241_SPARE_REG1_OFF		0x1c0000
+#define T241_CHIP_ID(phys)		FIELD_GET(GENMASK_ULL(44, 43), phys)
+#define T241_SHADOW_REG_OFF(sidx, pid)	(0x360048 + (sidx) * 0x10000 + (pid) * 8)
+#define SMCCC_SOC_ID_T241		0x036b0241
+static void __iomem *t241_scratch_regs[T241_CHIPS_MAX];
 
 /*
  * mpam_list_lock protects the SRCU lists when writing. Once the
@@ -75,6 +83,14 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable);
 /* When mpam is disabled, the printed reason to aid debugging */
 static char *mpam_disable_reason;
 
+/*
+ * Whether resctrl has been setup. Used by cpuhp in preference to
+ * mpam_is_enabled(). The disable call after an error interrupt makes
+ * mpam_is_enabled() false before the cpuhp callbacks are made.
+ * Reads/writes should hold mpam_cpuhp_state_lock, (or be cpuhp callbacks).
+ */
+static bool mpam_resctrl_enabled;
+
 /*
  * An MSC is a physical container for controls and monitors, each identified by
  * their RIS index. These share a base-address, interrupts and some MMIO
@@ -624,6 +640,86 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc,
 	return ERR_PTR(-ENOENT);
 }
 
+static int mpam_enable_quirk_nvidia_t241_1(struct mpam_msc *msc,
+					   const struct mpam_quirk *quirk)
+{
+	s32 soc_id = arm_smccc_get_soc_id_version();
+	struct resource *r;
+	phys_addr_t phys;
+
+	/*
+	 * A mapping to a device other than the MSC is needed, check
+	 * SOC_ID is  NVIDIA T241 chip (036b:0241)
+	 */
+	if (soc_id < 0 || soc_id != SMCCC_SOC_ID_T241)
+		return -EINVAL;
+
+	r = platform_get_resource(msc->pdev, IORESOURCE_MEM, 0);
+	if (!r)
+		return -EINVAL;
+
+	/* Find the internal registers base addr from the CHIP ID */
+	msc->t241_id = T241_CHIP_ID(r->start);
+	phys = FIELD_PREP(GENMASK_ULL(45, 44), msc->t241_id) | 0x19000000ULL;
+
+	t241_scratch_regs[msc->t241_id] = ioremap(phys, SZ_8M);
+	if (WARN_ON_ONCE(!t241_scratch_regs[msc->t241_id]))
+		return -EINVAL;
+
+	pr_info_once("Enabled workaround for NVIDIA T241 erratum T241-MPAM-1\n");
+
+	return 0;
+}
+
+static const struct mpam_quirk mpam_quirks[] = {
+	{
+		/* NVIDIA t241 erratum T241-MPAM-1 */
+		.init       = mpam_enable_quirk_nvidia_t241_1,
+		.iidr       = MPAM_IIDR_NVIDIA_T241,
+		.iidr_mask  = MPAM_IIDR_MATCH_ONE,
+		.workaround = T241_SCRUB_SHADOW_REGS,
+	},
+	{
+		/* NVIDIA t241 erratum T241-MPAM-4 */
+		.iidr       = MPAM_IIDR_NVIDIA_T241,
+		.iidr_mask  = MPAM_IIDR_MATCH_ONE,
+		.workaround = T241_FORCE_MBW_MIN_TO_ONE,
+	},
+	{
+		/* NVIDIA t241 erratum T241-MPAM-6 */
+		.iidr       = MPAM_IIDR_NVIDIA_T241,
+		.iidr_mask  = MPAM_IIDR_MATCH_ONE,
+		.workaround = T241_MBW_COUNTER_SCALE_64,
+	},
+	{
+	/* ARM CMN-650 CSU erratum 3642720 */
+	.iidr       = MPAM_IIDR_ARM_CMN_650,
+	.iidr_mask  = MPAM_IIDR_MATCH_ONE,
+	.workaround = IGNORE_CSU_NRDY,
+	},
+	{ NULL } /* Sentinel */
+};
+
+static void mpam_enable_quirks(struct mpam_msc *msc)
+{
+	const struct mpam_quirk *quirk;
+
+	for (quirk = &mpam_quirks[0]; quirk->iidr_mask; quirk++) {
+		int err = 0;
+
+		if (quirk->iidr != (msc->iidr & quirk->iidr_mask))
+			continue;
+
+		if (quirk->init)
+			err = quirk->init(msc, quirk);
+
+		if (err)
+			continue;
+
+		mpam_set_quirk(quirk->workaround, msc);
+	}
+}
+
 /*
  * IHI009A.a has this nugget: "If a monitor does not support automatic behaviour
  * of NRDY, software can use this bit for any purpose" - so hardware might not
@@ -715,6 +811,13 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris)
 			mpam_set_feature(mpam_feat_mbw_part, props);
 
 		props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features);
+
+		/*
+		 * The BWA_WD field can represent 0-63, but the control fields it
+		 * describes have a maximum of 16 bits.
+		 */
+		props->bwa_wd = min(props->bwa_wd, 16);
+
 		if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features))
 			mpam_set_feature(mpam_feat_mbw_max, props);
 
@@ -851,8 +954,11 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc)
 	/* Grab an IDR value to find out how many RIS there are */
 	mutex_lock(&msc->part_sel_lock);
 	idr = mpam_msc_read_idr(msc);
+	msc->iidr = mpam_read_partsel_reg(msc, IIDR);
 	mutex_unlock(&msc->part_sel_lock);
 
+	mpam_enable_quirks(msc);
+
 	msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr);
 
 	/* Use these values so partid/pmg always starts with a valid value */
@@ -903,6 +1009,7 @@ struct mon_read {
 	enum mpam_device_features	type;
 	u64				*val;
 	int				err;
+	bool				waited_timeout;
 };
 
 static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris)
@@ -1052,7 +1159,7 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val,
 	}
 }
 
-static u64 mpam_msmon_overflow_val(enum mpam_device_features type)
+static u64 __mpam_msmon_overflow_val(enum mpam_device_features type)
 {
 	/* TODO: implement scaling counters */
 	switch (type) {
@@ -1067,6 +1174,18 @@ static u64 mpam_msmon_overflow_val(enum mpam_device_features type)
 	}
 }
 
+static u64 mpam_msmon_overflow_val(enum mpam_device_features type,
+				   struct mpam_msc *msc)
+{
+	u64 overflow_val = __mpam_msmon_overflow_val(type);
+
+	if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) &&
+	    type != mpam_feat_msmon_mbwu_63counter)
+		overflow_val *= 64;
+
+	return overflow_val;
+}
+
 static void __ris_msmon_read(void *arg)
 {
 	u64 now;
@@ -1137,6 +1256,10 @@ static void __ris_msmon_read(void *arg)
 		if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops))
 			nrdy = now & MSMON___NRDY;
 		now = FIELD_GET(MSMON___VALUE, now);
+
+		if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout)
+			nrdy = false;
+
 		break;
 	case mpam_feat_msmon_mbwu_31counter:
 	case mpam_feat_msmon_mbwu_44counter:
@@ -1157,13 +1280,17 @@ static void __ris_msmon_read(void *arg)
 			now = FIELD_GET(MSMON___VALUE, now);
 		}
 
+		if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) &&
+		    m->type != mpam_feat_msmon_mbwu_63counter)
+			now *= 64;
+
 		if (nrdy)
 			break;
 
 		mbwu_state = &ris->mbwu_state[ctx->mon];
 
 		if (overflow)
-			mbwu_state->correction += mpam_msmon_overflow_val(m->type);
+			mbwu_state->correction += mpam_msmon_overflow_val(m->type, msc);
 
 		/*
 		 * Include bandwidth consumed before the last hardware reset and
@@ -1270,6 +1397,7 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx,
 			.ctx = ctx,
 			.type = type,
 			.val = val,
+			.waited_timeout = true,
 		};
 		*val = 0;
 
@@ -1338,6 +1466,75 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd)
 	__mpam_write_reg(msc, reg, bm);
 }
 
+static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid)
+{
+	int sidx, i, lcount = 1000;
+	void __iomem *regs;
+	u64 val0, val;
+
+	regs = t241_scratch_regs[ris->vmsc->msc->t241_id];
+
+	for (i = 0; i < lcount; i++) {
+		/* Read the shadow register at index 0 */
+		val0 = readq_relaxed(regs + T241_SHADOW_REG_OFF(0, partid));
+
+		/* Check if all the shadow registers have the same value */
+		for (sidx = 1; sidx < T241_CHIP_NSLICES; sidx++) {
+			val = readq_relaxed(regs +
+					    T241_SHADOW_REG_OFF(sidx, partid));
+			if (val != val0)
+				break;
+		}
+		if (sidx == T241_CHIP_NSLICES)
+			break;
+	}
+
+	if (i == lcount)
+		pr_warn_once("t241: inconsistent values in shadow regs");
+
+	/* Write a value zero to spare registers to take effect of MBW conf */
+	writeq_relaxed(0, regs + T241_SPARE_REG0_OFF);
+	writeq_relaxed(0, regs + T241_SPARE_REG1_OFF);
+}
+
+static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid,
+					  struct mpam_config *cfg)
+{
+	if (mpam_has_quirk(T241_SCRUB_SHADOW_REGS, ris->vmsc->msc))
+		mpam_apply_t241_erratum(ris, partid);
+}
+
+static u16 mpam_wa_t241_force_mbw_min_to_one(struct mpam_props *props)
+{
+	u16 max_hw_value, min_hw_granule, res0_bits;
+
+	res0_bits = 16 - props->bwa_wd;
+	max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits;
+	min_hw_granule = ~max_hw_value;
+
+	return min_hw_granule + 1;
+}
+
+static u16 mpam_wa_t241_calc_min_from_max(struct mpam_props *props,
+					  struct mpam_config *cfg)
+{
+	u16 val = 0;
+	u16 max;
+	u16 delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1;
+
+	if (mpam_has_feature(mpam_feat_mbw_max, cfg)) {
+		max = cfg->mbw_max;
+	} else {
+		/* Resetting. Hence, use the ris specific default. */
+		max = GENMASK(15, 16 - props->bwa_wd);
+	}
+
+	if (max > delta)
+		val = max - delta;
+
+	return val;
+}
+
 /* Called via IPI. Call while holding an SRCU reference */
 static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
 				      struct mpam_config *cfg)
@@ -1364,36 +1561,41 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
 		__mpam_intpart_sel(ris->ris_idx, partid, msc);
 	}
 
-	if (mpam_has_feature(mpam_feat_cpor_part, rprops) &&
-	    mpam_has_feature(mpam_feat_cpor_part, cfg)) {
-		if (cfg->reset_cpbm)
-			mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd);
-		else
+	if (mpam_has_feature(mpam_feat_cpor_part, rprops)) {
+		if (mpam_has_feature(mpam_feat_cpor_part, cfg))
 			mpam_write_partsel_reg(msc, CPBM, cfg->cpbm);
+		else
+			mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd);
 	}
 
-	if (mpam_has_feature(mpam_feat_mbw_part, rprops) &&
-	    mpam_has_feature(mpam_feat_mbw_part, cfg)) {
-		if (cfg->reset_mbw_pbm)
+	if (mpam_has_feature(mpam_feat_mbw_part, rprops)) {
+		if (mpam_has_feature(mpam_feat_mbw_part, cfg))
 			mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits);
 		else
 			mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm);
 	}
 
-	if (mpam_has_feature(mpam_feat_mbw_min, rprops) &&
-	    mpam_has_feature(mpam_feat_mbw_min, cfg))
-		mpam_write_partsel_reg(msc, MBW_MIN, 0);
+	if (mpam_has_feature(mpam_feat_mbw_min, rprops)) {
+		u16 val = 0;
 
-	if (mpam_has_feature(mpam_feat_mbw_max, rprops) &&
-	    mpam_has_feature(mpam_feat_mbw_max, cfg)) {
-		if (cfg->reset_mbw_max)
-			mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX);
-		else
-			mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max);
+		if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) {
+			u16 min = mpam_wa_t241_force_mbw_min_to_one(rprops);
+
+			val = mpam_wa_t241_calc_min_from_max(rprops, cfg);
+			val = max(val, min);
+		}
+
+		mpam_write_partsel_reg(msc, MBW_MIN, val);
 	}
 
-	if (mpam_has_feature(mpam_feat_mbw_prop, rprops) &&
-	    mpam_has_feature(mpam_feat_mbw_prop, cfg))
+	if (mpam_has_feature(mpam_feat_mbw_max, rprops)) {
+		if (mpam_has_feature(mpam_feat_mbw_max, cfg))
+			mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max);
+		else
+			mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX);
+	}
+
+	if (mpam_has_feature(mpam_feat_mbw_prop, rprops))
 		mpam_write_partsel_reg(msc, MBW_PROP, 0);
 
 	if (mpam_has_feature(mpam_feat_cmax_cmax, rprops))
@@ -1421,6 +1623,8 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
 		mpam_write_partsel_reg(msc, PRI, pri_val);
 	}
 
+	mpam_quirk_post_config_change(ris, partid, cfg);
+
 	mutex_unlock(&msc->part_sel_lock);
 }
 
@@ -1493,16 +1697,6 @@ static int mpam_save_mbwu_state(void *arg)
 	return 0;
 }
 
-static void mpam_init_reset_cfg(struct mpam_config *reset_cfg)
-{
-	*reset_cfg = (struct mpam_config) {
-		.reset_cpbm = true,
-		.reset_mbw_pbm = true,
-		.reset_mbw_max = true,
-	};
-	bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST);
-}
-
 /*
  * Called via smp_call_on_cpu() to prevent migration, while still being
  * pre-emptible. Caller must hold mpam_srcu.
@@ -1510,14 +1704,12 @@ static void mpam_init_reset_cfg(struct mpam_config *reset_cfg)
 static int mpam_reset_ris(void *arg)
 {
 	u16 partid, partid_max;
-	struct mpam_config reset_cfg;
+	struct mpam_config reset_cfg = {};
 	struct mpam_msc_ris *ris = arg;
 
 	if (ris->in_reset_state)
 		return 0;
 
-	mpam_init_reset_cfg(&reset_cfg);
-
 	spin_lock(&partid_max_lock);
 	partid_max = mpam_partid_max;
 	spin_unlock(&partid_max_lock);
@@ -1632,6 +1824,9 @@ static int mpam_cpu_online(unsigned int cpu)
 			mpam_reprogram_msc(msc);
 	}
 
+	if (mpam_resctrl_enabled)
+		return mpam_resctrl_online_cpu(cpu);
+
 	return 0;
 }
 
@@ -1675,6 +1870,9 @@ static int mpam_cpu_offline(unsigned int cpu)
 {
 	struct mpam_msc *msc;
 
+	if (mpam_resctrl_enabled)
+		mpam_resctrl_offline_cpu(cpu);
+
 	guard(srcu)(&mpam_srcu);
 	list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
 				 srcu_read_lock_held(&mpam_srcu)) {
@@ -1971,6 +2169,7 @@ static bool mpam_has_cmax_wd_feature(struct mpam_props *props)
  * resulting safe value must be compatible with both. When merging values in
  * the tree, all the aliasing resources must be handled first.
  * On mismatch, parent is modified.
+ * Quirks on an MSC will apply to all MSC in that class.
  */
 static void __props_mismatch(struct mpam_props *parent,
 			     struct mpam_props *child, bool alias)
@@ -2090,6 +2289,7 @@ static void __props_mismatch(struct mpam_props *parent,
  * nobble the class feature, as we can't configure all the resources.
  * e.g. The L3 cache is composed of two resources with 13 and 17 portion
  * bitmaps respectively.
+ * Quirks on an MSC will apply to all MSC in that class.
  */
 static void
 __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc)
@@ -2103,6 +2303,9 @@ __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc)
 	dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n",
 		(long)cprops->features, (long)vprops->features);
 
+	/* Merge quirks */
+	class->quirks |= vmsc->msc->quirks;
+
 	/* Take the safe value for any common features */
 	__props_mismatch(cprops, vprops, false);
 }
@@ -2167,6 +2370,9 @@ static void mpam_enable_merge_class_features(struct mpam_component *comp)
 
 	list_for_each_entry(vmsc, &comp->vmsc, comp_list)
 		__class_props_mismatch(class, vmsc);
+
+	if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class))
+		mpam_clear_feature(mpam_feat_mbw_min, &class->props);
 }
 
 /*
@@ -2520,6 +2726,12 @@ static void mpam_enable_once(void)
 	mutex_unlock(&mpam_list_lock);
 	cpus_read_unlock();
 
+	if (!err) {
+		err = mpam_resctrl_setup();
+		if (err)
+			pr_err("Failed to initialise resctrl: %d\n", err);
+	}
+
 	if (err) {
 		mpam_disable_reason = "Failed to enable.";
 		schedule_work(&mpam_broken_work);
@@ -2527,6 +2739,7 @@ static void mpam_enable_once(void)
 	}
 
 	static_branch_enable(&mpam_enabled);
+	mpam_resctrl_enabled = true;
 	mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline,
 				      "mpam:online");
 
@@ -2559,7 +2772,7 @@ static void mpam_reset_component_locked(struct mpam_component *comp)
 	}
 }
 
-static void mpam_reset_class_locked(struct mpam_class *class)
+void mpam_reset_class_locked(struct mpam_class *class)
 {
 	struct mpam_component *comp;
 
@@ -2586,24 +2799,39 @@ static void mpam_reset_class(struct mpam_class *class)
 void mpam_disable(struct work_struct *ignored)
 {
 	int idx;
+	bool do_resctrl_exit;
 	struct mpam_class *class;
 	struct mpam_msc *msc, *tmp;
 
+	if (mpam_is_enabled())
+		static_branch_disable(&mpam_enabled);
+
 	mutex_lock(&mpam_cpuhp_state_lock);
 	if (mpam_cpuhp_state) {
 		cpuhp_remove_state(mpam_cpuhp_state);
 		mpam_cpuhp_state = 0;
 	}
+
+	/*
+	 * Removing the cpuhp state called mpam_cpu_offline() and told resctrl
+	 * all the CPUs are offline.
+	 */
+	do_resctrl_exit = mpam_resctrl_enabled;
+	mpam_resctrl_enabled = false;
 	mutex_unlock(&mpam_cpuhp_state_lock);
 
-	static_branch_disable(&mpam_enabled);
+	if (do_resctrl_exit)
+		mpam_resctrl_exit();
 
 	mpam_unregister_irqs();
 
 	idx = srcu_read_lock(&mpam_srcu);
 	list_for_each_entry_srcu(class, &mpam_classes, classes_list,
-				 srcu_read_lock_held(&mpam_srcu))
+				 srcu_read_lock_held(&mpam_srcu)) {
 		mpam_reset_class(class);
+		if (do_resctrl_exit)
+			mpam_resctrl_teardown_class(class);
+	}
 	srcu_read_unlock(&mpam_srcu, idx);
 
 	mutex_lock(&mpam_list_lock);
@@ -2694,6 +2922,7 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid,
 					 srcu_read_lock_held(&mpam_srcu)) {
 			arg.ris = ris;
 			mpam_touch_msc(msc, __write_config, &arg);
+			ris->in_reset_state = false;
 		}
 		mutex_unlock(&msc->cfg_lock);
 	}
diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
index e8971842b124..1914aefdcba9 100644
--- a/drivers/resctrl/mpam_internal.h
+++ b/drivers/resctrl/mpam_internal.h
@@ -12,22 +12,31 @@
 #include <linux/jump_label.h>
 #include <linux/llist.h>
 #include <linux/mutex.h>
+#include <linux/resctrl.h>
 #include <linux/spinlock.h>
 #include <linux/srcu.h>
 #include <linux/types.h>
 
+#include <asm/mpam.h>
+
 #define MPAM_MSC_MAX_NUM_RIS	16
 
 struct platform_device;
 
-DECLARE_STATIC_KEY_FALSE(mpam_enabled);
-
 #ifdef CONFIG_MPAM_KUNIT_TEST
 #define PACKED_FOR_KUNIT __packed
 #else
 #define PACKED_FOR_KUNIT
 #endif
 
+/*
+ * This 'mon' values must not alias an actual monitor, so must be larger than
+ * U16_MAX, but not be confused with an errno value, so smaller than
+ * (u32)-SZ_4K.
+ * USE_PRE_ALLOCATED is used to avoid confusion with an actual monitor.
+ */
+#define USE_PRE_ALLOCATED	(U16_MAX + 1)
+
 static inline bool mpam_is_enabled(void)
 {
 	return static_branch_likely(&mpam_enabled);
@@ -76,6 +85,8 @@ struct mpam_msc {
 	u8			pmg_max;
 	unsigned long		ris_idxs;
 	u32			ris_max;
+	u32			iidr;
+	u16			quirks;
 
 	/*
 	 * error_irq_lock is taken when registering/unregistering the error
@@ -119,6 +130,9 @@ struct mpam_msc {
 	void __iomem		*mapped_hwpage;
 	size_t			mapped_hwpage_sz;
 
+	/* Values only used on some platforms for quirks */
+	u32			t241_id;
+
 	struct mpam_garbage	garbage;
 };
 
@@ -207,6 +221,42 @@ struct mpam_props {
 #define mpam_set_feature(_feat, x)	__set_bit(_feat, (x)->features)
 #define mpam_clear_feature(_feat, x)	__clear_bit(_feat, (x)->features)
 
+/* Workaround bits for msc->quirks */
+enum mpam_device_quirks {
+	T241_SCRUB_SHADOW_REGS,
+	T241_FORCE_MBW_MIN_TO_ONE,
+	T241_MBW_COUNTER_SCALE_64,
+	IGNORE_CSU_NRDY,
+	MPAM_QUIRK_LAST
+};
+
+#define mpam_has_quirk(_quirk, x)	((1 << (_quirk) & (x)->quirks))
+#define mpam_set_quirk(_quirk, x)	((x)->quirks |= (1 << (_quirk)))
+
+struct mpam_quirk {
+	int (*init)(struct mpam_msc *msc, const struct mpam_quirk *quirk);
+
+	u32 iidr;
+	u32 iidr_mask;
+
+	enum mpam_device_quirks workaround;
+};
+
+#define MPAM_IIDR_MATCH_ONE	(FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID,   0xfff) | \
+				 FIELD_PREP_CONST(MPAMF_IIDR_VARIANT,     0xf)	 | \
+				 FIELD_PREP_CONST(MPAMF_IIDR_REVISION,    0xf)	 | \
+				 FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff))
+
+#define MPAM_IIDR_NVIDIA_T241	(FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID,   0x241) | \
+				 FIELD_PREP_CONST(MPAMF_IIDR_VARIANT,     0)	 | \
+				 FIELD_PREP_CONST(MPAMF_IIDR_REVISION,    0)	 | \
+				 FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b))
+
+#define MPAM_IIDR_ARM_CMN_650	(FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID,   0)	 | \
+				 FIELD_PREP_CONST(MPAMF_IIDR_VARIANT,     0)	 | \
+				 FIELD_PREP_CONST(MPAMF_IIDR_REVISION,    0)	 | \
+				 FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x43b))
+
 /* The values for MSMON_CFG_MBWU_FLT.RWBW */
 enum mon_filter_options {
 	COUNT_BOTH	= 0,
@@ -215,7 +265,11 @@ enum mon_filter_options {
 };
 
 struct mon_cfg {
-	u16			mon;
+	/*
+	 * mon must be large enough to hold out of range values like
+	 * USE_PRE_ALLOCATED
+	 */
+	u32			mon;
 	u8			pmg;
 	bool			match_pmg;
 	bool			csu_exclude_clean;
@@ -246,6 +300,7 @@ struct mpam_class {
 
 	struct mpam_props	props;
 	u32			nrdy_usec;
+	u16			quirks;
 	u8			level;
 	enum mpam_class_types	type;
 
@@ -266,10 +321,6 @@ struct mpam_config {
 	u32	mbw_pbm;
 	u16	mbw_max;
 
-	bool	reset_cpbm;
-	bool	reset_mbw_pbm;
-	bool	reset_mbw_max;
-
 	struct mpam_garbage	garbage;
 };
 
@@ -337,6 +388,32 @@ struct mpam_msc_ris {
 	struct mpam_garbage	garbage;
 };
 
+struct mpam_resctrl_dom {
+	struct mpam_component		*ctrl_comp;
+
+	/*
+	 * There is no single mon_comp because different events may be backed
+	 * by different class/components. mon_comp is indexed by the event
+	 * number.
+	 */
+	struct mpam_component		*mon_comp[QOS_NUM_EVENTS];
+
+	struct rdt_ctrl_domain		resctrl_ctrl_dom;
+	struct rdt_l3_mon_domain	resctrl_mon_dom;
+};
+
+struct mpam_resctrl_res {
+	struct mpam_class	*class;
+	struct rdt_resource	resctrl_res;
+	bool			cdp_enabled;
+};
+
+struct mpam_resctrl_mon {
+	struct mpam_class	*class;
+
+	/* per-class data that resctrl needs will live here */
+};
+
 static inline int mpam_alloc_csu_mon(struct mpam_class *class)
 {
 	struct mpam_props *cprops = &class->props;
@@ -381,6 +458,9 @@ extern u8 mpam_pmg_max;
 void mpam_enable(struct work_struct *work);
 void mpam_disable(struct work_struct *work);
 
+/* Reset all the RIS in a class under cpus_read_lock() */
+void mpam_reset_class_locked(struct mpam_class *class);
+
 int mpam_apply_config(struct mpam_component *comp, u16 partid,
 		      struct mpam_config *cfg);
 
@@ -391,6 +471,20 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx);
 int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level,
 				   cpumask_t *affinity);
 
+#ifdef CONFIG_RESCTRL_FS
+int mpam_resctrl_setup(void);
+void mpam_resctrl_exit(void);
+int mpam_resctrl_online_cpu(unsigned int cpu);
+void mpam_resctrl_offline_cpu(unsigned int cpu);
+void mpam_resctrl_teardown_class(struct mpam_class *class);
+#else
+static inline int mpam_resctrl_setup(void) { return 0; }
+static inline void mpam_resctrl_exit(void) { }
+static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; }
+static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { }
+static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { }
+#endif /* CONFIG_RESCTRL_FS */
+
 /*
  * MPAM MSCs have the following register layout. See:
  * Arm Memory System Resource Partitioning and Monitoring (MPAM) System
diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c
new file mode 100644
index 000000000000..a9938006d0e6
--- /dev/null
+++ b/drivers/resctrl/mpam_resctrl.c
@@ -0,0 +1,1704 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/arm_mpam.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/math.h>
+#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/resctrl.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include <asm/mpam.h>
+
+#include "mpam_internal.h"
+
+DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters);
+
+/*
+ * The classes we've picked to map to resctrl resources, wrapped
+ * in with their resctrl structure.
+ * Class pointer may be NULL.
+ */
+static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES];
+
+#define for_each_mpam_resctrl_control(res, rid)					\
+	for (rid = 0, res = &mpam_resctrl_controls[rid];			\
+	     rid < RDT_NUM_RESOURCES;						\
+	     rid++, res = &mpam_resctrl_controls[rid])
+
+/*
+ * The classes we've picked to map to resctrl events.
+ * Resctrl believes all the worlds a Xeon, and these are all on the L3. This
+ * array lets us find the actual class backing the event counters. e.g.
+ * the only memory bandwidth counters may be on the memory controller, but to
+ * make use of them, we pretend they are on L3. Restrict the events considered
+ * to those supported by MPAM.
+ * Class pointer may be NULL.
+ */
+#define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID
+static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1];
+
+#define for_each_mpam_resctrl_mon(mon, eventid)					\
+	for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid];	\
+	     eventid <= MPAM_MAX_EVENT;						\
+	     eventid++, mon = &mpam_resctrl_counters[eventid])
+
+/* The lock for modifying resctrl's domain lists from cpuhp callbacks. */
+static DEFINE_MUTEX(domain_list_lock);
+
+/*
+ * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1.
+ * This applies globally to all traffic the CPU generates.
+ */
+static bool cdp_enabled;
+
+/*
+ * We use cacheinfo to discover the size of the caches and their id. cacheinfo
+ * populates this from a device_initcall(). mpam_resctrl_setup() must wait.
+ */
+static bool cacheinfo_ready;
+static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready);
+
+/*
+ * If resctrl_init() succeeded, resctrl_exit() can be used to remove support
+ * for the filesystem in the event of an error.
+ */
+static bool resctrl_enabled;
+
+bool resctrl_arch_alloc_capable(void)
+{
+	struct mpam_resctrl_res *res;
+	enum resctrl_res_level rid;
+
+	for_each_mpam_resctrl_control(res, rid) {
+		if (res->resctrl_res.alloc_capable)
+			return true;
+	}
+
+	return false;
+}
+
+bool resctrl_arch_mon_capable(void)
+{
+	struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+	struct rdt_resource *l3 = &res->resctrl_res;
+
+	/* All monitors are presented as being on the L3 cache */
+	return l3->mon_capable;
+}
+
+bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+	return false;
+}
+
+void resctrl_arch_mon_event_config_read(void *info)
+{
+}
+
+void resctrl_arch_mon_event_config_write(void *info)
+{
+}
+
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d)
+{
+}
+
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+			     u32 closid, u32 rmid, enum resctrl_event_id eventid)
+{
+}
+
+void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+			     u32 closid, u32 rmid, int cntr_id,
+			     enum resctrl_event_id eventid)
+{
+}
+
+void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+			      enum resctrl_event_id evtid, u32 rmid, u32 closid,
+			      u32 cntr_id, bool assign)
+{
+}
+
+int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+			   u32 unused, u32 rmid, int cntr_id,
+			   enum resctrl_event_id eventid, u64 *val)
+{
+	return -EOPNOTSUPP;
+}
+
+bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
+{
+	return false;
+}
+
+int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
+{
+	return -EINVAL;
+}
+
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
+{
+	return -EOPNOTSUPP;
+}
+
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
+{
+	return false;
+}
+
+void resctrl_arch_pre_mount(void)
+{
+}
+
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid)
+{
+	return mpam_resctrl_controls[rid].cdp_enabled;
+}
+
+/**
+ * resctrl_reset_task_closids() - Reset the PARTID/PMG values for all tasks.
+ *
+ * At boot, all existing tasks use partid zero for D and I.
+ * To enable/disable CDP emulation, all these tasks need relabelling.
+ */
+static void resctrl_reset_task_closids(void)
+{
+	struct task_struct *p, *t;
+
+	read_lock(&tasklist_lock);
+	for_each_process_thread(p, t) {
+		resctrl_arch_set_closid_rmid(t, RESCTRL_RESERVED_CLOSID,
+					     RESCTRL_RESERVED_RMID);
+	}
+	read_unlock(&tasklist_lock);
+}
+
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable)
+{
+	u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID;
+	struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+	struct rdt_resource *l3 = &res->resctrl_res;
+	int cpu;
+
+	if (!IS_ENABLED(CONFIG_EXPERT) && enable) {
+		/*
+		 * If the resctrl fs is mounted more than once, sequentially,
+		 * then CDP can lead to the use of out of range PARTIDs.
+		 */
+		pr_warn("CDP not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (enable)
+		pr_warn("CDP is an expert feature and may cause MPAM to malfunction.\n");
+
+	/*
+	 * resctrl_arch_set_cdp_enabled() is only called with enable set to
+	 * false on error and unmount.
+	 */
+	cdp_enabled = enable;
+	mpam_resctrl_controls[rid].cdp_enabled = enable;
+
+	if (enable)
+		l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx() / 2;
+	else
+		l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx();
+
+	/* The mbw_max feature can't hide cdp as it's a per-partid maximum. */
+	if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled)
+		mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false;
+
+	if (mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled &&
+	    mpam_resctrl_controls[RDT_RESOURCE_MBA].class)
+		mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true;
+
+	if (enable) {
+		if (mpam_partid_max < 1)
+			return -EINVAL;
+
+		partid_d = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_DATA);
+		partid_i = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_CODE);
+	}
+
+	mpam_set_task_partid_pmg(current, partid_d, partid_i, 0, 0);
+	WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current));
+
+	resctrl_reset_task_closids();
+
+	for_each_possible_cpu(cpu)
+		mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0);
+	on_each_cpu(resctrl_arch_sync_cpu_closid_rmid, NULL, 1);
+
+	return 0;
+}
+
+static bool mpam_resctrl_hide_cdp(enum resctrl_res_level rid)
+{
+	return cdp_enabled && !resctrl_arch_get_cdp_enabled(rid);
+}
+
+/*
+ * MSC may raise an error interrupt if it sees an out or range partid/pmg,
+ * and go on to truncate the value. Regardless of what the hardware supports,
+ * only the system wide safe value is safe to use.
+ */
+u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored)
+{
+	return mpam_partid_max + 1;
+}
+
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+	return (mpam_pmg_max + 1) * (mpam_partid_max + 1);
+}
+
+u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid)
+{
+	return closid * (mpam_pmg_max + 1) + rmid;
+}
+
+void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
+{
+	*closid = idx / (mpam_pmg_max + 1);
+	*rmid = idx % (mpam_pmg_max + 1);
+}
+
+void resctrl_arch_sched_in(struct task_struct *tsk)
+{
+	lockdep_assert_preemption_disabled();
+
+	mpam_thread_switch(tsk);
+}
+
+void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid)
+{
+	WARN_ON_ONCE(closid > U16_MAX);
+	WARN_ON_ONCE(rmid > U8_MAX);
+
+	if (!cdp_enabled) {
+		mpam_set_cpu_defaults(cpu, closid, closid, rmid, rmid);
+	} else {
+		/*
+		 * When CDP is enabled, resctrl halves the closid range and we
+		 * use odd/even partid for one closid.
+		 */
+		u32 partid_d = resctrl_get_config_index(closid, CDP_DATA);
+		u32 partid_i = resctrl_get_config_index(closid, CDP_CODE);
+
+		mpam_set_cpu_defaults(cpu, partid_d, partid_i, rmid, rmid);
+	}
+}
+
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
+{
+	struct resctrl_cpu_defaults *r = info;
+
+	lockdep_assert_preemption_disabled();
+
+	if (r) {
+		resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(),
+							 r->closid, r->rmid);
+	}
+
+	resctrl_arch_sched_in(current);
+}
+
+void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
+{
+	WARN_ON_ONCE(closid > U16_MAX);
+	WARN_ON_ONCE(rmid > U8_MAX);
+
+	if (!cdp_enabled) {
+		mpam_set_task_partid_pmg(tsk, closid, closid, rmid, rmid);
+	} else {
+		u32 partid_d = resctrl_get_config_index(closid, CDP_DATA);
+		u32 partid_i = resctrl_get_config_index(closid, CDP_CODE);
+
+		mpam_set_task_partid_pmg(tsk, partid_d, partid_i, rmid, rmid);
+	}
+}
+
+bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid)
+{
+	u64 regval = mpam_get_regval(tsk);
+	u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval);
+
+	if (cdp_enabled)
+		tsk_closid >>= 1;
+
+	return tsk_closid == closid;
+}
+
+/* The task's pmg is not unique, the partid must be considered too */
+bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
+{
+	u64 regval = mpam_get_regval(tsk);
+	u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval);
+	u32 tsk_rmid = FIELD_GET(MPAM0_EL1_PMG_D, regval);
+
+	if (cdp_enabled)
+		tsk_closid >>= 1;
+
+	return (tsk_closid == closid) && (tsk_rmid == rmid);
+}
+
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+	if (l >= RDT_NUM_RESOURCES)
+		return NULL;
+
+	return &mpam_resctrl_controls[l].resctrl_res;
+}
+
+static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid)
+{
+	struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid];
+
+	if (!mpam_is_enabled())
+		return -EINVAL;
+
+	if (!mon->class)
+		return -EINVAL;
+
+	switch (evtid) {
+	case QOS_L3_OCCUP_EVENT_ID:
+		/* With CDP, one monitor gets used for both code/data reads */
+		return mpam_alloc_csu_mon(mon->class);
+	case QOS_L3_MBM_LOCAL_EVENT_ID:
+	case QOS_L3_MBM_TOTAL_EVENT_ID:
+		return USE_PRE_ALLOCATED;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r,
+				 enum resctrl_event_id evtid)
+{
+	DEFINE_WAIT(wait);
+	int *ret;
+
+	ret = kmalloc_obj(*ret);
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
+
+	do {
+		prepare_to_wait(&resctrl_mon_ctx_waiters, &wait,
+				TASK_INTERRUPTIBLE);
+		*ret = resctrl_arch_mon_ctx_alloc_no_wait(evtid);
+		if (*ret == -ENOSPC)
+			schedule();
+	} while (*ret == -ENOSPC && !signal_pending(current));
+	finish_wait(&resctrl_mon_ctx_waiters, &wait);
+
+	return ret;
+}
+
+static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid,
+					      u32 mon_idx)
+{
+	struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid];
+
+	if (!mpam_is_enabled())
+		return;
+
+	if (!mon->class)
+		return;
+
+	if (evtid == QOS_L3_OCCUP_EVENT_ID)
+		mpam_free_csu_mon(mon->class, mon_idx);
+
+	wake_up(&resctrl_mon_ctx_waiters);
+}
+
+void resctrl_arch_mon_ctx_free(struct rdt_resource *r,
+			       enum resctrl_event_id evtid, void *arch_mon_ctx)
+{
+	u32 mon_idx = *(u32 *)arch_mon_ctx;
+
+	kfree(arch_mon_ctx);
+
+	resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx);
+}
+
+static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp,
+		      enum mpam_device_features mon_type,
+		      int mon_idx,
+		      enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val)
+{
+	struct mon_cfg cfg;
+
+	if (!mpam_is_enabled())
+		return -EINVAL;
+
+	/* Shift closid to account for CDP */
+	closid = resctrl_get_config_index(closid, cdp_type);
+
+	if (irqs_disabled()) {
+		/* Check if we can access this domain without an IPI */
+		return -EIO;
+	}
+
+	cfg = (struct mon_cfg) {
+		.mon = mon_idx,
+		.match_pmg = true,
+		.partid = closid,
+		.pmg = rmid,
+	};
+
+	return mpam_msmon_read(mon_comp, &cfg, mon_type, val);
+}
+
+static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp,
+			     enum mpam_device_features mon_type,
+			     int mon_idx, u32 closid, u32 rmid, u64 *val)
+{
+	if (cdp_enabled) {
+		u64 code_val = 0, data_val = 0;
+		int err;
+
+		err = __read_mon(mon, mon_comp, mon_type, mon_idx,
+				 CDP_CODE, closid, rmid, &code_val);
+		if (err)
+			return err;
+
+		err = __read_mon(mon, mon_comp, mon_type, mon_idx,
+				 CDP_DATA, closid, rmid, &data_val);
+		if (err)
+			return err;
+
+		*val += code_val + data_val;
+		return 0;
+	}
+
+	return __read_mon(mon, mon_comp, mon_type, mon_idx,
+			  CDP_NONE, closid, rmid, val);
+}
+
+/* MBWU when not in ABMC mode (not supported), and CSU counters. */
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr,
+			   u32 closid, u32 rmid, enum resctrl_event_id eventid,
+			   void *arch_priv, u64 *val, void *arch_mon_ctx)
+{
+	struct mpam_resctrl_dom *l3_dom;
+	struct mpam_component *mon_comp;
+	u32 mon_idx = *(u32 *)arch_mon_ctx;
+	enum mpam_device_features mon_type;
+	struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid];
+
+	resctrl_arch_rmid_read_context_check();
+
+	if (!mpam_is_enabled())
+		return -EINVAL;
+
+	if (eventid >= QOS_NUM_EVENTS || !mon->class)
+		return -EINVAL;
+
+	l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr);
+	mon_comp = l3_dom->mon_comp[eventid];
+
+	if (eventid != QOS_L3_OCCUP_EVENT_ID)
+		return -EINVAL;
+
+	mon_type = mpam_feat_msmon_csu;
+
+	return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx,
+				 closid, rmid, val);
+}
+
+/*
+ * The rmid realloc threshold should be for the smallest cache exposed to
+ * resctrl.
+ */
+static int update_rmid_limits(struct mpam_class *class)
+{
+	u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx();
+	struct mpam_props *cprops = &class->props;
+	struct cacheinfo *ci;
+
+	lockdep_assert_cpus_held();
+
+	if (!mpam_has_feature(mpam_feat_msmon_csu, cprops))
+		return 0;
+
+	/*
+	 * Assume cache levels are the same size for all CPUs...
+	 * The check just requires any online CPU and it can't go offline as we
+	 * hold the cpu lock.
+	 */
+	ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level);
+	if (!ci || ci->size == 0) {
+		pr_debug("Could not read cache size for class %u\n",
+			 class->level);
+		return -EINVAL;
+	}
+
+	if (!resctrl_rmid_realloc_limit ||
+	    ci->size < resctrl_rmid_realloc_limit) {
+		resctrl_rmid_realloc_limit = ci->size;
+		resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg;
+	}
+
+	return 0;
+}
+
+static bool cache_has_usable_cpor(struct mpam_class *class)
+{
+	struct mpam_props *cprops = &class->props;
+
+	if (!mpam_has_feature(mpam_feat_cpor_part, cprops))
+		return false;
+
+	/* resctrl uses u32 for all bitmap configurations */
+	return class->props.cpbm_wd <= 32;
+}
+
+static bool mba_class_use_mbw_max(struct mpam_props *cprops)
+{
+	return (mpam_has_feature(mpam_feat_mbw_max, cprops) &&
+		cprops->bwa_wd);
+}
+
+static bool class_has_usable_mba(struct mpam_props *cprops)
+{
+	return mba_class_use_mbw_max(cprops);
+}
+
+static bool cache_has_usable_csu(struct mpam_class *class)
+{
+	struct mpam_props *cprops;
+
+	if (!class)
+		return false;
+
+	cprops = &class->props;
+
+	if (!mpam_has_feature(mpam_feat_msmon_csu, cprops))
+		return false;
+
+	/*
+	 * CSU counters settle on the value, so we can get away with
+	 * having only one.
+	 */
+	if (!cprops->num_csu_mon)
+		return false;
+
+	return true;
+}
+
+/*
+ * Calculate the worst-case percentage change from each implemented step
+ * in the control.
+ */
+static u32 get_mba_granularity(struct mpam_props *cprops)
+{
+	if (!mba_class_use_mbw_max(cprops))
+		return 0;
+
+	/*
+	 * bwa_wd is the number of bits implemented in the 0.xxx
+	 * fixed point fraction. 1 bit is 50%, 2 is 25% etc.
+	 */
+	return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd);
+}
+
+/*
+ * Each fixed-point hardware value architecturally represents a range
+ * of values: the full range 0% - 100% is split contiguously into
+ * (1 << cprops->bwa_wd) equal bands.
+ *
+ * Although the bwa_bwd fields have 6 bits the maximum valid value is 16
+ * as it reports the width of fields that are at most 16 bits. When
+ * fewer than 16 bits are valid the least significant bits are
+ * ignored. The implied binary point is kept between bits 15 and 16 and
+ * so the valid bits are leftmost.
+ *
+ * See ARM IHI0099B.a "MPAM system component specification", Section 9.3,
+ * "The fixed-point fractional format" for more information.
+ *
+ * Find the nearest percentage value to the upper bound of the selected band:
+ */
+static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops)
+{
+	u32 val = mbw_max;
+
+	val >>= 16 - cprops->bwa_wd;
+	val += 1;
+	val *= MAX_MBA_BW;
+	val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd);
+
+	return val;
+}
+
+/*
+ * Find the band whose upper bound is closest to the specified percentage.
+ *
+ * A round-to-nearest policy is followed here as a balanced compromise
+ * between unexpected under-commit of the resource (where the total of
+ * a set of resource allocations after conversion is less than the
+ * expected total, due to rounding of the individual converted
+ * percentages) and over-commit (where the total of the converted
+ * allocations is greater than expected).
+ */
+static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops)
+{
+	u32 val = pc;
+
+	val <<= cprops->bwa_wd;
+	val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW);
+	val = max(val, 1) - 1;
+	val <<= 16 - cprops->bwa_wd;
+
+	return val;
+}
+
+static u32 get_mba_min(struct mpam_props *cprops)
+{
+	if (!mba_class_use_mbw_max(cprops)) {
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+
+	return mbw_max_to_percent(0, cprops);
+}
+
+/* Find the L3 cache that has affinity with this CPU */
+static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask)
+{
+	u32 cache_id = get_cpu_cacheinfo_id(cpu, 3);
+
+	lockdep_assert_cpus_held();
+
+	return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask);
+}
+
+/*
+ * topology_matches_l3() - Is the provided class the same shape as L3
+ * @victim:		The class we'd like to pretend is L3.
+ *
+ * resctrl expects all the world's a Xeon, and all counters are on the
+ * L3. We allow some mapping counters on other classes. This requires
+ * that the CPU->domain mapping is the same kind of shape.
+ *
+ * Using cacheinfo directly would make this work even if resctrl can't
+ * use the L3 - but cacheinfo can't tell us anything about offline CPUs.
+ * Using the L3 resctrl domain list also depends on CPUs being online.
+ * Using the mpam_class we picked for L3 so we can use its domain list
+ * assumes that there are MPAM controls on the L3.
+ * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id()
+ * helper which can tell us about offline CPUs ... but getting the cache_id
+ * to start with relies on at least one CPU per L3 cache being online at
+ * boot.
+ *
+ * Walk the victim component list and compare the affinity mask with the
+ * corresponding L3. The topology matches if each victim:component's affinity
+ * mask is the same as the CPU's corresponding L3's. These lists/masks are
+ * computed from firmware tables so don't change at runtime.
+ */
+static bool topology_matches_l3(struct mpam_class *victim)
+{
+	int cpu, err;
+	struct mpam_component *victim_iter;
+
+	lockdep_assert_cpus_held();
+
+	cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL;
+	if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL))
+		return false;
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(victim_iter, &victim->components, class_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		if (cpumask_empty(&victim_iter->affinity)) {
+			pr_debug("class %u has CPU-less component %u - can't match L3!\n",
+				 victim->level, victim_iter->comp_id);
+			return false;
+		}
+
+		cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask);
+		if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+			return false;
+
+		cpumask_clear(tmp_cpumask);
+		err = find_l3_equivalent_bitmask(cpu, tmp_cpumask);
+		if (err) {
+			pr_debug("Failed to find L3's equivalent component to class %u component %u\n",
+				 victim->level, victim_iter->comp_id);
+			return false;
+		}
+
+		/* Any differing bits in the affinity mask? */
+		if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) {
+			pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n"
+				 "L3:%*pbl != victim:%*pbl\n",
+				 victim->level, victim_iter->comp_id,
+				 cpumask_pr_args(tmp_cpumask),
+				 cpumask_pr_args(&victim_iter->affinity));
+
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/*
+ * Test if the traffic for a class matches that at egress from the L3. For
+ * MSC at memory controllers this is only possible if there is a single L3
+ * as otherwise the counters at the memory can include bandwidth from the
+ * non-local L3.
+ */
+static bool traffic_matches_l3(struct mpam_class *class)
+{
+	int err, cpu;
+
+	lockdep_assert_cpus_held();
+
+	if (class->type == MPAM_CLASS_CACHE && class->level == 3)
+		return true;
+
+	if (class->type == MPAM_CLASS_CACHE && class->level != 3) {
+		pr_debug("class %u is a different cache from L3\n", class->level);
+		return false;
+	}
+
+	if (class->type != MPAM_CLASS_MEMORY) {
+		pr_debug("class %u is neither of type cache or memory\n", class->level);
+		return false;
+	}
+
+	cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL;
+	if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) {
+		pr_debug("cpumask allocation failed\n");
+		return false;
+	}
+
+	cpu = cpumask_any_and(&class->affinity, cpu_online_mask);
+	err = find_l3_equivalent_bitmask(cpu, tmp_cpumask);
+	if (err) {
+		pr_debug("Failed to find L3 downstream to cpu %d\n", cpu);
+		return false;
+	}
+
+	if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) {
+		pr_debug("There is more than one L3\n");
+		return false;
+	}
+
+	/* Be strict; the traffic might stop in the intermediate cache. */
+	if (get_cpu_cacheinfo_id(cpu, 4) != -1) {
+		pr_debug("L3 isn't the last level of cache\n");
+		return false;
+	}
+
+	if (num_possible_nodes() > 1) {
+		pr_debug("There is more than one numa node\n");
+		return false;
+	}
+
+#ifdef CONFIG_HMEM_REPORTING
+	if (node_devices[cpu_to_node(cpu)]->cache_dev) {
+		pr_debug("There is a memory side cache\n");
+		return false;
+	}
+#endif
+
+	return true;
+}
+
+/* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */
+static void mpam_resctrl_pick_caches(void)
+{
+	struct mpam_class *class;
+	struct mpam_resctrl_res *res;
+
+	lockdep_assert_cpus_held();
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(class, &mpam_classes, classes_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		if (class->type != MPAM_CLASS_CACHE) {
+			pr_debug("class %u is not a cache\n", class->level);
+			continue;
+		}
+
+		if (class->level != 2 && class->level != 3) {
+			pr_debug("class %u is not L2 or L3\n", class->level);
+			continue;
+		}
+
+		if (!cache_has_usable_cpor(class)) {
+			pr_debug("class %u cache misses CPOR\n", class->level);
+			continue;
+		}
+
+		if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
+			pr_debug("class %u has missing CPUs, mask %*pb != %*pb\n", class->level,
+				 cpumask_pr_args(&class->affinity),
+				 cpumask_pr_args(cpu_possible_mask));
+			continue;
+		}
+
+		if (class->level == 2)
+			res = &mpam_resctrl_controls[RDT_RESOURCE_L2];
+		else
+			res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+		res->class = class;
+	}
+}
+
+static void mpam_resctrl_pick_mba(void)
+{
+	struct mpam_class *class, *candidate_class = NULL;
+	struct mpam_resctrl_res *res;
+
+	lockdep_assert_cpus_held();
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(class, &mpam_classes, classes_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		struct mpam_props *cprops = &class->props;
+
+		if (class->level != 3 && class->type == MPAM_CLASS_CACHE) {
+			pr_debug("class %u is a cache but not the L3\n", class->level);
+			continue;
+		}
+
+		if (!class_has_usable_mba(cprops)) {
+			pr_debug("class %u has no bandwidth control\n",
+				 class->level);
+			continue;
+		}
+
+		if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
+			pr_debug("class %u has missing CPUs\n", class->level);
+			continue;
+		}
+
+		if (!topology_matches_l3(class)) {
+			pr_debug("class %u topology doesn't match L3\n",
+				 class->level);
+			continue;
+		}
+
+		if (!traffic_matches_l3(class)) {
+			pr_debug("class %u traffic doesn't match L3 egress\n",
+				 class->level);
+			continue;
+		}
+
+		/*
+		 * Pick a resource to be MBA that as close as possible to
+		 * the L3. mbm_total counts the bandwidth leaving the L3
+		 * cache and MBA should correspond as closely as possible
+		 * for proper operation of mba_sc.
+		 */
+		if (!candidate_class || class->level < candidate_class->level)
+			candidate_class = class;
+	}
+
+	if (candidate_class) {
+		pr_debug("selected class %u to back MBA\n",
+			 candidate_class->level);
+		res = &mpam_resctrl_controls[RDT_RESOURCE_MBA];
+		res->class = candidate_class;
+	}
+}
+
+static void counter_update_class(enum resctrl_event_id evt_id,
+				 struct mpam_class *class)
+{
+	struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class;
+
+	if (existing_class) {
+		if (class->level == 3) {
+			pr_debug("Existing class is L3 - L3 wins\n");
+			return;
+		}
+
+		if (existing_class->level < class->level) {
+			pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n",
+				 existing_class->level, class->level);
+			return;
+		}
+	}
+
+	mpam_resctrl_counters[evt_id].class = class;
+}
+
+static void mpam_resctrl_pick_counters(void)
+{
+	struct mpam_class *class;
+
+	lockdep_assert_cpus_held();
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(class, &mpam_classes, classes_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		/* The name of the resource is L3... */
+		if (class->type == MPAM_CLASS_CACHE && class->level != 3) {
+			pr_debug("class %u is a cache but not the L3", class->level);
+			continue;
+		}
+
+		if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
+			pr_debug("class %u does not cover all CPUs",
+				 class->level);
+			continue;
+		}
+
+		if (cache_has_usable_csu(class)) {
+			pr_debug("class %u has usable CSU",
+				 class->level);
+
+			/* CSU counters only make sense on a cache. */
+			switch (class->type) {
+			case MPAM_CLASS_CACHE:
+				if (update_rmid_limits(class))
+					break;
+
+				counter_update_class(QOS_L3_OCCUP_EVENT_ID, class);
+				break;
+			default:
+				break;
+			}
+		}
+	}
+}
+
+static int mpam_resctrl_control_init(struct mpam_resctrl_res *res)
+{
+	struct mpam_class *class = res->class;
+	struct mpam_props *cprops = &class->props;
+	struct rdt_resource *r = &res->resctrl_res;
+
+	switch (r->rid) {
+	case RDT_RESOURCE_L2:
+	case RDT_RESOURCE_L3:
+		r->schema_fmt = RESCTRL_SCHEMA_BITMAP;
+		r->cache.arch_has_sparse_bitmasks = true;
+
+		r->cache.cbm_len = class->props.cpbm_wd;
+		/* mpam_devices will reject empty bitmaps */
+		r->cache.min_cbm_bits = 1;
+
+		if (r->rid == RDT_RESOURCE_L2) {
+			r->name = "L2";
+			r->ctrl_scope = RESCTRL_L2_CACHE;
+			r->cdp_capable = true;
+		} else {
+			r->name = "L3";
+			r->ctrl_scope = RESCTRL_L3_CACHE;
+			r->cdp_capable = true;
+		}
+
+		/*
+		 * Which bits are shared with other ...things...  Unknown
+		 * devices use partid-0 which uses all the bitmap fields. Until
+		 * we have configured the SMMU and GIC not to do this 'all the
+		 * bits' is the correct answer here.
+		 */
+		r->cache.shareable_bits = resctrl_get_default_ctrl(r);
+		r->alloc_capable = true;
+		break;
+	case RDT_RESOURCE_MBA:
+		r->schema_fmt = RESCTRL_SCHEMA_RANGE;
+		r->ctrl_scope = RESCTRL_L3_CACHE;
+
+		r->membw.delay_linear = true;
+		r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
+		r->membw.min_bw = get_mba_min(cprops);
+		r->membw.max_bw = MAX_MBA_BW;
+		r->membw.bw_gran = get_mba_granularity(cprops);
+
+		r->name = "MB";
+		r->alloc_capable = true;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp)
+{
+	struct mpam_class *class = comp->class;
+
+	if (class->type == MPAM_CLASS_CACHE)
+		return comp->comp_id;
+
+	if (topology_matches_l3(class)) {
+		/* Use the corresponding L3 component ID as the domain ID */
+		int id = get_cpu_cacheinfo_id(cpu, 3);
+
+		/* Implies topology_matches_l3() made a mistake */
+		if (WARN_ON_ONCE(id == -1))
+			return comp->comp_id;
+
+		return id;
+	}
+
+	/* Otherwise, expose the ID used by the firmware table code. */
+	return comp->comp_id;
+}
+
+static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon,
+				     enum resctrl_event_id type)
+{
+	struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+	struct rdt_resource *l3 = &res->resctrl_res;
+
+	lockdep_assert_cpus_held();
+
+	/*
+	 * There also needs to be an L3 cache present.
+	 * The check just requires any online CPU and it can't go offline as we
+	 * hold the cpu lock.
+	 */
+	if (get_cpu_cacheinfo_id(raw_smp_processor_id(), 3) == -1)
+		return 0;
+
+	/*
+	 * If there are no MPAM resources on L3, force it into existence.
+	 * topology_matches_l3() already ensures this looks like the L3.
+	 * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init().
+	 */
+	if (!res->class) {
+		pr_warn_once("Faking L3 MSC to enable counters.\n");
+		res->class = mpam_resctrl_counters[type].class;
+	}
+
+	/*
+	 * Called multiple times!, once per event type that has a
+	 * monitoring class.
+	 * Setting name is necessary on monitor only platforms.
+	 */
+	l3->name = "L3";
+	l3->mon_scope = RESCTRL_L3_CACHE;
+
+	/*
+	 * num-rmid is the upper bound for the number of monitoring groups that
+	 * can exist simultaneously, including the default monitoring group for
+	 * each control group. Hence, advertise the whole rmid_idx space even
+	 * though each control group has its own pmg/rmid space. Unfortunately,
+	 * this does mean userspace needs to know the architecture to correctly
+	 * interpret this value.
+	 */
+	l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx();
+
+	if (resctrl_enable_mon_event(type, false, 0, NULL))
+		l3->mon_capable = true;
+
+	return 0;
+}
+
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+			    u32 closid, enum resctrl_conf_type type)
+{
+	u32 partid;
+	struct mpam_config *cfg;
+	struct mpam_props *cprops;
+	struct mpam_resctrl_res *res;
+	struct mpam_resctrl_dom *dom;
+	enum mpam_device_features configured_by;
+
+	lockdep_assert_cpus_held();
+
+	if (!mpam_is_enabled())
+		return resctrl_get_default_ctrl(r);
+
+	res = container_of(r, struct mpam_resctrl_res, resctrl_res);
+	dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom);
+	cprops = &res->class->props;
+
+	/*
+	 * When CDP is enabled, but the resource doesn't support it,
+	 * the control is cloned across both partids.
+	 * Pick one at random to read:
+	 */
+	if (mpam_resctrl_hide_cdp(r->rid))
+		type = CDP_DATA;
+
+	partid = resctrl_get_config_index(closid, type);
+	cfg = &dom->ctrl_comp->cfg[partid];
+
+	switch (r->rid) {
+	case RDT_RESOURCE_L2:
+	case RDT_RESOURCE_L3:
+		configured_by = mpam_feat_cpor_part;
+		break;
+	case RDT_RESOURCE_MBA:
+		if (mpam_has_feature(mpam_feat_mbw_max, cprops)) {
+			configured_by = mpam_feat_mbw_max;
+			break;
+		}
+		fallthrough;
+	default:
+		return resctrl_get_default_ctrl(r);
+	}
+
+	if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) ||
+	    !mpam_has_feature(configured_by, cfg))
+		return resctrl_get_default_ctrl(r);
+
+	switch (configured_by) {
+	case mpam_feat_cpor_part:
+		return cfg->cpbm;
+	case mpam_feat_mbw_max:
+		return mbw_max_to_percent(cfg->mbw_max, cprops);
+	default:
+		return resctrl_get_default_ctrl(r);
+	}
+}
+
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+			    u32 closid, enum resctrl_conf_type t, u32 cfg_val)
+{
+	int err;
+	u32 partid;
+	struct mpam_config cfg;
+	struct mpam_props *cprops;
+	struct mpam_resctrl_res *res;
+	struct mpam_resctrl_dom *dom;
+
+	lockdep_assert_cpus_held();
+	lockdep_assert_irqs_enabled();
+
+	if (!mpam_is_enabled())
+		return -EINVAL;
+
+	/*
+	 * No need to check the CPU as mpam_apply_config() doesn't care, and
+	 * resctrl_arch_update_domains() relies on this.
+	 */
+	res = container_of(r, struct mpam_resctrl_res, resctrl_res);
+	dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom);
+	cprops = &res->class->props;
+
+	if (mpam_resctrl_hide_cdp(r->rid))
+		t = CDP_DATA;
+
+	partid = resctrl_get_config_index(closid, t);
+	if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) {
+		pr_debug("Not alloc capable or computed PARTID out of range\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Copy the current config to avoid clearing other resources when the
+	 * same component is exposed multiple times through resctrl.
+	 */
+	cfg = dom->ctrl_comp->cfg[partid];
+
+	switch (r->rid) {
+	case RDT_RESOURCE_L2:
+	case RDT_RESOURCE_L3:
+		cfg.cpbm = cfg_val;
+		mpam_set_feature(mpam_feat_cpor_part, &cfg);
+		break;
+	case RDT_RESOURCE_MBA:
+		if (mpam_has_feature(mpam_feat_mbw_max, cprops)) {
+			cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops);
+			mpam_set_feature(mpam_feat_mbw_max, &cfg);
+			break;
+		}
+		fallthrough;
+	default:
+		return -EINVAL;
+	}
+
+	/*
+	 * When CDP is enabled, but the resource doesn't support it, we need to
+	 * apply the same configuration to the other partid.
+	 */
+	if (mpam_resctrl_hide_cdp(r->rid)) {
+		partid = resctrl_get_config_index(closid, CDP_CODE);
+		err = mpam_apply_config(dom->ctrl_comp, partid, &cfg);
+		if (err)
+			return err;
+
+		partid = resctrl_get_config_index(closid, CDP_DATA);
+		return mpam_apply_config(dom->ctrl_comp, partid, &cfg);
+	}
+
+	return mpam_apply_config(dom->ctrl_comp, partid, &cfg);
+}
+
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
+{
+	int err;
+	struct rdt_ctrl_domain *d;
+
+	lockdep_assert_cpus_held();
+	lockdep_assert_irqs_enabled();
+
+	if (!mpam_is_enabled())
+		return -EINVAL;
+
+	list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) {
+		for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) {
+			struct resctrl_staged_config *cfg = &d->staged_config[t];
+
+			if (!cfg->have_new_ctrl)
+				continue;
+
+			err = resctrl_arch_update_one(r, d, closid, t,
+						      cfg->new_ctrl);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
+{
+	struct mpam_resctrl_res *res;
+
+	lockdep_assert_cpus_held();
+
+	if (!mpam_is_enabled())
+		return;
+
+	res = container_of(r, struct mpam_resctrl_res, resctrl_res);
+	mpam_reset_class_locked(res->class);
+}
+
+static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp,
+					 enum resctrl_res_level rid,
+					 struct rdt_domain_hdr *hdr)
+{
+	lockdep_assert_cpus_held();
+
+	INIT_LIST_HEAD(&hdr->list);
+	hdr->id = mpam_resctrl_pick_domain_id(cpu, comp);
+	hdr->rid = rid;
+	cpumask_set_cpu(cpu, &hdr->cpu_mask);
+}
+
+static void mpam_resctrl_online_domain_hdr(unsigned int cpu,
+					   struct rdt_domain_hdr *hdr)
+{
+	lockdep_assert_cpus_held();
+
+	cpumask_set_cpu(cpu, &hdr->cpu_mask);
+}
+
+/**
+ * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU.
+ * @cpu:	The CPU to remove from the domain.
+ * @hdr:	The domain's header.
+ *
+ * Removes @cpu from the header mask. If this was the last CPU in the domain,
+ * the domain header is removed from its parent list and true is returned,
+ * indicating the parent structure can be freed.
+ * If there are other CPUs in the domain, returns false.
+ */
+static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu,
+					    struct rdt_domain_hdr *hdr)
+{
+	lockdep_assert_held(&domain_list_lock);
+
+	cpumask_clear_cpu(cpu, &hdr->cpu_mask);
+	if (cpumask_empty(&hdr->cpu_mask)) {
+		list_del_rcu(&hdr->list);
+		synchronize_rcu();
+		return true;
+	}
+
+	return false;
+}
+
+static void mpam_resctrl_domain_insert(struct list_head *list,
+				       struct rdt_domain_hdr *new)
+{
+	struct rdt_domain_hdr *err;
+	struct list_head *pos = NULL;
+
+	lockdep_assert_held(&domain_list_lock);
+
+	err = resctrl_find_domain(list, new->id, &pos);
+	if (WARN_ON_ONCE(err))
+		return;
+
+	list_add_tail_rcu(&new->list, pos);
+}
+
+static struct mpam_component *find_component(struct mpam_class *class, int cpu)
+{
+	struct mpam_component *comp;
+
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(comp, &class->components, class_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		if (cpumask_test_cpu(cpu, &comp->affinity))
+			return comp;
+	}
+
+	return NULL;
+}
+
+static struct mpam_resctrl_dom *
+mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res)
+{
+	int err;
+	struct mpam_resctrl_dom *dom;
+	struct rdt_l3_mon_domain *mon_d;
+	struct rdt_ctrl_domain *ctrl_d;
+	struct mpam_class *class = res->class;
+	struct mpam_component *comp_iter, *ctrl_comp;
+	struct rdt_resource *r = &res->resctrl_res;
+
+	lockdep_assert_held(&domain_list_lock);
+
+	ctrl_comp = NULL;
+	guard(srcu)(&mpam_srcu);
+	list_for_each_entry_srcu(comp_iter, &class->components, class_list,
+				 srcu_read_lock_held(&mpam_srcu)) {
+		if (cpumask_test_cpu(cpu, &comp_iter->affinity)) {
+			ctrl_comp = comp_iter;
+			break;
+		}
+	}
+
+	/* class has no component for this CPU */
+	if (WARN_ON_ONCE(!ctrl_comp))
+		return ERR_PTR(-EINVAL);
+
+	dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu));
+	if (!dom)
+		return ERR_PTR(-ENOMEM);
+
+	if (r->alloc_capable) {
+		dom->ctrl_comp = ctrl_comp;
+
+		ctrl_d = &dom->resctrl_ctrl_dom;
+		mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr);
+		ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN;
+		err = resctrl_online_ctrl_domain(r, ctrl_d);
+		if (err)
+			goto free_domain;
+
+		mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr);
+	} else {
+		pr_debug("Skipped control domain online - no controls\n");
+	}
+
+	if (r->mon_capable) {
+		struct mpam_component *any_mon_comp;
+		struct mpam_resctrl_mon *mon;
+		enum resctrl_event_id eventid;
+
+		/*
+		 * Even if the monitor domain is backed by a different
+		 * component, the L3 component IDs need to be used... only
+		 * there may be no ctrl_comp for the L3.
+		 * Search each event's class list for a component with
+		 * overlapping CPUs and set up the dom->mon_comp array.
+		 */
+
+		for_each_mpam_resctrl_mon(mon, eventid) {
+			struct mpam_component *mon_comp;
+
+			if (!mon->class)
+				continue;       // dummy resource
+
+			mon_comp = find_component(mon->class, cpu);
+			dom->mon_comp[eventid] = mon_comp;
+			if (mon_comp)
+				any_mon_comp = mon_comp;
+		}
+		if (!any_mon_comp) {
+			WARN_ON_ONCE(0);
+			err = -EFAULT;
+			goto offline_ctrl_domain;
+		}
+
+		mon_d = &dom->resctrl_mon_dom;
+		mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr);
+		mon_d->hdr.type = RESCTRL_MON_DOMAIN;
+		err = resctrl_online_mon_domain(r, &mon_d->hdr);
+		if (err)
+			goto offline_ctrl_domain;
+
+		mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr);
+	} else {
+		pr_debug("Skipped monitor domain online - no monitors\n");
+	}
+
+	return dom;
+
+offline_ctrl_domain:
+	if (r->alloc_capable) {
+		mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr);
+		resctrl_offline_ctrl_domain(r, ctrl_d);
+	}
+free_domain:
+	kfree(dom);
+	dom = ERR_PTR(err);
+
+	return dom;
+}
+
+/*
+ * We know all the monitors are associated with the L3, even if there are no
+ * controls and therefore no control component. Find the cache-id for the CPU
+ * and use that to search for existing resctrl domains.
+ * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id
+ * for anything that is not a cache.
+ */
+static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu)
+{
+	int cache_id;
+	struct mpam_resctrl_dom *dom;
+	struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+
+	lockdep_assert_cpus_held();
+
+	if (!l3->class)
+		return NULL;
+	cache_id = get_cpu_cacheinfo_id(cpu, 3);
+	if (cache_id < 0)
+		return NULL;
+
+	list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) {
+		if (dom->resctrl_mon_dom.hdr.id == cache_id)
+			return dom;
+	}
+
+	return NULL;
+}
+
+static struct mpam_resctrl_dom *
+mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res)
+{
+	struct mpam_resctrl_dom *dom;
+	struct rdt_resource *r = &res->resctrl_res;
+
+	lockdep_assert_cpus_held();
+
+	list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) {
+		if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity))
+			return dom;
+	}
+
+	if (r->rid != RDT_RESOURCE_L3)
+		return NULL;
+
+	/* Search the mon domain list too - needed on monitor only platforms. */
+	return mpam_resctrl_get_mon_domain_from_cpu(cpu);
+}
+
+int mpam_resctrl_online_cpu(unsigned int cpu)
+{
+	struct mpam_resctrl_res *res;
+	enum resctrl_res_level rid;
+
+	guard(mutex)(&domain_list_lock);
+	for_each_mpam_resctrl_control(res, rid) {
+		struct mpam_resctrl_dom *dom;
+		struct rdt_resource *r = &res->resctrl_res;
+
+		if (!res->class)
+			continue;	// dummy_resource;
+
+		dom = mpam_resctrl_get_domain_from_cpu(cpu, res);
+		if (!dom) {
+			dom = mpam_resctrl_alloc_domain(cpu, res);
+			if (IS_ERR(dom))
+				return PTR_ERR(dom);
+		} else {
+			if (r->alloc_capable) {
+				struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom;
+
+				mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr);
+			}
+			if (r->mon_capable) {
+				struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom;
+
+				mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr);
+			}
+		}
+	}
+
+	resctrl_online_cpu(cpu);
+
+	return 0;
+}
+
+void mpam_resctrl_offline_cpu(unsigned int cpu)
+{
+	struct mpam_resctrl_res *res;
+	enum resctrl_res_level rid;
+
+	resctrl_offline_cpu(cpu);
+
+	guard(mutex)(&domain_list_lock);
+	for_each_mpam_resctrl_control(res, rid) {
+		struct mpam_resctrl_dom *dom;
+		struct rdt_l3_mon_domain *mon_d;
+		struct rdt_ctrl_domain *ctrl_d;
+		bool ctrl_dom_empty, mon_dom_empty;
+		struct rdt_resource *r = &res->resctrl_res;
+
+		if (!res->class)
+			continue;	// dummy resource
+
+		dom = mpam_resctrl_get_domain_from_cpu(cpu, res);
+		if (WARN_ON_ONCE(!dom))
+			continue;
+
+		if (r->alloc_capable) {
+			ctrl_d = &dom->resctrl_ctrl_dom;
+			ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr);
+			if (ctrl_dom_empty)
+				resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d);
+		} else {
+			ctrl_dom_empty = true;
+		}
+
+		if (r->mon_capable) {
+			mon_d = &dom->resctrl_mon_dom;
+			mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr);
+			if (mon_dom_empty)
+				resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr);
+		} else {
+			mon_dom_empty = true;
+		}
+
+		if (ctrl_dom_empty && mon_dom_empty)
+			kfree(dom);
+	}
+}
+
+int mpam_resctrl_setup(void)
+{
+	int err = 0;
+	struct mpam_resctrl_res *res;
+	enum resctrl_res_level rid;
+	struct mpam_resctrl_mon *mon;
+	enum resctrl_event_id eventid;
+
+	wait_event(wait_cacheinfo_ready, cacheinfo_ready);
+
+	cpus_read_lock();
+	for_each_mpam_resctrl_control(res, rid) {
+		INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains);
+		INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains);
+		res->resctrl_res.rid = rid;
+	}
+
+	/* Find some classes to use for controls */
+	mpam_resctrl_pick_caches();
+	mpam_resctrl_pick_mba();
+
+	/* Initialise the resctrl structures from the classes */
+	for_each_mpam_resctrl_control(res, rid) {
+		if (!res->class)
+			continue;	// dummy resource
+
+		err = mpam_resctrl_control_init(res);
+		if (err) {
+			pr_debug("Failed to initialise rid %u\n", rid);
+			goto internal_error;
+		}
+	}
+
+	/* Find some classes to use for monitors */
+	mpam_resctrl_pick_counters();
+
+	for_each_mpam_resctrl_mon(mon, eventid) {
+		if (!mon->class)
+			continue;	// dummy resource
+
+		err = mpam_resctrl_monitor_init(mon, eventid);
+		if (err) {
+			pr_debug("Failed to initialise event %u\n", eventid);
+			goto internal_error;
+		}
+	}
+
+	cpus_read_unlock();
+
+	if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) {
+		pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n",
+			 resctrl_arch_alloc_capable(), resctrl_arch_mon_capable());
+		return -EOPNOTSUPP;
+	}
+
+	err = resctrl_init();
+	if (err)
+		return err;
+
+	WRITE_ONCE(resctrl_enabled, true);
+
+	return 0;
+
+internal_error:
+	cpus_read_unlock();
+	pr_debug("Internal error %d - resctrl not supported\n", err);
+	return err;
+}
+
+void mpam_resctrl_exit(void)
+{
+	if (!READ_ONCE(resctrl_enabled))
+		return;
+
+	WRITE_ONCE(resctrl_enabled, false);
+	resctrl_exit();
+}
+
+/*
+ * The driver is detaching an MSC from this class, if resctrl was using it,
+ * pull on resctrl_exit().
+ */
+void mpam_resctrl_teardown_class(struct mpam_class *class)
+{
+	struct mpam_resctrl_res *res;
+	enum resctrl_res_level rid;
+	struct mpam_resctrl_mon *mon;
+	enum resctrl_event_id eventid;
+
+	might_sleep();
+
+	for_each_mpam_resctrl_control(res, rid) {
+		if (res->class == class) {
+			res->class = NULL;
+			break;
+		}
+	}
+	for_each_mpam_resctrl_mon(mon, eventid) {
+		if (mon->class == class) {
+			mon->class = NULL;
+			break;
+		}
+	}
+}
+
+static int __init __cacheinfo_ready(void)
+{
+	cacheinfo_ready = true;
+	wake_up(&wait_cacheinfo_ready);
+
+	return 0;
+}
+device_initcall_sync(__cacheinfo_ready);
+
+#ifdef CONFIG_MPAM_KUNIT_TEST
+#include "test_mpam_resctrl.c"
+#endif
diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c
new file mode 100644
index 000000000000..b93d6ad87e43
--- /dev/null
+++ b/drivers/resctrl/test_mpam_resctrl.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+/* This file is intended to be included into mpam_resctrl.c */
+
+#include <kunit/test.h>
+#include <linux/array_size.h>
+#include <linux/bits.h>
+#include <linux/math.h>
+#include <linux/sprintf.h>
+
+struct percent_value_case {
+	u8	pc;
+	u8	width;
+	u16	value;
+};
+
+/*
+ * Mysterious inscriptions taken from the union of ARM DDI 0598D.b,
+ * "Arm Architecture Reference Manual Supplement - Memory System
+ * Resource Partitioning and Monitoring (MPAM), for A-profile
+ * architecture", Section 9.8, "About the fixed-point fractional
+ * format" (exact percentage entries only) and ARM IHI0099B.a
+ * "MPAM system component specification", Section 9.3,
+ * "The fixed-point fractional format":
+ */
+static const struct percent_value_case percent_value_cases[] = {
+	/* Architectural cases: */
+	{   1,  8,    1 },	{   1, 12,  0x27 },	{   1, 16,  0x28e },
+	{  25,  8, 0x3f },	{  25, 12, 0x3ff },	{  25, 16, 0x3fff },
+	{  33,  8, 0x53 },	{  33, 12, 0x546 },	{  33, 16, 0x5479 },
+	{  35,  8, 0x58 },	{  35, 12, 0x598 },	{  35, 16, 0x5998 },
+	{  45,  8, 0x72 },	{  45, 12, 0x732 },	{  45, 16, 0x7332 },
+	{  50,  8, 0x7f },	{  50, 12, 0x7ff },	{  50, 16, 0x7fff },
+	{  52,  8, 0x84 },	{  52, 12, 0x850 },	{  52, 16, 0x851d },
+	{  55,  8, 0x8b },	{  55, 12, 0x8cb },	{  55, 16, 0x8ccb },
+	{  58,  8, 0x93 },	{  58, 12, 0x946 },	{  58, 16, 0x9479 },
+	{  75,  8, 0xbf },	{  75, 12, 0xbff },	{  75, 16, 0xbfff },
+	{  80,  8, 0xcb },	{  80, 12, 0xccb },	{  80, 16, 0xcccb },
+	{  88,  8, 0xe0 },	{  88, 12, 0xe13 },	{  88, 16, 0xe146 },
+	{  95,  8, 0xf2 },	{  95, 12, 0xf32 },	{  95, 16, 0xf332 },
+	{ 100,  8, 0xff },	{ 100, 12, 0xfff },	{ 100, 16, 0xffff },
+};
+
+static void test_percent_value_desc(const struct percent_value_case *param,
+				    char *desc)
+{
+	snprintf(desc, KUNIT_PARAM_DESC_SIZE,
+		 "pc=%d, width=%d, value=0x%.*x\n",
+		 param->pc, param->width,
+		 DIV_ROUND_UP(param->width, 4), param->value);
+}
+
+KUNIT_ARRAY_PARAM(test_percent_value, percent_value_cases,
+		  test_percent_value_desc);
+
+struct percent_value_test_info {
+	u32 pc;			/* result of value-to-percent conversion */
+	u32 value;		/* result of percent-to-value conversion */
+	u32 max_value;		/* maximum raw value allowed by test params */
+	unsigned int shift;	/* promotes raw testcase value to 16 bits */
+};
+
+/*
+ * Convert a reference percentage to a fixed-point MAX value and
+ * vice-versa, based on param (not test->param_value!)
+ */
+static void __prepare_percent_value_test(struct kunit *test,
+					 struct percent_value_test_info *res,
+					 const struct percent_value_case *param)
+{
+	struct mpam_props fake_props = { };
+
+	/* Reject bogus test parameters that would break the tests: */
+	KUNIT_ASSERT_GE(test, param->width, 1);
+	KUNIT_ASSERT_LE(test, param->width, 16);
+	KUNIT_ASSERT_LT(test, param->value, 1 << param->width);
+
+	mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+	fake_props.bwa_wd = param->width;
+
+	res->shift = 16 - param->width;
+	res->max_value = GENMASK_U32(param->width - 1, 0);
+	res->value = percent_to_mbw_max(param->pc, &fake_props);
+	res->pc = mbw_max_to_percent(param->value << res->shift, &fake_props);
+}
+
+static void test_get_mba_granularity(struct kunit *test)
+{
+	int ret;
+	struct mpam_props fake_props = { };
+
+	/* Use MBW_MAX */
+	mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+
+	fake_props.bwa_wd = 0;
+	KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_max(&fake_props));
+
+	fake_props.bwa_wd = 1;
+	KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props));
+
+	/* Architectural maximum: */
+	fake_props.bwa_wd = 16;
+	KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props));
+
+	/* No usable control... */
+	fake_props.bwa_wd = 0;
+	ret = get_mba_granularity(&fake_props);
+	KUNIT_EXPECT_EQ(test, ret, 0);
+
+	fake_props.bwa_wd = 1;
+	ret = get_mba_granularity(&fake_props);
+	KUNIT_EXPECT_EQ(test, ret, 50);	/* DIV_ROUND_UP(100, 1 << 1)% = 50% */
+
+	fake_props.bwa_wd = 2;
+	ret = get_mba_granularity(&fake_props);
+	KUNIT_EXPECT_EQ(test, ret, 25);	/* DIV_ROUND_UP(100, 1 << 2)% = 25% */
+
+	fake_props.bwa_wd = 3;
+	ret = get_mba_granularity(&fake_props);
+	KUNIT_EXPECT_EQ(test, ret, 13);	/* DIV_ROUND_UP(100, 1 << 3)% = 13% */
+
+	fake_props.bwa_wd = 6;
+	ret = get_mba_granularity(&fake_props);
+	KUNIT_EXPECT_EQ(test, ret, 2);	/* DIV_ROUND_UP(100, 1 << 6)% = 2% */
+
+	fake_props.bwa_wd = 7;
+	ret = get_mba_granularity(&fake_props);
+	KUNIT_EXPECT_EQ(test, ret, 1);	/* DIV_ROUND_UP(100, 1 << 7)% = 1% */
+
+	/* Granularity saturates at 1% */
+	fake_props.bwa_wd = 16; /* architectural maximum */
+	ret = get_mba_granularity(&fake_props);
+	KUNIT_EXPECT_EQ(test, ret, 1);	/* DIV_ROUND_UP(100, 1 << 16)% = 1% */
+}
+
+static void test_mbw_max_to_percent(struct kunit *test)
+{
+	const struct percent_value_case *param = test->param_value;
+	struct percent_value_test_info res;
+
+	/*
+	 * Since the reference values in percent_value_cases[] all
+	 * correspond to exact percentages, round-to-nearest will
+	 * always give the exact percentage back when the MPAM max
+	 * value has precision of 0.5% or finer.  (Always true for the
+	 * reference data, since they all specify 8 bits or more of
+	 * precision.
+	 *
+	 * So, keep it simple and demand an exact match:
+	 */
+	__prepare_percent_value_test(test, &res, param);
+	KUNIT_EXPECT_EQ(test, res.pc, param->pc);
+}
+
+static void test_percent_to_mbw_max(struct kunit *test)
+{
+	const struct percent_value_case *param = test->param_value;
+	struct percent_value_test_info res;
+
+	__prepare_percent_value_test(test, &res, param);
+
+	KUNIT_EXPECT_GE(test, res.value, param->value << res.shift);
+	KUNIT_EXPECT_LE(test, res.value, (param->value + 1) << res.shift);
+	KUNIT_EXPECT_LE(test, res.value, res.max_value << res.shift);
+
+	/* No flexibility allowed for 0% and 100%! */
+
+	if (param->pc == 0)
+		KUNIT_EXPECT_EQ(test, res.value, 0);
+
+	if (param->pc == 100)
+		KUNIT_EXPECT_EQ(test, res.value, res.max_value << res.shift);
+}
+
+static const void *test_all_bwa_wd_gen_params(struct kunit *test, const void *prev,
+					      char *desc)
+{
+	uintptr_t param = (uintptr_t)prev;
+
+	if (param > 15)
+		return NULL;
+
+	param++;
+
+	snprintf(desc, KUNIT_PARAM_DESC_SIZE, "wd=%u\n", (unsigned int)param);
+
+	return (void *)param;
+}
+
+static unsigned int test_get_bwa_wd(struct kunit *test)
+{
+	uintptr_t param = (uintptr_t)test->param_value;
+
+	KUNIT_ASSERT_GE(test, param, 1);
+	KUNIT_ASSERT_LE(test, param, 16);
+
+	return param;
+}
+
+static void test_mbw_max_to_percent_limits(struct kunit *test)
+{
+	struct mpam_props fake_props = {0};
+	u32 max_value;
+
+	mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+	fake_props.bwa_wd = test_get_bwa_wd(test);
+	max_value = GENMASK(15, 16 - fake_props.bwa_wd);
+
+	KUNIT_EXPECT_EQ(test, mbw_max_to_percent(max_value, &fake_props),
+			MAX_MBA_BW);
+	KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props),
+			get_mba_min(&fake_props));
+
+	/*
+	 * Rounding policy dependent 0% sanity-check:
+	 * With round-to-nearest, the minimum mbw_max value really
+	 * should map to 0% if there are at least 200 steps.
+	 * (100 steps may be enough for some other rounding policies.)
+	 */
+	if (fake_props.bwa_wd >= 8)
+		KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), 0);
+
+	if (fake_props.bwa_wd < 8 &&
+	    mbw_max_to_percent(0, &fake_props) == 0)
+		kunit_warn(test, "wd=%d: Testsuite/driver Rounding policy mismatch?",
+			   fake_props.bwa_wd);
+}
+
+/*
+ * Check that converting a percentage to mbw_max and back again (or, as
+ * appropriate, vice-versa) always restores the original value:
+ */
+static void test_percent_max_roundtrip_stability(struct kunit *test)
+{
+	struct mpam_props fake_props = {0};
+	unsigned int shift;
+	u32 pc, max, pc2, max2;
+
+	mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+	fake_props.bwa_wd = test_get_bwa_wd(test);
+	shift = 16 - fake_props.bwa_wd;
+
+	/*
+	 * Converting a valid value from the coarser scale to the finer
+	 * scale and back again must yield the original value:
+	 */
+	if (fake_props.bwa_wd >= 7) {
+		/* More than 100 steps: only test exact pc values: */
+		for (pc = get_mba_min(&fake_props); pc <= MAX_MBA_BW; pc++) {
+			max = percent_to_mbw_max(pc, &fake_props);
+			pc2 = mbw_max_to_percent(max, &fake_props);
+			KUNIT_EXPECT_EQ(test, pc2, pc);
+		}
+	} else {
+		/* Fewer than 100 steps: only test exact mbw_max values: */
+		for (max = 0; max < 1 << 16; max += 1 << shift) {
+			pc = mbw_max_to_percent(max, &fake_props);
+			max2 = percent_to_mbw_max(pc, &fake_props);
+			KUNIT_EXPECT_EQ(test, max2, max);
+		}
+	}
+}
+
+static void test_percent_to_max_rounding(struct kunit *test)
+{
+	const struct percent_value_case *param = test->param_value;
+	unsigned int num_rounded_up = 0, total = 0;
+	struct percent_value_test_info res;
+
+	for (param = percent_value_cases, total = 0;
+	     param < &percent_value_cases[ARRAY_SIZE(percent_value_cases)];
+	     param++, total++) {
+		__prepare_percent_value_test(test, &res, param);
+		if (res.value > param->value << res.shift)
+			num_rounded_up++;
+	}
+
+	/*
+	 * The MPAM driver applies a round-to-nearest policy, whereas a
+	 * round-down policy seems to have been applied in the
+	 * reference table from which the test vectors were selected.
+	 *
+	 * For a large and well-distributed suite of test vectors,
+	 * about half should be rounded up and half down compared with
+	 * the reference table.  The actual test vectors are few in
+	 * number and probably not very well distributed however, so
+	 * tolerate a round-up rate of between 1/4 and 3/4 before
+	 * crying foul:
+	 */
+
+	kunit_info(test, "Round-up rate: %u%% (%u/%u)\n",
+		   DIV_ROUND_CLOSEST(num_rounded_up * 100, total),
+		   num_rounded_up, total);
+
+	KUNIT_EXPECT_GE(test, 4 * num_rounded_up, 1 * total);
+	KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total);
+}
+
+static struct kunit_case mpam_resctrl_test_cases[] = {
+	KUNIT_CASE(test_get_mba_granularity),
+	KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params),
+	KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params),
+	KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params),
+	KUNIT_CASE(test_percent_to_max_rounding),
+	KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability,
+			 test_all_bwa_wd_gen_params),
+	{}
+};
+
+static struct kunit_suite mpam_resctrl_test_suite = {
+	.name = "mpam_resctrl_test_suite",
+	.test_cases = mpam_resctrl_test_cases,
+};
+
+kunit_test_suites(&mpam_resctrl_test_suite);
diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h
index 7f00c5285a32..f92a36187a52 100644
--- a/include/linux/arm_mpam.h
+++ b/include/linux/arm_mpam.h
@@ -5,6 +5,7 @@
 #define __LINUX_ARM_MPAM_H
 
 #include <linux/acpi.h>
+#include <linux/resctrl_types.h>
 #include <linux/types.h>
 
 struct mpam_msc;
@@ -49,6 +50,37 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
 }
 #endif
 
+bool resctrl_arch_alloc_capable(void);
+bool resctrl_arch_mon_capable(void);
+
+void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid);
+void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid);
+void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid);
+void resctrl_arch_sched_in(struct task_struct *tsk);
+bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid);
+bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid);
+u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid);
+void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid);
+u32 resctrl_arch_system_num_rmid_idx(void);
+
+struct rdt_resource;
+void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid);
+void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx);
+
+/*
+ * The CPU configuration for MPAM is cheap to write, and is only written if it
+ * has changed. No need for fine grained enables.
+ */
+static inline void resctrl_arch_enable_mon(void) { }
+static inline void resctrl_arch_disable_mon(void) { }
+static inline void resctrl_arch_enable_alloc(void) { }
+static inline void resctrl_arch_disable_alloc(void) { }
+
+static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
+{
+	return val;
+}
+
 /**
  * mpam_register_requestor() - Register a requestor with the MPAM driver
  * @partid_max:		The maximum PARTID value the requestor can generate.
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index d223246401bc..e04d67e999a1 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -324,7 +324,7 @@ static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
 {
 	instrumentation_begin();
 	syscall_exit_to_user_mode_work(regs);
-	local_irq_disable_exit_to_user();
+	local_irq_disable();
 	syscall_exit_to_user_mode_prepare(regs);
 	instrumentation_end();
 	exit_to_user_mode();
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index b976946b3cdb..7ab41eec549f 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -109,37 +109,6 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
 	instrumentation_end();
 }
 
-/**
- * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable()
- * @ti_work:	Cached TIF flags gathered with interrupts disabled
- *
- * Defaults to local_irq_enable(). Can be supplied by architecture specific
- * code.
- */
-static inline void local_irq_enable_exit_to_user(unsigned long ti_work);
-
-#ifndef local_irq_enable_exit_to_user
-static __always_inline void local_irq_enable_exit_to_user(unsigned long ti_work)
-{
-	local_irq_enable();
-}
-#endif
-
-/**
- * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable()
- *
- * Defaults to local_irq_disable(). Can be supplied by architecture specific
- * code.
- */
-static inline void local_irq_disable_exit_to_user(void);
-
-#ifndef local_irq_disable_exit_to_user
-static __always_inline void local_irq_disable_exit_to_user(void)
-{
-	local_irq_disable();
-}
-#endif
-
 /**
  * arch_exit_to_user_mode_work - Architecture specific TIF work for exit
  *				 to user mode.
@@ -348,6 +317,8 @@ static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
  */
 static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs)
 {
+	lockdep_assert_irqs_disabled();
+
 	instrumentation_begin();
 	irqentry_exit_to_user_mode_prepare(regs);
 	instrumentation_end();
@@ -378,6 +349,207 @@ typedef struct irqentry_state {
 } irqentry_state_t;
 #endif
 
+/**
+ * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt
+ *
+ * Conditional reschedule with additional sanity checks.
+ */
+void raw_irqentry_exit_cond_resched(void);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define irqentry_exit_cond_resched_dynamic_enabled	raw_irqentry_exit_cond_resched
+#define irqentry_exit_cond_resched_dynamic_disabled	NULL
+DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#define irqentry_exit_cond_resched()	static_call(irqentry_exit_cond_resched)()
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void);
+#define irqentry_exit_cond_resched()	dynamic_irqentry_exit_cond_resched()
+#endif
+#else /* CONFIG_PREEMPT_DYNAMIC */
+#define irqentry_exit_cond_resched()	raw_irqentry_exit_cond_resched()
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
+/**
+ * irqentry_enter_from_kernel_mode - Establish state before invoking the irq handler
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Invoked from architecture specific entry code with interrupts disabled.
+ * Can only be called when the interrupt entry came from kernel mode. The
+ * calling code must be non-instrumentable.  When the function returns all
+ * state is correct and the subsequent functions can be instrumented.
+ *
+ * The function establishes state (lockdep, RCU (context tracking), tracing) and
+ * is provided for architectures which require a strict split between entry from
+ * kernel and user mode and therefore cannot use irqentry_enter() which handles
+ * both entry modes.
+ *
+ * Returns: An opaque object that must be passed to irqentry_exit_to_kernel_mode().
+ */
+static __always_inline irqentry_state_t irqentry_enter_from_kernel_mode(struct pt_regs *regs)
+{
+	irqentry_state_t ret = {
+		.exit_rcu = false,
+	};
+
+	/*
+	 * If this entry hit the idle task invoke ct_irq_enter() whether
+	 * RCU is watching or not.
+	 *
+	 * Interrupts can nest when the first interrupt invokes softirq
+	 * processing on return which enables interrupts.
+	 *
+	 * Scheduler ticks in the idle task can mark quiescent state and
+	 * terminate a grace period, if and only if the timer interrupt is
+	 * not nested into another interrupt.
+	 *
+	 * Checking for rcu_is_watching() here would prevent the nesting
+	 * interrupt to invoke ct_irq_enter(). If that nested interrupt is
+	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
+	 * assume that it is the first interrupt and eventually claim
+	 * quiescent state and end grace periods prematurely.
+	 *
+	 * Unconditionally invoke ct_irq_enter() so RCU state stays
+	 * consistent.
+	 *
+	 * TINY_RCU does not support EQS, so let the compiler eliminate
+	 * this part when enabled.
+	 */
+	if (!IS_ENABLED(CONFIG_TINY_RCU) &&
+	    (is_idle_task(current) || arch_in_rcu_eqs())) {
+		/*
+		 * If RCU is not watching then the same careful
+		 * sequence vs. lockdep and tracing is required
+		 * as in irqentry_enter_from_user_mode().
+		 */
+		lockdep_hardirqs_off(CALLER_ADDR0);
+		ct_irq_enter();
+		instrumentation_begin();
+		kmsan_unpoison_entry_regs(regs);
+		trace_hardirqs_off_finish();
+		instrumentation_end();
+
+		ret.exit_rcu = true;
+		return ret;
+	}
+
+	/*
+	 * If RCU is watching then RCU only wants to check whether it needs
+	 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
+	 * already contains a warning when RCU is not watching, so no point
+	 * in having another one here.
+	 */
+	lockdep_hardirqs_off(CALLER_ADDR0);
+	instrumentation_begin();
+	kmsan_unpoison_entry_regs(regs);
+	rcu_irq_enter_check_tick();
+	trace_hardirqs_off_finish();
+	instrumentation_end();
+
+	return ret;
+}
+
+/**
+ * irqentry_exit_to_kernel_mode_preempt - Run preempt checks on return to kernel mode
+ * @regs:	Pointer to current's pt_regs
+ * @state:	Return value from matching call to irqentry_enter_from_kernel_mode()
+ *
+ * This is to be invoked before irqentry_exit_to_kernel_mode_after_preempt() to
+ * allow kernel preemption on return from interrupt.
+ *
+ * Must be invoked with interrupts disabled and CPU state which allows kernel
+ * preemption.
+ *
+ * After returning from this function, the caller can modify CPU state before
+ * invoking irqentry_exit_to_kernel_mode_after_preempt(), which is required to
+ * re-establish the tracing, lockdep and RCU state for returning to the
+ * interrupted context.
+ */
+static inline void irqentry_exit_to_kernel_mode_preempt(struct pt_regs *regs,
+							irqentry_state_t state)
+{
+	if (regs_irqs_disabled(regs) || state.exit_rcu)
+		return;
+
+	if (IS_ENABLED(CONFIG_PREEMPTION))
+		irqentry_exit_cond_resched();
+
+	hrtimer_rearm_deferred();
+}
+
+/**
+ * irqentry_exit_to_kernel_mode_after_preempt - Establish trace, lockdep and RCU state
+ * @regs:	Pointer to current's pt_regs
+ * @state:	Return value from matching call to irqentry_enter_from_kernel_mode()
+ *
+ * This is to be invoked after irqentry_exit_to_kernel_mode_preempt() and before
+ * actually returning to the interrupted context.
+ *
+ * There are no requirements for the CPU state other than being able to complete
+ * the tracing, lockdep and RCU state transitions. After this function returns
+ * the caller must return directly to the interrupted context.
+ */
+static __always_inline void
+irqentry_exit_to_kernel_mode_after_preempt(struct pt_regs *regs, irqentry_state_t state)
+{
+	if (!regs_irqs_disabled(regs)) {
+		/*
+		 * If RCU was not watching on entry this needs to be done
+		 * carefully and needs the same ordering of lockdep/tracing
+		 * and RCU as the return to user mode path.
+		 */
+		if (state.exit_rcu) {
+			instrumentation_begin();
+			/* Tell the tracer that IRET will enable interrupts */
+			trace_hardirqs_on_prepare();
+			lockdep_hardirqs_on_prepare();
+			instrumentation_end();
+			ct_irq_exit();
+			lockdep_hardirqs_on(CALLER_ADDR0);
+			return;
+		}
+
+		instrumentation_begin();
+		/* Covers both tracing and lockdep */
+		trace_hardirqs_on();
+		instrumentation_end();
+	} else {
+		/*
+		 * IRQ flags state is correct already. Just tell RCU if it
+		 * was not watching on entry.
+		 */
+		if (state.exit_rcu)
+			ct_irq_exit();
+	}
+}
+
+/**
+ * irqentry_exit_to_kernel_mode - Run preempt checks and establish state after
+ *				  invoking the interrupt handler
+ * @regs:	Pointer to current's pt_regs
+ * @state:	Return value from matching call to irqentry_enter_from_kernel_mode()
+ *
+ * This is the counterpart of irqentry_enter_from_kernel_mode() and combines
+ * the calls to irqentry_exit_to_kernel_mode_preempt() and
+ * irqentry_exit_to_kernel_mode_after_preempt().
+ *
+ * The requirement for the CPU state is that it can schedule. After the function
+ * returns the tracing, lockdep and RCU state transitions are completed and the
+ * caller must return directly to the interrupted context.
+ */
+static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs,
+							 irqentry_state_t state)
+{
+	lockdep_assert_irqs_disabled();
+
+	instrumentation_begin();
+	irqentry_exit_to_kernel_mode_preempt(regs, state);
+	instrumentation_end();
+
+	irqentry_exit_to_kernel_mode_after_preempt(regs, state);
+}
+
 /**
  * irqentry_enter - Handle state tracking on ordinary interrupt entries
  * @regs:	Pointer to pt_regs of interrupted context
@@ -407,32 +579,10 @@ typedef struct irqentry_state {
  * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
  * would not be possible.
  *
- * Returns: An opaque object that must be passed to idtentry_exit()
+ * Returns: An opaque object that must be passed to irqentry_exit()
  */
 irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
 
-/**
- * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt
- *
- * Conditional reschedule with additional sanity checks.
- */
-void raw_irqentry_exit_cond_resched(void);
-
-#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#define irqentry_exit_cond_resched_dynamic_enabled	raw_irqentry_exit_cond_resched
-#define irqentry_exit_cond_resched_dynamic_disabled	NULL
-DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
-#define irqentry_exit_cond_resched()	static_call(irqentry_exit_cond_resched)()
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
-void dynamic_irqentry_exit_cond_resched(void);
-#define irqentry_exit_cond_resched()	dynamic_irqentry_exit_cond_resched()
-#endif
-#else /* CONFIG_PREEMPT_DYNAMIC */
-#define irqentry_exit_cond_resched()	raw_irqentry_exit_cond_resched()
-#endif /* CONFIG_PREEMPT_DYNAMIC */
-
 /**
  * irqentry_exit - Handle return from exception that used irqentry_enter()
  * @regs:	Pointer to pt_regs (exception entry regs)
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 9e1a6afb07f2..19d2244a9fef 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -47,7 +47,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
 	 */
 	while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
 
-		local_irq_enable_exit_to_user(ti_work);
+		local_irq_enable();
 
 		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
 			if (!rseq_grant_slice_extension(ti_work, TIF_SLICE_EXT_DENY))
@@ -74,7 +74,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
 		 * might have changed while interrupts and preemption was
 		 * enabled above.
 		 */
-		local_irq_disable_exit_to_user();
+		local_irq_disable();
 
 		/* Check if any of the above work has queued a deferred wakeup */
 		tick_nohz_user_enter_prepare();
@@ -105,70 +105,16 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 
 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 {
-	irqentry_state_t ret = {
-		.exit_rcu = false,
-	};
-
 	if (user_mode(regs)) {
+		irqentry_state_t ret = {
+			.exit_rcu = false,
+		};
+
 		irqentry_enter_from_user_mode(regs);
 		return ret;
 	}
 
-	/*
-	 * If this entry hit the idle task invoke ct_irq_enter() whether
-	 * RCU is watching or not.
-	 *
-	 * Interrupts can nest when the first interrupt invokes softirq
-	 * processing on return which enables interrupts.
-	 *
-	 * Scheduler ticks in the idle task can mark quiescent state and
-	 * terminate a grace period, if and only if the timer interrupt is
-	 * not nested into another interrupt.
-	 *
-	 * Checking for rcu_is_watching() here would prevent the nesting
-	 * interrupt to invoke ct_irq_enter(). If that nested interrupt is
-	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
-	 * assume that it is the first interrupt and eventually claim
-	 * quiescent state and end grace periods prematurely.
-	 *
-	 * Unconditionally invoke ct_irq_enter() so RCU state stays
-	 * consistent.
-	 *
-	 * TINY_RCU does not support EQS, so let the compiler eliminate
-	 * this part when enabled.
-	 */
-	if (!IS_ENABLED(CONFIG_TINY_RCU) &&
-	    (is_idle_task(current) || arch_in_rcu_eqs())) {
-		/*
-		 * If RCU is not watching then the same careful
-		 * sequence vs. lockdep and tracing is required
-		 * as in irqentry_enter_from_user_mode().
-		 */
-		lockdep_hardirqs_off(CALLER_ADDR0);
-		ct_irq_enter();
-		instrumentation_begin();
-		kmsan_unpoison_entry_regs(regs);
-		trace_hardirqs_off_finish();
-		instrumentation_end();
-
-		ret.exit_rcu = true;
-		return ret;
-	}
-
-	/*
-	 * If RCU is watching then RCU only wants to check whether it needs
-	 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
-	 * already contains a warning when RCU is not watching, so no point
-	 * in having another one here.
-	 */
-	lockdep_hardirqs_off(CALLER_ADDR0);
-	instrumentation_begin();
-	kmsan_unpoison_entry_regs(regs);
-	rcu_irq_enter_check_tick();
-	trace_hardirqs_off_finish();
-	instrumentation_end();
-
-	return ret;
+	return irqentry_enter_from_kernel_mode(regs);
 }
 
 /**
@@ -212,45 +158,10 @@ void dynamic_irqentry_exit_cond_resched(void)
 
 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
 {
-	lockdep_assert_irqs_disabled();
-
-	/* Check whether this returns to user mode */
-	if (user_mode(regs)) {
+	if (user_mode(regs))
 		irqentry_exit_to_user_mode(regs);
-	} else if (!regs_irqs_disabled(regs)) {
-		/*
-		 * If RCU was not watching on entry this needs to be done
-		 * carefully and needs the same ordering of lockdep/tracing
-		 * and RCU as the return to user mode path.
-		 */
-		if (state.exit_rcu) {
-			instrumentation_begin();
-			hrtimer_rearm_deferred();
-			/* Tell the tracer that IRET will enable interrupts */
-			trace_hardirqs_on_prepare();
-			lockdep_hardirqs_on_prepare();
-			instrumentation_end();
-			ct_irq_exit();
-			lockdep_hardirqs_on(CALLER_ADDR0);
-			return;
-		}
-
-		instrumentation_begin();
-		if (IS_ENABLED(CONFIG_PREEMPTION))
-			irqentry_exit_cond_resched();
-
-		hrtimer_rearm_deferred();
-		/* Covers both tracing and lockdep */
-		trace_hardirqs_on();
-		instrumentation_end();
-	} else {
-		/*
-		 * IRQ flags state is correct already. Just tell RCU if it
-		 * was not watching on entry.
-		 */
-		if (state.exit_rcu)
-			ct_irq_exit();
-	}
+	else
+		irqentry_exit_to_kernel_mode(regs, state);
 }
 
 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index c2661a312fc9..e22703d6b97c 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -56,7 +56,8 @@ static void atomics_sigill(void)
 
 static void cmpbr_sigill(void)
 {
-	/* Not implemented, too complicated and unreliable anyway */
+	asm volatile(".inst 0x74C00040\n" /* CBEQ w0, w0, +8 */
+		     "udf #0" : : : "cc"); /* UDF #0 */
 }
 
 static void crc32_sigill(void)
diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c
index 73de5be58bab..fa3478a6c914 100644
--- a/tools/testing/selftests/kvm/arm64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c
@@ -124,6 +124,7 @@ static const struct reg_ftr_bits ftr_id_aa64isar2_el1[] = {
 
 static const struct reg_ftr_bits ftr_id_aa64isar3_el1[] = {
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, FPRCVT, 0),
+	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, LSUI, 0),
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, LSFE, 0),
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, FAMINMAX, 0),
 	REG_FTR_END,