diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index 47d9a3df6329..aa12708ddb96 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -24,7 +24,8 @@ Performance monitor support thunderx2-pmu alibaba_pmu dwc_pcie_pmu - nvidia-pmu + nvidia-tegra241-pmu + nvidia-tegra410-pmu meson-ddr-pmu cxl ampere_cspmu diff --git a/Documentation/admin-guide/perf/nvidia-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst similarity index 98% rename from Documentation/admin-guide/perf/nvidia-pmu.rst rename to Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst index f538ef67e0e8..fad5bc4cee6c 100644 --- a/Documentation/admin-guide/perf/nvidia-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra241-pmu.rst @@ -1,8 +1,8 @@ -========================================================= -NVIDIA Tegra SoC Uncore Performance Monitoring Unit (PMU) -========================================================= +============================================================ +NVIDIA Tegra241 SoC Uncore Performance Monitoring Unit (PMU) +============================================================ -The NVIDIA Tegra SoC includes various system PMUs to measure key performance +The NVIDIA Tegra241 SoC includes various system PMUs to measure key performance metrics like memory bandwidth, latency, and utilization: * Scalable Coherency Fabric (SCF) diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst new file mode 100644 index 000000000000..0656223b61d4 --- /dev/null +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -0,0 +1,522 @@ +===================================================================== +NVIDIA Tegra410 SoC Uncore Performance Monitoring Unit (PMU) +===================================================================== + +The NVIDIA Tegra410 SoC includes various system PMUs to measure key performance +metrics like memory bandwidth, latency, and utilization: + +* Unified Coherence Fabric (UCF) +* PCIE +* PCIE-TGT +* CPU Memory (CMEM) Latency +* NVLink-C2C +* NV-CLink +* NV-DLink + +PMU Driver +---------- + +The PMU driver describes the available events and configuration of each PMU in +sysfs. Please see the sections below to get the sysfs path of each PMU. Like +other uncore PMU drivers, the driver provides "cpumask" sysfs attribute to show +the CPU id used to handle the PMU event. There is also "associated_cpus" +sysfs attribute, which contains a list of CPUs associated with the PMU instance. + +UCF PMU +------- + +The Unified Coherence Fabric (UCF) in the NVIDIA Tegra410 SoC serves as a +distributed cache, last level for CPU Memory and CXL Memory, and cache coherent +interconnect that supports hardware coherence across multiple coherently caching +agents, including: + + * CPU clusters + * GPU + * PCIe Ordering Controller Unit (OCU) + * Other IO-coherent requesters + +The events and configuration options of this PMU device are described in sysfs, +see /sys/bus/event_source/devices/nvidia_ucf_pmu_. + +Some of the events available in this PMU can be used to measure bandwidth and +utilization: + + * slc_access_rd: count the number of read requests to SLC. + * slc_access_wr: count the number of write requests to SLC. + * slc_bytes_rd: count the number of bytes transferred by slc_access_rd. + * slc_bytes_wr: count the number of bytes transferred by slc_access_wr. + * mem_access_rd: count the number of read requests to local or remote memory. + * mem_access_wr: count the number of write requests to local or remote memory. + * mem_bytes_rd: count the number of bytes transferred by mem_access_rd. + * mem_bytes_wr: count the number of bytes transferred by mem_access_wr. + * cycles: counts the UCF cycles. + +The average bandwidth is calculated as:: + + AVG_SLC_READ_BANDWIDTH_IN_GBPS = SLC_BYTES_RD / ELAPSED_TIME_IN_NS + AVG_SLC_WRITE_BANDWIDTH_IN_GBPS = SLC_BYTES_WR / ELAPSED_TIME_IN_NS + AVG_MEM_READ_BANDWIDTH_IN_GBPS = MEM_BYTES_RD / ELAPSED_TIME_IN_NS + AVG_MEM_WRITE_BANDWIDTH_IN_GBPS = MEM_BYTES_WR / ELAPSED_TIME_IN_NS + +The average request rate is calculated as:: + + AVG_SLC_READ_REQUEST_RATE = SLC_ACCESS_RD / CYCLES + AVG_SLC_WRITE_REQUEST_RATE = SLC_ACCESS_WR / CYCLES + AVG_MEM_READ_REQUEST_RATE = MEM_ACCESS_RD / CYCLES + AVG_MEM_WRITE_REQUEST_RATE = MEM_ACCESS_WR / CYCLES + +More details about what other events are available can be found in Tegra410 SoC +technical reference manual. + +The events can be filtered based on source or destination. The source filter +indicates the traffic initiator to the SLC, e.g local CPU, non-CPU device, or +remote socket. The destination filter specifies the destination memory type, +e.g. local system memory (CMEM), local GPU memory (GMEM), or remote memory. The +local/remote classification of the destination filter is based on the home +socket of the address, not where the data actually resides. The available +filters are described in +/sys/bus/event_source/devices/nvidia_ucf_pmu_/format/. + +The list of UCF PMU event filters: + +* Source filter: + + * src_loc_cpu: if set, count events from local CPU + * src_loc_noncpu: if set, count events from local non-CPU device + * src_rem: if set, count events from CPU, GPU, PCIE devices of remote socket + +* Destination filter: + + * dst_loc_cmem: if set, count events to local system memory (CMEM) address + * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address + * dst_loc_other: if set, count events to local CXL memory address + * dst_rem: if set, count events to CPU, GPU, and CXL memory address of remote socket + +If the source is not specified, the PMU will count events from all sources. If +the destination is not specified, the PMU will count events to all destinations. + +Example usage: + +* Count event id 0x0 in socket 0 from all sources and to all destinations:: + + perf stat -a -e nvidia_ucf_pmu_0/event=0x0/ + +* Count event id 0x0 in socket 0 with source filter = local CPU and destination + filter = local system memory (CMEM):: + + perf stat -a -e nvidia_ucf_pmu_0/event=0x0,src_loc_cpu=0x1,dst_loc_cmem=0x1/ + +* Count event id 0x0 in socket 1 with source filter = local non-CPU device and + destination filter = remote memory:: + + perf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/ + +PCIE PMU +-------- + +This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and +the memory subsystem. It monitors all read/write traffic from the root port(s) +or a particular BDF in a PCIE RC to local or remote memory. There is one PMU per +PCIE RC in the SoC. Each RC can have up to 16 lanes that can be bifurcated into +up to 8 root ports. The traffic from each root port can be filtered using RP or +BDF filter. For example, specifying "src_rp_mask=0xFF" means the PMU counter will +capture traffic from all RPs. Please see below for more details. + +The events and configuration options of this PMU device are described in sysfs, +see /sys/bus/event_source/devices/nvidia_pcie_pmu__rc_. + +The events in this PMU can be used to measure bandwidth, utilization, and +latency: + + * rd_req: count the number of read requests by PCIE device. + * wr_req: count the number of write requests by PCIE device. + * rd_bytes: count the number of bytes transferred by rd_req. + * wr_bytes: count the number of bytes transferred by wr_req. + * rd_cum_outs: count outstanding rd_req each cycle. + * cycles: count the clock cycles of SOC fabric connected to the PCIE interface. + +The average bandwidth is calculated as:: + + AVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS + AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS + +The average request rate is calculated as:: + + AVG_RD_REQUEST_RATE = RD_REQ / CYCLES + AVG_WR_REQUEST_RATE = WR_REQ / CYCLES + + +The average latency is calculated as:: + + FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ + AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ + +The PMU events can be filtered based on the traffic source and destination. +The source filter indicates the PCIE devices that will be monitored. The +destination filter specifies the destination memory type, e.g. local system +memory (CMEM), local GPU memory (GMEM), or remote memory. The local/remote +classification of the destination filter is based on the home socket of the +address, not where the data actually resides. These filters can be found in +/sys/bus/event_source/devices/nvidia_pcie_pmu__rc_/format/. + +The list of event filters: + +* Source filter: + + * src_rp_mask: bitmask of root ports that will be monitored. Each bit in this + bitmask represents the RP index in the RC. If the bit is set, all devices under + the associated RP will be monitored. E.g "src_rp_mask=0xF" will monitor + devices in root port 0 to 3. + * src_bdf: the BDF that will be monitored. This is a 16-bit value that + follows formula: (bus << 8) + (device << 3) + (function). For example, the + value of BDF 27:01.1 is 0x2781. + * src_bdf_en: enable the BDF filter. If this is set, the BDF filter value in + "src_bdf" is used to filter the traffic. + + Note that Root-Port and BDF filters are mutually exclusive and the PMU in + each RC can only have one BDF filter for the whole counters. If BDF filter + is enabled, the BDF filter value will be applied to all events. + +* Destination filter: + + * dst_loc_cmem: if set, count events to local system memory (CMEM) address + * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address + * dst_loc_pcie_p2p: if set, count events to local PCIE peer address + * dst_loc_pcie_cxl: if set, count events to local CXL memory address + * dst_rem: if set, count events to remote memory address + +If the source filter is not specified, the PMU will count events from all root +ports. If the destination filter is not specified, the PMU will count events +to all destinations. + +Example usage: + +* Count event id 0x0 from root port 0 of PCIE RC-0 on socket 0 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_0/event=0x0,src_rp_mask=0x1/ + +* Count event id 0x1 from root port 0 and 1 of PCIE RC-1 on socket 0 and + targeting just local CMEM of socket 0:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_1/event=0x1,src_rp_mask=0x3,dst_loc_cmem=0x1/ + +* Count event id 0x2 from root port 0 of PCIE RC-2 on socket 1 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_1_rc_2/event=0x2,src_rp_mask=0x1/ + +* Count event id 0x3 from root port 0 and 1 of PCIE RC-3 on socket 1 and + targeting just local CMEM of socket 1:: + + perf stat -a -e nvidia_pcie_pmu_1_rc_3/event=0x3,src_rp_mask=0x3,dst_loc_cmem=0x1/ + +* Count event id 0x4 from BDF 01:01.0 of PCIE RC-4 on socket 0 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/ + +.. _NVIDIA_T410_PCIE_PMU_RC_Mapping_Section: + +Mapping the RC# to lspci segment number +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Mapping the RC# to lspci segment number can be non-trivial; hence a new NVIDIA +Designated Vendor Specific Capability (DVSEC) register is added into the PCIE config space +for each RP. This DVSEC has vendor id "10de" and DVSEC id of "0x4". The DVSEC register +contains the following information to map PCIE devices under the RP back to its RC# : + + - Bus# (byte 0xc) : bus number as reported by the lspci output + - Segment# (byte 0xd) : segment number as reported by the lspci output + - RP# (byte 0xe) : port number as reported by LnkCap attribute from lspci for a device with Root Port capability + - RC# (byte 0xf): root complex number associated with the RP + - Socket# (byte 0x10): socket number associated with the RP + +Example script for mapping lspci BDF to RC# and socket#:: + + #!/bin/bash + while read bdf rest; do + dvsec4_reg=$(lspci -vv -s $bdf | awk ' + /Designated Vendor-Specific: Vendor=10de ID=0004/ { + match($0, /\[([0-9a-fA-F]+)/, arr); + print "0x" arr[1]; + exit + } + ') + if [ -n "$dvsec4_reg" ]; then + bus=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xc))).b) + segment=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xd))).b) + rp=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xe))).b) + rc=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xf))).b) + socket=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0x10))).b) + echo "$bdf: Bus=$bus, Segment=$segment, RP=$rp, RC=$rc, Socket=$socket" + fi + done < <(lspci -d 10de:) + +Example output:: + + 0001:00:00.0: Bus=00, Segment=01, RP=00, RC=00, Socket=00 + 0002:80:00.0: Bus=80, Segment=02, RP=01, RC=01, Socket=00 + 0002:a0:00.0: Bus=a0, Segment=02, RP=02, RC=01, Socket=00 + 0002:c0:00.0: Bus=c0, Segment=02, RP=03, RC=01, Socket=00 + 0002:e0:00.0: Bus=e0, Segment=02, RP=04, RC=01, Socket=00 + 0003:00:00.0: Bus=00, Segment=03, RP=00, RC=02, Socket=00 + 0004:00:00.0: Bus=00, Segment=04, RP=00, RC=03, Socket=00 + 0005:00:00.0: Bus=00, Segment=05, RP=00, RC=04, Socket=00 + 0005:40:00.0: Bus=40, Segment=05, RP=01, RC=04, Socket=00 + 0005:c0:00.0: Bus=c0, Segment=05, RP=02, RC=04, Socket=00 + 0006:00:00.0: Bus=00, Segment=06, RP=00, RC=05, Socket=00 + 0009:00:00.0: Bus=00, Segment=09, RP=00, RC=00, Socket=01 + 000a:80:00.0: Bus=80, Segment=0a, RP=01, RC=01, Socket=01 + 000a:a0:00.0: Bus=a0, Segment=0a, RP=02, RC=01, Socket=01 + 000a:e0:00.0: Bus=e0, Segment=0a, RP=03, RC=01, Socket=01 + 000b:00:00.0: Bus=00, Segment=0b, RP=00, RC=02, Socket=01 + 000c:00:00.0: Bus=00, Segment=0c, RP=00, RC=03, Socket=01 + 000d:00:00.0: Bus=00, Segment=0d, RP=00, RC=04, Socket=01 + 000d:40:00.0: Bus=40, Segment=0d, RP=01, RC=04, Socket=01 + 000d:c0:00.0: Bus=c0, Segment=0d, RP=02, RC=04, Socket=01 + 000e:00:00.0: Bus=00, Segment=0e, RP=00, RC=05, Socket=01 + +PCIE-TGT PMU +------------ + +This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and +the memory subsystem. It monitors traffic targeting PCIE BAR and CXL HDM ranges. +There is one PCIE-TGT PMU per PCIE RC in the SoC. Each RC in Tegra410 SoC can +have up to 16 lanes that can be bifurcated into up to 8 root ports (RP). The PMU +provides RP filter to count PCIE BAR traffic to each RP and address filter to +count access to PCIE BAR or CXL HDM ranges. The details of the filters are +described in the following sections. + +Mapping the RC# to lspci segment number is similar to the PCIE PMU. Please see +:ref:`NVIDIA_T410_PCIE_PMU_RC_Mapping_Section` for more info. + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_pcie_tgt_pmu__rc_. + +The events in this PMU can be used to measure bandwidth and utilization: + + * rd_req: count the number of read requests to PCIE. + * wr_req: count the number of write requests to PCIE. + * rd_bytes: count the number of bytes transferred by rd_req. + * wr_bytes: count the number of bytes transferred by wr_req. + * cycles: count the clock cycles of SOC fabric connected to the PCIE interface. + +The average bandwidth is calculated as:: + + AVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS + AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS + +The average request rate is calculated as:: + + AVG_RD_REQUEST_RATE = RD_REQ / CYCLES + AVG_WR_REQUEST_RATE = WR_REQ / CYCLES + +The PMU events can be filtered based on the destination root port or target +address range. Filtering based on RP is only available for PCIE BAR traffic. +Address filter works for both PCIE BAR and CXL HDM ranges. These filters can be +found in sysfs, see +/sys/bus/event_source/devices/nvidia_pcie_tgt_pmu__rc_/format/. + +Destination filter settings: + +* dst_rp_mask: bitmask to select the root port(s) to monitor. E.g. "dst_rp_mask=0xFF" + corresponds to all root ports (from 0 to 7) in the PCIE RC. Note that this filter is + only available for PCIE BAR traffic. +* dst_addr_base: BAR or CXL HDM filter base address. +* dst_addr_mask: BAR or CXL HDM filter address mask. +* dst_addr_en: enable BAR or CXL HDM address range filter. If this is set, the + address range specified by "dst_addr_base" and "dst_addr_mask" will be used to filter + the PCIE BAR and CXL HDM traffic address. The PMU uses the following comparison + to determine if the traffic destination address falls within the filter range:: + + (txn's addr & dst_addr_mask) == (dst_addr_base & dst_addr_mask) + + If the comparison succeeds, then the event will be counted. + +If the destination filter is not specified, the RP filter will be configured by default +to count PCIE BAR traffic to all root ports. + +Example usage: + +* Count event id 0x0 to root port 0 and 1 of PCIE RC-0 on socket 0:: + + perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_0/event=0x0,dst_rp_mask=0x3/ + +* Count event id 0x1 for accesses to PCIE BAR or CXL HDM address range + 0x10000 to 0x100FF on socket 0's PCIE RC-1:: + + perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/ + +CPU Memory (CMEM) Latency PMU +----------------------------- + +This PMU monitors latency events of memory read requests from the edge of the +Unified Coherence Fabric (UCF) to local CPU DRAM: + + * RD_REQ counters: count read requests (32B per request). + * RD_CUM_OUTS counters: accumulated outstanding request counter, which track + how many cycles the read requests are in flight. + * CYCLES counter: counts the number of elapsed cycles. + +The average latency is calculated as:: + + FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ + AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ + +The events and configuration options of this PMU device are described in sysfs, +see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_. + +Example usage:: + + perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}' + +NVLink-C2C PMU +-------------- + +This PMU monitors latency events of memory read/write requests that pass through +the NVIDIA Chip-to-Chip (C2C) interface. Bandwidth events are not available +in this PMU, unlike the C2C PMU in Grace (Tegra241 SoC). + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_. + +The list of events: + + * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests. + * IN_RD_REQ: the number of incoming read requests. + * IN_WR_CUM_OUTS: accumulated outstanding request (in cycles) of incoming write requests. + * IN_WR_REQ: the number of incoming write requests. + * OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests. + * OUT_RD_REQ: the number of outgoing read requests. + * OUT_WR_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing write requests. + * OUT_WR_REQ: the number of outgoing write requests. + * CYCLES: NVLink-C2C interface cycle counts. + +The incoming events count the reads/writes from remote device to the SoC. +The outgoing events count the reads/writes from the SoC to remote device. + +The sysfs /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_/peer +contains the information about the connected device. + +When the C2C interface is connected to GPU(s), the user can use the +"gpu_mask" parameter to filter traffic to/from specific GPU(s). Each bit represents the GPU +index, e.g. "gpu_mask=0x1" corresponds to GPU 0 and "gpu_mask=0x3" is for GPU 0 and 1. +The PMU will monitor all GPUs by default if not specified. + +When connected to another SoC, only the read events are available. + +The events can be used to calculate the average latency of the read/write requests:: + + C2C_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + + IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ + IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + + IN_WR_AVG_LATENCY_IN_CYCLES = IN_WR_CUM_OUTS / IN_WR_REQ + IN_WR_AVG_LATENCY_IN_NS = IN_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + + OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ + OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + + OUT_WR_AVG_LATENCY_IN_CYCLES = OUT_WR_CUM_OUTS / OUT_WR_REQ + OUT_WR_AVG_LATENCY_IN_NS = OUT_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ + +Example usage: + + * Count incoming traffic from all GPUs connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_req/ + + * Count incoming traffic from GPU 0 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x1/ + + * Count incoming traffic from GPU 1 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x2/ + + * Count outgoing traffic to all GPUs connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_req/ + + * Count outgoing traffic to GPU 0 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x1/ + + * Count outgoing traffic to GPU 1 connected via NVLink-C2C:: + + perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x2/ + +NV-CLink PMU +------------ + +This PMU monitors latency events of memory read requests that pass through +the NV-CLINK interface. Bandwidth events are not available in this PMU. +In Tegra410 SoC, the NV-CLink interface is used to connect to another Tegra410 +SoC and this PMU only counts read traffic. + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_nvclink_pmu_. + +The list of events: + + * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests. + * IN_RD_REQ: the number of incoming read requests. + * OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests. + * OUT_RD_REQ: the number of outgoing read requests. + * CYCLES: NV-CLINK interface cycle counts. + +The incoming events count the reads from remote device to the SoC. +The outgoing events count the reads from the SoC to remote device. + +The events can be used to calculate the average latency of the read requests:: + + CLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + + IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ + IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ + + OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ + OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ + +Example usage: + + * Count incoming read traffic from remote SoC connected via NV-CLINK:: + + perf stat -a -e nvidia_nvclink_pmu_0/in_rd_req/ + + * Count outgoing read traffic to remote SoC connected via NV-CLINK:: + + perf stat -a -e nvidia_nvclink_pmu_0/out_rd_req/ + +NV-DLink PMU +------------ + +This PMU monitors latency events of memory read requests that pass through +the NV-DLINK interface. Bandwidth events are not available in this PMU. +In Tegra410 SoC, this PMU only counts CXL memory read traffic. + +The events and configuration options of this PMU device are available in sysfs, +see /sys/bus/event_source/devices/nvidia_nvdlink_pmu_. + +The list of events: + + * IN_RD_CUM_OUTS: accumulated outstanding read requests (in cycles) to CXL memory. + * IN_RD_REQ: the number of read requests to CXL memory. + * CYCLES: NV-DLINK interface cycle counts. + +The events can be used to calculate the average latency of the read requests:: + + DLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + + IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ + IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / DLINK_FREQ_IN_GHZ + +Example usage: + + * Count read events to CXL memory:: + + perf stat -a -e '{nvidia_nvdlink_pmu_0/in_rd_req/,nvidia_nvdlink_pmu_0/in_rd_cum_outs/}' diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst index af52edc8c0ac..98052b4ef4a1 100644 --- a/Documentation/arch/arm64/index.rst +++ b/Documentation/arch/arm64/index.rst @@ -23,6 +23,7 @@ ARM64 Architecture memory memory-tagging-extension mops + mpam perf pointer-authentication ptdump diff --git a/Documentation/arch/arm64/mpam.rst b/Documentation/arch/arm64/mpam.rst new file mode 100644 index 000000000000..570f51a8d4eb --- /dev/null +++ b/Documentation/arch/arm64/mpam.rst @@ -0,0 +1,72 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +MPAM +==== + +What is MPAM +============ +MPAM (Memory Partitioning and Monitoring) is a feature in the CPUs and memory +system components such as the caches or memory controllers that allow memory +traffic to be labelled, partitioned and monitored. + +Traffic is labelled by the CPU, based on the control or monitor group the +current task is assigned to using resctrl. Partitioning policy can be set +using the schemata file in resctrl, and monitor values read via resctrl. +See Documentation/filesystems/resctrl.rst for more details. + +This allows tasks that share memory system resources, such as caches, to be +isolated from each other according to the partitioning policy (so called noisy +neighbours). + +Supported Platforms +=================== +Use of this feature requires CPU support, support in the memory system +components, and a description from firmware of where the MPAM device controls +are in the MMIO address space. (e.g. the 'MPAM' ACPI table). + +The MMIO device that provides MPAM controls/monitors for a memory system +component is called a memory system component. (MSC). + +Because the user interface to MPAM is via resctrl, only MPAM features that are +compatible with resctrl can be exposed to user-space. + +MSC are considered as a group based on the topology. MSC that correspond with +the L3 cache are considered together, it is not possible to mix MSC between L2 +and L3 to 'cover' a resctrl schema. + +The supported features are: + +* Cache portion bitmap controls (CPOR) on the L2 or L3 caches. To expose + CPOR at L2 or L3, every CPU must have a corresponding CPU cache at this + level that also supports the feature. Mismatched big/little platforms are + not supported as resctrl's controls would then also depend on task + placement. + +* Memory bandwidth maximum controls (MBW_MAX) on or after the L3 cache. + resctrl uses the L3 cache-id to identify where the memory bandwidth + control is applied. For this reason the platform must have an L3 cache + with cache-id's supplied by firmware. (It doesn't need to support MPAM.) + + To be exported as the 'MB' schema, the topology of the group of MSC chosen + must match the topology of the L3 cache so that the cache-id's can be + repainted. For example: Platforms with Memory bandwidth maximum controls + on CPU-less NUMA nodes cannot expose the 'MB' schema to resctrl as these + nodes do not have a corresponding L3 cache. If the memory bandwidth + control is on the memory rather than the L3 then there must be a single + global L3 as otherwise it is unknown which L3 the traffic came from. There + must be no caches between the L3 and the memory so that the two ends of + the path have equivalent traffic. + + When the MPAM driver finds multiple groups of MSC it can use for the 'MB' + schema, it prefers the group closest to the L3 cache. + +* Cache Storage Usage (CSU) counters can expose the 'llc_occupancy' provided + there is at least one CSU monitor on each MSC that makes up the L3 group. + Exposing CSU counters from other caches or devices is not supported. + +Reporting Bugs +============== +If you are not seeing the counters or controls you expect please share the +debug messages produced when enabling dynamic debug and booting with: +dyndbg="file mpam_resctrl.c +pl" diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 4c300caad901..65ed6ea33751 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -214,6 +214,9 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | SI L1 | #4311569 | ARM64_ERRATUM_4311569 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | CMN-650 | #3642720 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_845719 | +----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_843419 | @@ -247,6 +250,12 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 GICv3/4.x | T241-FABRIC-4 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-6 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index 2ec0e5e83fc9..ecfede0c0348 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -238,6 +238,13 @@ static inline void kvm_vcpu_pmu_resync_el0(void) {} static inline bool pmuv3_implemented(int pmuver) { + /* + * PMUVer follows the standard ID scheme for an unsigned field with the + * exception of 0xF (IMP_DEF) which is treated specially and implies + * FEAT_PMUv3 is not implemented. + * + * See DDI0487L.a D24.1.3.2 for more details. + */ return !(pmuver == ARMV8_PMU_DFR_VER_IMP_DEF || pmuver == ARMV8_PMU_DFR_VER_NI); } diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4de630e398ca..1cf37b30b861 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -61,32 +61,6 @@ config ARM64 select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAVE_TRACE_MMIO_ACCESS - select ARCH_INLINE_READ_LOCK if !PREEMPTION - select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION - select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION - select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPTION - select ARCH_INLINE_READ_UNLOCK if !PREEMPTION - select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPTION - select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPTION - select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPTION - select ARCH_INLINE_WRITE_LOCK if !PREEMPTION - select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPTION - select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPTION - select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPTION - select ARCH_INLINE_WRITE_UNLOCK if !PREEMPTION - select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPTION - select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPTION - select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPTION - select ARCH_INLINE_SPIN_TRYLOCK if !PREEMPTION - select ARCH_INLINE_SPIN_TRYLOCK_BH if !PREEMPTION - select ARCH_INLINE_SPIN_LOCK if !PREEMPTION - select ARCH_INLINE_SPIN_LOCK_BH if !PREEMPTION - select ARCH_INLINE_SPIN_LOCK_IRQ if !PREEMPTION - select ARCH_INLINE_SPIN_LOCK_IRQSAVE if !PREEMPTION - select ARCH_INLINE_SPIN_UNLOCK if !PREEMPTION - select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPTION - select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION - select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_USE_CMPXCHG_LOCKREF @@ -2009,8 +1983,8 @@ config ARM64_TLB_RANGE config ARM64_MPAM bool "Enable support for MPAM" - select ARM64_MPAM_DRIVER if EXPERT # does nothing yet - select ACPI_MPAM if ACPI + select ARM64_MPAM_DRIVER + select ARCH_HAS_CPU_RESCTRL help Memory System Resource Partitioning and Monitoring (MPAM) is an optional extension to the Arm architecture that allows each @@ -2032,6 +2006,8 @@ config ARM64_MPAM MPAM is exposed to user-space via the resctrl pseudo filesystem. + This option enables the extra context switch code. + endmenu # "ARMv8.4 architectural features" menu "ARMv8.5 architectural features" @@ -2208,6 +2184,26 @@ config ARM64_GCS endmenu # "ARMv9.4 architectural features" +config AS_HAS_LSUI + def_bool $(as-instr,.arch_extension lsui) + help + Supported by LLVM 20+ and binutils 2.45+. + +menu "ARMv9.6 architectural features" + +config ARM64_LSUI + bool "Support Unprivileged Load Store Instructions (LSUI)" + default y + depends on AS_HAS_LSUI && !CPU_BIG_ENDIAN + help + The Unprivileged Load Store Instructions (LSUI) provides + variants load/store instructions that access user-space memory + from the kernel without clearing PSTATE.PAN bit. + + This feature is supported by LLVM 20+ and binutils 2.45+. + +endmenu # "ARMv9.6 architectural feature" + config ARM64_SVE bool "ARM Scalable Vector Extension support" default y @@ -2365,7 +2361,7 @@ config CMDLINE default "" help Provide a set of default command-line options at build time by - entering them here. As a minimum, you should specify the the + entering them here. As a minimum, you should specify the root device (e.g. root=/dev/nfs). choice diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 9148f5a31968..12aa6a283249 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -15,7 +15,7 @@ #ifdef CONFIG_ARM64_SW_TTBR0_PAN .macro __uaccess_ttbr0_disable, tmp1 mrs \tmp1, ttbr1_el1 // swapper_pg_dir - bic \tmp1, \tmp1, #TTBR_ASID_MASK + bic \tmp1, \tmp1, #TTBRx_EL1_ASID_MASK sub \tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET // reserved_pg_dir msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 add \tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 177c691914f8..6e3da333442e 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -71,6 +71,8 @@ cpucap_is_possible(const unsigned int cap) return true; case ARM64_HAS_PMUV3: return IS_ENABLED(CONFIG_HW_PERF_EVENTS); + case ARM64_HAS_LSUI: + return IS_ENABLED(CONFIG_ARM64_LSUI); } return true; diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 85f4c1615472..4d15071a4f3f 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -513,7 +513,8 @@ check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2 .Linit_mpam_\@: - msr_s SYS_MPAM2_EL2, xzr // use the default partition + mov x0, #MPAM2_EL2_EnMPAMSM_MASK + msr_s SYS_MPAM2_EL2, x0 // use the default partition, // and disable lower traps mrs_s x0, SYS_MPAMIDR_EL1 tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index bc06691d2062..d1d2ff9d323a 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -9,71 +9,292 @@ #include #include +#include #define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ -#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ -do { \ +#define LLSC_FUTEX_ATOMIC_OP(op, insn) \ +static __always_inline int \ +__llsc_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ +{ \ unsigned int loops = FUTEX_MAX_LOOPS; \ + int ret, oldval, newval; \ \ uaccess_enable_privileged(); \ - asm volatile( \ -" prfm pstl1strm, %2\n" \ -"1: ldxr %w1, %2\n" \ + asm volatile("// __llsc_futex_atomic_" #op "\n" \ +" prfm pstl1strm, %[uaddr]\n" \ +"1: ldxr %w[oldval], %[uaddr]\n" \ insn "\n" \ -"2: stlxr %w0, %w3, %2\n" \ -" cbz %w0, 3f\n" \ -" sub %w4, %w4, %w0\n" \ -" cbnz %w4, 1b\n" \ -" mov %w0, %w6\n" \ +"2: stlxr %w[ret], %w[newval], %[uaddr]\n" \ +" cbz %w[ret], 3f\n" \ +" sub %w[loops], %w[loops], %w[ret]\n" \ +" cbnz %w[loops], 1b\n" \ +" mov %w[ret], %w[err]\n" \ "3:\n" \ " dmb ish\n" \ - _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) \ - _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0) \ - : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), \ - "+r" (loops) \ - : "r" (oparg), "Ir" (-EAGAIN) \ + _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) \ + _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) \ + : [ret] "=&r" (ret), [oldval] "=&r" (oldval), \ + [uaddr] "+Q" (*uaddr), [newval] "=&r" (newval), \ + [loops] "+r" (loops) \ + : [oparg] "r" (oparg), [err] "Ir" (-EAGAIN) \ : "memory"); \ uaccess_disable_privileged(); \ -} while (0) + \ + if (!ret) \ + *oval = oldval; \ + \ + return ret; \ +} + +LLSC_FUTEX_ATOMIC_OP(add, "add %w[newval], %w[oldval], %w[oparg]") +LLSC_FUTEX_ATOMIC_OP(or, "orr %w[newval], %w[oldval], %w[oparg]") +LLSC_FUTEX_ATOMIC_OP(and, "and %w[newval], %w[oldval], %w[oparg]") +LLSC_FUTEX_ATOMIC_OP(eor, "eor %w[newval], %w[oldval], %w[oparg]") +LLSC_FUTEX_ATOMIC_OP(set, "mov %w[newval], %w[oparg]") + +static __always_inline int +__llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +{ + int ret = 0; + unsigned int loops = FUTEX_MAX_LOOPS; + u32 val, tmp; + + uaccess_enable_privileged(); + asm volatile("//__llsc_futex_cmpxchg\n" +" prfm pstl1strm, %[uaddr]\n" +"1: ldxr %w[curval], %[uaddr]\n" +" eor %w[tmp], %w[curval], %w[oldval]\n" +" cbnz %w[tmp], 4f\n" +"2: stlxr %w[tmp], %w[newval], %[uaddr]\n" +" cbz %w[tmp], 3f\n" +" sub %w[loops], %w[loops], %w[tmp]\n" +" cbnz %w[loops], 1b\n" +" mov %w[ret], %w[err]\n" +"3:\n" +" dmb ish\n" +"4:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w[ret]) + _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w[ret]) + : [ret] "+r" (ret), [curval] "=&r" (val), + [uaddr] "+Q" (*uaddr), [tmp] "=&r" (tmp), + [loops] "+r" (loops) + : [oldval] "r" (oldval), [newval] "r" (newval), + [err] "Ir" (-EAGAIN) + : "memory"); + uaccess_disable_privileged(); + + if (!ret) + *oval = val; + + return ret; +} + +#ifdef CONFIG_ARM64_LSUI + +/* + * Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(), as + * PAN toggling is not required. + */ + +#define LSUI_FUTEX_ATOMIC_OP(op, asm_op) \ +static __always_inline int \ +__lsui_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ +{ \ + int ret = 0; \ + int oldval; \ + \ + uaccess_ttbr0_enable(); \ + \ + asm volatile("// __lsui_futex_atomic_" #op "\n" \ + __LSUI_PREAMBLE \ +"1: " #asm_op "al %w[oparg], %w[oldval], %[uaddr]\n" \ +"2:\n" \ + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) \ + : [ret] "+r" (ret), [uaddr] "+Q" (*uaddr), \ + [oldval] "=r" (oldval) \ + : [oparg] "r" (oparg) \ + : "memory"); \ + \ + uaccess_ttbr0_disable(); \ + \ + if (!ret) \ + *oval = oldval; \ + return ret; \ +} + +LSUI_FUTEX_ATOMIC_OP(add, ldtadd) +LSUI_FUTEX_ATOMIC_OP(or, ldtset) +LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr) +LSUI_FUTEX_ATOMIC_OP(set, swpt) + +static __always_inline int +__lsui_cmpxchg64(u64 __user *uaddr, u64 *oldval, u64 newval) +{ + int ret = 0; + + uaccess_ttbr0_enable(); + + asm volatile("// __lsui_cmpxchg64\n" + __LSUI_PREAMBLE +"1: casalt %[oldval], %[newval], %[uaddr]\n" +"2:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) + : [ret] "+r" (ret), [uaddr] "+Q" (*uaddr), + [oldval] "+r" (*oldval) + : [newval] "r" (newval) + : "memory"); + + uaccess_ttbr0_disable(); + + return ret; +} + +static __always_inline int +__lsui_cmpxchg32(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +{ + u64 __user *uaddr64; + bool futex_pos, other_pos; + u32 other, orig_other; + union { + u32 futex[2]; + u64 raw; + } oval64, orig64, nval64; + + uaddr64 = (u64 __user *)PTR_ALIGN_DOWN(uaddr, sizeof(u64)); + futex_pos = !IS_ALIGNED((unsigned long)uaddr, sizeof(u64)); + other_pos = !futex_pos; + + oval64.futex[futex_pos] = oldval; + if (get_user(oval64.futex[other_pos], (u32 __user *)uaddr64 + other_pos)) + return -EFAULT; + + orig64.raw = oval64.raw; + + nval64.futex[futex_pos] = newval; + nval64.futex[other_pos] = oval64.futex[other_pos]; + + if (__lsui_cmpxchg64(uaddr64, &oval64.raw, nval64.raw)) + return -EFAULT; + + oldval = oval64.futex[futex_pos]; + other = oval64.futex[other_pos]; + orig_other = orig64.futex[other_pos]; + + if (other != orig_other) + return -EAGAIN; + + *oval = oldval; + + return 0; +} + +static __always_inline int +__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval) +{ + /* + * Undo the bitwise negation applied to the oparg passed from + * arch_futex_atomic_op_inuser() with FUTEX_OP_ANDN. + */ + return __lsui_futex_atomic_andnot(~oparg, uaddr, oval); +} + +static __always_inline int +__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval) +{ + u32 oldval, newval, val; + int ret, i; + + if (get_user(oldval, uaddr)) + return -EFAULT; + + /* + * there are no ldteor/stteor instructions... + */ + for (i = 0; i < FUTEX_MAX_LOOPS; i++) { + newval = oldval ^ oparg; + + ret = __lsui_cmpxchg32(uaddr, oldval, newval, &val); + switch (ret) { + case -EFAULT: + return ret; + case -EAGAIN: + continue; + } + + if (val == oldval) { + *oval = val; + return 0; + } + + oldval = val; + } + + return -EAGAIN; +} + +static __always_inline int +__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +{ + /* + * Callers of futex_atomic_cmpxchg_inatomic() already retry on + * -EAGAIN, no need for another loop of max retries. + */ + return __lsui_cmpxchg32(uaddr, oldval, newval, oval); +} +#endif /* CONFIG_ARM64_LSUI */ + + +#define FUTEX_ATOMIC_OP(op) \ +static __always_inline int \ +__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \ +{ \ + return __lsui_llsc_body(futex_atomic_##op, oparg, uaddr, oval); \ +} + +FUTEX_ATOMIC_OP(add) +FUTEX_ATOMIC_OP(or) +FUTEX_ATOMIC_OP(and) +FUTEX_ATOMIC_OP(eor) +FUTEX_ATOMIC_OP(set) + +static __always_inline int +__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval) +{ + return __lsui_llsc_body(futex_cmpxchg, uaddr, oldval, newval, oval); +} static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr) { - int oldval = 0, ret, tmp; - u32 __user *uaddr = __uaccess_mask_ptr(_uaddr); + int ret; + u32 __user *uaddr; if (!access_ok(_uaddr, sizeof(u32))) return -EFAULT; + uaddr = __uaccess_mask_ptr(_uaddr); + switch (op) { case FUTEX_OP_SET: - __futex_atomic_op("mov %w3, %w5", - ret, oldval, uaddr, tmp, oparg); + ret = __futex_atomic_set(oparg, uaddr, oval); break; case FUTEX_OP_ADD: - __futex_atomic_op("add %w3, %w1, %w5", - ret, oldval, uaddr, tmp, oparg); + ret = __futex_atomic_add(oparg, uaddr, oval); break; case FUTEX_OP_OR: - __futex_atomic_op("orr %w3, %w1, %w5", - ret, oldval, uaddr, tmp, oparg); + ret = __futex_atomic_or(oparg, uaddr, oval); break; case FUTEX_OP_ANDN: - __futex_atomic_op("and %w3, %w1, %w5", - ret, oldval, uaddr, tmp, ~oparg); + ret = __futex_atomic_and(~oparg, uaddr, oval); break; case FUTEX_OP_XOR: - __futex_atomic_op("eor %w3, %w1, %w5", - ret, oldval, uaddr, tmp, oparg); + ret = __futex_atomic_eor(oparg, uaddr, oval); break; default: ret = -ENOSYS; } - if (!ret) - *oval = oldval; - return ret; } @@ -81,40 +302,14 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, u32 oldval, u32 newval) { - int ret = 0; - unsigned int loops = FUTEX_MAX_LOOPS; - u32 val, tmp; u32 __user *uaddr; if (!access_ok(_uaddr, sizeof(u32))) return -EFAULT; uaddr = __uaccess_mask_ptr(_uaddr); - uaccess_enable_privileged(); - asm volatile("// futex_atomic_cmpxchg_inatomic\n" -" prfm pstl1strm, %2\n" -"1: ldxr %w1, %2\n" -" sub %w3, %w1, %w5\n" -" cbnz %w3, 4f\n" -"2: stlxr %w3, %w6, %2\n" -" cbz %w3, 3f\n" -" sub %w4, %w4, %w3\n" -" cbnz %w4, 1b\n" -" mov %w0, %w7\n" -"3:\n" -" dmb ish\n" -"4:\n" - _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0) - _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0) - : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) - : "r" (oldval), "r" (newval), "Ir" (-EAGAIN) - : "memory"); - uaccess_disable_privileged(); - if (!ret) - *uval = val; - - return ret; + return __futex_cmpxchg(uaddr, oldval, newval, uval); } #endif /* __ASM_FUTEX_H */ diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index e6f8ff3cc630..d038ff14d16c 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -71,23 +71,23 @@ static inline void __flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, - bool last_level) + tlbf_t flags) { switch (stride) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - __flush_tlb_range(vma, start, end, PUD_SIZE, last_level, 1); + __flush_tlb_range(vma, start, end, PUD_SIZE, 1, flags); break; #endif case CONT_PMD_SIZE: case PMD_SIZE: - __flush_tlb_range(vma, start, end, PMD_SIZE, last_level, 2); + __flush_tlb_range(vma, start, end, PMD_SIZE, 2, flags); break; case CONT_PTE_SIZE: - __flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, 3); + __flush_tlb_range(vma, start, end, PAGE_SIZE, 3, flags); break; default: - __flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, TLBI_TTL_UNKNOWN); + __flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, flags); } } @@ -98,7 +98,7 @@ static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, { unsigned long stride = huge_page_size(hstate_vma(vma)); - __flush_hugetlb_tlb_range(vma, start, end, stride, false); + __flush_hugetlb_tlb_range(vma, start, end, stride, TLBF_NONE); } #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 72ea4bda79f3..abe8218b2325 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -60,126 +60,10 @@ * of KERNEL_HWCAP_{feature}. */ #define __khwcap_feature(x) const_ilog2(HWCAP_ ## x) -#define KERNEL_HWCAP_FP __khwcap_feature(FP) -#define KERNEL_HWCAP_ASIMD __khwcap_feature(ASIMD) -#define KERNEL_HWCAP_EVTSTRM __khwcap_feature(EVTSTRM) -#define KERNEL_HWCAP_AES __khwcap_feature(AES) -#define KERNEL_HWCAP_PMULL __khwcap_feature(PMULL) -#define KERNEL_HWCAP_SHA1 __khwcap_feature(SHA1) -#define KERNEL_HWCAP_SHA2 __khwcap_feature(SHA2) -#define KERNEL_HWCAP_CRC32 __khwcap_feature(CRC32) -#define KERNEL_HWCAP_ATOMICS __khwcap_feature(ATOMICS) -#define KERNEL_HWCAP_FPHP __khwcap_feature(FPHP) -#define KERNEL_HWCAP_ASIMDHP __khwcap_feature(ASIMDHP) -#define KERNEL_HWCAP_CPUID __khwcap_feature(CPUID) -#define KERNEL_HWCAP_ASIMDRDM __khwcap_feature(ASIMDRDM) -#define KERNEL_HWCAP_JSCVT __khwcap_feature(JSCVT) -#define KERNEL_HWCAP_FCMA __khwcap_feature(FCMA) -#define KERNEL_HWCAP_LRCPC __khwcap_feature(LRCPC) -#define KERNEL_HWCAP_DCPOP __khwcap_feature(DCPOP) -#define KERNEL_HWCAP_SHA3 __khwcap_feature(SHA3) -#define KERNEL_HWCAP_SM3 __khwcap_feature(SM3) -#define KERNEL_HWCAP_SM4 __khwcap_feature(SM4) -#define KERNEL_HWCAP_ASIMDDP __khwcap_feature(ASIMDDP) -#define KERNEL_HWCAP_SHA512 __khwcap_feature(SHA512) -#define KERNEL_HWCAP_SVE __khwcap_feature(SVE) -#define KERNEL_HWCAP_ASIMDFHM __khwcap_feature(ASIMDFHM) -#define KERNEL_HWCAP_DIT __khwcap_feature(DIT) -#define KERNEL_HWCAP_USCAT __khwcap_feature(USCAT) -#define KERNEL_HWCAP_ILRCPC __khwcap_feature(ILRCPC) -#define KERNEL_HWCAP_FLAGM __khwcap_feature(FLAGM) -#define KERNEL_HWCAP_SSBS __khwcap_feature(SSBS) -#define KERNEL_HWCAP_SB __khwcap_feature(SB) -#define KERNEL_HWCAP_PACA __khwcap_feature(PACA) -#define KERNEL_HWCAP_PACG __khwcap_feature(PACG) -#define KERNEL_HWCAP_GCS __khwcap_feature(GCS) -#define KERNEL_HWCAP_CMPBR __khwcap_feature(CMPBR) -#define KERNEL_HWCAP_FPRCVT __khwcap_feature(FPRCVT) -#define KERNEL_HWCAP_F8MM8 __khwcap_feature(F8MM8) -#define KERNEL_HWCAP_F8MM4 __khwcap_feature(F8MM4) -#define KERNEL_HWCAP_SVE_F16MM __khwcap_feature(SVE_F16MM) -#define KERNEL_HWCAP_SVE_ELTPERM __khwcap_feature(SVE_ELTPERM) -#define KERNEL_HWCAP_SVE_AES2 __khwcap_feature(SVE_AES2) -#define KERNEL_HWCAP_SVE_BFSCALE __khwcap_feature(SVE_BFSCALE) -#define KERNEL_HWCAP_SVE2P2 __khwcap_feature(SVE2P2) -#define KERNEL_HWCAP_SME2P2 __khwcap_feature(SME2P2) -#define KERNEL_HWCAP_SME_SBITPERM __khwcap_feature(SME_SBITPERM) -#define KERNEL_HWCAP_SME_AES __khwcap_feature(SME_AES) -#define KERNEL_HWCAP_SME_SFEXPA __khwcap_feature(SME_SFEXPA) -#define KERNEL_HWCAP_SME_STMOP __khwcap_feature(SME_STMOP) -#define KERNEL_HWCAP_SME_SMOP4 __khwcap_feature(SME_SMOP4) - #define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 64) -#define KERNEL_HWCAP_DCPODP __khwcap2_feature(DCPODP) -#define KERNEL_HWCAP_SVE2 __khwcap2_feature(SVE2) -#define KERNEL_HWCAP_SVEAES __khwcap2_feature(SVEAES) -#define KERNEL_HWCAP_SVEPMULL __khwcap2_feature(SVEPMULL) -#define KERNEL_HWCAP_SVEBITPERM __khwcap2_feature(SVEBITPERM) -#define KERNEL_HWCAP_SVESHA3 __khwcap2_feature(SVESHA3) -#define KERNEL_HWCAP_SVESM4 __khwcap2_feature(SVESM4) -#define KERNEL_HWCAP_FLAGM2 __khwcap2_feature(FLAGM2) -#define KERNEL_HWCAP_FRINT __khwcap2_feature(FRINT) -#define KERNEL_HWCAP_SVEI8MM __khwcap2_feature(SVEI8MM) -#define KERNEL_HWCAP_SVEF32MM __khwcap2_feature(SVEF32MM) -#define KERNEL_HWCAP_SVEF64MM __khwcap2_feature(SVEF64MM) -#define KERNEL_HWCAP_SVEBF16 __khwcap2_feature(SVEBF16) -#define KERNEL_HWCAP_I8MM __khwcap2_feature(I8MM) -#define KERNEL_HWCAP_BF16 __khwcap2_feature(BF16) -#define KERNEL_HWCAP_DGH __khwcap2_feature(DGH) -#define KERNEL_HWCAP_RNG __khwcap2_feature(RNG) -#define KERNEL_HWCAP_BTI __khwcap2_feature(BTI) -#define KERNEL_HWCAP_MTE __khwcap2_feature(MTE) -#define KERNEL_HWCAP_ECV __khwcap2_feature(ECV) -#define KERNEL_HWCAP_AFP __khwcap2_feature(AFP) -#define KERNEL_HWCAP_RPRES __khwcap2_feature(RPRES) -#define KERNEL_HWCAP_MTE3 __khwcap2_feature(MTE3) -#define KERNEL_HWCAP_SME __khwcap2_feature(SME) -#define KERNEL_HWCAP_SME_I16I64 __khwcap2_feature(SME_I16I64) -#define KERNEL_HWCAP_SME_F64F64 __khwcap2_feature(SME_F64F64) -#define KERNEL_HWCAP_SME_I8I32 __khwcap2_feature(SME_I8I32) -#define KERNEL_HWCAP_SME_F16F32 __khwcap2_feature(SME_F16F32) -#define KERNEL_HWCAP_SME_B16F32 __khwcap2_feature(SME_B16F32) -#define KERNEL_HWCAP_SME_F32F32 __khwcap2_feature(SME_F32F32) -#define KERNEL_HWCAP_SME_FA64 __khwcap2_feature(SME_FA64) -#define KERNEL_HWCAP_WFXT __khwcap2_feature(WFXT) -#define KERNEL_HWCAP_EBF16 __khwcap2_feature(EBF16) -#define KERNEL_HWCAP_SVE_EBF16 __khwcap2_feature(SVE_EBF16) -#define KERNEL_HWCAP_CSSC __khwcap2_feature(CSSC) -#define KERNEL_HWCAP_RPRFM __khwcap2_feature(RPRFM) -#define KERNEL_HWCAP_SVE2P1 __khwcap2_feature(SVE2P1) -#define KERNEL_HWCAP_SME2 __khwcap2_feature(SME2) -#define KERNEL_HWCAP_SME2P1 __khwcap2_feature(SME2P1) -#define KERNEL_HWCAP_SME_I16I32 __khwcap2_feature(SME_I16I32) -#define KERNEL_HWCAP_SME_BI32I32 __khwcap2_feature(SME_BI32I32) -#define KERNEL_HWCAP_SME_B16B16 __khwcap2_feature(SME_B16B16) -#define KERNEL_HWCAP_SME_F16F16 __khwcap2_feature(SME_F16F16) -#define KERNEL_HWCAP_MOPS __khwcap2_feature(MOPS) -#define KERNEL_HWCAP_HBC __khwcap2_feature(HBC) -#define KERNEL_HWCAP_SVE_B16B16 __khwcap2_feature(SVE_B16B16) -#define KERNEL_HWCAP_LRCPC3 __khwcap2_feature(LRCPC3) -#define KERNEL_HWCAP_LSE128 __khwcap2_feature(LSE128) -#define KERNEL_HWCAP_FPMR __khwcap2_feature(FPMR) -#define KERNEL_HWCAP_LUT __khwcap2_feature(LUT) -#define KERNEL_HWCAP_FAMINMAX __khwcap2_feature(FAMINMAX) -#define KERNEL_HWCAP_F8CVT __khwcap2_feature(F8CVT) -#define KERNEL_HWCAP_F8FMA __khwcap2_feature(F8FMA) -#define KERNEL_HWCAP_F8DP4 __khwcap2_feature(F8DP4) -#define KERNEL_HWCAP_F8DP2 __khwcap2_feature(F8DP2) -#define KERNEL_HWCAP_F8E4M3 __khwcap2_feature(F8E4M3) -#define KERNEL_HWCAP_F8E5M2 __khwcap2_feature(F8E5M2) -#define KERNEL_HWCAP_SME_LUTV2 __khwcap2_feature(SME_LUTV2) -#define KERNEL_HWCAP_SME_F8F16 __khwcap2_feature(SME_F8F16) -#define KERNEL_HWCAP_SME_F8F32 __khwcap2_feature(SME_F8F32) -#define KERNEL_HWCAP_SME_SF8FMA __khwcap2_feature(SME_SF8FMA) -#define KERNEL_HWCAP_SME_SF8DP4 __khwcap2_feature(SME_SF8DP4) -#define KERNEL_HWCAP_SME_SF8DP2 __khwcap2_feature(SME_SF8DP2) -#define KERNEL_HWCAP_POE __khwcap2_feature(POE) - #define __khwcap3_feature(x) (const_ilog2(HWCAP3_ ## x) + 128) -#define KERNEL_HWCAP_MTE_FAR __khwcap3_feature(MTE_FAR) -#define KERNEL_HWCAP_MTE_STORE_ONLY __khwcap3_feature(MTE_STORE_ONLY) -#define KERNEL_HWCAP_LSFE __khwcap3_feature(LSFE) -#define KERNEL_HWCAP_LS64 __khwcap3_feature(LS64) + +#include "asm/kernel-hwcap.h" /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/lsui.h b/arch/arm64/include/asm/lsui.h new file mode 100644 index 000000000000..8f0d81953eb6 --- /dev/null +++ b/arch/arm64/include/asm/lsui.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_LSUI_H +#define __ASM_LSUI_H + +#include +#include +#include +#include +#include + +#define __LSUI_PREAMBLE ".arch_extension lsui\n" + +#ifdef CONFIG_ARM64_LSUI + +#define __lsui_llsc_body(op, ...) \ +({ \ + alternative_has_cap_unlikely(ARM64_HAS_LSUI) ? \ + __lsui_##op(__VA_ARGS__) : __llsc_##op(__VA_ARGS__); \ +}) + +#else /* CONFIG_ARM64_LSUI */ + +#define __lsui_llsc_body(op, ...) __llsc_##op(__VA_ARGS__) + +#endif /* CONFIG_ARM64_LSUI */ + +#endif /* __ASM_LSUI_H */ diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 137a173df1ff..5e1211c540ab 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -10,20 +10,12 @@ #define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */ #define USER_ASID_BIT 48 #define USER_ASID_FLAG (UL(1) << USER_ASID_BIT) -#define TTBR_ASID_MASK (UL(0xffff) << 48) #ifndef __ASSEMBLER__ #include #include -enum pgtable_type { - TABLE_PTE, - TABLE_PMD, - TABLE_PUD, - TABLE_P4D, -}; - typedef struct { atomic64_t id; #ifdef CONFIG_COMPAT @@ -112,5 +104,7 @@ void kpti_install_ng_mappings(void); static inline void kpti_install_ng_mappings(void) {} #endif +extern bool page_alloc_available; + #endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index cc80af59c69e..803b68758152 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -210,7 +210,8 @@ static inline void update_saved_ttbr0(struct task_struct *tsk, if (mm == &init_mm) ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); else - ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | ASID(mm) << 48; + ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | + FIELD_PREP(TTBRx_EL1_ASID_MASK, ASID(mm)); WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr); } diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h new file mode 100644 index 000000000000..70d396e7b6da --- /dev/null +++ b/arch/arm64/include/asm/mpam.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2025 Arm Ltd. */ + +#ifndef __ASM__MPAM_H +#define __ASM__MPAM_H + +#include +#include +#include +#include +#include + +#include + +DECLARE_STATIC_KEY_FALSE(mpam_enabled); +DECLARE_PER_CPU(u64, arm64_mpam_default); +DECLARE_PER_CPU(u64, arm64_mpam_current); + +/* + * The value of the MPAM0_EL1 sysreg when a task is in resctrl's default group. + * This is used by the context switch code to use the resctrl CPU property + * instead. The value is modified when CDP is enabled/disabled by mounting + * the resctrl filesystem. + */ +extern u64 arm64_mpam_global_default; + +#ifdef CONFIG_ARM64_MPAM +static inline u64 __mpam_regval(u16 partid_d, u16 partid_i, u8 pmg_d, u8 pmg_i) +{ + return FIELD_PREP(MPAM0_EL1_PARTID_D, partid_d) | + FIELD_PREP(MPAM0_EL1_PARTID_I, partid_i) | + FIELD_PREP(MPAM0_EL1_PMG_D, pmg_d) | + FIELD_PREP(MPAM0_EL1_PMG_I, pmg_i); +} + +static inline void mpam_set_cpu_defaults(int cpu, u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ + u64 default_val = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i); + + WRITE_ONCE(per_cpu(arm64_mpam_default, cpu), default_val); +} + +/* + * The resctrl filesystem writes to the partid/pmg values for threads and CPUs, + * which may race with reads in mpam_thread_switch(). Ensure only one of the old + * or new values are used. Particular care should be taken with the pmg field as + * mpam_thread_switch() may read a partid and pmg that don't match, causing this + * value to be stored with cache allocations, despite being considered 'free' by + * resctrl. + */ +static inline u64 mpam_get_regval(struct task_struct *tsk) +{ + return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg); +} + +static inline void mpam_set_task_partid_pmg(struct task_struct *tsk, + u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ + u64 regval = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i); + + WRITE_ONCE(task_thread_info(tsk)->mpam_partid_pmg, regval); +} + +static inline void mpam_thread_switch(struct task_struct *tsk) +{ + u64 oldregval; + int cpu = smp_processor_id(); + u64 regval = mpam_get_regval(tsk); + + if (!static_branch_likely(&mpam_enabled)) + return; + + if (regval == READ_ONCE(arm64_mpam_global_default)) + regval = READ_ONCE(per_cpu(arm64_mpam_default, cpu)); + + oldregval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + if (oldregval == regval) + return; + + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (system_supports_sme()) + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1); + isb(); + + /* Synchronising the EL0 write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); + + WRITE_ONCE(per_cpu(arm64_mpam_current, cpu), regval); +} +#else +static inline void mpam_thread_switch(struct task_struct *tsk) {} +#endif /* CONFIG_ARM64_MPAM */ + +#endif /* __ASM__MPAM_H */ diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 6d4a78b9dc3e..7f7b97e09996 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -252,6 +252,9 @@ static inline void mte_check_tfsr_entry(void) if (!kasan_hw_tags_enabled()) return; + if (!system_uses_mte_async_or_asymm_mode()) + return; + mte_check_tfsr_el1(); } @@ -260,6 +263,9 @@ static inline void mte_check_tfsr_exit(void) if (!kasan_hw_tags_enabled()) return; + if (!system_uses_mte_async_or_asymm_mode()) + return; + /* * The asynchronous faults are sync'ed automatically with * TFSR_EL1 on kernel entry but for exit an explicit dsb() diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index d49180bb7cb3..72f31800c703 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -223,8 +223,6 @@ */ #define S1_TABLE_AP (_AT(pmdval_t, 3) << 61) -#define TTBR_CNP_BIT (UL(1) << 0) - /* * TCR flags. */ @@ -287,9 +285,12 @@ #endif #ifdef CONFIG_ARM64_VA_BITS_52 +#define PTRS_PER_PGD_52_VA (UL(1) << (52 - PGDIR_SHIFT)) +#define PTRS_PER_PGD_48_VA (UL(1) << (48 - PGDIR_SHIFT)) +#define PTRS_PER_PGD_EXTRA (PTRS_PER_PGD_52_VA - PTRS_PER_PGD_48_VA) + /* Must be at least 64-byte aligned to prevent corruption of the TTBR */ -#define TTBR1_BADDR_4852_OFFSET (((UL(1) << (52 - PGDIR_SHIFT)) - \ - (UL(1) << (48 - PGDIR_SHIFT))) * 8) +#define TTBR1_BADDR_4852_OFFSET (PTRS_PER_PGD_EXTRA << PTDESC_ORDER) #endif #endif diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index f560e6420267..212ce1b02e15 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -25,6 +25,8 @@ */ #define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */ +#define PTE_PRESENT_VALID_KERNEL (PTE_VALID | PTE_MAYBE_NG) + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define PTE_UFFD_WP (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */ #define PTE_SWP_UFFD_WP (_AT(pteval_t, 1) << 3) /* only for swp ptes */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b3e58735c49b..308e29e829b8 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -89,9 +89,9 @@ static inline void arch_leave_lazy_mmu_mode(void) /* Set stride and tlb_level in flush_*_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) \ - __flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2) + __flush_tlb_range(vma, addr, end, PMD_SIZE, 2, TLBF_NONE) #define flush_pud_tlb_range(vma, addr, end) \ - __flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1) + __flush_tlb_range(vma, addr, end, PUD_SIZE, 1, TLBF_NONE) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* @@ -101,10 +101,11 @@ static inline void arch_leave_lazy_mmu_mode(void) * entries exist. */ #define flush_tlb_fix_spurious_fault(vma, address, ptep) \ - local_flush_tlb_page_nonotify(vma, address) + __flush_tlb_page(vma, address, TLBF_NOBROADCAST | TLBF_NONOTIFY) -#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \ - local_flush_tlb_page_nonotify(vma, address) +#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \ + __flush_tlb_range(vma, address, address + PMD_SIZE, PMD_SIZE, 2, \ + TLBF_NOBROADCAST | TLBF_NONOTIFY | TLBF_NOWALKCACHE) /* * ZERO_PAGE is a global shared page that is always zero: used @@ -322,9 +323,11 @@ static inline pte_t pte_mknoncont(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_CONT)); } -static inline pte_t pte_mkvalid(pte_t pte) +static inline pte_t pte_mkvalid_k(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_VALID)); + pte = clear_pte_bit(pte, __pgprot(PTE_PRESENT_INVALID)); + pte = set_pte_bit(pte, __pgprot(PTE_PRESENT_VALID_KERNEL)); + return pte; } static inline pte_t pte_mkinvalid(pte_t pte) @@ -594,6 +597,7 @@ static inline int pmd_protnone(pmd_t pmd) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) +#define pmd_mkvalid_k(pmd) pte_pmd(pte_mkvalid_k(pmd_pte(pmd))) #define pmd_mkinvalid(pmd) pte_pmd(pte_mkinvalid(pmd_pte(pmd))) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define pmd_uffd_wp(pmd) pte_uffd_wp(pmd_pte(pmd)) @@ -635,6 +639,8 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd) #define pud_young(pud) pte_young(pud_pte(pud)) #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) +#define pud_mkwrite_novma(pud) pte_pud(pte_mkwrite_novma(pud_pte(pud))) +#define pud_mkvalid_k(pud) pte_pud(pte_mkvalid_k(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) static inline pud_t pud_mkhuge(pud_t pud) @@ -779,9 +785,13 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_TABLE) -#define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ - PMD_TYPE_SECT) -#define pmd_leaf(pmd) (pmd_present(pmd) && !pmd_table(pmd)) + +#define pmd_leaf pmd_leaf +static inline bool pmd_leaf(pmd_t pmd) +{ + return pmd_present(pmd) && !pmd_table(pmd); +} + #define pmd_bad(pmd) (!pmd_table(pmd)) #define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE) @@ -799,11 +809,8 @@ static inline int pmd_trans_huge(pmd_t pmd) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 -static inline bool pud_sect(pud_t pud) { return false; } static inline bool pud_table(pud_t pud) { return true; } #else -#define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \ - PUD_TYPE_SECT) #define pud_table(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \ PUD_TYPE_TABLE) #endif @@ -873,7 +880,11 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) PUD_TYPE_TABLE) #define pud_present(pud) pte_present(pud_pte(pud)) #ifndef __PAGETABLE_PMD_FOLDED -#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) +#define pud_leaf pud_leaf +static inline bool pud_leaf(pud_t pud) +{ + return pud_present(pud) && !pud_table(pud); +} #else #define pud_leaf(pud) false #endif @@ -1247,9 +1258,18 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); } -extern int __ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, - pte_t entry, int dirty); +extern int __ptep_set_access_flags_anysz(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty, + unsigned long pgsize); + +static inline int __ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + return __ptep_set_access_flags_anysz(vma, address, ptep, entry, dirty, + PAGE_SIZE); +} #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS @@ -1257,8 +1277,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { - return __ptep_set_access_flags(vma, address, (pte_t *)pmdp, - pmd_pte(entry), dirty); + return __ptep_set_access_flags_anysz(vma, address, (pte_t *)pmdp, + pmd_pte(entry), dirty, PMD_SIZE); } #endif @@ -1320,7 +1340,7 @@ static inline int __ptep_clear_flush_young(struct vm_area_struct *vma, * context-switch, which provides a DSB to complete the TLB * invalidation. */ - flush_tlb_page_nosync(vma, address); + __flush_tlb_page(vma, address, TLBF_NOSYNC); } return young; diff --git a/arch/arm64/include/asm/resctrl.h b/arch/arm64/include/asm/resctrl.h new file mode 100644 index 000000000000..b506e95cf6e3 --- /dev/null +++ b/arch/arm64/include/asm/resctrl.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h index 0fbc2e7867d3..a15a2968e7b6 100644 --- a/arch/arm64/include/asm/scs.h +++ b/arch/arm64/include/asm/scs.h @@ -10,6 +10,11 @@ #ifdef CONFIG_SHADOW_CALL_STACK scs_sp .req x18 + .macro scs_load_current_base + get_current_task scs_sp + ldr scs_sp, [scs_sp, #TSK_TI_SCS_BASE] + .endm + .macro scs_load_current get_current_task scs_sp ldr scs_sp, [scs_sp, #TSK_TI_SCS_SP] @@ -19,6 +24,9 @@ str scs_sp, [\tsk, #TSK_TI_SCS_SP] .endm #else + .macro scs_load_current_base + .endm + .macro scs_load_current .endm diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 7942478e4065..5d7fe3e153c8 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -41,6 +41,9 @@ struct thread_info { #ifdef CONFIG_SHADOW_CALL_STACK void *scs_base; void *scs_sp; +#endif +#ifdef CONFIG_ARM64_MPAM + u64 mpam_partid_pmg; #endif u32 cpu; }; diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 8d762607285c..10869d7731b8 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -53,7 +53,7 @@ static inline int tlb_get_level(struct mmu_gather *tlb) static inline void tlb_flush(struct mmu_gather *tlb) { struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0); - bool last_level = !tlb->freed_tables; + tlbf_t flags = tlb->freed_tables ? TLBF_NONE : TLBF_NOWALKCACHE; unsigned long stride = tlb_get_unmap_size(tlb); int tlb_level = tlb_get_level(tlb); @@ -63,13 +63,13 @@ static inline void tlb_flush(struct mmu_gather *tlb) * reallocate our ASID without invalidating the entire TLB. */ if (tlb->fullmm) { - if (!last_level) + if (tlb->freed_tables) flush_tlb_mm(tlb->mm); return; } __flush_tlb_range(&vma, tlb->start, tlb->end, stride, - last_level, tlb_level); + tlb_level, flags); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 1416e652612b..47fa4d39a461 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -97,24 +97,69 @@ static inline unsigned long get_trans_granule(void) #define TLBI_TTL_UNKNOWN INT_MAX -#define __tlbi_level(op, addr, level) do { \ - u64 arg = addr; \ - \ - if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && \ - level >= 0 && level <= 3) { \ - u64 ttl = level & 3; \ - ttl |= get_trans_granule() << 2; \ - arg &= ~TLBI_TTL_MASK; \ - arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \ - } \ - \ - __tlbi(op, arg); \ -} while(0) +typedef void (*tlbi_op)(u64 arg); -#define __tlbi_user_level(op, arg, level) do { \ - if (arm64_kernel_unmapped_at_el0()) \ - __tlbi_level(op, (arg | USER_ASID_FLAG), level); \ -} while (0) +static __always_inline void vae1is(u64 arg) +{ + __tlbi(vae1is, arg); + __tlbi_user(vae1is, arg); +} + +static __always_inline void vae2is(u64 arg) +{ + __tlbi(vae2is, arg); +} + +static __always_inline void vale1(u64 arg) +{ + __tlbi(vale1, arg); + __tlbi_user(vale1, arg); +} + +static __always_inline void vale1is(u64 arg) +{ + __tlbi(vale1is, arg); + __tlbi_user(vale1is, arg); +} + +static __always_inline void vale2is(u64 arg) +{ + __tlbi(vale2is, arg); +} + +static __always_inline void vaale1is(u64 arg) +{ + __tlbi(vaale1is, arg); +} + +static __always_inline void ipas2e1(u64 arg) +{ + __tlbi(ipas2e1, arg); +} + +static __always_inline void ipas2e1is(u64 arg) +{ + __tlbi(ipas2e1is, arg); +} + +static __always_inline void __tlbi_level_asid(tlbi_op op, u64 addr, u32 level, + u16 asid) +{ + u64 arg = __TLBI_VADDR(addr, asid); + + if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && level <= 3) { + u64 ttl = level | (get_trans_granule() << 2); + + FIELD_MODIFY(TLBI_TTL_MASK, &arg, ttl); + } + + op(arg); +} + +static inline void __tlbi_level(tlbi_op op, u64 addr, u32 level) +{ + __tlbi_level_asid(op, addr, level, 0); +} /* * This macro creates a properly formatted VA operand for the TLB RANGE. The @@ -141,19 +186,6 @@ static inline unsigned long get_trans_granule(void) #define TLBIR_TTL_MASK GENMASK_ULL(38, 37) #define TLBIR_BADDR_MASK GENMASK_ULL(36, 0) -#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl) \ - ({ \ - unsigned long __ta = 0; \ - unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0; \ - __ta |= FIELD_PREP(TLBIR_BADDR_MASK, baddr); \ - __ta |= FIELD_PREP(TLBIR_TTL_MASK, __ttl); \ - __ta |= FIELD_PREP(TLBIR_NUM_MASK, num); \ - __ta |= FIELD_PREP(TLBIR_SCALE_MASK, scale); \ - __ta |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule()); \ - __ta |= FIELD_PREP(TLBIR_ASID_MASK, asid); \ - __ta; \ - }) - /* These macros are used by the TLBI RANGE feature. */ #define __TLBI_RANGE_PAGES(num, scale) \ ((unsigned long)((num) + 1) << (5 * (scale) + 1)) @@ -167,11 +199,7 @@ static inline unsigned long get_trans_granule(void) * range. */ #define __TLBI_RANGE_NUM(pages, scale) \ - ({ \ - int __pages = min((pages), \ - __TLBI_RANGE_PAGES(31, (scale))); \ - (__pages >> (5 * (scale) + 1)) - 1; \ - }) + (((pages) >> (5 * (scale) + 1)) - 1) #define __repeat_tlbi_sync(op, arg...) \ do { \ @@ -241,10 +269,7 @@ static inline void __tlbi_sync_s1ish_hyp(void) * unmapping pages from vmalloc/io space. * * flush_tlb_page(vma, addr) - * Invalidate a single user mapping for address 'addr' in the - * address space corresponding to 'vma->mm'. Note that this - * operation only invalidates a single, last-level page-table - * entry and therefore does not affect any walk-caches. + * Equivalent to __flush_tlb_page(..., flags=TLBF_NONE) * * * Next, we have some undocumented invalidation routines that you probably @@ -258,30 +283,28 @@ static inline void __tlbi_sync_s1ish_hyp(void) * CPUs, ensuring that any walk-cache entries associated with the * translation are also invalidated. * - * __flush_tlb_range(vma, start, end, stride, last_level, tlb_level) + * __flush_tlb_range(vma, start, end, stride, tlb_level, flags) * Invalidate the virtual-address range '[start, end)' on all * CPUs for the user address space corresponding to 'vma->mm'. * The invalidation operations are issued at a granularity - * determined by 'stride' and only affect any walk-cache entries - * if 'last_level' is equal to false. tlb_level is the level at + * determined by 'stride'. tlb_level is the level at * which the invalidation must take place. If the level is wrong, * no invalidation may take place. In the case where the level * cannot be easily determined, the value TLBI_TTL_UNKNOWN will - * perform a non-hinted invalidation. + * perform a non-hinted invalidation. flags may be TLBF_NONE (0) or + * any combination of TLBF_NOWALKCACHE (elide eviction of walk + * cache entries), TLBF_NONOTIFY (don't call mmu notifiers), + * TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST + * (only perform the invalidation for the local cpu). * - * local_flush_tlb_page(vma, addr) - * Local variant of flush_tlb_page(). Stale TLB entries may - * remain in remote CPUs. - * - * local_flush_tlb_page_nonotify(vma, addr) - * Same as local_flush_tlb_page() except MMU notifier will not be - * called. - * - * local_flush_tlb_contpte(vma, addr) - * Invalidate the virtual-address range - * '[addr, addr+CONT_PTE_SIZE)' mapped with contpte on local CPU - * for the user address space corresponding to 'vma->mm'. Stale - * TLB entries may remain in remote CPUs. + * __flush_tlb_page(vma, addr, flags) + * Invalidate a single user mapping for address 'addr' in the + * address space corresponding to 'vma->mm'. Note that this + * operation only invalidates a single level 3 page-table entry + * and therefore does not affect any walk-caches. flags may contain + * any combination of TLBF_NONOTIFY (don't call mmu notifiers), + * TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST + * (only perform the invalidation for the local cpu). * * Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented * on top of these routines, since that is our interface to the mmu_gather @@ -315,59 +338,6 @@ static inline void flush_tlb_mm(struct mm_struct *mm) mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } -static inline void __local_flush_tlb_page_nonotify_nosync(struct mm_struct *mm, - unsigned long uaddr) -{ - unsigned long addr; - - dsb(nshst); - addr = __TLBI_VADDR(uaddr, ASID(mm)); - __tlbi(vale1, addr); - __tlbi_user(vale1, addr); -} - -static inline void local_flush_tlb_page_nonotify(struct vm_area_struct *vma, - unsigned long uaddr) -{ - __local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr); - dsb(nsh); -} - -static inline void local_flush_tlb_page(struct vm_area_struct *vma, - unsigned long uaddr) -{ - __local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr); - mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, uaddr & PAGE_MASK, - (uaddr & PAGE_MASK) + PAGE_SIZE); - dsb(nsh); -} - -static inline void __flush_tlb_page_nosync(struct mm_struct *mm, - unsigned long uaddr) -{ - unsigned long addr; - - dsb(ishst); - addr = __TLBI_VADDR(uaddr, ASID(mm)); - __tlbi(vale1is, addr); - __tlbi_user(vale1is, addr); - mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK, - (uaddr & PAGE_MASK) + PAGE_SIZE); -} - -static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, - unsigned long uaddr) -{ - return __flush_tlb_page_nosync(vma->vm_mm, uaddr); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long uaddr) -{ - flush_tlb_page_nosync(vma, uaddr); - __tlbi_sync_s1ish(); -} - static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) { return true; @@ -397,14 +367,13 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) /* * __flush_tlb_range_op - Perform TLBI operation upon a range * - * @op: TLBI instruction that operates on a range (has 'r' prefix) + * @lop: TLBI level operation to perform + * @rop: TLBI range operation to perform * @start: The start address of the range * @pages: Range as the number of pages from 'start' * @stride: Flush granularity * @asid: The ASID of the task (0 for IPA instructions) - * @tlb_level: Translation Table level hint, if known - * @tlbi_user: If 'true', call an additional __tlbi_user() - * (typically for user ASIDs). 'flase' for IPA instructions + * @level: Translation Table level hint, if known * @lpa2: If 'true', the lpa2 scheme is used as set out below * * When the CPU does not support TLB range operations, flush the TLB @@ -427,116 +396,181 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * operations can only span an even number of pages. We save this for last to * ensure 64KB start alignment is maintained for the LPA2 case. */ -#define __flush_tlb_range_op(op, start, pages, stride, \ - asid, tlb_level, tlbi_user, lpa2) \ -do { \ - typeof(start) __flush_start = start; \ - typeof(pages) __flush_pages = pages; \ - int num = 0; \ - int scale = 3; \ - int shift = lpa2 ? 16 : PAGE_SHIFT; \ - unsigned long addr; \ - \ - while (__flush_pages > 0) { \ - if (!system_supports_tlb_range() || \ - __flush_pages == 1 || \ - (lpa2 && __flush_start != ALIGN(__flush_start, SZ_64K))) { \ - addr = __TLBI_VADDR(__flush_start, asid); \ - __tlbi_level(op, addr, tlb_level); \ - if (tlbi_user) \ - __tlbi_user_level(op, addr, tlb_level); \ - __flush_start += stride; \ - __flush_pages -= stride >> PAGE_SHIFT; \ - continue; \ - } \ - \ - num = __TLBI_RANGE_NUM(__flush_pages, scale); \ - if (num >= 0) { \ - addr = __TLBI_VADDR_RANGE(__flush_start >> shift, asid, \ - scale, num, tlb_level); \ - __tlbi(r##op, addr); \ - if (tlbi_user) \ - __tlbi_user(r##op, addr); \ - __flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ - __flush_pages -= __TLBI_RANGE_PAGES(num, scale);\ - } \ - scale--; \ - } \ -} while (0) - -#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ - __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); - -static inline bool __flush_tlb_range_limit_excess(unsigned long start, - unsigned long end, unsigned long pages, unsigned long stride) +static __always_inline void rvae1is(u64 arg) { - /* - * When the system does not support TLB range based flush - * operation, (MAX_DVM_OPS - 1) pages can be handled. But - * with TLB range based operation, MAX_TLBI_RANGE_PAGES - * pages can be handled. - */ - if ((!system_supports_tlb_range() && - (end - start) >= (MAX_DVM_OPS * stride)) || - pages > MAX_TLBI_RANGE_PAGES) - return true; - - return false; + __tlbi(rvae1is, arg); + __tlbi_user(rvae1is, arg); } -static inline void __flush_tlb_range_nosync(struct mm_struct *mm, - unsigned long start, unsigned long end, - unsigned long stride, bool last_level, - int tlb_level) +static __always_inline void rvale1(u64 arg) { + __tlbi(rvale1, arg); + __tlbi_user(rvale1, arg); +} + +static __always_inline void rvale1is(u64 arg) +{ + __tlbi(rvale1is, arg); + __tlbi_user(rvale1is, arg); +} + +static __always_inline void rvaale1is(u64 arg) +{ + __tlbi(rvaale1is, arg); +} + +static __always_inline void ripas2e1is(u64 arg) +{ + __tlbi(ripas2e1is, arg); +} + +static __always_inline void __tlbi_range(tlbi_op op, u64 addr, + u16 asid, int scale, int num, + u32 level, bool lpa2) +{ + u64 arg = 0; + + arg |= FIELD_PREP(TLBIR_BADDR_MASK, addr >> (lpa2 ? 16 : PAGE_SHIFT)); + arg |= FIELD_PREP(TLBIR_TTL_MASK, level > 3 ? 0 : level); + arg |= FIELD_PREP(TLBIR_NUM_MASK, num); + arg |= FIELD_PREP(TLBIR_SCALE_MASK, scale); + arg |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule()); + arg |= FIELD_PREP(TLBIR_ASID_MASK, asid); + + op(arg); +} + +static __always_inline void __flush_tlb_range_op(tlbi_op lop, tlbi_op rop, + u64 start, size_t pages, + u64 stride, u16 asid, + u32 level, bool lpa2) +{ + u64 addr = start, end = start + pages * PAGE_SIZE; + int scale = 3; + + while (addr != end) { + int num; + + pages = (end - addr) >> PAGE_SHIFT; + + if (!system_supports_tlb_range() || pages == 1) + goto invalidate_one; + + if (lpa2 && !IS_ALIGNED(addr, SZ_64K)) + goto invalidate_one; + + num = __TLBI_RANGE_NUM(pages, scale); + if (num >= 0) { + __tlbi_range(rop, addr, asid, scale, num, level, lpa2); + addr += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; + } + + scale--; + continue; +invalidate_one: + __tlbi_level_asid(lop, addr, level, asid); + addr += stride; + } +} + +#define __flush_s1_tlb_range_op(op, start, pages, stride, asid, tlb_level) \ + __flush_tlb_range_op(op, r##op, start, pages, stride, asid, tlb_level, lpa2_is_enabled()) + +#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ + __flush_tlb_range_op(op, r##op, start, pages, stride, 0, tlb_level, kvm_lpa2_is_enabled()) + +static inline bool __flush_tlb_range_limit_excess(unsigned long pages, + unsigned long stride) +{ + /* + * Assume that the worst case number of DVM ops required to flush a + * given range on a system that supports tlb-range is 20 (4 scales, 1 + * final page, 15 for alignment on LPA2 systems), which is much smaller + * than MAX_DVM_OPS. + */ + if (system_supports_tlb_range()) + return pages > MAX_TLBI_RANGE_PAGES; + + return pages >= (MAX_DVM_OPS * stride) >> PAGE_SHIFT; +} + +typedef unsigned __bitwise tlbf_t; + +/* No special behaviour. */ +#define TLBF_NONE ((__force tlbf_t)0) + +/* Invalidate tlb entries only, leaving the page table walk cache intact. */ +#define TLBF_NOWALKCACHE ((__force tlbf_t)BIT(0)) + +/* Skip the trailing dsb after issuing tlbi. */ +#define TLBF_NOSYNC ((__force tlbf_t)BIT(1)) + +/* Suppress tlb notifier callbacks for this flush operation. */ +#define TLBF_NONOTIFY ((__force tlbf_t)BIT(2)) + +/* Perform the tlbi locally without broadcasting to other CPUs. */ +#define TLBF_NOBROADCAST ((__force tlbf_t)BIT(3)) + +static __always_inline void __do_flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long stride, int tlb_level, + tlbf_t flags) +{ + struct mm_struct *mm = vma->vm_mm; unsigned long asid, pages; - start = round_down(start, stride); - end = round_up(end, stride); pages = (end - start) >> PAGE_SHIFT; - if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { + if (__flush_tlb_range_limit_excess(pages, stride)) { flush_tlb_mm(mm); return; } - dsb(ishst); + if (!(flags & TLBF_NOBROADCAST)) + dsb(ishst); + else + dsb(nshst); + asid = ASID(mm); - if (last_level) - __flush_tlb_range_op(vale1is, start, pages, stride, asid, - tlb_level, true, lpa2_is_enabled()); - else - __flush_tlb_range_op(vae1is, start, pages, stride, asid, - tlb_level, true, lpa2_is_enabled()); + switch (flags & (TLBF_NOWALKCACHE | TLBF_NOBROADCAST)) { + case TLBF_NONE: + __flush_s1_tlb_range_op(vae1is, start, pages, stride, + asid, tlb_level); + break; + case TLBF_NOWALKCACHE: + __flush_s1_tlb_range_op(vale1is, start, pages, stride, + asid, tlb_level); + break; + case TLBF_NOBROADCAST: + /* Combination unused */ + BUG(); + break; + case TLBF_NOWALKCACHE | TLBF_NOBROADCAST: + __flush_s1_tlb_range_op(vale1, start, pages, stride, + asid, tlb_level); + break; + } - mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); + if (!(flags & TLBF_NONOTIFY)) + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); + + if (!(flags & TLBF_NOSYNC)) { + if (!(flags & TLBF_NOBROADCAST)) + __tlbi_sync_s1ish(); + else + dsb(nsh); + } } static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long stride, bool last_level, - int tlb_level) + unsigned long stride, int tlb_level, + tlbf_t flags) { - __flush_tlb_range_nosync(vma->vm_mm, start, end, stride, - last_level, tlb_level); - __tlbi_sync_s1ish(); -} - -static inline void local_flush_tlb_contpte(struct vm_area_struct *vma, - unsigned long addr) -{ - unsigned long asid; - - addr = round_down(addr, CONT_PTE_SIZE); - - dsb(nshst); - asid = ASID(vma->vm_mm); - __flush_tlb_range_op(vale1, addr, CONT_PTES, PAGE_SIZE, asid, - 3, true, lpa2_is_enabled()); - mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, addr, - addr + CONT_PTE_SIZE); - dsb(nsh); + start = round_down(start, stride); + end = round_up(end, stride); + __do_flush_tlb_range(vma, start, end, stride, tlb_level, flags); } static inline void flush_tlb_range(struct vm_area_struct *vma, @@ -548,7 +582,23 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, * Set the tlb_level to TLBI_TTL_UNKNOWN because we can not get enough * information here. */ - __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN); + __flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, TLBF_NONE); +} + +static inline void __flush_tlb_page(struct vm_area_struct *vma, + unsigned long uaddr, tlbf_t flags) +{ + unsigned long start = round_down(uaddr, PAGE_SIZE); + unsigned long end = start + PAGE_SIZE; + + __do_flush_tlb_range(vma, start, end, PAGE_SIZE, 3, + TLBF_NOWALKCACHE | flags); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long uaddr) +{ + __flush_tlb_page(vma, uaddr, TLBF_NONE); } static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) @@ -560,14 +610,14 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end end = round_up(end, stride); pages = (end - start) >> PAGE_SHIFT; - if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { + if (__flush_tlb_range_limit_excess(pages, stride)) { flush_tlb_all(); return; } dsb(ishst); - __flush_tlb_range_op(vaale1is, start, pages, stride, 0, - TLBI_TTL_UNKNOWN, false, lpa2_is_enabled()); + __flush_s1_tlb_range_op(vaale1is, start, pages, stride, 0, + TLBI_TTL_UNKNOWN); __tlbi_sync_s1ish(); isb(); } @@ -589,7 +639,10 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm, unsigned long start, unsigned long end) { - __flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, true, 3); + struct vm_area_struct vma = { .vm_mm = mm, .vm_flags = 0 }; + + __flush_tlb_range(&vma, start, end, PAGE_SIZE, 3, + TLBF_NOWALKCACHE | TLBF_NOSYNC); } static inline bool __pte_flags_need_flush(ptdesc_t oldval, ptdesc_t newval) @@ -618,6 +671,8 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) } #define huge_pmd_needs_flush huge_pmd_needs_flush +#undef __tlbi_user +#undef __TLBI_VADDR #endif #endif diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index eafc83d255d8..b0c83a08dda9 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -62,7 +62,7 @@ static inline void __uaccess_ttbr0_disable(void) local_irq_save(flags); ttbr = read_sysreg(ttbr1_el1); - ttbr &= ~TTBR_ASID_MASK; + ttbr &= ~TTBRx_EL1_ASID_MASK; /* reserved_pg_dir placed before swapper_pg_dir */ write_sysreg(ttbr - RESERVED_SWAPPER_OFFSET, ttbr0_el1); /* Set reserved ASID */ @@ -85,8 +85,8 @@ static inline void __uaccess_ttbr0_enable(void) /* Restore active ASID */ ttbr1 = read_sysreg(ttbr1_el1); - ttbr1 &= ~TTBR_ASID_MASK; /* safety measure */ - ttbr1 |= ttbr0 & TTBR_ASID_MASK; + ttbr1 &= ~TTBRx_EL1_ASID_MASK; /* safety measure */ + ttbr1 |= ttbr0 & TTBRx_EL1_ASID_MASK; write_sysreg(ttbr1, ttbr1_el1); /* Restore user page table */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index fe627100d199..74b76bb70452 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_ARM64_MPAM) += mpam.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index e737c6295ec7..b7a1f8b788bb 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -610,6 +610,20 @@ static int __init armv8_deprecated_init(void) } #endif + +#ifdef CONFIG_SWP_EMULATION + /* + * The purpose of supporting LSUI is to eliminate PAN toggling. CPUs + * that support LSUI are unlikely to support a 32-bit runtime. Rather + * than emulating the SWP instruction using LSUI instructions, simply + * disable SWP emulation. + */ + if (cpus_have_final_cap(ARM64_HAS_LSUI)) { + insn_swp.status = INSN_UNAVAILABLE; + pr_info("swp/swpb instruction emulation is not supported on this system\n"); + } +#endif + for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) { struct insn_emulation *ie = insn_emulations[i]; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 32c2dbcc0c64..29312081d2e3 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -77,6 +77,7 @@ #include #include +#include #include #include #include @@ -86,6 +87,7 @@ #include #include #include +#include #include #include #include @@ -281,6 +283,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { static const struct arm64_ftr_bits ftr_id_aa64isar3[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSUI_SHIFT, 4, ID_AA64ISAR3_EL1_LSUI_NI), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0), ARM64_FTR_END, @@ -565,7 +568,7 @@ static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { * We can instantiate multiple PMU instances with different levels * of support. */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_EL1_DebugVer_SHIFT, 4, 0x6), ARM64_FTR_END, }; @@ -709,7 +712,7 @@ static const struct arm64_ftr_bits ftr_id_pfr2[] = { static const struct arm64_ftr_bits ftr_id_dfr0[] = { /* [31:28] TraceFilt */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MProfDbg_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapTrc_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopTrc_SHIFT, 4, 0), @@ -1927,19 +1930,10 @@ static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); unsigned int pmuver; - /* - * PMUVer follows the standard ID scheme for an unsigned field with the - * exception of 0xF (IMP_DEF) which is treated specially and implies - * FEAT_PMUv3 is not implemented. - * - * See DDI0487L.a D24.1.3.2 for more details. - */ pmuver = cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT); - if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) - return false; - return pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP; + return pmuv3_implemented(pmuver); } #endif @@ -2501,13 +2495,19 @@ test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) static void cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) { - /* - * Access by the kernel (at EL1) should use the reserved PARTID - * which is configured unrestricted. This avoids priority-inversion - * where latency sensitive tasks have to wait for a task that has - * been throttled to release the lock. - */ - write_sysreg_s(0, SYS_MPAM1_EL1); + int cpu = smp_processor_id(); + u64 regval = 0; + + if (IS_ENABLED(CONFIG_ARM64_MPAM) && static_branch_likely(&mpam_enabled)) + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (cpus_have_cap(ARM64_SME)) + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1); + isb(); + + /* Synchronising the EL0 write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); } static bool @@ -3178,6 +3178,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_ls64_v, ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LS64, LS64_V) }, +#ifdef CONFIG_ARM64_LSUI + { + .desc = "Unprivileged Load Store Instructions (LSUI)", + .capability = ARM64_HAS_LSUI, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64ISAR3_EL1, LSUI, IMP) + }, +#endif {}, }; diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 3625797e9ee8..f42ce7b5c67f 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -35,11 +35,11 @@ * Before this function is called it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) +static noinstr irqentry_state_t arm64_enter_from_kernel_mode(struct pt_regs *regs) { irqentry_state_t state; - state = irqentry_enter(regs); + state = irqentry_enter_from_kernel_mode(regs); mte_check_tfsr_entry(); mte_disable_tco_entry(current); @@ -51,11 +51,14 @@ static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs) * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ -static void noinstr exit_to_kernel_mode(struct pt_regs *regs, - irqentry_state_t state) +static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) { + local_irq_disable(); + irqentry_exit_to_kernel_mode_preempt(regs, state); + local_daif_mask(); mte_check_tfsr_exit(); - irqentry_exit(regs, state); + irqentry_exit_to_kernel_mode_after_preempt(regs, state); } /* @@ -298,11 +301,10 @@ static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) unsigned long far = read_sysreg(far_el1); irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_mem_abort(far, esr, regs); - local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) @@ -310,55 +312,50 @@ static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) unsigned long far = read_sysreg(far_el1); irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_sp_pc_abort(far, esr, regs); - local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr) { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_undef(regs, esr); - local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_bti(regs, esr); - local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_gcs(regs, esr); - local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_mops(regs, esr); - local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr) @@ -420,11 +417,10 @@ static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); local_daif_inherit(regs); do_el1_fpac(regs, esr); - local_daif_mask(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) @@ -491,13 +487,13 @@ static __always_inline void __el1_irq(struct pt_regs *regs, { irqentry_state_t state; - state = enter_from_kernel_mode(regs); + state = arm64_enter_from_kernel_mode(regs); irq_enter_rcu(); do_interrupt_handler(regs, handler); irq_exit_rcu(); - exit_to_kernel_mode(regs, state); + arm64_exit_to_kernel_mode(regs, state); } static void noinstr el1_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index f8018b5c1f9a..e0db14e9c843 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -273,7 +273,7 @@ alternative_if ARM64_HAS_ADDRESS_AUTH alternative_else_nop_endif 1: - scs_load_current + scs_load_current_base .else add x21, sp, #PT_REGS_SIZE get_current_task tsk @@ -378,8 +378,6 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif 3: - scs_save tsk - /* Ignore asynchronous tag check faults in the uaccess routines */ ldr x0, [tsk, THREAD_SCTLR_USER] clear_mte_async_tcf x0 @@ -473,7 +471,7 @@ alternative_else_nop_endif */ SYM_CODE_START_LOCAL(__swpan_entry_el1) mrs x21, ttbr0_el1 - tst x21, #TTBR_ASID_MASK // Check for the reserved ASID + tst x21, #TTBRx_EL1_ASID_MASK // Check for the reserved ASID orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR b.eq 1f // TTBR0 access already disabled and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 239c16e3d02f..c5693a32e49b 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -129,9 +129,6 @@ int machine_kexec_post_load(struct kimage *kimage) } /* Create a copy of the linear map */ - trans_pgd = kexec_page_alloc(kimage); - if (!trans_pgd) - return -ENOMEM; rc = trans_pgd_create_copy(&info, &trans_pgd, PAGE_OFFSET, PAGE_END); if (rc) return rc; diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c new file mode 100644 index 000000000000..3a490de4fa12 --- /dev/null +++ b/arch/arm64/kernel/mpam.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Arm Ltd. */ + +#include + +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(mpam_enabled); +DEFINE_PER_CPU(u64, arm64_mpam_default); +DEFINE_PER_CPU(u64, arm64_mpam_current); + +u64 arm64_mpam_global_default; + +static int mpam_pm_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + u64 regval; + int cpu = smp_processor_id(); + + switch (cmd) { + case CPU_PM_EXIT: + /* + * Don't use mpam_thread_switch() as the system register + * value has changed under our feet. + */ + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (system_supports_sme()) { + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), + SYS_MPAMSM_EL1); + } + isb(); + + write_sysreg_s(regval, SYS_MPAM0_EL1); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block mpam_pm_nb = { + .notifier_call = mpam_pm_notifier, +}; + +static int __init arm64_mpam_register_cpus(void) +{ + u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr); + u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr); + + if (!system_supports_mpam()) + return 0; + + cpu_pm_register_notifier(&mpam_pm_nb); + return mpam_register_requestor(partid_max, pmg_max); +} +/* Must occur before mpam_msc_driver_init() from subsys_initcall() */ +arch_initcall(arm64_mpam_register_cpus) diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 32148bf09c1d..6874b16d0657 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -291,6 +291,9 @@ void mte_thread_switch(struct task_struct *next) /* TCO may not have been disabled on exception entry for the current task. */ mte_disable_tco_entry(next); + if (!system_uses_mte_async_or_asymm_mode()) + return; + /* * Check if an async tag exception occurred at EL1. * @@ -315,8 +318,8 @@ void mte_cpu_setup(void) * CnP is not a boot feature so MTE gets enabled before CnP, but let's * make sure that is the case. */ - BUG_ON(read_sysreg(ttbr0_el1) & TTBR_CNP_BIT); - BUG_ON(read_sysreg(ttbr1_el1) & TTBR_CNP_BIT); + BUG_ON(read_sysreg(ttbr0_el1) & TTBRx_EL1_CnP); + BUG_ON(read_sysreg(ttbr1_el1) & TTBRx_EL1_CnP); /* Normal Tagged memory type at the corresponding MAIR index */ sysreg_clear_set(mair_el1, @@ -350,6 +353,9 @@ void mte_suspend_enter(void) if (!system_supports_mte()) return; + if (!system_uses_mte_async_or_asymm_mode()) + return; + /* * The barriers are required to guarantee that the indirect writes * to TFSR_EL1 are synchronized before we report the state. diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 489554931231..c0bf1f46cdc6 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -699,6 +700,29 @@ void update_sctlr_el1(u64 sctlr) isb(); } +static inline void debug_switch_state(void) +{ + if (system_uses_irq_prio_masking()) { + unsigned long daif_expected = 0; + unsigned long daif_actual = read_sysreg(daif); + unsigned long pmr_expected = GIC_PRIO_IRQOFF; + unsigned long pmr_actual = read_sysreg_s(SYS_ICC_PMR_EL1); + + WARN_ONCE(daif_actual != daif_expected || + pmr_actual != pmr_expected, + "Unexpected DAIF + PMR: 0x%lx + 0x%lx (expected 0x%lx + 0x%lx)\n", + daif_actual, pmr_actual, + daif_expected, pmr_expected); + } else { + unsigned long daif_expected = DAIF_PROCCTX_NOIRQ; + unsigned long daif_actual = read_sysreg(daif); + + WARN_ONCE(daif_actual != daif_expected, + "Unexpected DAIF value: 0x%lx (expected 0x%lx)\n", + daif_actual, daif_expected); + } +} + /* * Thread switching. */ @@ -708,6 +732,8 @@ struct task_struct *__switch_to(struct task_struct *prev, { struct task_struct *last; + debug_switch_state(); + fpsimd_thread_switch(next); tls_thread_switch(next); hw_breakpoint_thread_switch(next); @@ -738,6 +764,12 @@ struct task_struct *__switch_to(struct task_struct *prev, if (prev->thread.sctlr_user != next->thread.sctlr_user) update_sctlr_el1(next->thread.sctlr_user); + /* + * MPAM thread switch happens after the DSB to ensure prev's accesses + * use prev's MPAM settings. + */ + mpam_thread_switch(next); + /* the actual thread switch */ last = cpu_switch_to(prev, next); diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index 9e846ce4ef9c..92160f2e57ff 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -145,7 +145,7 @@ void __init arm64_rsi_init(void) return; if (!rsi_version_matches()) return; - if (WARN_ON(rsi_get_realm_config(&config))) + if (WARN_ON(rsi_get_realm_config(lm_alias(&config)))) return; prot_ns_shared = __phys_to_pte_val(BIT(config.ipa_bits - 1)); diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index b9d4998c97ef..7e9860143add 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -36,7 +36,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end) * The workaround requires an inner-shareable tlbi. * We pick the reserved-ASID to minimise the impact. */ - __tlbi(aside1is, __TLBI_VADDR(0, 0)); + __tlbi(aside1is, 0UL); __tlbi_sync_s1ish(); } diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index a024d9a770dc..9f8f0ae8e86e 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -9,6 +9,7 @@ #include #include #include +#include static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw) { @@ -1679,6 +1680,35 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) } } +static int __lsui_swap_desc(u64 __user *ptep, u64 old, u64 new) +{ + u64 tmp = old; + int ret = 0; + + /* + * Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(), + * as PAN toggling is not required. + */ + uaccess_ttbr0_enable(); + + asm volatile(__LSUI_PREAMBLE + "1: cast %[old], %[new], %[addr]\n" + "2:\n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) + : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret) + : [new] "r" (new) + : "memory"); + + uaccess_ttbr0_disable(); + + if (ret) + return ret; + if (tmp != old) + return -EAGAIN; + + return ret; +} + static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new) { u64 tmp = old; @@ -1754,7 +1784,9 @@ int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new) return -EPERM; ptep = (void __user *)hva + offset; - if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) + if (cpus_have_final_cap(ARM64_HAS_LSUI)) + r = __lsui_swap_desc(ptep, old, new); + else if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS)) r = __lse_swap_desc(ptep, old, new); else r = __llsc_swap_desc(ptep, old, new); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 3ad6b7c6e4ba..f4d7b12045e8 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -75,8 +76,10 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) void kvm_init_host_debug_data(void) { u64 dfr0 = read_sysreg(id_aa64dfr0_el1); + unsigned int pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_EL1_PMUVer_SHIFT); - if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0) + if (pmuv3_implemented(pmuver)) *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N, read_sysreg(pmcr_el0)); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 2597e8bda867..0b50ddd530f3 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -267,7 +267,8 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) { - u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + u64 clr = MPAM2_EL2_EnMPAMSM; + u64 set = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; if (!system_supports_mpam()) return; @@ -277,18 +278,21 @@ static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); } else { /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ - r |= MPAM2_EL2_TIDR; + set |= MPAM2_EL2_TIDR; } - write_sysreg_s(r, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); } static inline void __deactivate_traps_mpam(void) { + u64 clr = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1 | MPAM2_EL2_TIDR; + u64 set = MPAM2_EL2_EnMPAMSM; + if (!system_supports_mpam()) return; - write_sysreg_s(0, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); if (system_supports_mpam_hcr()) write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 0d42eedc7167..445eb0743af2 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -130,7 +130,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) ldr x1, [x0, #NVHE_INIT_PGD_PA] phys_to_ttbr x2, x1 alternative_if ARM64_HAS_CNP - orr x2, x2, #TTBR_CNP_BIT + orr x2, x2, #TTBRx_EL1_CnP alternative_else_nop_endif msr ttbr0_el2, x2 @@ -291,7 +291,7 @@ SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd) /* Install the new pgtables */ phys_to_ttbr x5, x0 alternative_if ARM64_HAS_CNP - orr x5, x5, #TTBR_CNP_BIT + orr x5, x5, #TTBRx_EL1_CnP alternative_else_nop_endif msr ttbr0_el2, x5 diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 218976287d3f..4d8fcc7a3a41 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -270,7 +270,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 */ dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); + __tlbi_level(vale2is, addr, level); __tlbi_sync_s1ish_hyp(); isb(); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 3dc1ce0d27fe..b29140995d48 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -158,7 +158,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1is, ipa, level); /* @@ -188,7 +187,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1, ipa, level); /* diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 9b480f947da2..30226f2d5564 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -490,14 +490,14 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN); + __tlbi_level(vae2is, ctx->addr, TLBI_TTL_UNKNOWN); } else { if (ctx->end - ctx->addr < granule) return -EINVAL; kvm_clear_pte(ctx->ptep); dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); + __tlbi_level(vale2is, ctx->addr, ctx->level); *unmapped += granule; } diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index b254d442e54e..be685b63e8cf 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -183,6 +183,21 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt) } NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe); +/* + * The _EL0 value was written by the host's context switch and belongs to the + * VMM. Copy this into the guest's _EL1 register. + */ +static inline void __mpam_guest_load(void) +{ + u64 mask = MPAM0_EL1_PARTID_D | MPAM0_EL1_PARTID_I | MPAM0_EL1_PMG_D | MPAM0_EL1_PMG_I; + + if (system_supports_mpam()) { + u64 val = (read_sysreg_s(SYS_MPAM0_EL1) & mask) | MPAM1_EL1_MPAMEN; + + write_sysreg_el1(val, SYS_MPAM1); + } +} + /** * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU * @@ -222,6 +237,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); + __mpam_guest_load(); if (unlikely(is_hyp_ctxt(vcpu))) { __sysreg_restore_vel2_state(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 35855dadfb1b..f7b9dfe3f3a5 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -104,7 +104,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1is, ipa, level); /* @@ -136,7 +135,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, * Instead, we invalidate Stage-2 for this IPA, and the * whole of Stage-1. Weep... */ - ipa >>= 12; __tlbi_level(ipas2e1, ipa, level); /* diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1b4cacb6e918..c1e0dea903a1 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1805,7 +1805,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, break; case SYS_ID_AA64ISAR3_EL1: val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE | - ID_AA64ISAR3_EL1_FAMINMAX; + ID_AA64ISAR3_EL1_FAMINMAX | ID_AA64ISAR3_EL1_LSUI; break; case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; @@ -3252,6 +3252,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64ISAR2_EL1_GPA3)), ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE | + ID_AA64ISAR3_EL1_LSUI | ID_AA64ISAR3_EL1_FAMINMAX)), ID_UNALLOCATED(6,4), ID_UNALLOCATED(6,5), @@ -3376,6 +3377,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_MPAM1_EL1), undef_access }, { SYS_DESC(SYS_MPAM0_EL1), undef_access }, + { SYS_DESC(SYS_MPAMSM_EL1), undef_access }, + { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index b2ac06246327..0f4a28b87469 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -354,15 +354,15 @@ void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm) /* Skip CNP for the reserved ASID */ if (system_supports_cnp() && asid) - ttbr0 |= TTBR_CNP_BIT; + ttbr0 |= TTBRx_EL1_CnP; /* SW PAN needs a copy of the ASID in TTBR0 for entry */ if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN)) - ttbr0 |= FIELD_PREP(TTBR_ASID_MASK, asid); + ttbr0 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid); /* Set ASID in TTBR1 since TCR.A1 is set */ - ttbr1 &= ~TTBR_ASID_MASK; - ttbr1 |= FIELD_PREP(TTBR_ASID_MASK, asid); + ttbr1 &= ~TTBRx_EL1_ASID_MASK; + ttbr1 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid); cpu_set_reserved_ttbr0_nosync(); write_sysreg(ttbr1, ttbr1_el1); diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 1519d090d5ea..3970392c4326 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -225,7 +225,8 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr, */ if (!system_supports_bbml2_noabort()) - __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); + __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, 3, + TLBF_NOWALKCACHE); __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); } @@ -551,8 +552,8 @@ int contpte_clear_flush_young_ptes(struct vm_area_struct *vma, * See comment in __ptep_clear_flush_young(); same rationale for * eliding the trailing DSB applies here. */ - __flush_tlb_range_nosync(vma->vm_mm, addr, end, - PAGE_SIZE, true, 3); + __flush_tlb_range(vma, addr, end, PAGE_SIZE, 3, + TLBF_NOWALKCACHE | TLBF_NOSYNC); } return young; @@ -685,7 +686,10 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma, __ptep_set_access_flags(vma, addr, ptep, entry, 0); if (dirty) - local_flush_tlb_contpte(vma, start_addr); + __flush_tlb_range(vma, start_addr, + start_addr + CONT_PTE_SIZE, + PAGE_SIZE, 3, + TLBF_NOWALKCACHE | TLBF_NOBROADCAST); } else { __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte); __ptep_set_access_flags(vma, addr, ptep, entry, dirty); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index be9dab2c7d6a..920a8b244d59 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -204,12 +204,13 @@ static void show_pte(unsigned long addr) * * Returns whether or not the PTE actually changed. */ -int __ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, - pte_t entry, int dirty) +int __ptep_set_access_flags_anysz(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty, unsigned long pgsize) { pteval_t old_pteval, pteval; pte_t pte = __ptep_get(ptep); + int level; if (pte_same(pte, entry)) return 0; @@ -238,8 +239,27 @@ int __ptep_set_access_flags(struct vm_area_struct *vma, * may still cause page faults and be invalidated via * flush_tlb_fix_spurious_fault(). */ - if (dirty) - local_flush_tlb_page(vma, address); + if (dirty) { + switch (pgsize) { + case PAGE_SIZE: + level = 3; + break; + case PMD_SIZE: + level = 2; + break; +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + level = 1; + break; +#endif + default: + level = TLBI_TTL_UNKNOWN; + WARN_ON(1); + } + + __flush_tlb_range(vma, address, address + pgsize, pgsize, level, + TLBF_NOWALKCACHE | TLBF_NOBROADCAST); + } return 1; } diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index a42c05cf5640..30772a909aea 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -181,7 +181,7 @@ static pte_t get_clear_contig_flush(struct mm_struct *mm, struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); unsigned long end = addr + (pgsize * ncontig); - __flush_hugetlb_tlb_range(&vma, addr, end, pgsize, true); + __flush_hugetlb_tlb_range(&vma, addr, end, pgsize, TLBF_NOWALKCACHE); return orig_pte; } @@ -209,7 +209,7 @@ static void clear_flush(struct mm_struct *mm, if (mm == &init_mm) flush_tlb_kernel_range(saddr, addr); else - __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true); + __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, TLBF_NOWALKCACHE); } void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, @@ -427,11 +427,11 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t orig_pte; VM_WARN_ON(!pte_present(pte)); + ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); if (!pte_cont(pte)) - return __ptep_set_access_flags(vma, addr, ptep, pte, dirty); - - ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); + return __ptep_set_access_flags_anysz(vma, addr, ptep, pte, + dirty, pgsize); if (!__cont_access_flags_changed(ptep, pte, ncontig)) return 0; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 96711b8578fd..b9b248d24fd1 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -350,7 +350,6 @@ void __init arch_mm_preinit(void) } swiotlb_init(swiotlb, flags); - swiotlb_update_mem_attributes(); /* * Check boundaries twice: Some fundamental inconsistencies can be @@ -377,6 +376,14 @@ void __init arch_mm_preinit(void) } } +bool page_alloc_available __ro_after_init; + +void __init mem_init(void) +{ + page_alloc_available = true; + swiotlb_update_mem_attributes(); +} + void free_initmem(void) { void *lm_init_begin = lm_alias(__init_begin); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a6a00accf4f9..7ea743996a61 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -112,7 +112,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type) +static phys_addr_t __init early_pgtable_alloc(enum pgtable_level pgtable_level) { phys_addr_t phys; @@ -197,14 +197,14 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { unsigned long next; pmd_t pmd = READ_ONCE(*pmdp); pte_t *ptep; - BUG_ON(pmd_sect(pmd)); + BUG_ON(pmd_leaf(pmd)); if (pmd_none(pmd)) { pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; phys_addr_t pte_phys; @@ -212,7 +212,7 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pmdval |= PMD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pte_phys = pgtable_alloc(TABLE_PTE); + pte_phys = pgtable_alloc(PGTABLE_LEVEL_PTE); if (pte_phys == INVALID_PHYS_ADDR) return -ENOMEM; ptep = pte_set_fixmap(pte_phys); @@ -252,7 +252,7 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { unsigned long next; @@ -292,7 +292,7 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret; @@ -303,7 +303,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, /* * Check for initial section mappings in the pgd/pud. */ - BUG_ON(pud_sect(pud)); + BUG_ON(pud_leaf(pud)); if (pud_none(pud)) { pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; phys_addr_t pmd_phys; @@ -311,7 +311,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pudval |= PUD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pmd_phys = pgtable_alloc(TABLE_PMD); + pmd_phys = pgtable_alloc(PGTABLE_LEVEL_PMD); if (pmd_phys == INVALID_PHYS_ADDR) return -ENOMEM; pmdp = pmd_set_fixmap(pmd_phys); @@ -349,7 +349,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret = 0; @@ -364,7 +364,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) p4dval |= P4D_TABLE_PXN; BUG_ON(!pgtable_alloc); - pud_phys = pgtable_alloc(TABLE_PUD); + pud_phys = pgtable_alloc(PGTABLE_LEVEL_PUD); if (pud_phys == INVALID_PHYS_ADDR) return -ENOMEM; pudp = pud_set_fixmap(pud_phys); @@ -415,7 +415,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret; @@ -430,7 +430,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) pgdval |= PGD_TABLE_PXN; BUG_ON(!pgtable_alloc); - p4d_phys = pgtable_alloc(TABLE_P4D); + p4d_phys = pgtable_alloc(PGTABLE_LEVEL_P4D); if (p4d_phys == INVALID_PHYS_ADDR) return -ENOMEM; p4dp = p4d_set_fixmap(p4d_phys); @@ -467,7 +467,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret; @@ -500,7 +500,7 @@ static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret; @@ -516,7 +516,7 @@ static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), + phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) { int ret; @@ -528,7 +528,7 @@ static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, } static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, - enum pgtable_type pgtable_type) + enum pgtable_level pgtable_level) { /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0); @@ -539,40 +539,43 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, pa = page_to_phys(ptdesc_page(ptdesc)); - switch (pgtable_type) { - case TABLE_PTE: + switch (pgtable_level) { + case PGTABLE_LEVEL_PTE: BUG_ON(!pagetable_pte_ctor(mm, ptdesc)); break; - case TABLE_PMD: + case PGTABLE_LEVEL_PMD: BUG_ON(!pagetable_pmd_ctor(mm, ptdesc)); break; - case TABLE_PUD: + case PGTABLE_LEVEL_PUD: pagetable_pud_ctor(ptdesc); break; - case TABLE_P4D: + case PGTABLE_LEVEL_P4D: pagetable_p4d_ctor(ptdesc); break; + case PGTABLE_LEVEL_PGD: + VM_WARN_ON(1); + break; } return pa; } static phys_addr_t -pgd_pgtable_alloc_init_mm_gfp(enum pgtable_type pgtable_type, gfp_t gfp) +pgd_pgtable_alloc_init_mm_gfp(enum pgtable_level pgtable_level, gfp_t gfp) { - return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type); + return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_level); } static phys_addr_t __maybe_unused -pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) +pgd_pgtable_alloc_init_mm(enum pgtable_level pgtable_level) { - return pgd_pgtable_alloc_init_mm_gfp(pgtable_type, GFP_PGTABLE_KERNEL); + return pgd_pgtable_alloc_init_mm_gfp(pgtable_level, GFP_PGTABLE_KERNEL); } static phys_addr_t -pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) +pgd_pgtable_alloc_special_mm(enum pgtable_level pgtable_level) { - return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type); + return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_level); } static void split_contpte(pte_t *ptep) @@ -593,7 +596,7 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont) pte_t *ptep; int i; - pte_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PTE, gfp); + pte_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PTE, gfp); if (pte_phys == INVALID_PHYS_ADDR) return -ENOMEM; ptep = (pte_t *)phys_to_virt(pte_phys); @@ -602,6 +605,8 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont) tableprot |= PMD_TABLE_PXN; prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE); + if (!pmd_valid(pmd)) + prot = pte_pgprot(pte_mkinvalid(pfn_pte(0, prot))); prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); if (to_cont) prot = __pgprot(pgprot_val(prot) | PTE_CONT); @@ -638,7 +643,7 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont) pmd_t *pmdp; int i; - pmd_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PMD, gfp); + pmd_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PMD, gfp); if (pmd_phys == INVALID_PHYS_ADDR) return -ENOMEM; pmdp = (pmd_t *)phys_to_virt(pmd_phys); @@ -647,6 +652,8 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont) tableprot |= PUD_TABLE_PXN; prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); + if (!pud_valid(pud)) + prot = pmd_pgprot(pmd_mkinvalid(pfn_pmd(0, prot))); prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); if (to_cont) prot = __pgprot(pgprot_val(prot) | PTE_CONT); @@ -768,30 +775,51 @@ static inline bool force_pte_mapping(void) } static DEFINE_MUTEX(pgtable_split_lock); +static bool linear_map_requires_bbml2; int split_kernel_leaf_mapping(unsigned long start, unsigned long end) { int ret; - /* - * !BBML2_NOABORT systems should not be trying to change permissions on - * anything that is not pte-mapped in the first place. Just return early - * and let the permission change code raise a warning if not already - * pte-mapped. - */ - if (!system_supports_bbml2_noabort()) - return 0; - /* * If the region is within a pte-mapped area, there is no need to try to * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may * change permissions from atomic context so for those cases (which are * always pte-mapped), we must not go any further because taking the - * mutex below may sleep. + * mutex below may sleep. Do not call force_pte_mapping() here because + * it could return a confusing result if called from a secondary cpu + * prior to finalizing caps. Instead, linear_map_requires_bbml2 gives us + * what we need. */ - if (force_pte_mapping() || is_kfence_address((void *)start)) + if (!linear_map_requires_bbml2 || is_kfence_address((void *)start)) return 0; + if (!system_supports_bbml2_noabort()) { + /* + * !BBML2_NOABORT systems should not be trying to change + * permissions on anything that is not pte-mapped in the first + * place. Just return early and let the permission change code + * raise a warning if not already pte-mapped. + */ + if (system_capabilities_finalized()) + return 0; + + /* + * Boot-time: split_kernel_leaf_mapping_locked() allocates from + * page allocator. Can't split until it's available. + */ + if (WARN_ON(!page_alloc_available)) + return -EBUSY; + + /* + * Boot-time: Started secondary cpus but don't know if they + * support BBML2_NOABORT yet. Can't allow splitting in this + * window in case they don't. + */ + if (WARN_ON(num_online_cpus() > 1)) + return -EBUSY; + } + /* * Ensure start and end are at least page-aligned since this is the * finest granularity we can split to. @@ -891,8 +919,6 @@ static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp return ret; } -static bool linear_map_requires_bbml2 __initdata; - u32 idmap_kpti_bbml2_flag; static void __init init_idmap_kpti_bbml2_flag(void) @@ -1226,7 +1252,7 @@ static void __init declare_vma(struct vm_struct *vma, static phys_addr_t kpti_ng_temp_alloc __initdata; -static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_type type) +static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_level pgtable_level) { kpti_ng_temp_alloc -= PAGE_SIZE; return kpti_ng_temp_alloc; @@ -1458,10 +1484,14 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, WARN_ON(!pte_present(pte)); __pte_clear(&init_mm, addr, ptep); - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - if (free_mapped) + if (free_mapped) { + /* CONT blocks are not supported in the vmemmap */ + WARN_ON(pte_cont(pte)); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); free_hotplug_page_range(pte_page(pte), PAGE_SIZE, altmap); + } + /* unmap_hotplug_range() flushes TLB for !free_mapped */ } while (addr += PAGE_SIZE, addr < end); } @@ -1480,17 +1510,16 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr, continue; WARN_ON(!pmd_present(pmd)); - if (pmd_sect(pmd)) { + if (pmd_leaf(pmd)) { pmd_clear(pmdp); - - /* - * One TLBI should be sufficient here as the PMD_SIZE - * range is mapped with a single block entry. - */ - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - if (free_mapped) + if (free_mapped) { + /* CONT blocks are not supported in the vmemmap */ + WARN_ON(pmd_cont(pmd)); + flush_tlb_kernel_range(addr, addr + PMD_SIZE); free_hotplug_page_range(pmd_page(pmd), PMD_SIZE, altmap); + } + /* unmap_hotplug_range() flushes TLB for !free_mapped */ continue; } WARN_ON(!pmd_table(pmd)); @@ -1513,17 +1542,14 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr, continue; WARN_ON(!pud_present(pud)); - if (pud_sect(pud)) { + if (pud_leaf(pud)) { pud_clear(pudp); - - /* - * One TLBI should be sufficient here as the PUD_SIZE - * range is mapped with a single block entry. - */ - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - if (free_mapped) + if (free_mapped) { + flush_tlb_kernel_range(addr, addr + PUD_SIZE); free_hotplug_page_range(pud_page(pud), PUD_SIZE, altmap); + } + /* unmap_hotplug_range() flushes TLB for !free_mapped */ continue; } WARN_ON(!pud_table(pud)); @@ -1553,6 +1579,7 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr, static void unmap_hotplug_range(unsigned long addr, unsigned long end, bool free_mapped, struct vmem_altmap *altmap) { + unsigned long start = addr; unsigned long next; pgd_t *pgdp, pgd; @@ -1574,6 +1601,9 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end, WARN_ON(!pgd_present(pgd)); unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap); } while (addr = next, addr < end); + + if (!free_mapped) + flush_tlb_kernel_range(start, end); } static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, @@ -1627,7 +1657,7 @@ static void free_empty_pmd_table(pud_t *pudp, unsigned long addr, if (pmd_none(pmd)) continue; - WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd)); + WARN_ON(!pmd_present(pmd) || !pmd_table(pmd)); free_empty_pte_table(pmdp, addr, next, floor, ceiling); } while (addr = next, addr < end); @@ -1667,7 +1697,7 @@ static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr, if (pud_none(pud)) continue; - WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud)); + WARN_ON(!pud_present(pud) || !pud_table(pud)); free_empty_pmd_table(pudp, addr, next, floor, ceiling); } while (addr = next, addr < end); @@ -1763,7 +1793,7 @@ int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, { vmemmap_verify((pte_t *)pmdp, node, addr, next); - return pmd_sect(READ_ONCE(*pmdp)); + return pmd_leaf(READ_ONCE(*pmdp)); } int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, @@ -1827,7 +1857,7 @@ void p4d_clear_huge(p4d_t *p4dp) int pud_clear_huge(pud_t *pudp) { - if (!pud_sect(READ_ONCE(*pudp))) + if (!pud_leaf(READ_ONCE(*pudp))) return 0; pud_clear(pudp); return 1; @@ -1835,7 +1865,7 @@ int pud_clear_huge(pud_t *pudp) int pmd_clear_huge(pmd_t *pmdp) { - if (!pmd_sect(READ_ONCE(*pmdp))) + if (!pmd_leaf(READ_ONCE(*pmdp))) return 0; pmd_clear(pmdp); return 1; @@ -2010,6 +2040,107 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size); } + +static bool addr_splits_kernel_leaf(unsigned long addr) +{ + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + /* + * If the given address points at a the start address of + * a possible leaf, we certainly won't split. Otherwise, + * check if we would actually split a leaf by traversing + * the page tables further. + */ + if (IS_ALIGNED(addr, PGDIR_SIZE)) + return false; + + pgdp = pgd_offset_k(addr); + pgd = pgdp_get(pgdp); + if (!pgd_present(pgd)) + return false; + + if (IS_ALIGNED(addr, P4D_SIZE)) + return false; + + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + if (!p4d_present(p4d)) + return false; + + if (IS_ALIGNED(addr, PUD_SIZE)) + return false; + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + if (!pud_present(pud)) + return false; + + if (pud_leaf(pud)) + return true; + + if (IS_ALIGNED(addr, CONT_PMD_SIZE)) + return false; + + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + if (!pmd_present(pmd)) + return false; + + if (pmd_cont(pmd)) + return true; + + if (IS_ALIGNED(addr, PMD_SIZE)) + return false; + + if (pmd_leaf(pmd)) + return true; + + if (IS_ALIGNED(addr, CONT_PTE_SIZE)) + return false; + + ptep = pte_offset_kernel(pmdp, addr); + pte = __ptep_get(ptep); + if (!pte_present(pte)) + return false; + + if (pte_cont(pte)) + return true; + + return !IS_ALIGNED(addr, PAGE_SIZE); +} + +static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long phys_start, phys_end, start, end; + + phys_start = PFN_PHYS(pfn); + phys_end = phys_start + nr_pages * PAGE_SIZE; + + /* PFN range's linear map edges are leaf entry aligned */ + start = __phys_to_virt(phys_start); + end = __phys_to_virt(phys_end); + if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) { + pr_warn("[%lx %lx] splits a leaf entry in linear map\n", + phys_start, phys_end); + return false; + } + + /* PFN range's vmemmap edges are leaf entry aligned */ + BUILD_BUG_ON(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)); + start = (unsigned long)pfn_to_page(pfn); + end = (unsigned long)pfn_to_page(pfn + nr_pages); + if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) { + pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n", + phys_start, phys_end); + return false; + } + return true; +} + /* * This memory hotplug notifier helps prevent boot memory from being * inadvertently removed as it blocks pfn range offlining process in @@ -2018,8 +2149,11 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) * In future if and when boot memory could be removed, this notifier * should be dropped and free_hotplug_page_range() should handle any * reserved pages allocated during boot. + * + * This also blocks any memory remove that would have caused a split + * in leaf entry in kernel linear or vmemmap mapping. */ -static int prevent_bootmem_remove_notifier(struct notifier_block *nb, +static int prevent_memory_remove_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct mem_section *ms; @@ -2065,11 +2199,15 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb, return NOTIFY_DONE; } } + + if (!can_unmap_without_split(pfn, arg->nr_pages)) + return NOTIFY_BAD; + return NOTIFY_OK; } -static struct notifier_block prevent_bootmem_remove_nb = { - .notifier_call = prevent_bootmem_remove_notifier, +static struct notifier_block prevent_memory_remove_nb = { + .notifier_call = prevent_memory_remove_notifier, }; /* @@ -2119,7 +2257,7 @@ static void validate_bootmem_online(void) } } -static int __init prevent_bootmem_remove_init(void) +static int __init prevent_memory_remove_init(void) { int ret = 0; @@ -2127,13 +2265,13 @@ static int __init prevent_bootmem_remove_init(void) return ret; validate_bootmem_online(); - ret = register_memory_notifier(&prevent_bootmem_remove_nb); + ret = register_memory_notifier(&prevent_memory_remove_nb); if (ret) pr_err("%s: Notifier registration failed %d\n", __func__, ret); return ret; } -early_initcall(prevent_bootmem_remove_init); +early_initcall(prevent_memory_remove_init); #endif pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, @@ -2149,7 +2287,7 @@ pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, */ if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte)) __flush_tlb_range(vma, addr, nr * PAGE_SIZE, - PAGE_SIZE, true, 3); + PAGE_SIZE, 3, TLBF_NOWALKCACHE); } return pte; @@ -2188,7 +2326,7 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); if (cnp) - ttbr1 |= TTBR_CNP_BIT; + ttbr1 |= TTBRx_EL1_CnP; replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 358d1dc9a576..ce035e1b4eaf 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -25,6 +25,11 @@ static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) { struct page_change_data *masks = walk->private; + /* + * Some users clear and set bits which alias each other (e.g. PTE_NG and + * PTE_PRESENT_INVALID). It is therefore important that we always clear + * first then set. + */ val &= ~(pgprot_val(masks->clear_mask)); val |= (pgprot_val(masks->set_mask)); @@ -36,7 +41,7 @@ static int pageattr_pud_entry(pud_t *pud, unsigned long addr, { pud_t val = pudp_get(pud); - if (pud_sect(val)) { + if (pud_leaf(val)) { if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) return -EINVAL; val = __pud(set_pageattr_masks(pud_val(val), walk)); @@ -52,7 +57,7 @@ static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, { pmd_t val = pmdp_get(pmd); - if (pmd_sect(val)) { + if (pmd_leaf(val)) { if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) return -EINVAL; val = __pmd(set_pageattr_masks(pmd_val(val), walk)); @@ -132,11 +137,12 @@ static int __change_memory_common(unsigned long start, unsigned long size, ret = update_range_prot(start, size, set_mask, clear_mask); /* - * If the memory is being made valid without changing any other bits - * then a TLBI isn't required as a non-valid entry cannot be cached in - * the TLB. + * If the memory is being switched from present-invalid to valid without + * changing any other bits then a TLBI isn't required as a non-valid + * entry cannot be cached in the TLB. */ - if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask)) + if (pgprot_val(set_mask) != PTE_PRESENT_VALID_KERNEL || + pgprot_val(clear_mask) != PTE_PRESENT_INVALID) flush_tlb_kernel_range(start, start + size); return ret; } @@ -237,18 +243,18 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) { if (enable) return __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(PTE_VALID), - __pgprot(0)); + __pgprot(PTE_PRESENT_VALID_KERNEL), + __pgprot(PTE_PRESENT_INVALID)); else return __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(0), - __pgprot(PTE_VALID)); + __pgprot(PTE_PRESENT_INVALID), + __pgprot(PTE_PRESENT_VALID_KERNEL)); } int set_direct_map_invalid_noflush(struct page *page) { - pgprot_t clear_mask = __pgprot(PTE_VALID); - pgprot_t set_mask = __pgprot(0); + pgprot_t clear_mask = __pgprot(PTE_PRESENT_VALID_KERNEL); + pgprot_t set_mask = __pgprot(PTE_PRESENT_INVALID); if (!can_set_direct_map()) return 0; @@ -259,8 +265,8 @@ int set_direct_map_invalid_noflush(struct page *page) int set_direct_map_default_noflush(struct page *page) { - pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE); - pgprot_t clear_mask = __pgprot(PTE_RDONLY); + pgprot_t set_mask = __pgprot(PTE_PRESENT_VALID_KERNEL | PTE_WRITE); + pgprot_t clear_mask = __pgprot(PTE_PRESENT_INVALID | PTE_RDONLY); if (!can_set_direct_map()) return 0; @@ -296,8 +302,8 @@ static int __set_memory_enc_dec(unsigned long addr, * entries or Synchronous External Aborts caused by RIPAS_EMPTY */ ret = __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(set_prot), - __pgprot(clear_prot | PTE_VALID)); + __pgprot(set_prot | PTE_PRESENT_INVALID), + __pgprot(clear_prot | PTE_PRESENT_VALID_KERNEL)); if (ret) return ret; @@ -311,8 +317,8 @@ static int __set_memory_enc_dec(unsigned long addr, return ret; return __change_memory_common(addr, PAGE_SIZE * numpages, - __pgprot(PTE_VALID), - __pgprot(0)); + __pgprot(PTE_PRESENT_VALID_KERNEL), + __pgprot(PTE_PRESENT_INVALID)); } static int realm_set_memory_encrypted(unsigned long addr, int numpages) @@ -404,15 +410,15 @@ bool kernel_page_present(struct page *page) pud = READ_ONCE(*pudp); if (pud_none(pud)) return false; - if (pud_sect(pud)) - return true; + if (pud_leaf(pud)) + return pud_valid(pud); pmdp = pmd_offset(pudp, addr); pmd = READ_ONCE(*pmdp); if (pmd_none(pmd)) return false; - if (pmd_sect(pmd)) - return true; + if (pmd_leaf(pmd)) + return pmd_valid(pmd); ptep = pte_offset_kernel(pmdp, addr); return pte_valid(__ptep_get(ptep)); diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 18543b603c77..cca9706a875c 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -31,36 +31,6 @@ static void *trans_alloc(struct trans_pgd_info *info) return info->trans_alloc_page(info->trans_alloc_arg); } -static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) -{ - pte_t pte = __ptep_get(src_ptep); - - if (pte_valid(pte)) { - /* - * Resume will overwrite areas that may be marked - * read only (code, rodata). Clear the RDONLY bit from - * the temporary mappings we use during restore. - */ - __set_pte(dst_ptep, pte_mkwrite_novma(pte)); - } else if (!pte_none(pte)) { - /* - * debug_pagealloc will removed the PTE_VALID bit if - * the page isn't in use by the resume kernel. It may have - * been in use by the original kernel, in which case we need - * to put it back in our copy to do the restore. - * - * Other cases include kfence / vmalloc / memfd_secret which - * may call `set_direct_map_invalid_noflush()`. - * - * Before marking this entry valid, check the pfn should - * be mapped. - */ - BUG_ON(!pfn_valid(pte_pfn(pte))); - - __set_pte(dst_ptep, pte_mkvalid(pte_mkwrite_novma(pte))); - } -} - static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, unsigned long end) { @@ -76,7 +46,11 @@ static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, src_ptep = pte_offset_kernel(src_pmdp, start); do { - _copy_pte(dst_ptep, src_ptep, addr); + pte_t pte = __ptep_get(src_ptep); + + if (pte_none(pte)) + continue; + __set_pte(dst_ptep, pte_mkvalid_k(pte_mkwrite_novma(pte))); } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); return 0; @@ -109,8 +83,7 @@ static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp, if (copy_pte(info, dst_pmdp, src_pmdp, addr, next)) return -ENOMEM; } else { - set_pmd(dst_pmdp, - __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); + set_pmd(dst_pmdp, pmd_mkvalid_k(pmd_mkwrite_novma(pmd))); } } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); @@ -145,8 +118,7 @@ static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp, if (copy_pmd(info, dst_pudp, src_pudp, addr, next)) return -ENOMEM; } else { - set_pud(dst_pudp, - __pud(pud_val(pud) & ~PUD_SECT_RDONLY)); + set_pud(dst_pudp, pud_mkvalid_k(pud_mkwrite_novma(pud))); } } while (dst_pudp++, src_pudp++, addr = next, addr != end); diff --git a/arch/arm64/tools/Makefile b/arch/arm64/tools/Makefile index c2b34e761006..a94b3d9caad6 100644 --- a/arch/arm64/tools/Makefile +++ b/arch/arm64/tools/Makefile @@ -3,7 +3,7 @@ gen := arch/$(ARCH)/include/generated kapi := $(gen)/asm -kapisyshdr-y := cpucap-defs.h sysreg-defs.h +kapisyshdr-y := cpucap-defs.h kernel-hwcap.h sysreg-defs.h kapi-hdrs-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) @@ -18,11 +18,17 @@ kapi: $(kapi-hdrs-y) quiet_cmd_gen_cpucaps = GEN $@ cmd_gen_cpucaps = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@ +quiet_cmd_gen_kernel_hwcap = GEN $@ + cmd_gen_kernel_hwcap = mkdir -p $(dir $@); /bin/sh -e $(real-prereqs) > $@ + quiet_cmd_gen_sysreg = GEN $@ cmd_gen_sysreg = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@ $(kapi)/cpucap-defs.h: $(src)/gen-cpucaps.awk $(src)/cpucaps FORCE $(call if_changed,gen_cpucaps) +$(kapi)/kernel-hwcap.h: $(src)/gen-kernel-hwcaps.sh $(srctree)/arch/arm64/include/uapi/asm/hwcap.h FORCE + $(call if_changed,gen_kernel_hwcap) + $(kapi)/sysreg-defs.h: $(src)/gen-sysreg.awk $(src)/sysreg FORCE $(call if_changed,gen_sysreg) diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 7261553b644b..b7286d977788 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -48,6 +48,7 @@ HAS_LPA2 HAS_LSE_ATOMICS HAS_LS64 HAS_LS64_V +HAS_LSUI HAS_MOPS HAS_NESTED_VIRT HAS_BBML2_NOABORT diff --git a/arch/arm64/tools/gen-kernel-hwcaps.sh b/arch/arm64/tools/gen-kernel-hwcaps.sh new file mode 100644 index 000000000000..e7cdcf428d91 --- /dev/null +++ b/arch/arm64/tools/gen-kernel-hwcaps.sh @@ -0,0 +1,23 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0 +# +# gen-kernel-hwcap.sh - Generate kernel internal hwcap.h definitions +# +# Copyright 2026 Arm, Ltd. + +if [ "$1" = "" ]; then + echo "$0: no filename specified" + exit 1 +fi + +echo "#ifndef __ASM_KERNEL_HWCAPS_H" +echo "#define __ASM_KERNEL_HWCAPS_H" +echo "" +echo "/* Generated file - do not edit */" +echo "" + +grep -E '^#define HWCAP[0-9]*_[A-Z0-9_]+' $1 | \ + sed 's/.*HWCAP\([0-9]*\)_\([A-Z0-9_]\+\).*/#define KERNEL_HWCAP_\2\t__khwcap\1_feature(\2)/' + +echo "" +echo "#endif /* __ASM_KERNEL_HWCAPS_H */" diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 9d1c21108057..9d20ec6816d4 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1496,6 +1496,7 @@ UnsignedEnum 27:24 B16B16 0b0000 NI 0b0001 IMP 0b0010 BFSCALE + 0b0011 B16MM EndEnum UnsignedEnum 23:20 BF16 0b0000 NI @@ -1522,6 +1523,7 @@ UnsignedEnum 3:0 SVEver 0b0001 SVE2 0b0010 SVE2p1 0b0011 SVE2p2 + 0b0100 SVE2p3 EndEnum EndSysreg @@ -1530,7 +1532,11 @@ UnsignedEnum 63 FA64 0b0 NI 0b1 IMP EndEnum -Res0 62:61 +Res0 62 +UnsignedEnum 61 LUT6 + 0b0 NI + 0b1 IMP +EndEnum UnsignedEnum 60 LUTv2 0b0 NI 0b1 IMP @@ -1540,6 +1546,7 @@ UnsignedEnum 59:56 SMEver 0b0001 SME2 0b0010 SME2p1 0b0011 SME2p2 + 0b0100 SME2p3 EndEnum UnsignedEnum 55:52 I16I64 0b0000 NI @@ -1654,7 +1661,13 @@ UnsignedEnum 26 F8MM4 0b0 NI 0b1 IMP EndEnum -Res0 25:2 +Res0 25:16 +UnsignedEnum 15 F16MM2 + 0b0 NI + 0b1 IMP +EndEnum +Res0 14:8 +Raz 7:2 UnsignedEnum 1 F8E4M3 0b0 NI 0b1 IMP @@ -1835,6 +1848,8 @@ EndEnum UnsignedEnum 51:48 FHM 0b0000 NI 0b0001 IMP + 0b0010 F16F32DOT + 0b0011 F16F32MM EndEnum UnsignedEnum 47:44 DP 0b0000 NI @@ -1976,6 +1991,7 @@ EndEnum UnsignedEnum 59:56 LUT 0b0000 NI 0b0001 IMP + 0b0010 LUT6 EndEnum UnsignedEnum 55:52 CSSC 0b0000 NI @@ -3655,11 +3671,15 @@ Field 3:0 BS EndSysreg Sysreg SMIDR_EL1 3 1 0 0 6 -Res0 63:32 +Res0 63:60 +Field 59:56 NSMC +Field 55:52 HIP +Field 51:32 AFFINITY2 Field 31:24 IMPLEMENTER Field 23:16 REVISION Field 15 SMPS -Res0 14:12 +Field 14:13 SH +Res0 12 Field 11:0 AFFINITY EndSysreg @@ -5172,6 +5192,14 @@ Field 31:16 PARTID_D Field 15:0 PARTID_I EndSysreg +Sysreg MPAMSM_EL1 3 0 10 5 3 +Res0 63:48 +Field 47:40 PMG_D +Res0 39:32 +Field 31:16 PARTID_D +Res0 15:0 +EndSysreg + Sysreg ISR_EL1 3 0 12 1 0 Res0 63:11 Field 10 IS diff --git a/drivers/acpi/arm64/agdi.c b/drivers/acpi/arm64/agdi.c index feb4b2cb4618..0c2d9d6c160b 100644 --- a/drivers/acpi/arm64/agdi.c +++ b/drivers/acpi/arm64/agdi.c @@ -36,7 +36,7 @@ static int agdi_sdei_probe(struct platform_device *pdev, err = sdei_event_register(adata->sdei_event, agdi_sdei_handler, pdev); if (err) { - dev_err(&pdev->dev, "Failed to register for SDEI event %d", + dev_err(&pdev->dev, "Failed to register for SDEI event %d\n", adata->sdei_event); return err; } diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 638321fc9800..ab90932fc2d0 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -311,4 +311,18 @@ config MARVELL_PEM_PMU Enable support for PCIe Interface performance monitoring on Marvell platform. +config NVIDIA_TEGRA410_CMEM_LATENCY_PMU + tristate "NVIDIA Tegra410 CPU Memory Latency PMU" + depends on ARM64 && ACPI + help + Enable perf support for CPU memory latency counters monitoring on + NVIDIA Tegra410 SoC. + +config NVIDIA_TEGRA410_C2C_PMU + tristate "NVIDIA Tegra410 C2C PMU" + depends on ARM64 && ACPI + help + Enable perf support for counters in NVIDIA C2C interface of NVIDIA + Tegra410 SoC. + endmenu diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index ea52711a87e3..eb8a022dad9a 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -35,3 +35,5 @@ obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/ obj-$(CONFIG_MESON_DDR_PMU) += amlogic/ obj-$(CONFIG_CXL_PMU) += cxl_pmu.o +obj-$(CONFIG_NVIDIA_TEGRA410_CMEM_LATENCY_PMU) += nvidia_t410_cmem_latency_pmu.o +obj-$(CONFIG_NVIDIA_TEGRA410_C2C_PMU) += nvidia_t410_c2c_pmu.o diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 40c05c519a1d..f5305c8fdca4 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -2132,6 +2132,8 @@ static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, i static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int idx) { struct arm_cmn_dtc *dtc = cmn->dtc + idx; + const struct resource *cfg; + resource_size_t base, size; dtc->pmu_base = dn->pmu_base; dtc->base = dtc->pmu_base - arm_cmn_pmu_offset(cmn, dn); @@ -2139,6 +2141,13 @@ static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int id if (dtc->irq < 0) return dtc->irq; + cfg = platform_get_resource(to_platform_device(cmn->dev), IORESOURCE_MEM, 0); + base = dtc->base - cmn->base + cfg->start; + size = cmn->part == PART_CMN600 ? SZ_16K : SZ_64K; + if (!devm_request_mem_region(cmn->dev, base, size, dev_name(cmn->dev))) + return dev_err_probe(cmn->dev, -EBUSY, + "Failed to request DTC region 0x%pa\n", &base); + writel_relaxed(CMN_DT_DTC_CTL_DT_EN, dtc->base + CMN_DT_DTC_CTL); writel_relaxed(CMN_DT_PMCR_PMU_EN | CMN_DT_PMCR_OVFL_INTR_EN, CMN_DT_PMCR(dtc)); writeq_relaxed(0, CMN_DT_PMCCNTR(dtc)); @@ -2525,43 +2534,26 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) return 0; } -static int arm_cmn600_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn) -{ - struct resource *cfg, *root; - - cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!cfg) - return -EINVAL; - - root = platform_get_resource(pdev, IORESOURCE_MEM, 1); - if (!root) - return -EINVAL; - - if (!resource_contains(cfg, root)) - swap(cfg, root); - /* - * Note that devm_ioremap_resource() is dumb and won't let the platform - * device claim cfg when the ACPI companion device has already claimed - * root within it. But since they *are* already both claimed in the - * appropriate name, we don't really need to do it again here anyway. - */ - cmn->base = devm_ioremap(cmn->dev, cfg->start, resource_size(cfg)); - if (!cmn->base) - return -ENOMEM; - - return root->start - cfg->start; -} - -static int arm_cmn600_of_probe(struct device_node *np) +static int arm_cmn_get_root(struct arm_cmn *cmn, const struct resource *cfg) { + const struct device_node *np = cmn->dev->of_node; + const struct resource *root; u32 rootnode; - return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode; + if (cmn->part != PART_CMN600) + return 0; + + if (np) + return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode; + + root = platform_get_resource(to_platform_device(cmn->dev), IORESOURCE_MEM, 1); + return root ? root->start - cfg->start : -EINVAL; } static int arm_cmn_probe(struct platform_device *pdev) { struct arm_cmn *cmn; + const struct resource *cfg; const char *name; static atomic_t id; int err, rootnode, this_id; @@ -2575,16 +2567,16 @@ static int arm_cmn_probe(struct platform_device *pdev) cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev)); platform_set_drvdata(pdev, cmn); - if (cmn->part == PART_CMN600 && has_acpi_companion(cmn->dev)) { - rootnode = arm_cmn600_acpi_probe(pdev, cmn); - } else { - rootnode = 0; - cmn->base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(cmn->base)) - return PTR_ERR(cmn->base); - if (cmn->part == PART_CMN600) - rootnode = arm_cmn600_of_probe(pdev->dev.of_node); - } + cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!cfg) + return -EINVAL; + + /* Map the whole region now, claim the DTCs once we've found them */ + cmn->base = devm_ioremap(cmn->dev, cfg->start, resource_size(cfg)); + if (!cmn->base) + return -ENOMEM; + + rootnode = arm_cmn_get_root(cmn, cfg); if (rootnode < 0) return rootnode; diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index ed72c3d1f796..80fb314d5135 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -16,7 +16,7 @@ * The user should refer to the vendor technical documentation to get details * about the supported events. * - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ @@ -1134,6 +1134,23 @@ static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu) return 0; } + +struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu) +{ + char hid[16] = {}; + char uid[16] = {}; + const struct acpi_apmt_node *apmt_node; + + apmt_node = arm_cspmu_apmt_node(cspmu->dev); + if (!apmt_node || apmt_node->type != ACPI_APMT_NODE_TYPE_ACPI) + return NULL; + + memcpy(hid, &apmt_node->inst_primary, sizeof(apmt_node->inst_primary)); + snprintf(uid, sizeof(uid), "%u", apmt_node->inst_secondary); + + return acpi_dev_get_first_match_dev(hid, uid, -1); +} +EXPORT_SYMBOL_GPL(arm_cspmu_acpi_dev_get); #else static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu) { diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h index cd65a58dbd88..3fc5c8d77266 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.h +++ b/drivers/perf/arm_cspmu/arm_cspmu.h @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 * * ARM CoreSight Architecture PMU driver. - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ #ifndef __ARM_CSPMU_H__ #define __ARM_CSPMU_H__ +#include #include #include #include @@ -255,4 +256,18 @@ int arm_cspmu_impl_register(const struct arm_cspmu_impl_match *impl_match); /* Unregister vendor backend. */ void arm_cspmu_impl_unregister(const struct arm_cspmu_impl_match *impl_match); +#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64) +/** + * Get ACPI device associated with the PMU. + * The caller is responsible for calling acpi_dev_put() on the returned device. + */ +struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu); +#else +static inline struct acpi_device * +arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu) +{ + return NULL; +} +#endif + #endif /* __ARM_CSPMU_H__ */ diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index e06a06d3407b..bac83e424d6d 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * */ @@ -8,6 +8,7 @@ #include #include +#include #include #include "arm_cspmu.h" @@ -21,6 +22,44 @@ #define NV_CNVL_PORT_COUNT 4ULL #define NV_CNVL_FILTER_ID_MASK GENMASK_ULL(NV_CNVL_PORT_COUNT - 1, 0) +#define NV_UCF_SRC_COUNT 3ULL +#define NV_UCF_DST_COUNT 4ULL +#define NV_UCF_FILTER_ID_MASK GENMASK_ULL(11, 0) +#define NV_UCF_FILTER_SRC GENMASK_ULL(2, 0) +#define NV_UCF_FILTER_DST GENMASK_ULL(11, 8) +#define NV_UCF_FILTER_DEFAULT (NV_UCF_FILTER_SRC | NV_UCF_FILTER_DST) + +#define NV_PCIE_V2_PORT_COUNT 8ULL +#define NV_PCIE_V2_FILTER_ID_MASK GENMASK_ULL(24, 0) +#define NV_PCIE_V2_FILTER_PORT GENMASK_ULL(NV_PCIE_V2_PORT_COUNT - 1, 0) +#define NV_PCIE_V2_FILTER_BDF_VAL GENMASK_ULL(23, NV_PCIE_V2_PORT_COUNT) +#define NV_PCIE_V2_FILTER_BDF_EN BIT(24) +#define NV_PCIE_V2_FILTER_BDF_VAL_EN GENMASK_ULL(24, NV_PCIE_V2_PORT_COUNT) +#define NV_PCIE_V2_FILTER_DEFAULT NV_PCIE_V2_FILTER_PORT + +#define NV_PCIE_V2_DST_COUNT 5ULL +#define NV_PCIE_V2_FILTER2_ID_MASK GENMASK_ULL(4, 0) +#define NV_PCIE_V2_FILTER2_DST GENMASK_ULL(NV_PCIE_V2_DST_COUNT - 1, 0) +#define NV_PCIE_V2_FILTER2_DEFAULT NV_PCIE_V2_FILTER2_DST + +#define NV_PCIE_TGT_PORT_COUNT 8ULL +#define NV_PCIE_TGT_EV_TYPE_CC 0x4 +#define NV_PCIE_TGT_EV_TYPE_COUNT 3ULL +#define NV_PCIE_TGT_EV_TYPE_MASK GENMASK_ULL(NV_PCIE_TGT_EV_TYPE_COUNT - 1, 0) +#define NV_PCIE_TGT_FILTER2_MASK GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT, 0) +#define NV_PCIE_TGT_FILTER2_PORT GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT - 1, 0) +#define NV_PCIE_TGT_FILTER2_ADDR_EN BIT(NV_PCIE_TGT_PORT_COUNT) +#define NV_PCIE_TGT_FILTER2_ADDR GENMASK_ULL(15, NV_PCIE_TGT_PORT_COUNT) +#define NV_PCIE_TGT_FILTER2_DEFAULT NV_PCIE_TGT_FILTER2_PORT + +#define NV_PCIE_TGT_ADDR_COUNT 8ULL +#define NV_PCIE_TGT_ADDR_STRIDE 20 +#define NV_PCIE_TGT_ADDR_CTRL 0xD38 +#define NV_PCIE_TGT_ADDR_BASE_LO 0xD3C +#define NV_PCIE_TGT_ADDR_BASE_HI 0xD40 +#define NV_PCIE_TGT_ADDR_MASK_LO 0xD44 +#define NV_PCIE_TGT_ADDR_MASK_HI 0xD48 + #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) #define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) @@ -124,6 +163,55 @@ static struct attribute *mcf_pmu_event_attrs[] = { NULL, }; +static struct attribute *ucf_pmu_event_attrs[] = { + ARM_CSPMU_EVENT_ATTR(bus_cycles, 0x1D), + + ARM_CSPMU_EVENT_ATTR(slc_allocate, 0xF0), + ARM_CSPMU_EVENT_ATTR(slc_wb, 0xF3), + ARM_CSPMU_EVENT_ATTR(slc_refill_rd, 0x109), + ARM_CSPMU_EVENT_ATTR(slc_refill_wr, 0x10A), + ARM_CSPMU_EVENT_ATTR(slc_hit_rd, 0x119), + + ARM_CSPMU_EVENT_ATTR(slc_access_dataless, 0x183), + ARM_CSPMU_EVENT_ATTR(slc_access_atomic, 0x184), + + ARM_CSPMU_EVENT_ATTR(slc_access_rd, 0x111), + ARM_CSPMU_EVENT_ATTR(slc_access_wr, 0x112), + ARM_CSPMU_EVENT_ATTR(slc_bytes_rd, 0x113), + ARM_CSPMU_EVENT_ATTR(slc_bytes_wr, 0x114), + + ARM_CSPMU_EVENT_ATTR(mem_access_rd, 0x121), + ARM_CSPMU_EVENT_ATTR(mem_access_wr, 0x122), + ARM_CSPMU_EVENT_ATTR(mem_bytes_rd, 0x123), + ARM_CSPMU_EVENT_ATTR(mem_bytes_wr, 0x124), + + ARM_CSPMU_EVENT_ATTR(local_snoop, 0x180), + ARM_CSPMU_EVENT_ATTR(ext_snp_access, 0x181), + ARM_CSPMU_EVENT_ATTR(ext_snp_evict, 0x182), + + ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), + NULL +}; + +static struct attribute *pcie_v2_pmu_event_attrs[] = { + ARM_CSPMU_EVENT_ATTR(rd_bytes, 0x0), + ARM_CSPMU_EVENT_ATTR(wr_bytes, 0x1), + ARM_CSPMU_EVENT_ATTR(rd_req, 0x2), + ARM_CSPMU_EVENT_ATTR(wr_req, 0x3), + ARM_CSPMU_EVENT_ATTR(rd_cum_outs, 0x4), + ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), + NULL +}; + +static struct attribute *pcie_tgt_pmu_event_attrs[] = { + ARM_CSPMU_EVENT_ATTR(rd_bytes, 0x0), + ARM_CSPMU_EVENT_ATTR(wr_bytes, 0x1), + ARM_CSPMU_EVENT_ATTR(rd_req, 0x2), + ARM_CSPMU_EVENT_ATTR(wr_req, 0x3), + ARM_CSPMU_EVENT_ATTR(cycles, NV_PCIE_TGT_EV_TYPE_CC), + NULL +}; + static struct attribute *generic_pmu_event_attrs[] = { ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), NULL, @@ -152,6 +240,40 @@ static struct attribute *cnvlink_pmu_format_attrs[] = { NULL, }; +static struct attribute *ucf_pmu_format_attrs[] = { + ARM_CSPMU_FORMAT_EVENT_ATTR, + ARM_CSPMU_FORMAT_ATTR(src_loc_noncpu, "config1:0"), + ARM_CSPMU_FORMAT_ATTR(src_loc_cpu, "config1:1"), + ARM_CSPMU_FORMAT_ATTR(src_rem, "config1:2"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config1:8"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config1:9"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_other, "config1:10"), + ARM_CSPMU_FORMAT_ATTR(dst_rem, "config1:11"), + NULL +}; + +static struct attribute *pcie_v2_pmu_format_attrs[] = { + ARM_CSPMU_FORMAT_EVENT_ATTR, + ARM_CSPMU_FORMAT_ATTR(src_rp_mask, "config1:0-7"), + ARM_CSPMU_FORMAT_ATTR(src_bdf, "config1:8-23"), + ARM_CSPMU_FORMAT_ATTR(src_bdf_en, "config1:24"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config2:0"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config2:1"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_p2p, "config2:2"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_cxl, "config2:3"), + ARM_CSPMU_FORMAT_ATTR(dst_rem, "config2:4"), + NULL +}; + +static struct attribute *pcie_tgt_pmu_format_attrs[] = { + ARM_CSPMU_FORMAT_ATTR(event, "config:0-2"), + ARM_CSPMU_FORMAT_ATTR(dst_rp_mask, "config:3-10"), + ARM_CSPMU_FORMAT_ATTR(dst_addr_en, "config:11"), + ARM_CSPMU_FORMAT_ATTR(dst_addr_base, "config1:0-63"), + ARM_CSPMU_FORMAT_ATTR(dst_addr_mask, "config2:0-63"), + NULL +}; + static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, @@ -183,6 +305,32 @@ nv_cspmu_get_name(const struct arm_cspmu *cspmu) return ctx->name; } +#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64) +static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id) +{ + struct fwnode_handle *fwnode; + struct acpi_device *adev; + int ret; + + adev = arm_cspmu_acpi_dev_get(cspmu); + if (!adev) + return -ENODEV; + + fwnode = acpi_fwnode_handle(adev); + ret = fwnode_property_read_u32(fwnode, "instance_id", id); + if (ret) + dev_err(cspmu->dev, "Failed to get instance ID\n"); + + acpi_dev_put(adev); + return ret; +} +#else +static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id) +{ + return -EINVAL; +} +#endif + static u32 nv_cspmu_event_filter(const struct perf_event *event) { const struct nv_cspmu_ctx *ctx = @@ -228,6 +376,20 @@ static void nv_cspmu_set_ev_filter(struct arm_cspmu *cspmu, } } +static void nv_cspmu_reset_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + const struct nv_cspmu_ctx *ctx = + to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); + const u32 offset = 4 * event->hw.idx; + + if (ctx->get_filter) + writel(0, cspmu->base0 + PMEVFILTR + offset); + + if (ctx->get_filter2) + writel(0, cspmu->base0 + PMEVFILT2R + offset); +} + static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, const struct perf_event *event) { @@ -236,10 +398,386 @@ static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, writel(filter, cspmu->base0 + PMCCFILTR); } +static u32 ucf_pmu_event_filter(const struct perf_event *event) +{ + u32 ret, filter, src, dst; + + filter = nv_cspmu_event_filter(event); + + /* Monitor all sources if none is selected. */ + src = FIELD_GET(NV_UCF_FILTER_SRC, filter); + if (src == 0) + src = GENMASK_ULL(NV_UCF_SRC_COUNT - 1, 0); + + /* Monitor all destinations if none is selected. */ + dst = FIELD_GET(NV_UCF_FILTER_DST, filter); + if (dst == 0) + dst = GENMASK_ULL(NV_UCF_DST_COUNT - 1, 0); + + ret = FIELD_PREP(NV_UCF_FILTER_SRC, src); + ret |= FIELD_PREP(NV_UCF_FILTER_DST, dst); + + return ret; +} + +static u32 pcie_v2_pmu_bdf_val_en(u32 filter) +{ + const u32 bdf_en = FIELD_GET(NV_PCIE_V2_FILTER_BDF_EN, filter); + + /* Returns both BDF value and enable bit if BDF filtering is enabled. */ + if (bdf_en) + return FIELD_GET(NV_PCIE_V2_FILTER_BDF_VAL_EN, filter); + + /* Ignore the BDF value if BDF filter is not enabled. */ + return 0; +} + +static u32 pcie_v2_pmu_event_filter(const struct perf_event *event) +{ + u32 filter, lead_filter, lead_bdf; + struct perf_event *leader; + const struct nv_cspmu_ctx *ctx = + to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); + + filter = event->attr.config1 & ctx->filter_mask; + if (filter != 0) + return filter; + + leader = event->group_leader; + + /* Use leader's filter value if its BDF filtering is enabled. */ + if (event != leader) { + lead_filter = pcie_v2_pmu_event_filter(leader); + lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter); + if (lead_bdf != 0) + return lead_filter; + } + + /* Otherwise, return default filter value. */ + return ctx->filter_default_val; +} + +static int pcie_v2_pmu_validate_event(struct arm_cspmu *cspmu, + struct perf_event *new_ev) +{ + /* + * Make sure the events are using same BDF filter since the PCIE-SRC PMU + * only supports one common BDF filter setting for all of the counters. + */ + + int idx; + u32 new_filter, new_rp, new_bdf, new_lead_filter, new_lead_bdf; + struct perf_event *new_leader; + + if (cspmu->impl.ops.is_cycle_counter_event(new_ev)) + return 0; + + new_leader = new_ev->group_leader; + + new_filter = pcie_v2_pmu_event_filter(new_ev); + new_lead_filter = pcie_v2_pmu_event_filter(new_leader); + + new_bdf = pcie_v2_pmu_bdf_val_en(new_filter); + new_lead_bdf = pcie_v2_pmu_bdf_val_en(new_lead_filter); + + new_rp = FIELD_GET(NV_PCIE_V2_FILTER_PORT, new_filter); + + if (new_rp != 0 && new_bdf != 0) { + dev_err(cspmu->dev, + "RP and BDF filtering are mutually exclusive\n"); + return -EINVAL; + } + + if (new_bdf != new_lead_bdf) { + dev_err(cspmu->dev, + "sibling and leader BDF value should be equal\n"); + return -EINVAL; + } + + /* Compare BDF filter on existing events. */ + idx = find_first_bit(cspmu->hw_events.used_ctrs, + cspmu->cycle_counter_logical_idx); + + if (idx != cspmu->cycle_counter_logical_idx) { + struct perf_event *leader = cspmu->hw_events.events[idx]->group_leader; + + const u32 lead_filter = pcie_v2_pmu_event_filter(leader); + const u32 lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter); + + if (new_lead_bdf != lead_bdf) { + dev_err(cspmu->dev, "only one BDF value is supported\n"); + return -EINVAL; + } + } + + return 0; +} + +struct pcie_tgt_addr_filter { + u32 refcount; + u64 base; + u64 mask; +}; + +struct pcie_tgt_data { + struct pcie_tgt_addr_filter addr_filter[NV_PCIE_TGT_ADDR_COUNT]; + void __iomem *addr_filter_reg; +}; + +#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64) +static int pcie_tgt_init_data(struct arm_cspmu *cspmu) +{ + int ret; + struct acpi_device *adev; + struct pcie_tgt_data *data; + struct list_head resource_list; + struct resource_entry *rentry; + struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu); + struct device *dev = cspmu->dev; + + data = devm_kzalloc(dev, sizeof(struct pcie_tgt_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + adev = arm_cspmu_acpi_dev_get(cspmu); + if (!adev) { + dev_err(dev, "failed to get associated PCIE-TGT device\n"); + return -ENODEV; + } + + INIT_LIST_HEAD(&resource_list); + ret = acpi_dev_get_memory_resources(adev, &resource_list); + if (ret < 0) { + dev_err(dev, "failed to get PCIE-TGT device memory resources\n"); + acpi_dev_put(adev); + return ret; + } + + rentry = list_first_entry_or_null( + &resource_list, struct resource_entry, node); + if (rentry) { + data->addr_filter_reg = devm_ioremap_resource(dev, rentry->res); + ret = 0; + } + + if (IS_ERR(data->addr_filter_reg)) { + dev_err(dev, "failed to get address filter resource\n"); + ret = PTR_ERR(data->addr_filter_reg); + } + + acpi_dev_free_resource_list(&resource_list); + acpi_dev_put(adev); + + ctx->data = data; + + return ret; +} +#else +static int pcie_tgt_init_data(struct arm_cspmu *cspmu) +{ + return -ENODEV; +} +#endif + +static struct pcie_tgt_data *pcie_tgt_get_data(struct arm_cspmu *cspmu) +{ + struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu); + + return ctx->data; +} + +/* Find the first available address filter slot. */ +static int pcie_tgt_find_addr_idx(struct arm_cspmu *cspmu, u64 base, u64 mask, + bool is_reset) +{ + int i; + struct pcie_tgt_data *data = pcie_tgt_get_data(cspmu); + + for (i = 0; i < NV_PCIE_TGT_ADDR_COUNT; i++) { + if (!is_reset && data->addr_filter[i].refcount == 0) + return i; + + if (data->addr_filter[i].base == base && + data->addr_filter[i].mask == mask) + return i; + } + + return -ENODEV; +} + +static u32 pcie_tgt_pmu_event_filter(const struct perf_event *event) +{ + u32 filter; + + filter = (event->attr.config >> NV_PCIE_TGT_EV_TYPE_COUNT) & + NV_PCIE_TGT_FILTER2_MASK; + + return filter; +} + +static bool pcie_tgt_pmu_addr_en(const struct perf_event *event) +{ + u32 filter = pcie_tgt_pmu_event_filter(event); + + return FIELD_GET(NV_PCIE_TGT_FILTER2_ADDR_EN, filter) != 0; +} + +static u32 pcie_tgt_pmu_port_filter(const struct perf_event *event) +{ + u32 filter = pcie_tgt_pmu_event_filter(event); + + return FIELD_GET(NV_PCIE_TGT_FILTER2_PORT, filter); +} + +static u64 pcie_tgt_pmu_dst_addr_base(const struct perf_event *event) +{ + return event->attr.config1; +} + +static u64 pcie_tgt_pmu_dst_addr_mask(const struct perf_event *event) +{ + return event->attr.config2; +} + +static int pcie_tgt_pmu_validate_event(struct arm_cspmu *cspmu, + struct perf_event *new_ev) +{ + u64 base, mask; + int idx; + + if (!pcie_tgt_pmu_addr_en(new_ev)) + return 0; + + /* Make sure there is a slot available for the address filter. */ + base = pcie_tgt_pmu_dst_addr_base(new_ev); + mask = pcie_tgt_pmu_dst_addr_mask(new_ev); + idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false); + if (idx < 0) + return -EINVAL; + + return 0; +} + +static void pcie_tgt_pmu_config_addr_filter(struct arm_cspmu *cspmu, + bool en, u64 base, u64 mask, int idx) +{ + struct pcie_tgt_data *data; + struct pcie_tgt_addr_filter *filter; + void __iomem *filter_reg; + + data = pcie_tgt_get_data(cspmu); + filter = &data->addr_filter[idx]; + filter_reg = data->addr_filter_reg + (idx * NV_PCIE_TGT_ADDR_STRIDE); + + if (en) { + filter->refcount++; + if (filter->refcount == 1) { + filter->base = base; + filter->mask = mask; + + writel(lower_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_LO); + writel(upper_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_HI); + writel(lower_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_LO); + writel(upper_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_HI); + writel(1, filter_reg + NV_PCIE_TGT_ADDR_CTRL); + } + } else { + filter->refcount--; + if (filter->refcount == 0) { + writel(0, filter_reg + NV_PCIE_TGT_ADDR_CTRL); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_LO); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_HI); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_LO); + writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_HI); + + filter->base = 0; + filter->mask = 0; + } + } +} + +static void pcie_tgt_pmu_set_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + bool addr_filter_en; + int idx; + u32 filter2_val, filter2_offset, port_filter; + u64 base, mask; + + filter2_val = 0; + filter2_offset = PMEVFILT2R + (4 * event->hw.idx); + + addr_filter_en = pcie_tgt_pmu_addr_en(event); + if (addr_filter_en) { + base = pcie_tgt_pmu_dst_addr_base(event); + mask = pcie_tgt_pmu_dst_addr_mask(event); + idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false); + + if (idx < 0) { + dev_err(cspmu->dev, + "Unable to find a slot for address filtering\n"); + writel(0, cspmu->base0 + filter2_offset); + return; + } + + /* Configure address range filter registers.*/ + pcie_tgt_pmu_config_addr_filter(cspmu, true, base, mask, idx); + + /* Config the counter to use the selected address filter slot. */ + filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_ADDR, 1U << idx); + } + + port_filter = pcie_tgt_pmu_port_filter(event); + + /* Monitor all ports if no filter is selected. */ + if (!addr_filter_en && port_filter == 0) + port_filter = NV_PCIE_TGT_FILTER2_PORT; + + filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_PORT, port_filter); + + writel(filter2_val, cspmu->base0 + filter2_offset); +} + +static void pcie_tgt_pmu_reset_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + bool addr_filter_en; + u64 base, mask; + int idx; + + addr_filter_en = pcie_tgt_pmu_addr_en(event); + if (!addr_filter_en) + return; + + base = pcie_tgt_pmu_dst_addr_base(event); + mask = pcie_tgt_pmu_dst_addr_mask(event); + idx = pcie_tgt_find_addr_idx(cspmu, base, mask, true); + + if (idx < 0) { + dev_err(cspmu->dev, + "Unable to find the address filter slot to reset\n"); + return; + } + + pcie_tgt_pmu_config_addr_filter(cspmu, false, base, mask, idx); +} + +static u32 pcie_tgt_pmu_event_type(const struct perf_event *event) +{ + return event->attr.config & NV_PCIE_TGT_EV_TYPE_MASK; +} + +static bool pcie_tgt_pmu_is_cycle_counter_event(const struct perf_event *event) +{ + u32 event_type = pcie_tgt_pmu_event_type(event); + + return event_type == NV_PCIE_TGT_EV_TYPE_CC; +} enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, - NAME_FMT_SOCKET + NAME_FMT_SOCKET, + NAME_FMT_SOCKET_INST, }; struct nv_cspmu_match { @@ -342,6 +880,63 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .init_data = NULL }, }, + { + .prodid = 0x2CF20000, + .prodid_mask = NV_PRODID_MASK, + .name_pattern = "nvidia_ucf_pmu_%u", + .name_fmt = NAME_FMT_SOCKET, + .template_ctx = { + .event_attr = ucf_pmu_event_attrs, + .format_attr = ucf_pmu_format_attrs, + .filter_mask = NV_UCF_FILTER_ID_MASK, + .filter_default_val = NV_UCF_FILTER_DEFAULT, + .filter2_mask = 0x0, + .filter2_default_val = 0x0, + .get_filter = ucf_pmu_event_filter, + }, + }, + { + .prodid = 0x10301000, + .prodid_mask = NV_PRODID_MASK, + .name_pattern = "nvidia_pcie_pmu_%u_rc_%u", + .name_fmt = NAME_FMT_SOCKET_INST, + .template_ctx = { + .event_attr = pcie_v2_pmu_event_attrs, + .format_attr = pcie_v2_pmu_format_attrs, + .filter_mask = NV_PCIE_V2_FILTER_ID_MASK, + .filter_default_val = NV_PCIE_V2_FILTER_DEFAULT, + .filter2_mask = NV_PCIE_V2_FILTER2_ID_MASK, + .filter2_default_val = NV_PCIE_V2_FILTER2_DEFAULT, + .get_filter = pcie_v2_pmu_event_filter, + .get_filter2 = nv_cspmu_event_filter2, + }, + .ops = { + .validate_event = pcie_v2_pmu_validate_event, + .reset_ev_filter = nv_cspmu_reset_ev_filter, + } + }, + { + .prodid = 0x10700000, + .prodid_mask = NV_PRODID_MASK, + .name_pattern = "nvidia_pcie_tgt_pmu_%u_rc_%u", + .name_fmt = NAME_FMT_SOCKET_INST, + .template_ctx = { + .event_attr = pcie_tgt_pmu_event_attrs, + .format_attr = pcie_tgt_pmu_format_attrs, + .filter_mask = 0x0, + .filter_default_val = 0x0, + .filter2_mask = NV_PCIE_TGT_FILTER2_MASK, + .filter2_default_val = NV_PCIE_TGT_FILTER2_DEFAULT, + .init_data = pcie_tgt_init_data + }, + .ops = { + .is_cycle_counter_event = pcie_tgt_pmu_is_cycle_counter_event, + .event_type = pcie_tgt_pmu_event_type, + .validate_event = pcie_tgt_pmu_validate_event, + .set_ev_filter = pcie_tgt_pmu_set_ev_filter, + .reset_ev_filter = pcie_tgt_pmu_reset_ev_filter, + } + }, { .prodid = 0, .prodid_mask = 0, @@ -365,7 +960,7 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, const struct nv_cspmu_match *match) { - char *name; + char *name = NULL; struct device *dev = cspmu->dev; static atomic_t pmu_generic_idx = {0}; @@ -379,13 +974,20 @@ static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, socket); break; } + case NAME_FMT_SOCKET_INST: { + const int cpu = cpumask_first(&cspmu->associated_cpus); + const int socket = cpu_to_node(cpu); + u32 inst_id; + + if (!nv_cspmu_get_inst_id(cspmu, &inst_id)) + name = devm_kasprintf(dev, GFP_KERNEL, + match->name_pattern, socket, inst_id); + break; + } case NAME_FMT_GENERIC: name = devm_kasprintf(dev, GFP_KERNEL, match->name_pattern, atomic_fetch_inc(&pmu_generic_idx)); break; - default: - name = NULL; - break; } return name; @@ -426,8 +1028,12 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) cspmu->impl.ctx = ctx; /* NVIDIA specific callbacks. */ + SET_OP(validate_event, impl_ops, match, NULL); + SET_OP(event_type, impl_ops, match, NULL); + SET_OP(is_cycle_counter_event, impl_ops, match, NULL); SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter); SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter); + SET_OP(reset_ev_filter, impl_ops, match, NULL); SET_OP(get_event_attrs, impl_ops, match, nv_cspmu_get_event_attrs); SET_OP(get_format_attrs, impl_ops, match, nv_cspmu_get_format_attrs); SET_OP(get_name, impl_ops, match, nv_cspmu_get_name); diff --git a/drivers/perf/nvidia_t410_c2c_pmu.c b/drivers/perf/nvidia_t410_c2c_pmu.c new file mode 100644 index 000000000000..411987153ff3 --- /dev/null +++ b/drivers/perf/nvidia_t410_c2c_pmu.c @@ -0,0 +1,1051 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVIDIA Tegra410 C2C PMU driver. + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* The C2C interface types in Tegra410. */ +#define C2C_TYPE_NVLINK 0x0 +#define C2C_TYPE_NVCLINK 0x1 +#define C2C_TYPE_NVDLINK 0x2 +#define C2C_TYPE_COUNT 0x3 + +/* The type of the peer device connected to the C2C interface. */ +#define C2C_PEER_TYPE_CPU 0x0 +#define C2C_PEER_TYPE_GPU 0x1 +#define C2C_PEER_TYPE_CXLMEM 0x2 +#define C2C_PEER_TYPE_COUNT 0x3 + +/* The number of peer devices can be connected to the C2C interface. */ +#define C2C_NR_PEER_CPU 0x1 +#define C2C_NR_PEER_GPU 0x2 +#define C2C_NR_PEER_CXLMEM 0x1 +#define C2C_NR_PEER_MAX 0x2 + +/* Number of instances on each interface. */ +#define C2C_NR_INST_NVLINK 14 +#define C2C_NR_INST_NVCLINK 12 +#define C2C_NR_INST_NVDLINK 16 +#define C2C_NR_INST_MAX 16 + +/* Register offsets. */ +#define C2C_CTRL 0x864 +#define C2C_IN_STATUS 0x868 +#define C2C_CYCLE_CNTR 0x86c +#define C2C_IN_RD_CUM_OUTS_CNTR 0x874 +#define C2C_IN_RD_REQ_CNTR 0x87c +#define C2C_IN_WR_CUM_OUTS_CNTR 0x884 +#define C2C_IN_WR_REQ_CNTR 0x88c +#define C2C_OUT_STATUS 0x890 +#define C2C_OUT_RD_CUM_OUTS_CNTR 0x898 +#define C2C_OUT_RD_REQ_CNTR 0x8a0 +#define C2C_OUT_WR_CUM_OUTS_CNTR 0x8a8 +#define C2C_OUT_WR_REQ_CNTR 0x8b0 + +/* C2C_IN_STATUS register field. */ +#define C2C_IN_STATUS_CYCLE_OVF BIT(0) +#define C2C_IN_STATUS_IN_RD_CUM_OUTS_OVF BIT(1) +#define C2C_IN_STATUS_IN_RD_REQ_OVF BIT(2) +#define C2C_IN_STATUS_IN_WR_CUM_OUTS_OVF BIT(3) +#define C2C_IN_STATUS_IN_WR_REQ_OVF BIT(4) + +/* C2C_OUT_STATUS register field. */ +#define C2C_OUT_STATUS_OUT_RD_CUM_OUTS_OVF BIT(0) +#define C2C_OUT_STATUS_OUT_RD_REQ_OVF BIT(1) +#define C2C_OUT_STATUS_OUT_WR_CUM_OUTS_OVF BIT(2) +#define C2C_OUT_STATUS_OUT_WR_REQ_OVF BIT(3) + +/* Events. */ +#define C2C_EVENT_CYCLES 0x0 +#define C2C_EVENT_IN_RD_CUM_OUTS 0x1 +#define C2C_EVENT_IN_RD_REQ 0x2 +#define C2C_EVENT_IN_WR_CUM_OUTS 0x3 +#define C2C_EVENT_IN_WR_REQ 0x4 +#define C2C_EVENT_OUT_RD_CUM_OUTS 0x5 +#define C2C_EVENT_OUT_RD_REQ 0x6 +#define C2C_EVENT_OUT_WR_CUM_OUTS 0x7 +#define C2C_EVENT_OUT_WR_REQ 0x8 + +#define C2C_NUM_EVENTS 0x9 +#define C2C_MASK_EVENT 0xFF +#define C2C_MAX_ACTIVE_EVENTS 32 + +#define C2C_ACTIVE_CPU_MASK 0x0 +#define C2C_ASSOCIATED_CPU_MASK 0x1 + +/* + * Maximum poll count for reading counter value using high-low-high sequence. + */ +#define HILOHI_MAX_POLL 1000 + +static unsigned long nv_c2c_pmu_cpuhp_state; + +/* PMU descriptor. */ + +/* C2C type information. */ +struct nv_c2c_pmu_data { + unsigned int c2c_type; + unsigned int nr_inst; + const char *name_fmt; +}; + +static const struct nv_c2c_pmu_data nv_c2c_pmu_data[] = { + [C2C_TYPE_NVLINK] = { + .c2c_type = C2C_TYPE_NVLINK, + .nr_inst = C2C_NR_INST_NVLINK, + .name_fmt = "nvidia_nvlink_c2c_pmu_%u", + }, + [C2C_TYPE_NVCLINK] = { + .c2c_type = C2C_TYPE_NVCLINK, + .nr_inst = C2C_NR_INST_NVCLINK, + .name_fmt = "nvidia_nvclink_pmu_%u", + }, + [C2C_TYPE_NVDLINK] = { + .c2c_type = C2C_TYPE_NVDLINK, + .nr_inst = C2C_NR_INST_NVDLINK, + .name_fmt = "nvidia_nvdlink_pmu_%u", + }, +}; + +/* Tracks the events assigned to the PMU for a given logical index. */ +struct nv_c2c_pmu_hw_events { + /* The events that are active. */ + struct perf_event *events[C2C_MAX_ACTIVE_EVENTS]; + + /* + * Each bit indicates a logical counter is being used (or not) for an + * event. + */ + DECLARE_BITMAP(used_ctrs, C2C_MAX_ACTIVE_EVENTS); +}; + +struct nv_c2c_pmu { + struct pmu pmu; + struct device *dev; + struct acpi_device *acpi_dev; + + const char *name; + const char *identifier; + + const struct nv_c2c_pmu_data *data; + unsigned int peer_type; + unsigned int socket; + unsigned int nr_peer; + unsigned long peer_insts[C2C_NR_PEER_MAX][BITS_TO_LONGS(C2C_NR_INST_MAX)]; + u32 filter_default; + + struct nv_c2c_pmu_hw_events hw_events; + + cpumask_t associated_cpus; + cpumask_t active_cpu; + + struct hlist_node cpuhp_node; + + const struct attribute_group **attr_groups; + + void __iomem *base_broadcast; + void __iomem *base[C2C_NR_INST_MAX]; +}; + +#define to_c2c_pmu(p) (container_of(p, struct nv_c2c_pmu, pmu)) + +/* Get event type from perf_event. */ +static inline u32 get_event_type(struct perf_event *event) +{ + return (event->attr.config) & C2C_MASK_EVENT; +} + +static inline u32 get_filter_mask(struct perf_event *event) +{ + u32 filter; + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + + filter = ((u32)event->attr.config1) & c2c_pmu->filter_default; + if (filter == 0) + filter = c2c_pmu->filter_default; + + return filter; +} + +/* PMU operations. */ + +static int nv_c2c_pmu_get_event_idx(struct nv_c2c_pmu_hw_events *hw_events, + struct perf_event *event) +{ + u32 idx; + + idx = find_first_zero_bit(hw_events->used_ctrs, C2C_MAX_ACTIVE_EVENTS); + if (idx >= C2C_MAX_ACTIVE_EVENTS) + return -EAGAIN; + + set_bit(idx, hw_events->used_ctrs); + + return idx; +} + +static bool +nv_c2c_pmu_validate_event(struct pmu *pmu, + struct nv_c2c_pmu_hw_events *hw_events, + struct perf_event *event) +{ + if (is_software_event(event)) + return true; + + /* Reject groups spanning multiple HW PMUs. */ + if (event->pmu != pmu) + return false; + + return nv_c2c_pmu_get_event_idx(hw_events, event) >= 0; +} + +/* + * Make sure the group of events can be scheduled at once + * on the PMU. + */ +static bool nv_c2c_pmu_validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct nv_c2c_pmu_hw_events fake_hw_events; + + if (event->group_leader == event) + return true; + + memset(&fake_hw_events, 0, sizeof(fake_hw_events)); + + if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, leader)) + return false; + + for_each_sibling_event(sibling, leader) { + if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, + sibling)) + return false; + } + + return nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, event); +} + +static int nv_c2c_pmu_event_init(struct perf_event *event) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u32 event_type = get_event_type(event); + + if (event->attr.type != event->pmu->type || + event_type >= C2C_NUM_EVENTS) + return -ENOENT; + + /* + * Following other "uncore" PMUs, we do not support sampling mode or + * attach to a task (per-process mode). + */ + if (is_sampling_event(event)) { + dev_dbg(c2c_pmu->pmu.dev, "Can't support sampling events\n"); + return -EOPNOTSUPP; + } + + if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) { + dev_dbg(c2c_pmu->pmu.dev, "Can't support per-task counters\n"); + return -EINVAL; + } + + /* + * Make sure the CPU assignment is on one of the CPUs associated with + * this PMU. + */ + if (!cpumask_test_cpu(event->cpu, &c2c_pmu->associated_cpus)) { + dev_dbg(c2c_pmu->pmu.dev, + "Requested cpu is not associated with the PMU\n"); + return -EINVAL; + } + + /* Enforce the current active CPU to handle the events in this PMU. */ + event->cpu = cpumask_first(&c2c_pmu->active_cpu); + if (event->cpu >= nr_cpu_ids) + return -EINVAL; + + if (!nv_c2c_pmu_validate_group(event)) + return -EINVAL; + + hwc->idx = -1; + hwc->config = event_type; + + return 0; +} + +/* + * Read 64-bit register as a pair of 32-bit registers using hi-lo-hi sequence. + */ +static u64 read_reg64_hilohi(const void __iomem *addr, u32 max_poll_count) +{ + u32 val_lo, val_hi; + u64 val; + + /* Use high-low-high sequence to avoid tearing */ + do { + if (max_poll_count-- == 0) { + pr_err("NV C2C PMU: timeout hi-low-high sequence\n"); + return 0; + } + + val_hi = readl(addr + 4); + val_lo = readl(addr); + } while (val_hi != readl(addr + 4)); + + val = (((u64)val_hi << 32) | val_lo); + + return val; +} + +static void nv_c2c_pmu_check_status(struct nv_c2c_pmu *c2c_pmu, u32 instance) +{ + u32 in_status, out_status; + + in_status = readl(c2c_pmu->base[instance] + C2C_IN_STATUS); + out_status = readl(c2c_pmu->base[instance] + C2C_OUT_STATUS); + + if (in_status || out_status) + dev_warn(c2c_pmu->dev, + "C2C PMU overflow in: 0x%x, out: 0x%x\n", + in_status, out_status); +} + +static u32 nv_c2c_ctr_offset[C2C_NUM_EVENTS] = { + [C2C_EVENT_CYCLES] = C2C_CYCLE_CNTR, + [C2C_EVENT_IN_RD_CUM_OUTS] = C2C_IN_RD_CUM_OUTS_CNTR, + [C2C_EVENT_IN_RD_REQ] = C2C_IN_RD_REQ_CNTR, + [C2C_EVENT_IN_WR_CUM_OUTS] = C2C_IN_WR_CUM_OUTS_CNTR, + [C2C_EVENT_IN_WR_REQ] = C2C_IN_WR_REQ_CNTR, + [C2C_EVENT_OUT_RD_CUM_OUTS] = C2C_OUT_RD_CUM_OUTS_CNTR, + [C2C_EVENT_OUT_RD_REQ] = C2C_OUT_RD_REQ_CNTR, + [C2C_EVENT_OUT_WR_CUM_OUTS] = C2C_OUT_WR_CUM_OUTS_CNTR, + [C2C_EVENT_OUT_WR_REQ] = C2C_OUT_WR_REQ_CNTR, +}; + +static u64 nv_c2c_pmu_read_counter(struct perf_event *event) +{ + u32 ctr_id, ctr_offset, filter_mask, filter_idx, inst_idx; + unsigned long *inst_mask; + DECLARE_BITMAP(filter_bitmap, C2C_NR_PEER_MAX); + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + u64 val = 0; + + filter_mask = get_filter_mask(event); + bitmap_from_arr32(filter_bitmap, &filter_mask, c2c_pmu->nr_peer); + + ctr_id = event->hw.config; + ctr_offset = nv_c2c_ctr_offset[ctr_id]; + + for_each_set_bit(filter_idx, filter_bitmap, c2c_pmu->nr_peer) { + inst_mask = c2c_pmu->peer_insts[filter_idx]; + for_each_set_bit(inst_idx, inst_mask, c2c_pmu->data->nr_inst) { + nv_c2c_pmu_check_status(c2c_pmu, inst_idx); + + /* + * Each instance share same clock and the driver always + * enables all instances. So we can use the counts from + * one instance for cycle counter. + */ + if (ctr_id == C2C_EVENT_CYCLES) + return read_reg64_hilohi( + c2c_pmu->base[inst_idx] + ctr_offset, + HILOHI_MAX_POLL); + + /* + * For other events, sum up the counts from all instances. + */ + val += read_reg64_hilohi( + c2c_pmu->base[inst_idx] + ctr_offset, + HILOHI_MAX_POLL); + } + } + + return val; +} + +static void nv_c2c_pmu_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev, now; + + do { + prev = local64_read(&hwc->prev_count); + now = nv_c2c_pmu_read_counter(event); + } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); + + local64_add(now - prev, &event->count); +} + +static void nv_c2c_pmu_start(struct perf_event *event, int pmu_flags) +{ + event->hw.state = 0; +} + +static void nv_c2c_pmu_stop(struct perf_event *event, int pmu_flags) +{ + event->hw.state |= PERF_HES_STOPPED; +} + +static int nv_c2c_pmu_add(struct perf_event *event, int flags) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx; + + if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), + &c2c_pmu->associated_cpus))) + return -ENOENT; + + idx = nv_c2c_pmu_get_event_idx(hw_events, event); + if (idx < 0) + return idx; + + hw_events->events[idx] = event; + hwc->idx = idx; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + if (flags & PERF_EF_START) + nv_c2c_pmu_start(event, PERF_EF_RELOAD); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); + + return 0; +} + +static void nv_c2c_pmu_del(struct perf_event *event, int flags) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu); + struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + nv_c2c_pmu_stop(event, PERF_EF_UPDATE); + + hw_events->events[idx] = NULL; + + clear_bit(idx, hw_events->used_ctrs); + + perf_event_update_userpage(event); +} + +static void nv_c2c_pmu_read(struct perf_event *event) +{ + nv_c2c_pmu_event_update(event); +} + +static void nv_c2c_pmu_enable(struct pmu *pmu) +{ + void __iomem *bcast; + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); + + /* Check if any filter is enabled. */ + if (bitmap_empty(c2c_pmu->hw_events.used_ctrs, C2C_MAX_ACTIVE_EVENTS)) + return; + + /* Enable all the counters. */ + bcast = c2c_pmu->base_broadcast; + writel(0x1UL, bcast + C2C_CTRL); +} + +static void nv_c2c_pmu_disable(struct pmu *pmu) +{ + unsigned int idx; + void __iomem *bcast; + struct perf_event *event; + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); + + /* Disable all the counters. */ + bcast = c2c_pmu->base_broadcast; + writel(0x0UL, bcast + C2C_CTRL); + + /* + * The counters will start from 0 again on restart. + * Update the events immediately to avoid losing the counts. + */ + for_each_set_bit(idx, c2c_pmu->hw_events.used_ctrs, + C2C_MAX_ACTIVE_EVENTS) { + event = c2c_pmu->hw_events.events[idx]; + + if (!event) + continue; + + nv_c2c_pmu_event_update(event); + + local64_set(&event->hw.prev_count, 0ULL); + } +} + +/* PMU identifier attribute. */ + +static ssize_t nv_c2c_pmu_identifier_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(page, "%s\n", c2c_pmu->identifier); +} + +static struct device_attribute nv_c2c_pmu_identifier_attr = + __ATTR(identifier, 0444, nv_c2c_pmu_identifier_show, NULL); + +static struct attribute *nv_c2c_pmu_identifier_attrs[] = { + &nv_c2c_pmu_identifier_attr.attr, + NULL, +}; + +static struct attribute_group nv_c2c_pmu_identifier_attr_group = { + .attrs = nv_c2c_pmu_identifier_attrs, +}; + +/* Peer attribute. */ + +static ssize_t nv_c2c_pmu_peer_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + const char *peer_type[C2C_PEER_TYPE_COUNT] = { + [C2C_PEER_TYPE_CPU] = "cpu", + [C2C_PEER_TYPE_GPU] = "gpu", + [C2C_PEER_TYPE_CXLMEM] = "cxlmem", + }; + + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev)); + return sysfs_emit(page, "nr_%s=%u\n", peer_type[c2c_pmu->peer_type], + c2c_pmu->nr_peer); +} + +static struct device_attribute nv_c2c_pmu_peer_attr = + __ATTR(peer, 0444, nv_c2c_pmu_peer_show, NULL); + +static struct attribute *nv_c2c_pmu_peer_attrs[] = { + &nv_c2c_pmu_peer_attr.attr, + NULL, +}; + +static struct attribute_group nv_c2c_pmu_peer_attr_group = { + .attrs = nv_c2c_pmu_peer_attrs, +}; + +/* Format attributes. */ + +#define NV_C2C_PMU_EXT_ATTR(_name, _func, _config) \ + (&((struct dev_ext_attribute[]){ \ + { \ + .attr = __ATTR(_name, 0444, _func, NULL), \ + .var = (void *)_config \ + } \ + })[0].attr.attr) + +#define NV_C2C_PMU_FORMAT_ATTR(_name, _config) \ + NV_C2C_PMU_EXT_ATTR(_name, device_show_string, _config) + +#define NV_C2C_PMU_FORMAT_EVENT_ATTR \ + NV_C2C_PMU_FORMAT_ATTR(event, "config:0-3") + +static struct attribute *nv_c2c_pmu_gpu_formats[] = { + NV_C2C_PMU_FORMAT_EVENT_ATTR, + NV_C2C_PMU_FORMAT_ATTR(gpu_mask, "config1:0-1"), + NULL, +}; + +static const struct attribute_group nv_c2c_pmu_gpu_format_group = { + .name = "format", + .attrs = nv_c2c_pmu_gpu_formats, +}; + +static struct attribute *nv_c2c_pmu_formats[] = { + NV_C2C_PMU_FORMAT_EVENT_ATTR, + NULL, +}; + +static const struct attribute_group nv_c2c_pmu_format_group = { + .name = "format", + .attrs = nv_c2c_pmu_formats, +}; + +/* Event attributes. */ + +static ssize_t nv_c2c_pmu_sysfs_event_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, typeof(*pmu_attr), attr); + return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id); +} + +#define NV_C2C_PMU_EVENT_ATTR(_name, _config) \ + PMU_EVENT_ATTR_ID(_name, nv_c2c_pmu_sysfs_event_show, _config) + +static struct attribute *nv_c2c_pmu_gpu_events[] = { + NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES), + NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ), + NV_C2C_PMU_EVENT_ATTR(in_wr_cum_outs, C2C_EVENT_IN_WR_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_wr_req, C2C_EVENT_IN_WR_REQ), + NV_C2C_PMU_EVENT_ATTR(out_rd_cum_outs, C2C_EVENT_OUT_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(out_rd_req, C2C_EVENT_OUT_RD_REQ), + NV_C2C_PMU_EVENT_ATTR(out_wr_cum_outs, C2C_EVENT_OUT_WR_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(out_wr_req, C2C_EVENT_OUT_WR_REQ), + NULL +}; + +static const struct attribute_group nv_c2c_pmu_gpu_events_group = { + .name = "events", + .attrs = nv_c2c_pmu_gpu_events, +}; + +static struct attribute *nv_c2c_pmu_cpu_events[] = { + NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES), + NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ), + NV_C2C_PMU_EVENT_ATTR(out_rd_cum_outs, C2C_EVENT_OUT_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(out_rd_req, C2C_EVENT_OUT_RD_REQ), + NULL +}; + +static const struct attribute_group nv_c2c_pmu_cpu_events_group = { + .name = "events", + .attrs = nv_c2c_pmu_cpu_events, +}; + +static struct attribute *nv_c2c_pmu_cxlmem_events[] = { + NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES), + NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS), + NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ), + NULL +}; + +static const struct attribute_group nv_c2c_pmu_cxlmem_events_group = { + .name = "events", + .attrs = nv_c2c_pmu_cxlmem_events, +}; + +/* Cpumask attributes. */ + +static ssize_t nv_c2c_pmu_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu); + struct dev_ext_attribute *eattr = + container_of(attr, struct dev_ext_attribute, attr); + unsigned long mask_id = (unsigned long)eattr->var; + const cpumask_t *cpumask; + + switch (mask_id) { + case C2C_ACTIVE_CPU_MASK: + cpumask = &c2c_pmu->active_cpu; + break; + case C2C_ASSOCIATED_CPU_MASK: + cpumask = &c2c_pmu->associated_cpus; + break; + default: + return 0; + } + return cpumap_print_to_pagebuf(true, buf, cpumask); +} + +#define NV_C2C_PMU_CPUMASK_ATTR(_name, _config) \ + NV_C2C_PMU_EXT_ATTR(_name, nv_c2c_pmu_cpumask_show, \ + (unsigned long)_config) + +static struct attribute *nv_c2c_pmu_cpumask_attrs[] = { + NV_C2C_PMU_CPUMASK_ATTR(cpumask, C2C_ACTIVE_CPU_MASK), + NV_C2C_PMU_CPUMASK_ATTR(associated_cpus, C2C_ASSOCIATED_CPU_MASK), + NULL, +}; + +static const struct attribute_group nv_c2c_pmu_cpumask_attr_group = { + .attrs = nv_c2c_pmu_cpumask_attrs, +}; + +/* Attribute groups for C2C PMU connecting SoC and GPU */ +static const struct attribute_group *nv_c2c_pmu_gpu_attr_groups[] = { + &nv_c2c_pmu_gpu_format_group, + &nv_c2c_pmu_gpu_events_group, + &nv_c2c_pmu_cpumask_attr_group, + &nv_c2c_pmu_identifier_attr_group, + &nv_c2c_pmu_peer_attr_group, + NULL +}; + +/* Attribute groups for C2C PMU connecting multiple SoCs */ +static const struct attribute_group *nv_c2c_pmu_cpu_attr_groups[] = { + &nv_c2c_pmu_format_group, + &nv_c2c_pmu_cpu_events_group, + &nv_c2c_pmu_cpumask_attr_group, + &nv_c2c_pmu_identifier_attr_group, + &nv_c2c_pmu_peer_attr_group, + NULL +}; + +/* Attribute groups for C2C PMU connecting SoC and CXLMEM */ +static const struct attribute_group *nv_c2c_pmu_cxlmem_attr_groups[] = { + &nv_c2c_pmu_format_group, + &nv_c2c_pmu_cxlmem_events_group, + &nv_c2c_pmu_cpumask_attr_group, + &nv_c2c_pmu_identifier_attr_group, + &nv_c2c_pmu_peer_attr_group, + NULL +}; + +static int nv_c2c_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct nv_c2c_pmu *c2c_pmu = + hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node); + + if (!cpumask_test_cpu(cpu, &c2c_pmu->associated_cpus)) + return 0; + + /* If the PMU is already managed, there is nothing to do */ + if (!cpumask_empty(&c2c_pmu->active_cpu)) + return 0; + + /* Use this CPU for event counting */ + cpumask_set_cpu(cpu, &c2c_pmu->active_cpu); + + return 0; +} + +static int nv_c2c_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) +{ + unsigned int dst; + + struct nv_c2c_pmu *c2c_pmu = + hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node); + + /* Nothing to do if this CPU doesn't own the PMU */ + if (!cpumask_test_and_clear_cpu(cpu, &c2c_pmu->active_cpu)) + return 0; + + /* Choose a new CPU to migrate ownership of the PMU to */ + dst = cpumask_any_and_but(&c2c_pmu->associated_cpus, + cpu_online_mask, cpu); + if (dst >= nr_cpu_ids) + return 0; + + /* Use this CPU for event counting */ + perf_pmu_migrate_context(&c2c_pmu->pmu, cpu, dst); + cpumask_set_cpu(dst, &c2c_pmu->active_cpu); + + return 0; +} + +static int nv_c2c_pmu_get_cpus(struct nv_c2c_pmu *c2c_pmu) +{ + int socket = c2c_pmu->socket, cpu; + + for_each_possible_cpu(cpu) { + if (cpu_to_node(cpu) == socket) + cpumask_set_cpu(cpu, &c2c_pmu->associated_cpus); + } + + if (cpumask_empty(&c2c_pmu->associated_cpus)) { + dev_dbg(c2c_pmu->dev, + "No cpu associated with C2C PMU socket-%u\n", socket); + return -ENODEV; + } + + return 0; +} + +static int nv_c2c_pmu_init_socket(struct nv_c2c_pmu *c2c_pmu) +{ + const char *uid_str; + int ret, socket; + + uid_str = acpi_device_uid(c2c_pmu->acpi_dev); + if (!uid_str) { + dev_err(c2c_pmu->dev, "No ACPI device UID\n"); + return -ENODEV; + } + + ret = kstrtou32(uid_str, 0, &socket); + if (ret) { + dev_err(c2c_pmu->dev, "Failed to parse ACPI device UID\n"); + return ret; + } + + c2c_pmu->socket = socket; + return 0; +} + +static int nv_c2c_pmu_init_id(struct nv_c2c_pmu *c2c_pmu) +{ + char *name; + + name = devm_kasprintf(c2c_pmu->dev, GFP_KERNEL, c2c_pmu->data->name_fmt, + c2c_pmu->socket); + if (!name) + return -ENOMEM; + + c2c_pmu->name = name; + + c2c_pmu->identifier = acpi_device_hid(c2c_pmu->acpi_dev); + + return 0; +} + +static int nv_c2c_pmu_init_filter(struct nv_c2c_pmu *c2c_pmu) +{ + u32 cpu_en = 0; + struct device *dev = c2c_pmu->dev; + const struct nv_c2c_pmu_data *data = c2c_pmu->data; + + if (data->c2c_type == C2C_TYPE_NVDLINK) { + c2c_pmu->peer_type = C2C_PEER_TYPE_CXLMEM; + + c2c_pmu->peer_insts[0][0] = (1UL << data->nr_inst) - 1; + + c2c_pmu->nr_peer = C2C_NR_PEER_CXLMEM; + c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1; + + c2c_pmu->attr_groups = nv_c2c_pmu_cxlmem_attr_groups; + + return 0; + } + + if (device_property_read_u32(dev, "cpu_en_mask", &cpu_en)) + dev_dbg(dev, "no cpu_en_mask property\n"); + + if (cpu_en) { + c2c_pmu->peer_type = C2C_PEER_TYPE_CPU; + + /* Fill peer_insts bitmap with instances connected to peer CPU. */ + bitmap_from_arr32(c2c_pmu->peer_insts[0], &cpu_en, data->nr_inst); + + c2c_pmu->nr_peer = 1; + c2c_pmu->attr_groups = nv_c2c_pmu_cpu_attr_groups; + } else { + u32 i; + const char *props[C2C_NR_PEER_MAX] = { + "gpu0_en_mask", "gpu1_en_mask" + }; + + for (i = 0; i < C2C_NR_PEER_MAX; i++) { + u32 gpu_en = 0; + + if (device_property_read_u32(dev, props[i], &gpu_en)) + dev_dbg(dev, "no %s property\n", props[i]); + + if (gpu_en) { + /* Fill peer_insts bitmap with instances connected to peer GPU. */ + bitmap_from_arr32(c2c_pmu->peer_insts[i], &gpu_en, + data->nr_inst); + + c2c_pmu->nr_peer++; + } + } + + if (c2c_pmu->nr_peer == 0) { + dev_err(dev, "No GPU is enabled\n"); + return -EINVAL; + } + + c2c_pmu->peer_type = C2C_PEER_TYPE_GPU; + c2c_pmu->attr_groups = nv_c2c_pmu_gpu_attr_groups; + } + + c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1; + + return 0; +} + +static void *nv_c2c_pmu_init_pmu(struct platform_device *pdev) +{ + int ret; + struct nv_c2c_pmu *c2c_pmu; + struct acpi_device *acpi_dev; + struct device *dev = &pdev->dev; + + acpi_dev = ACPI_COMPANION(dev); + if (!acpi_dev) + return ERR_PTR(-ENODEV); + + c2c_pmu = devm_kzalloc(dev, sizeof(*c2c_pmu), GFP_KERNEL); + if (!c2c_pmu) + return ERR_PTR(-ENOMEM); + + c2c_pmu->dev = dev; + c2c_pmu->acpi_dev = acpi_dev; + c2c_pmu->data = (const struct nv_c2c_pmu_data *)device_get_match_data(dev); + if (!c2c_pmu->data) + return ERR_PTR(-EINVAL); + + platform_set_drvdata(pdev, c2c_pmu); + + ret = nv_c2c_pmu_init_socket(c2c_pmu); + if (ret) + return ERR_PTR(ret); + + ret = nv_c2c_pmu_init_id(c2c_pmu); + if (ret) + return ERR_PTR(ret); + + ret = nv_c2c_pmu_init_filter(c2c_pmu); + if (ret) + return ERR_PTR(ret); + + return c2c_pmu; +} + +static int nv_c2c_pmu_init_mmio(struct nv_c2c_pmu *c2c_pmu) +{ + int i; + struct device *dev = c2c_pmu->dev; + struct platform_device *pdev = to_platform_device(dev); + const struct nv_c2c_pmu_data *data = c2c_pmu->data; + + /* Map the address of all the instances. */ + for (i = 0; i < data->nr_inst; i++) { + c2c_pmu->base[i] = devm_platform_ioremap_resource(pdev, i); + if (IS_ERR(c2c_pmu->base[i])) { + dev_err(dev, "Failed map address for instance %d\n", i); + return PTR_ERR(c2c_pmu->base[i]); + } + } + + /* Map broadcast address. */ + c2c_pmu->base_broadcast = devm_platform_ioremap_resource(pdev, + data->nr_inst); + if (IS_ERR(c2c_pmu->base_broadcast)) { + dev_err(dev, "Failed map broadcast address\n"); + return PTR_ERR(c2c_pmu->base_broadcast); + } + + return 0; +} + +static int nv_c2c_pmu_register_pmu(struct nv_c2c_pmu *c2c_pmu) +{ + int ret; + + ret = cpuhp_state_add_instance(nv_c2c_pmu_cpuhp_state, + &c2c_pmu->cpuhp_node); + if (ret) { + dev_err(c2c_pmu->dev, "Error %d registering hotplug\n", ret); + return ret; + } + + c2c_pmu->pmu = (struct pmu) { + .parent = c2c_pmu->dev, + .task_ctx_nr = perf_invalid_context, + .pmu_enable = nv_c2c_pmu_enable, + .pmu_disable = nv_c2c_pmu_disable, + .event_init = nv_c2c_pmu_event_init, + .add = nv_c2c_pmu_add, + .del = nv_c2c_pmu_del, + .start = nv_c2c_pmu_start, + .stop = nv_c2c_pmu_stop, + .read = nv_c2c_pmu_read, + .attr_groups = c2c_pmu->attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | + PERF_PMU_CAP_NO_INTERRUPT, + }; + + ret = perf_pmu_register(&c2c_pmu->pmu, c2c_pmu->name, -1); + if (ret) { + dev_err(c2c_pmu->dev, "Failed to register C2C PMU: %d\n", ret); + cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state, + &c2c_pmu->cpuhp_node); + return ret; + } + + return 0; +} + +static int nv_c2c_pmu_probe(struct platform_device *pdev) +{ + int ret; + struct nv_c2c_pmu *c2c_pmu; + + c2c_pmu = nv_c2c_pmu_init_pmu(pdev); + if (IS_ERR(c2c_pmu)) + return PTR_ERR(c2c_pmu); + + ret = nv_c2c_pmu_init_mmio(c2c_pmu); + if (ret) + return ret; + + ret = nv_c2c_pmu_get_cpus(c2c_pmu); + if (ret) + return ret; + + ret = nv_c2c_pmu_register_pmu(c2c_pmu); + if (ret) + return ret; + + dev_dbg(c2c_pmu->dev, "Registered %s PMU\n", c2c_pmu->name); + + return 0; +} + +static void nv_c2c_pmu_device_remove(struct platform_device *pdev) +{ + struct nv_c2c_pmu *c2c_pmu = platform_get_drvdata(pdev); + + perf_pmu_unregister(&c2c_pmu->pmu); + cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state, &c2c_pmu->cpuhp_node); +} + +static const struct acpi_device_id nv_c2c_pmu_acpi_match[] = { + { "NVDA2023", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVLINK] }, + { "NVDA2022", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVCLINK] }, + { "NVDA2020", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVDLINK] }, + { } +}; +MODULE_DEVICE_TABLE(acpi, nv_c2c_pmu_acpi_match); + +static struct platform_driver nv_c2c_pmu_driver = { + .driver = { + .name = "nvidia-t410-c2c-pmu", + .acpi_match_table = nv_c2c_pmu_acpi_match, + .suppress_bind_attrs = true, + }, + .probe = nv_c2c_pmu_probe, + .remove = nv_c2c_pmu_device_remove, +}; + +static int __init nv_c2c_pmu_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/nvidia/c2c:online", + nv_c2c_pmu_online_cpu, + nv_c2c_pmu_cpu_teardown); + if (ret < 0) + return ret; + + nv_c2c_pmu_cpuhp_state = ret; + return platform_driver_register(&nv_c2c_pmu_driver); +} + +static void __exit nv_c2c_pmu_exit(void) +{ + platform_driver_unregister(&nv_c2c_pmu_driver); + cpuhp_remove_multi_state(nv_c2c_pmu_cpuhp_state); +} + +module_init(nv_c2c_pmu_init); +module_exit(nv_c2c_pmu_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("NVIDIA Tegra410 C2C PMU driver"); +MODULE_AUTHOR("Besar Wicaksono "); diff --git a/drivers/perf/nvidia_t410_cmem_latency_pmu.c b/drivers/perf/nvidia_t410_cmem_latency_pmu.c new file mode 100644 index 000000000000..acb8f5571522 --- /dev/null +++ b/drivers/perf/nvidia_t410_cmem_latency_pmu.c @@ -0,0 +1,736 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVIDIA Tegra410 CPU Memory (CMEM) Latency PMU driver. + * + * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NUM_INSTANCES 14 + +/* Register offsets. */ +#define CMEM_LAT_CG_CTRL 0x800 +#define CMEM_LAT_CTRL 0x808 +#define CMEM_LAT_STATUS 0x810 +#define CMEM_LAT_CYCLE_CNTR 0x818 +#define CMEM_LAT_MC0_REQ_CNTR 0x820 +#define CMEM_LAT_MC0_AOR_CNTR 0x830 +#define CMEM_LAT_MC1_REQ_CNTR 0x838 +#define CMEM_LAT_MC1_AOR_CNTR 0x848 +#define CMEM_LAT_MC2_REQ_CNTR 0x850 +#define CMEM_LAT_MC2_AOR_CNTR 0x860 + +/* CMEM_LAT_CTRL values. */ +#define CMEM_LAT_CTRL_DISABLE 0x0ULL +#define CMEM_LAT_CTRL_ENABLE 0x1ULL +#define CMEM_LAT_CTRL_CLR 0x2ULL + +/* CMEM_LAT_CG_CTRL values. */ +#define CMEM_LAT_CG_CTRL_DISABLE 0x0ULL +#define CMEM_LAT_CG_CTRL_ENABLE 0x1ULL + +/* CMEM_LAT_STATUS register field. */ +#define CMEM_LAT_STATUS_CYCLE_OVF BIT(0) +#define CMEM_LAT_STATUS_MC0_AOR_OVF BIT(1) +#define CMEM_LAT_STATUS_MC0_REQ_OVF BIT(3) +#define CMEM_LAT_STATUS_MC1_AOR_OVF BIT(4) +#define CMEM_LAT_STATUS_MC1_REQ_OVF BIT(6) +#define CMEM_LAT_STATUS_MC2_AOR_OVF BIT(7) +#define CMEM_LAT_STATUS_MC2_REQ_OVF BIT(9) + +/* Events. */ +#define CMEM_LAT_EVENT_CYCLES 0x0 +#define CMEM_LAT_EVENT_REQ 0x1 +#define CMEM_LAT_EVENT_AOR 0x2 + +#define CMEM_LAT_NUM_EVENTS 0x3 +#define CMEM_LAT_MASK_EVENT 0x3 +#define CMEM_LAT_MAX_ACTIVE_EVENTS 32 + +#define CMEM_LAT_ACTIVE_CPU_MASK 0x0 +#define CMEM_LAT_ASSOCIATED_CPU_MASK 0x1 + +static unsigned long cmem_lat_pmu_cpuhp_state; + +struct cmem_lat_pmu_hw_events { + struct perf_event *events[CMEM_LAT_MAX_ACTIVE_EVENTS]; + DECLARE_BITMAP(used_ctrs, CMEM_LAT_MAX_ACTIVE_EVENTS); +}; + +struct cmem_lat_pmu { + struct pmu pmu; + struct device *dev; + const char *name; + const char *identifier; + void __iomem *base_broadcast; + void __iomem *base[NUM_INSTANCES]; + cpumask_t associated_cpus; + cpumask_t active_cpu; + struct hlist_node node; + struct cmem_lat_pmu_hw_events hw_events; +}; + +#define to_cmem_lat_pmu(p) \ + container_of(p, struct cmem_lat_pmu, pmu) + + +/* Get event type from perf_event. */ +static inline u32 get_event_type(struct perf_event *event) +{ + return (event->attr.config) & CMEM_LAT_MASK_EVENT; +} + +/* PMU operations. */ +static int cmem_lat_pmu_get_event_idx(struct cmem_lat_pmu_hw_events *hw_events, + struct perf_event *event) +{ + unsigned int idx; + + idx = find_first_zero_bit(hw_events->used_ctrs, CMEM_LAT_MAX_ACTIVE_EVENTS); + if (idx >= CMEM_LAT_MAX_ACTIVE_EVENTS) + return -EAGAIN; + + set_bit(idx, hw_events->used_ctrs); + + return idx; +} + +static bool cmem_lat_pmu_validate_event(struct pmu *pmu, + struct cmem_lat_pmu_hw_events *hw_events, + struct perf_event *event) +{ + int ret; + + if (is_software_event(event)) + return true; + + /* Reject groups spanning multiple HW PMUs. */ + if (event->pmu != pmu) + return false; + + ret = cmem_lat_pmu_get_event_idx(hw_events, event); + if (ret < 0) + return false; + + return true; +} + +/* Make sure the group of events can be scheduled at once on the PMU. */ +static bool cmem_lat_pmu_validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct cmem_lat_pmu_hw_events fake_hw_events; + + if (event->group_leader == event) + return true; + + memset(&fake_hw_events, 0, sizeof(fake_hw_events)); + + if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, leader)) + return false; + + for_each_sibling_event(sibling, leader) { + if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, sibling)) + return false; + } + + return cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, event); +} + +static int cmem_lat_pmu_event_init(struct perf_event *event) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u32 event_type = get_event_type(event); + + if (event->attr.type != event->pmu->type || + event_type >= CMEM_LAT_NUM_EVENTS) + return -ENOENT; + + /* + * Sampling, per-process mode, and per-task counters are not supported + * since this PMU is shared across all CPUs. + */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) { + dev_dbg(cmem_lat_pmu->pmu.dev, + "Can't support sampling and per-process mode\n"); + return -EOPNOTSUPP; + } + + if (event->cpu < 0) { + dev_dbg(cmem_lat_pmu->pmu.dev, "Can't support per-task counters\n"); + return -EINVAL; + } + + /* + * Make sure the CPU assignment is on one of the CPUs associated with + * this PMU. + */ + if (!cpumask_test_cpu(event->cpu, &cmem_lat_pmu->associated_cpus)) { + dev_dbg(cmem_lat_pmu->pmu.dev, + "Requested cpu is not associated with the PMU\n"); + return -EINVAL; + } + + /* Enforce the current active CPU to handle the events in this PMU. */ + event->cpu = cpumask_first(&cmem_lat_pmu->active_cpu); + if (event->cpu >= nr_cpu_ids) + return -EINVAL; + + if (!cmem_lat_pmu_validate_group(event)) + return -EINVAL; + + hwc->idx = -1; + hwc->config = event_type; + + return 0; +} + +static u64 cmem_lat_pmu_read_status(struct cmem_lat_pmu *cmem_lat_pmu, + unsigned int inst) +{ + return readq(cmem_lat_pmu->base[inst] + CMEM_LAT_STATUS); +} + +static u64 cmem_lat_pmu_read_cycle_counter(struct perf_event *event) +{ + const unsigned int instance = 0; + u64 status; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct device *dev = cmem_lat_pmu->dev; + + /* + * Use the reading from first instance since all instances are + * identical. + */ + status = cmem_lat_pmu_read_status(cmem_lat_pmu, instance); + if (status & CMEM_LAT_STATUS_CYCLE_OVF) + dev_warn(dev, "Cycle counter overflow\n"); + + return readq(cmem_lat_pmu->base[instance] + CMEM_LAT_CYCLE_CNTR); +} + +static u64 cmem_lat_pmu_read_req_counter(struct perf_event *event) +{ + unsigned int i; + u64 status, val = 0; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct device *dev = cmem_lat_pmu->dev; + + /* Sum up the counts from all instances. */ + for (i = 0; i < NUM_INSTANCES; i++) { + status = cmem_lat_pmu_read_status(cmem_lat_pmu, i); + if (status & CMEM_LAT_STATUS_MC0_REQ_OVF) + dev_warn(dev, "MC0 request counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC1_REQ_OVF) + dev_warn(dev, "MC1 request counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC2_REQ_OVF) + dev_warn(dev, "MC2 request counter overflow\n"); + + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC0_REQ_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC1_REQ_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC2_REQ_CNTR); + } + + return val; +} + +static u64 cmem_lat_pmu_read_aor_counter(struct perf_event *event) +{ + unsigned int i; + u64 status, val = 0; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct device *dev = cmem_lat_pmu->dev; + + /* Sum up the counts from all instances. */ + for (i = 0; i < NUM_INSTANCES; i++) { + status = cmem_lat_pmu_read_status(cmem_lat_pmu, i); + if (status & CMEM_LAT_STATUS_MC0_AOR_OVF) + dev_warn(dev, "MC0 AOR counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC1_AOR_OVF) + dev_warn(dev, "MC1 AOR counter overflow\n"); + if (status & CMEM_LAT_STATUS_MC2_AOR_OVF) + dev_warn(dev, "MC2 AOR counter overflow\n"); + + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC0_AOR_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC1_AOR_CNTR); + val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC2_AOR_CNTR); + } + + return val; +} + +static u64 (*read_counter_fn[CMEM_LAT_NUM_EVENTS])(struct perf_event *) = { + [CMEM_LAT_EVENT_CYCLES] = cmem_lat_pmu_read_cycle_counter, + [CMEM_LAT_EVENT_REQ] = cmem_lat_pmu_read_req_counter, + [CMEM_LAT_EVENT_AOR] = cmem_lat_pmu_read_aor_counter, +}; + +static void cmem_lat_pmu_event_update(struct perf_event *event) +{ + u32 event_type; + u64 prev, now; + struct hw_perf_event *hwc = &event->hw; + + if (hwc->state & PERF_HES_STOPPED) + return; + + event_type = hwc->config; + + do { + prev = local64_read(&hwc->prev_count); + now = read_counter_fn[event_type](event); + } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); + + local64_add(now - prev, &event->count); + + hwc->state |= PERF_HES_UPTODATE; +} + +static void cmem_lat_pmu_start(struct perf_event *event, int pmu_flags) +{ + event->hw.state = 0; +} + +static void cmem_lat_pmu_stop(struct perf_event *event, int pmu_flags) +{ + event->hw.state |= PERF_HES_STOPPED; +} + +static int cmem_lat_pmu_add(struct perf_event *event, int flags) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx; + + if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), + &cmem_lat_pmu->associated_cpus))) + return -ENOENT; + + idx = cmem_lat_pmu_get_event_idx(hw_events, event); + if (idx < 0) + return idx; + + hw_events->events[idx] = event; + hwc->idx = idx; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + if (flags & PERF_EF_START) + cmem_lat_pmu_start(event, PERF_EF_RELOAD); + + /* Propagate changes to the userspace mapping. */ + perf_event_update_userpage(event); + + return 0; +} + +static void cmem_lat_pmu_del(struct perf_event *event, int flags) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu); + struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events; + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + cmem_lat_pmu_stop(event, PERF_EF_UPDATE); + + hw_events->events[idx] = NULL; + + clear_bit(idx, hw_events->used_ctrs); + + perf_event_update_userpage(event); +} + +static void cmem_lat_pmu_read(struct perf_event *event) +{ + cmem_lat_pmu_event_update(event); +} + +static inline void cmem_lat_pmu_cg_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, + u64 val) +{ + writeq(val, cmem_lat_pmu->base_broadcast + CMEM_LAT_CG_CTRL); +} + +static inline void cmem_lat_pmu_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, u64 val) +{ + writeq(val, cmem_lat_pmu->base_broadcast + CMEM_LAT_CTRL); +} + +static void cmem_lat_pmu_enable(struct pmu *pmu) +{ + bool disabled; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); + + disabled = bitmap_empty(cmem_lat_pmu->hw_events.used_ctrs, + CMEM_LAT_MAX_ACTIVE_EVENTS); + + if (disabled) + return; + + /* Enable all the counters. */ + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_ENABLE); + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_ENABLE); +} + +static void cmem_lat_pmu_disable(struct pmu *pmu) +{ + int idx; + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); + + /* Disable all the counters. */ + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_DISABLE); + + /* + * The counters will start from 0 again on restart. + * Update the events immediately to avoid losing the counts. + */ + for_each_set_bit(idx, cmem_lat_pmu->hw_events.used_ctrs, + CMEM_LAT_MAX_ACTIVE_EVENTS) { + struct perf_event *event = cmem_lat_pmu->hw_events.events[idx]; + + if (!event) + continue; + + cmem_lat_pmu_event_update(event); + + local64_set(&event->hw.prev_count, 0ULL); + } + + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_CLR); + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_DISABLE); +} + +/* PMU identifier attribute. */ + +static ssize_t cmem_lat_pmu_identifier_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(page, "%s\n", cmem_lat_pmu->identifier); +} + +static struct device_attribute cmem_lat_pmu_identifier_attr = + __ATTR(identifier, 0444, cmem_lat_pmu_identifier_show, NULL); + +static struct attribute *cmem_lat_pmu_identifier_attrs[] = { + &cmem_lat_pmu_identifier_attr.attr, + NULL +}; + +static struct attribute_group cmem_lat_pmu_identifier_attr_group = { + .attrs = cmem_lat_pmu_identifier_attrs, +}; + +/* Format attributes. */ + +#define NV_PMU_EXT_ATTR(_name, _func, _config) \ + (&((struct dev_ext_attribute[]){ \ + { \ + .attr = __ATTR(_name, 0444, _func, NULL), \ + .var = (void *)_config \ + } \ + })[0].attr.attr) + +static struct attribute *cmem_lat_pmu_formats[] = { + NV_PMU_EXT_ATTR(event, device_show_string, "config:0-1"), + NULL +}; + +static const struct attribute_group cmem_lat_pmu_format_group = { + .name = "format", + .attrs = cmem_lat_pmu_formats, +}; + +/* Event attributes. */ + +static ssize_t cmem_lat_pmu_sysfs_event_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, typeof(*pmu_attr), attr); + return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id); +} + +#define NV_PMU_EVENT_ATTR(_name, _config) \ + PMU_EVENT_ATTR_ID(_name, cmem_lat_pmu_sysfs_event_show, _config) + +static struct attribute *cmem_lat_pmu_events[] = { + NV_PMU_EVENT_ATTR(cycles, CMEM_LAT_EVENT_CYCLES), + NV_PMU_EVENT_ATTR(rd_req, CMEM_LAT_EVENT_REQ), + NV_PMU_EVENT_ATTR(rd_cum_outs, CMEM_LAT_EVENT_AOR), + NULL +}; + +static const struct attribute_group cmem_lat_pmu_events_group = { + .name = "events", + .attrs = cmem_lat_pmu_events, +}; + +/* Cpumask attributes. */ + +static ssize_t cmem_lat_pmu_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu); + struct dev_ext_attribute *eattr = + container_of(attr, struct dev_ext_attribute, attr); + unsigned long mask_id = (unsigned long)eattr->var; + const cpumask_t *cpumask; + + switch (mask_id) { + case CMEM_LAT_ACTIVE_CPU_MASK: + cpumask = &cmem_lat_pmu->active_cpu; + break; + case CMEM_LAT_ASSOCIATED_CPU_MASK: + cpumask = &cmem_lat_pmu->associated_cpus; + break; + default: + return 0; + } + return cpumap_print_to_pagebuf(true, buf, cpumask); +} + +#define NV_PMU_CPUMASK_ATTR(_name, _config) \ + NV_PMU_EXT_ATTR(_name, cmem_lat_pmu_cpumask_show, \ + (unsigned long)_config) + +static struct attribute *cmem_lat_pmu_cpumask_attrs[] = { + NV_PMU_CPUMASK_ATTR(cpumask, CMEM_LAT_ACTIVE_CPU_MASK), + NV_PMU_CPUMASK_ATTR(associated_cpus, CMEM_LAT_ASSOCIATED_CPU_MASK), + NULL +}; + +static const struct attribute_group cmem_lat_pmu_cpumask_attr_group = { + .attrs = cmem_lat_pmu_cpumask_attrs, +}; + +/* Per PMU device attribute groups. */ + +static const struct attribute_group *cmem_lat_pmu_attr_groups[] = { + &cmem_lat_pmu_identifier_attr_group, + &cmem_lat_pmu_format_group, + &cmem_lat_pmu_events_group, + &cmem_lat_pmu_cpumask_attr_group, + NULL +}; + +static int cmem_lat_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + struct cmem_lat_pmu *cmem_lat_pmu = + hlist_entry_safe(node, struct cmem_lat_pmu, node); + + if (!cpumask_test_cpu(cpu, &cmem_lat_pmu->associated_cpus)) + return 0; + + /* If the PMU is already managed, there is nothing to do */ + if (!cpumask_empty(&cmem_lat_pmu->active_cpu)) + return 0; + + /* Use this CPU for event counting */ + cpumask_set_cpu(cpu, &cmem_lat_pmu->active_cpu); + + return 0; +} + +static int cmem_lat_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) +{ + unsigned int dst; + + struct cmem_lat_pmu *cmem_lat_pmu = + hlist_entry_safe(node, struct cmem_lat_pmu, node); + + /* Nothing to do if this CPU doesn't own the PMU */ + if (!cpumask_test_and_clear_cpu(cpu, &cmem_lat_pmu->active_cpu)) + return 0; + + /* Choose a new CPU to migrate ownership of the PMU to */ + dst = cpumask_any_and_but(&cmem_lat_pmu->associated_cpus, + cpu_online_mask, cpu); + if (dst >= nr_cpu_ids) + return 0; + + /* Use this CPU for event counting */ + perf_pmu_migrate_context(&cmem_lat_pmu->pmu, cpu, dst); + cpumask_set_cpu(dst, &cmem_lat_pmu->active_cpu); + + return 0; +} + +static int cmem_lat_pmu_get_cpus(struct cmem_lat_pmu *cmem_lat_pmu, + unsigned int socket) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu_to_node(cpu) == socket) + cpumask_set_cpu(cpu, &cmem_lat_pmu->associated_cpus); + } + + if (cpumask_empty(&cmem_lat_pmu->associated_cpus)) { + dev_dbg(cmem_lat_pmu->dev, + "No cpu associated with PMU socket-%u\n", socket); + return -ENODEV; + } + + return 0; +} + +static int cmem_lat_pmu_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct acpi_device *acpi_dev; + struct cmem_lat_pmu *cmem_lat_pmu; + char *name, *uid_str; + int ret, i; + u32 socket; + + acpi_dev = ACPI_COMPANION(dev); + if (!acpi_dev) + return -ENODEV; + + uid_str = acpi_device_uid(acpi_dev); + if (!uid_str) + return -ENODEV; + + ret = kstrtou32(uid_str, 0, &socket); + if (ret) + return ret; + + cmem_lat_pmu = devm_kzalloc(dev, sizeof(*cmem_lat_pmu), GFP_KERNEL); + name = devm_kasprintf(dev, GFP_KERNEL, "nvidia_cmem_latency_pmu_%u", socket); + if (!cmem_lat_pmu || !name) + return -ENOMEM; + + cmem_lat_pmu->dev = dev; + cmem_lat_pmu->name = name; + cmem_lat_pmu->identifier = acpi_device_hid(acpi_dev); + platform_set_drvdata(pdev, cmem_lat_pmu); + + cmem_lat_pmu->pmu = (struct pmu) { + .parent = &pdev->dev, + .task_ctx_nr = perf_invalid_context, + .pmu_enable = cmem_lat_pmu_enable, + .pmu_disable = cmem_lat_pmu_disable, + .event_init = cmem_lat_pmu_event_init, + .add = cmem_lat_pmu_add, + .del = cmem_lat_pmu_del, + .start = cmem_lat_pmu_start, + .stop = cmem_lat_pmu_stop, + .read = cmem_lat_pmu_read, + .attr_groups = cmem_lat_pmu_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | + PERF_PMU_CAP_NO_INTERRUPT, + }; + + /* Map the address of all the instances. */ + for (i = 0; i < NUM_INSTANCES; i++) { + cmem_lat_pmu->base[i] = devm_platform_ioremap_resource(pdev, i); + if (IS_ERR(cmem_lat_pmu->base[i])) { + dev_err(dev, "Failed map address for instance %d\n", i); + return PTR_ERR(cmem_lat_pmu->base[i]); + } + } + + /* Map broadcast address. */ + cmem_lat_pmu->base_broadcast = devm_platform_ioremap_resource(pdev, + NUM_INSTANCES); + if (IS_ERR(cmem_lat_pmu->base_broadcast)) { + dev_err(dev, "Failed map broadcast address\n"); + return PTR_ERR(cmem_lat_pmu->base_broadcast); + } + + ret = cmem_lat_pmu_get_cpus(cmem_lat_pmu, socket); + if (ret) + return ret; + + ret = cpuhp_state_add_instance(cmem_lat_pmu_cpuhp_state, + &cmem_lat_pmu->node); + if (ret) { + dev_err(&pdev->dev, "Error %d registering hotplug\n", ret); + return ret; + } + + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_ENABLE); + cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_CLR); + cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_DISABLE); + + ret = perf_pmu_register(&cmem_lat_pmu->pmu, name, -1); + if (ret) { + dev_err(&pdev->dev, "Failed to register PMU: %d\n", ret); + cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state, + &cmem_lat_pmu->node); + return ret; + } + + dev_dbg(&pdev->dev, "Registered %s PMU\n", name); + + return 0; +} + +static void cmem_lat_pmu_device_remove(struct platform_device *pdev) +{ + struct cmem_lat_pmu *cmem_lat_pmu = platform_get_drvdata(pdev); + + perf_pmu_unregister(&cmem_lat_pmu->pmu); + cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state, + &cmem_lat_pmu->node); +} + +static const struct acpi_device_id cmem_lat_pmu_acpi_match[] = { + { "NVDA2021" }, + { } +}; +MODULE_DEVICE_TABLE(acpi, cmem_lat_pmu_acpi_match); + +static struct platform_driver cmem_lat_pmu_driver = { + .driver = { + .name = "nvidia-t410-cmem-latency-pmu", + .acpi_match_table = ACPI_PTR(cmem_lat_pmu_acpi_match), + .suppress_bind_attrs = true, + }, + .probe = cmem_lat_pmu_probe, + .remove = cmem_lat_pmu_device_remove, +}; + +static int __init cmem_lat_pmu_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/nvidia/cmem_latency:online", + cmem_lat_pmu_cpu_online, + cmem_lat_pmu_cpu_teardown); + if (ret < 0) + return ret; + + cmem_lat_pmu_cpuhp_state = ret; + + return platform_driver_register(&cmem_lat_pmu_driver); +} + +static void __exit cmem_lat_pmu_exit(void) +{ + platform_driver_unregister(&cmem_lat_pmu_driver); + cpuhp_remove_multi_state(cmem_lat_pmu_cpuhp_state); +} + +module_init(cmem_lat_pmu_init); +module_exit(cmem_lat_pmu_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("NVIDIA Tegra410 CPU Memory Latency PMU driver"); +MODULE_AUTHOR("Besar Wicaksono "); diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index c808e0470394..672abea3b03c 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -1,6 +1,7 @@ menuconfig ARM64_MPAM_DRIVER bool "MPAM driver" - depends on ARM64 && ARM64_MPAM && EXPERT + depends on ARM64 && ARM64_MPAM + select ACPI_MPAM if ACPI help Memory System Resource Partitioning and Monitoring (MPAM) driver for System IP, e.g. caches and memory controllers. @@ -22,3 +23,9 @@ config MPAM_KUNIT_TEST If unsure, say N. endif + +config ARM64_MPAM_RESCTRL_FS + bool + default y if ARM64_MPAM_DRIVER && RESCTRL_FS + select RESCTRL_RMID_DEPENDS_ON_CLOSID + select RESCTRL_ASSIGN_FIXED diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 898199dcf80d..4f6d0e81f9b8 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o mpam-y += mpam_devices.o +mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 0666be6b0e88..41b14344b16f 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -29,7 +29,15 @@ #include "mpam_internal.h" -DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */ +/* Values for the T241 errata workaround */ +#define T241_CHIPS_MAX 4 +#define T241_CHIP_NSLICES 12 +#define T241_SPARE_REG0_OFF 0x1b0000 +#define T241_SPARE_REG1_OFF 0x1c0000 +#define T241_CHIP_ID(phys) FIELD_GET(GENMASK_ULL(44, 43), phys) +#define T241_SHADOW_REG_OFF(sidx, pid) (0x360048 + (sidx) * 0x10000 + (pid) * 8) +#define SMCCC_SOC_ID_T241 0x036b0241 +static void __iomem *t241_scratch_regs[T241_CHIPS_MAX]; /* * mpam_list_lock protects the SRCU lists when writing. Once the @@ -75,6 +83,14 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable); /* When mpam is disabled, the printed reason to aid debugging */ static char *mpam_disable_reason; +/* + * Whether resctrl has been setup. Used by cpuhp in preference to + * mpam_is_enabled(). The disable call after an error interrupt makes + * mpam_is_enabled() false before the cpuhp callbacks are made. + * Reads/writes should hold mpam_cpuhp_state_lock, (or be cpuhp callbacks). + */ +static bool mpam_resctrl_enabled; + /* * An MSC is a physical container for controls and monitors, each identified by * their RIS index. These share a base-address, interrupts and some MMIO @@ -624,6 +640,86 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, return ERR_PTR(-ENOENT); } +static int mpam_enable_quirk_nvidia_t241_1(struct mpam_msc *msc, + const struct mpam_quirk *quirk) +{ + s32 soc_id = arm_smccc_get_soc_id_version(); + struct resource *r; + phys_addr_t phys; + + /* + * A mapping to a device other than the MSC is needed, check + * SOC_ID is NVIDIA T241 chip (036b:0241) + */ + if (soc_id < 0 || soc_id != SMCCC_SOC_ID_T241) + return -EINVAL; + + r = platform_get_resource(msc->pdev, IORESOURCE_MEM, 0); + if (!r) + return -EINVAL; + + /* Find the internal registers base addr from the CHIP ID */ + msc->t241_id = T241_CHIP_ID(r->start); + phys = FIELD_PREP(GENMASK_ULL(45, 44), msc->t241_id) | 0x19000000ULL; + + t241_scratch_regs[msc->t241_id] = ioremap(phys, SZ_8M); + if (WARN_ON_ONCE(!t241_scratch_regs[msc->t241_id])) + return -EINVAL; + + pr_info_once("Enabled workaround for NVIDIA T241 erratum T241-MPAM-1\n"); + + return 0; +} + +static const struct mpam_quirk mpam_quirks[] = { + { + /* NVIDIA t241 erratum T241-MPAM-1 */ + .init = mpam_enable_quirk_nvidia_t241_1, + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_SCRUB_SHADOW_REGS, + }, + { + /* NVIDIA t241 erratum T241-MPAM-4 */ + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_FORCE_MBW_MIN_TO_ONE, + }, + { + /* NVIDIA t241 erratum T241-MPAM-6 */ + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_MBW_COUNTER_SCALE_64, + }, + { + /* ARM CMN-650 CSU erratum 3642720 */ + .iidr = MPAM_IIDR_ARM_CMN_650, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = IGNORE_CSU_NRDY, + }, + { NULL } /* Sentinel */ +}; + +static void mpam_enable_quirks(struct mpam_msc *msc) +{ + const struct mpam_quirk *quirk; + + for (quirk = &mpam_quirks[0]; quirk->iidr_mask; quirk++) { + int err = 0; + + if (quirk->iidr != (msc->iidr & quirk->iidr_mask)) + continue; + + if (quirk->init) + err = quirk->init(msc, quirk); + + if (err) + continue; + + mpam_set_quirk(quirk->workaround, msc); + } +} + /* * IHI009A.a has this nugget: "If a monitor does not support automatic behaviour * of NRDY, software can use this bit for any purpose" - so hardware might not @@ -715,6 +811,13 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_mbw_part, props); props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features); + + /* + * The BWA_WD field can represent 0-63, but the control fields it + * describes have a maximum of 16 bits. + */ + props->bwa_wd = min(props->bwa_wd, 16); + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) mpam_set_feature(mpam_feat_mbw_max, props); @@ -851,8 +954,11 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc) /* Grab an IDR value to find out how many RIS there are */ mutex_lock(&msc->part_sel_lock); idr = mpam_msc_read_idr(msc); + msc->iidr = mpam_read_partsel_reg(msc, IIDR); mutex_unlock(&msc->part_sel_lock); + mpam_enable_quirks(msc); + msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr); /* Use these values so partid/pmg always starts with a valid value */ @@ -903,6 +1009,7 @@ struct mon_read { enum mpam_device_features type; u64 *val; int err; + bool waited_timeout; }; static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris) @@ -1052,7 +1159,7 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, } } -static u64 mpam_msmon_overflow_val(enum mpam_device_features type) +static u64 __mpam_msmon_overflow_val(enum mpam_device_features type) { /* TODO: implement scaling counters */ switch (type) { @@ -1067,6 +1174,18 @@ static u64 mpam_msmon_overflow_val(enum mpam_device_features type) } } +static u64 mpam_msmon_overflow_val(enum mpam_device_features type, + struct mpam_msc *msc) +{ + u64 overflow_val = __mpam_msmon_overflow_val(type); + + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) && + type != mpam_feat_msmon_mbwu_63counter) + overflow_val *= 64; + + return overflow_val; +} + static void __ris_msmon_read(void *arg) { u64 now; @@ -1137,6 +1256,10 @@ static void __ris_msmon_read(void *arg) if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) nrdy = now & MSMON___NRDY; now = FIELD_GET(MSMON___VALUE, now); + + if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout) + nrdy = false; + break; case mpam_feat_msmon_mbwu_31counter: case mpam_feat_msmon_mbwu_44counter: @@ -1157,13 +1280,17 @@ static void __ris_msmon_read(void *arg) now = FIELD_GET(MSMON___VALUE, now); } + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) && + m->type != mpam_feat_msmon_mbwu_63counter) + now *= 64; + if (nrdy) break; mbwu_state = &ris->mbwu_state[ctx->mon]; if (overflow) - mbwu_state->correction += mpam_msmon_overflow_val(m->type); + mbwu_state->correction += mpam_msmon_overflow_val(m->type, msc); /* * Include bandwidth consumed before the last hardware reset and @@ -1270,6 +1397,7 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, .ctx = ctx, .type = type, .val = val, + .waited_timeout = true, }; *val = 0; @@ -1338,6 +1466,75 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) __mpam_write_reg(msc, reg, bm); } +static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid) +{ + int sidx, i, lcount = 1000; + void __iomem *regs; + u64 val0, val; + + regs = t241_scratch_regs[ris->vmsc->msc->t241_id]; + + for (i = 0; i < lcount; i++) { + /* Read the shadow register at index 0 */ + val0 = readq_relaxed(regs + T241_SHADOW_REG_OFF(0, partid)); + + /* Check if all the shadow registers have the same value */ + for (sidx = 1; sidx < T241_CHIP_NSLICES; sidx++) { + val = readq_relaxed(regs + + T241_SHADOW_REG_OFF(sidx, partid)); + if (val != val0) + break; + } + if (sidx == T241_CHIP_NSLICES) + break; + } + + if (i == lcount) + pr_warn_once("t241: inconsistent values in shadow regs"); + + /* Write a value zero to spare registers to take effect of MBW conf */ + writeq_relaxed(0, regs + T241_SPARE_REG0_OFF); + writeq_relaxed(0, regs + T241_SPARE_REG1_OFF); +} + +static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid, + struct mpam_config *cfg) +{ + if (mpam_has_quirk(T241_SCRUB_SHADOW_REGS, ris->vmsc->msc)) + mpam_apply_t241_erratum(ris, partid); +} + +static u16 mpam_wa_t241_force_mbw_min_to_one(struct mpam_props *props) +{ + u16 max_hw_value, min_hw_granule, res0_bits; + + res0_bits = 16 - props->bwa_wd; + max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + + return min_hw_granule + 1; +} + +static u16 mpam_wa_t241_calc_min_from_max(struct mpam_props *props, + struct mpam_config *cfg) +{ + u16 val = 0; + u16 max; + u16 delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1; + + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) { + max = cfg->mbw_max; + } else { + /* Resetting. Hence, use the ris specific default. */ + max = GENMASK(15, 16 - props->bwa_wd); + } + + if (max > delta) + val = max - delta; + + return val; +} + /* Called via IPI. Call while holding an SRCU reference */ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, struct mpam_config *cfg) @@ -1364,36 +1561,41 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, __mpam_intpart_sel(ris->ris_idx, partid, msc); } - if (mpam_has_feature(mpam_feat_cpor_part, rprops) && - mpam_has_feature(mpam_feat_cpor_part, cfg)) { - if (cfg->reset_cpbm) - mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); - else + if (mpam_has_feature(mpam_feat_cpor_part, rprops)) { + if (mpam_has_feature(mpam_feat_cpor_part, cfg)) mpam_write_partsel_reg(msc, CPBM, cfg->cpbm); + else + mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); } - if (mpam_has_feature(mpam_feat_mbw_part, rprops) && - mpam_has_feature(mpam_feat_mbw_part, cfg)) { - if (cfg->reset_mbw_pbm) + if (mpam_has_feature(mpam_feat_mbw_part, rprops)) { + if (mpam_has_feature(mpam_feat_mbw_part, cfg)) mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits); else mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); } - if (mpam_has_feature(mpam_feat_mbw_min, rprops) && - mpam_has_feature(mpam_feat_mbw_min, cfg)) - mpam_write_partsel_reg(msc, MBW_MIN, 0); + if (mpam_has_feature(mpam_feat_mbw_min, rprops)) { + u16 val = 0; - if (mpam_has_feature(mpam_feat_mbw_max, rprops) && - mpam_has_feature(mpam_feat_mbw_max, cfg)) { - if (cfg->reset_mbw_max) - mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); - else - mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) { + u16 min = mpam_wa_t241_force_mbw_min_to_one(rprops); + + val = mpam_wa_t241_calc_min_from_max(rprops, cfg); + val = max(val, min); + } + + mpam_write_partsel_reg(msc, MBW_MIN, val); } - if (mpam_has_feature(mpam_feat_mbw_prop, rprops) && - mpam_has_feature(mpam_feat_mbw_prop, cfg)) + if (mpam_has_feature(mpam_feat_mbw_max, rprops)) { + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) + mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); + else + mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); + } + + if (mpam_has_feature(mpam_feat_mbw_prop, rprops)) mpam_write_partsel_reg(msc, MBW_PROP, 0); if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) @@ -1421,6 +1623,8 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, mpam_write_partsel_reg(msc, PRI, pri_val); } + mpam_quirk_post_config_change(ris, partid, cfg); + mutex_unlock(&msc->part_sel_lock); } @@ -1493,16 +1697,6 @@ static int mpam_save_mbwu_state(void *arg) return 0; } -static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) -{ - *reset_cfg = (struct mpam_config) { - .reset_cpbm = true, - .reset_mbw_pbm = true, - .reset_mbw_max = true, - }; - bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST); -} - /* * Called via smp_call_on_cpu() to prevent migration, while still being * pre-emptible. Caller must hold mpam_srcu. @@ -1510,14 +1704,12 @@ static void mpam_init_reset_cfg(struct mpam_config *reset_cfg) static int mpam_reset_ris(void *arg) { u16 partid, partid_max; - struct mpam_config reset_cfg; + struct mpam_config reset_cfg = {}; struct mpam_msc_ris *ris = arg; if (ris->in_reset_state) return 0; - mpam_init_reset_cfg(&reset_cfg); - spin_lock(&partid_max_lock); partid_max = mpam_partid_max; spin_unlock(&partid_max_lock); @@ -1632,6 +1824,9 @@ static int mpam_cpu_online(unsigned int cpu) mpam_reprogram_msc(msc); } + if (mpam_resctrl_enabled) + return mpam_resctrl_online_cpu(cpu); + return 0; } @@ -1675,6 +1870,9 @@ static int mpam_cpu_offline(unsigned int cpu) { struct mpam_msc *msc; + if (mpam_resctrl_enabled) + mpam_resctrl_offline_cpu(cpu); + guard(srcu)(&mpam_srcu); list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, srcu_read_lock_held(&mpam_srcu)) { @@ -1971,6 +2169,7 @@ static bool mpam_has_cmax_wd_feature(struct mpam_props *props) * resulting safe value must be compatible with both. When merging values in * the tree, all the aliasing resources must be handled first. * On mismatch, parent is modified. + * Quirks on an MSC will apply to all MSC in that class. */ static void __props_mismatch(struct mpam_props *parent, struct mpam_props *child, bool alias) @@ -2090,6 +2289,7 @@ static void __props_mismatch(struct mpam_props *parent, * nobble the class feature, as we can't configure all the resources. * e.g. The L3 cache is composed of two resources with 13 and 17 portion * bitmaps respectively. + * Quirks on an MSC will apply to all MSC in that class. */ static void __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) @@ -2103,6 +2303,9 @@ __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n", (long)cprops->features, (long)vprops->features); + /* Merge quirks */ + class->quirks |= vmsc->msc->quirks; + /* Take the safe value for any common features */ __props_mismatch(cprops, vprops, false); } @@ -2167,6 +2370,9 @@ static void mpam_enable_merge_class_features(struct mpam_component *comp) list_for_each_entry(vmsc, &comp->vmsc, comp_list) __class_props_mismatch(class, vmsc); + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class)) + mpam_clear_feature(mpam_feat_mbw_min, &class->props); } /* @@ -2520,6 +2726,12 @@ static void mpam_enable_once(void) mutex_unlock(&mpam_list_lock); cpus_read_unlock(); + if (!err) { + err = mpam_resctrl_setup(); + if (err) + pr_err("Failed to initialise resctrl: %d\n", err); + } + if (err) { mpam_disable_reason = "Failed to enable."; schedule_work(&mpam_broken_work); @@ -2527,6 +2739,7 @@ static void mpam_enable_once(void) } static_branch_enable(&mpam_enabled); + mpam_resctrl_enabled = true; mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, "mpam:online"); @@ -2559,7 +2772,7 @@ static void mpam_reset_component_locked(struct mpam_component *comp) } } -static void mpam_reset_class_locked(struct mpam_class *class) +void mpam_reset_class_locked(struct mpam_class *class) { struct mpam_component *comp; @@ -2586,24 +2799,39 @@ static void mpam_reset_class(struct mpam_class *class) void mpam_disable(struct work_struct *ignored) { int idx; + bool do_resctrl_exit; struct mpam_class *class; struct mpam_msc *msc, *tmp; + if (mpam_is_enabled()) + static_branch_disable(&mpam_enabled); + mutex_lock(&mpam_cpuhp_state_lock); if (mpam_cpuhp_state) { cpuhp_remove_state(mpam_cpuhp_state); mpam_cpuhp_state = 0; } + + /* + * Removing the cpuhp state called mpam_cpu_offline() and told resctrl + * all the CPUs are offline. + */ + do_resctrl_exit = mpam_resctrl_enabled; + mpam_resctrl_enabled = false; mutex_unlock(&mpam_cpuhp_state_lock); - static_branch_disable(&mpam_enabled); + if (do_resctrl_exit) + mpam_resctrl_exit(); mpam_unregister_irqs(); idx = srcu_read_lock(&mpam_srcu); list_for_each_entry_srcu(class, &mpam_classes, classes_list, - srcu_read_lock_held(&mpam_srcu)) + srcu_read_lock_held(&mpam_srcu)) { mpam_reset_class(class); + if (do_resctrl_exit) + mpam_resctrl_teardown_class(class); + } srcu_read_unlock(&mpam_srcu, idx); mutex_lock(&mpam_list_lock); @@ -2694,6 +2922,7 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid, srcu_read_lock_held(&mpam_srcu)) { arg.ris = ris; mpam_touch_msc(msc, __write_config, &arg); + ris->in_reset_state = false; } mutex_unlock(&msc->cfg_lock); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index e8971842b124..1914aefdcba9 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -12,22 +12,31 @@ #include #include #include +#include #include #include #include +#include + #define MPAM_MSC_MAX_NUM_RIS 16 struct platform_device; -DECLARE_STATIC_KEY_FALSE(mpam_enabled); - #ifdef CONFIG_MPAM_KUNIT_TEST #define PACKED_FOR_KUNIT __packed #else #define PACKED_FOR_KUNIT #endif +/* + * This 'mon' values must not alias an actual monitor, so must be larger than + * U16_MAX, but not be confused with an errno value, so smaller than + * (u32)-SZ_4K. + * USE_PRE_ALLOCATED is used to avoid confusion with an actual monitor. + */ +#define USE_PRE_ALLOCATED (U16_MAX + 1) + static inline bool mpam_is_enabled(void) { return static_branch_likely(&mpam_enabled); @@ -76,6 +85,8 @@ struct mpam_msc { u8 pmg_max; unsigned long ris_idxs; u32 ris_max; + u32 iidr; + u16 quirks; /* * error_irq_lock is taken when registering/unregistering the error @@ -119,6 +130,9 @@ struct mpam_msc { void __iomem *mapped_hwpage; size_t mapped_hwpage_sz; + /* Values only used on some platforms for quirks */ + u32 t241_id; + struct mpam_garbage garbage; }; @@ -207,6 +221,42 @@ struct mpam_props { #define mpam_set_feature(_feat, x) __set_bit(_feat, (x)->features) #define mpam_clear_feature(_feat, x) __clear_bit(_feat, (x)->features) +/* Workaround bits for msc->quirks */ +enum mpam_device_quirks { + T241_SCRUB_SHADOW_REGS, + T241_FORCE_MBW_MIN_TO_ONE, + T241_MBW_COUNTER_SCALE_64, + IGNORE_CSU_NRDY, + MPAM_QUIRK_LAST +}; + +#define mpam_has_quirk(_quirk, x) ((1 << (_quirk) & (x)->quirks)) +#define mpam_set_quirk(_quirk, x) ((x)->quirks |= (1 << (_quirk))) + +struct mpam_quirk { + int (*init)(struct mpam_msc *msc, const struct mpam_quirk *quirk); + + u32 iidr; + u32 iidr_mask; + + enum mpam_device_quirks workaround; +}; + +#define MPAM_IIDR_MATCH_ONE (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0xfff) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0xf) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0xf) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff)) + +#define MPAM_IIDR_NVIDIA_T241 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0x241) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b)) + +#define MPAM_IIDR_ARM_CMN_650 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x43b)) + /* The values for MSMON_CFG_MBWU_FLT.RWBW */ enum mon_filter_options { COUNT_BOTH = 0, @@ -215,7 +265,11 @@ enum mon_filter_options { }; struct mon_cfg { - u16 mon; + /* + * mon must be large enough to hold out of range values like + * USE_PRE_ALLOCATED + */ + u32 mon; u8 pmg; bool match_pmg; bool csu_exclude_clean; @@ -246,6 +300,7 @@ struct mpam_class { struct mpam_props props; u32 nrdy_usec; + u16 quirks; u8 level; enum mpam_class_types type; @@ -266,10 +321,6 @@ struct mpam_config { u32 mbw_pbm; u16 mbw_max; - bool reset_cpbm; - bool reset_mbw_pbm; - bool reset_mbw_max; - struct mpam_garbage garbage; }; @@ -337,6 +388,32 @@ struct mpam_msc_ris { struct mpam_garbage garbage; }; +struct mpam_resctrl_dom { + struct mpam_component *ctrl_comp; + + /* + * There is no single mon_comp because different events may be backed + * by different class/components. mon_comp is indexed by the event + * number. + */ + struct mpam_component *mon_comp[QOS_NUM_EVENTS]; + + struct rdt_ctrl_domain resctrl_ctrl_dom; + struct rdt_l3_mon_domain resctrl_mon_dom; +}; + +struct mpam_resctrl_res { + struct mpam_class *class; + struct rdt_resource resctrl_res; + bool cdp_enabled; +}; + +struct mpam_resctrl_mon { + struct mpam_class *class; + + /* per-class data that resctrl needs will live here */ +}; + static inline int mpam_alloc_csu_mon(struct mpam_class *class) { struct mpam_props *cprops = &class->props; @@ -381,6 +458,9 @@ extern u8 mpam_pmg_max; void mpam_enable(struct work_struct *work); void mpam_disable(struct work_struct *work); +/* Reset all the RIS in a class under cpus_read_lock() */ +void mpam_reset_class_locked(struct mpam_class *class); + int mpam_apply_config(struct mpam_component *comp, u16 partid, struct mpam_config *cfg); @@ -391,6 +471,20 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, cpumask_t *affinity); +#ifdef CONFIG_RESCTRL_FS +int mpam_resctrl_setup(void); +void mpam_resctrl_exit(void); +int mpam_resctrl_online_cpu(unsigned int cpu); +void mpam_resctrl_offline_cpu(unsigned int cpu); +void mpam_resctrl_teardown_class(struct mpam_class *class); +#else +static inline int mpam_resctrl_setup(void) { return 0; } +static inline void mpam_resctrl_exit(void) { } +static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } +static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { } +static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } +#endif /* CONFIG_RESCTRL_FS */ + /* * MPAM MSCs have the following register layout. See: * Arm Memory System Resource Partitioning and Monitoring (MPAM) System diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c new file mode 100644 index 000000000000..a9938006d0e6 --- /dev/null +++ b/drivers/resctrl/mpam_resctrl.c @@ -0,0 +1,1704 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mpam_internal.h" + +DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters); + +/* + * The classes we've picked to map to resctrl resources, wrapped + * in with their resctrl structure. + * Class pointer may be NULL. + */ +static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; + +#define for_each_mpam_resctrl_control(res, rid) \ + for (rid = 0, res = &mpam_resctrl_controls[rid]; \ + rid < RDT_NUM_RESOURCES; \ + rid++, res = &mpam_resctrl_controls[rid]) + +/* + * The classes we've picked to map to resctrl events. + * Resctrl believes all the worlds a Xeon, and these are all on the L3. This + * array lets us find the actual class backing the event counters. e.g. + * the only memory bandwidth counters may be on the memory controller, but to + * make use of them, we pretend they are on L3. Restrict the events considered + * to those supported by MPAM. + * Class pointer may be NULL. + */ +#define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID +static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1]; + +#define for_each_mpam_resctrl_mon(mon, eventid) \ + for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid]; \ + eventid <= MPAM_MAX_EVENT; \ + eventid++, mon = &mpam_resctrl_counters[eventid]) + +/* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ +static DEFINE_MUTEX(domain_list_lock); + +/* + * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1. + * This applies globally to all traffic the CPU generates. + */ +static bool cdp_enabled; + +/* + * We use cacheinfo to discover the size of the caches and their id. cacheinfo + * populates this from a device_initcall(). mpam_resctrl_setup() must wait. + */ +static bool cacheinfo_ready; +static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); + +/* + * If resctrl_init() succeeded, resctrl_exit() can be used to remove support + * for the filesystem in the event of an error. + */ +static bool resctrl_enabled; + +bool resctrl_arch_alloc_capable(void) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + for_each_mpam_resctrl_control(res, rid) { + if (res->resctrl_res.alloc_capable) + return true; + } + + return false; +} + +bool resctrl_arch_mon_capable(void) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + /* All monitors are presented as being on the L3 cache */ + return l3->mon_capable; +} + +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + return false; +} + +void resctrl_arch_mon_event_config_read(void *info) +{ +} + +void resctrl_arch_mon_event_config_write(void *info) +{ +} + +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) +{ +} + +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id eventid) +{ +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ +} + +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + return -EOPNOTSUPP; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return false; +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + return -EINVAL; +} + +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) +{ + return -EOPNOTSUPP; +} + +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r) +{ + return false; +} + +void resctrl_arch_pre_mount(void) +{ +} + +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) +{ + return mpam_resctrl_controls[rid].cdp_enabled; +} + +/** + * resctrl_reset_task_closids() - Reset the PARTID/PMG values for all tasks. + * + * At boot, all existing tasks use partid zero for D and I. + * To enable/disable CDP emulation, all these tasks need relabelling. + */ +static void resctrl_reset_task_closids(void) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + resctrl_arch_set_closid_rmid(t, RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + } + read_unlock(&tasklist_lock); +} + +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) +{ + u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + int cpu; + + if (!IS_ENABLED(CONFIG_EXPERT) && enable) { + /* + * If the resctrl fs is mounted more than once, sequentially, + * then CDP can lead to the use of out of range PARTIDs. + */ + pr_warn("CDP not supported\n"); + return -EOPNOTSUPP; + } + + if (enable) + pr_warn("CDP is an expert feature and may cause MPAM to malfunction.\n"); + + /* + * resctrl_arch_set_cdp_enabled() is only called with enable set to + * false on error and unmount. + */ + cdp_enabled = enable; + mpam_resctrl_controls[rid].cdp_enabled = enable; + + if (enable) + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx() / 2; + else + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + + /* The mbw_max feature can't hide cdp as it's a per-partid maximum. */ + if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled) + mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false; + + if (mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled && + mpam_resctrl_controls[RDT_RESOURCE_MBA].class) + mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true; + + if (enable) { + if (mpam_partid_max < 1) + return -EINVAL; + + partid_d = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_DATA); + partid_i = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_CODE); + } + + mpam_set_task_partid_pmg(current, partid_d, partid_i, 0, 0); + WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current)); + + resctrl_reset_task_closids(); + + for_each_possible_cpu(cpu) + mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0); + on_each_cpu(resctrl_arch_sync_cpu_closid_rmid, NULL, 1); + + return 0; +} + +static bool mpam_resctrl_hide_cdp(enum resctrl_res_level rid) +{ + return cdp_enabled && !resctrl_arch_get_cdp_enabled(rid); +} + +/* + * MSC may raise an error interrupt if it sees an out or range partid/pmg, + * and go on to truncate the value. Regardless of what the hardware supports, + * only the system wide safe value is safe to use. + */ +u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) +{ + return mpam_partid_max + 1; +} + +u32 resctrl_arch_system_num_rmid_idx(void) +{ + return (mpam_pmg_max + 1) * (mpam_partid_max + 1); +} + +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid) +{ + return closid * (mpam_pmg_max + 1) + rmid; +} + +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *closid = idx / (mpam_pmg_max + 1); + *rmid = idx % (mpam_pmg_max + 1); +} + +void resctrl_arch_sched_in(struct task_struct *tsk) +{ + lockdep_assert_preemption_disabled(); + + mpam_thread_switch(tsk); +} + +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_cpu_defaults(cpu, closid, closid, rmid, rmid); + } else { + /* + * When CDP is enabled, resctrl halves the closid range and we + * use odd/even partid for one closid. + */ + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_cpu_defaults(cpu, partid_d, partid_i, rmid, rmid); + } +} + +void resctrl_arch_sync_cpu_closid_rmid(void *info) +{ + struct resctrl_cpu_defaults *r = info; + + lockdep_assert_preemption_disabled(); + + if (r) { + resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(), + r->closid, r->rmid); + } + + resctrl_arch_sched_in(current); +} + +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_task_partid_pmg(tsk, closid, closid, rmid, rmid); + } else { + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_task_partid_pmg(tsk, partid_d, partid_i, rmid, rmid); + } +} + +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return tsk_closid == closid; +} + +/* The task's pmg is not unique, the partid must be considered too */ +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + u32 tsk_rmid = FIELD_GET(MPAM0_EL1_PMG_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return (tsk_closid == closid) && (tsk_rmid == rmid); +} + +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) +{ + if (l >= RDT_NUM_RESOURCES) + return NULL; + + return &mpam_resctrl_controls[l].resctrl_res; +} + +static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mpam_is_enabled()) + return -EINVAL; + + if (!mon->class) + return -EINVAL; + + switch (evtid) { + case QOS_L3_OCCUP_EVENT_ID: + /* With CDP, one monitor gets used for both code/data reads */ + return mpam_alloc_csu_mon(mon->class); + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + return USE_PRE_ALLOCATED; + default: + return -EOPNOTSUPP; + } +} + +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, + enum resctrl_event_id evtid) +{ + DEFINE_WAIT(wait); + int *ret; + + ret = kmalloc_obj(*ret); + if (!ret) + return ERR_PTR(-ENOMEM); + + do { + prepare_to_wait(&resctrl_mon_ctx_waiters, &wait, + TASK_INTERRUPTIBLE); + *ret = resctrl_arch_mon_ctx_alloc_no_wait(evtid); + if (*ret == -ENOSPC) + schedule(); + } while (*ret == -ENOSPC && !signal_pending(current)); + finish_wait(&resctrl_mon_ctx_waiters, &wait); + + return ret; +} + +static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid, + u32 mon_idx) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mpam_is_enabled()) + return; + + if (!mon->class) + return; + + if (evtid == QOS_L3_OCCUP_EVENT_ID) + mpam_free_csu_mon(mon->class, mon_idx); + + wake_up(&resctrl_mon_ctx_waiters); +} + +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, + enum resctrl_event_id evtid, void *arch_mon_ctx) +{ + u32 mon_idx = *(u32 *)arch_mon_ctx; + + kfree(arch_mon_ctx); + + resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); +} + +static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) +{ + struct mon_cfg cfg; + + if (!mpam_is_enabled()) + return -EINVAL; + + /* Shift closid to account for CDP */ + closid = resctrl_get_config_index(closid, cdp_type); + + if (irqs_disabled()) { + /* Check if we can access this domain without an IPI */ + return -EIO; + } + + cfg = (struct mon_cfg) { + .mon = mon_idx, + .match_pmg = true, + .partid = closid, + .pmg = rmid, + }; + + return mpam_msmon_read(mon_comp, &cfg, mon_type, val); +} + +static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, u32 closid, u32 rmid, u64 *val) +{ + if (cdp_enabled) { + u64 code_val = 0, data_val = 0; + int err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_CODE, closid, rmid, &code_val); + if (err) + return err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_DATA, closid, rmid, &data_val); + if (err) + return err; + + *val += code_val + data_val; + return 0; + } + + return __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_NONE, closid, rmid, val); +} + +/* MBWU when not in ABMC mode (not supported), and CSU counters. */ +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, + u32 closid, u32 rmid, enum resctrl_event_id eventid, + void *arch_priv, u64 *val, void *arch_mon_ctx) +{ + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + u32 mon_idx = *(u32 *)arch_mon_ctx; + enum mpam_device_features mon_type; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + + resctrl_arch_rmid_read_context_check(); + + if (!mpam_is_enabled()) + return -EINVAL; + + if (eventid >= QOS_NUM_EVENTS || !mon->class) + return -EINVAL; + + l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr); + mon_comp = l3_dom->mon_comp[eventid]; + + if (eventid != QOS_L3_OCCUP_EVENT_ID) + return -EINVAL; + + mon_type = mpam_feat_msmon_csu; + + return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, + closid, rmid, val); +} + +/* + * The rmid realloc threshold should be for the smallest cache exposed to + * resctrl. + */ +static int update_rmid_limits(struct mpam_class *class) +{ + u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx(); + struct mpam_props *cprops = &class->props; + struct cacheinfo *ci; + + lockdep_assert_cpus_held(); + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return 0; + + /* + * Assume cache levels are the same size for all CPUs... + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level); + if (!ci || ci->size == 0) { + pr_debug("Could not read cache size for class %u\n", + class->level); + return -EINVAL; + } + + if (!resctrl_rmid_realloc_limit || + ci->size < resctrl_rmid_realloc_limit) { + resctrl_rmid_realloc_limit = ci->size; + resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg; + } + + return 0; +} + +static bool cache_has_usable_cpor(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_cpor_part, cprops)) + return false; + + /* resctrl uses u32 for all bitmap configurations */ + return class->props.cpbm_wd <= 32; +} + +static bool mba_class_use_mbw_max(struct mpam_props *cprops) +{ + return (mpam_has_feature(mpam_feat_mbw_max, cprops) && + cprops->bwa_wd); +} + +static bool class_has_usable_mba(struct mpam_props *cprops) +{ + return mba_class_use_mbw_max(cprops); +} + +static bool cache_has_usable_csu(struct mpam_class *class) +{ + struct mpam_props *cprops; + + if (!class) + return false; + + cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return false; + + /* + * CSU counters settle on the value, so we can get away with + * having only one. + */ + if (!cprops->num_csu_mon) + return false; + + return true; +} + +/* + * Calculate the worst-case percentage change from each implemented step + * in the control. + */ +static u32 get_mba_granularity(struct mpam_props *cprops) +{ + if (!mba_class_use_mbw_max(cprops)) + return 0; + + /* + * bwa_wd is the number of bits implemented in the 0.xxx + * fixed point fraction. 1 bit is 50%, 2 is 25% etc. + */ + return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd); +} + +/* + * Each fixed-point hardware value architecturally represents a range + * of values: the full range 0% - 100% is split contiguously into + * (1 << cprops->bwa_wd) equal bands. + * + * Although the bwa_bwd fields have 6 bits the maximum valid value is 16 + * as it reports the width of fields that are at most 16 bits. When + * fewer than 16 bits are valid the least significant bits are + * ignored. The implied binary point is kept between bits 15 and 16 and + * so the valid bits are leftmost. + * + * See ARM IHI0099B.a "MPAM system component specification", Section 9.3, + * "The fixed-point fractional format" for more information. + * + * Find the nearest percentage value to the upper bound of the selected band: + */ +static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +{ + u32 val = mbw_max; + + val >>= 16 - cprops->bwa_wd; + val += 1; + val *= MAX_MBA_BW; + val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd); + + return val; +} + +/* + * Find the band whose upper bound is closest to the specified percentage. + * + * A round-to-nearest policy is followed here as a balanced compromise + * between unexpected under-commit of the resource (where the total of + * a set of resource allocations after conversion is less than the + * expected total, due to rounding of the individual converted + * percentages) and over-commit (where the total of the converted + * allocations is greater than expected). + */ +static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +{ + u32 val = pc; + + val <<= cprops->bwa_wd; + val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); + val = max(val, 1) - 1; + val <<= 16 - cprops->bwa_wd; + + return val; +} + +static u32 get_mba_min(struct mpam_props *cprops) +{ + if (!mba_class_use_mbw_max(cprops)) { + WARN_ON_ONCE(1); + return 0; + } + + return mbw_max_to_percent(0, cprops); +} + +/* Find the L3 cache that has affinity with this CPU */ +static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask) +{ + u32 cache_id = get_cpu_cacheinfo_id(cpu, 3); + + lockdep_assert_cpus_held(); + + return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask); +} + +/* + * topology_matches_l3() - Is the provided class the same shape as L3 + * @victim: The class we'd like to pretend is L3. + * + * resctrl expects all the world's a Xeon, and all counters are on the + * L3. We allow some mapping counters on other classes. This requires + * that the CPU->domain mapping is the same kind of shape. + * + * Using cacheinfo directly would make this work even if resctrl can't + * use the L3 - but cacheinfo can't tell us anything about offline CPUs. + * Using the L3 resctrl domain list also depends on CPUs being online. + * Using the mpam_class we picked for L3 so we can use its domain list + * assumes that there are MPAM controls on the L3. + * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id() + * helper which can tell us about offline CPUs ... but getting the cache_id + * to start with relies on at least one CPU per L3 cache being online at + * boot. + * + * Walk the victim component list and compare the affinity mask with the + * corresponding L3. The topology matches if each victim:component's affinity + * mask is the same as the CPU's corresponding L3's. These lists/masks are + * computed from firmware tables so don't change at runtime. + */ +static bool topology_matches_l3(struct mpam_class *victim) +{ + int cpu, err; + struct mpam_component *victim_iter; + + lockdep_assert_cpus_held(); + + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) + return false; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(victim_iter, &victim->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_empty(&victim_iter->affinity)) { + pr_debug("class %u has CPU-less component %u - can't match L3!\n", + victim->level, victim_iter->comp_id); + return false; + } + + cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask); + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return false; + + cpumask_clear(tmp_cpumask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3's equivalent component to class %u component %u\n", + victim->level, victim_iter->comp_id); + return false; + } + + /* Any differing bits in the affinity mask? */ + if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) { + pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n" + "L3:%*pbl != victim:%*pbl\n", + victim->level, victim_iter->comp_id, + cpumask_pr_args(tmp_cpumask), + cpumask_pr_args(&victim_iter->affinity)); + + return false; + } + } + + return true; +} + +/* + * Test if the traffic for a class matches that at egress from the L3. For + * MSC at memory controllers this is only possible if there is a single L3 + * as otherwise the counters at the memory can include bandwidth from the + * non-local L3. + */ +static bool traffic_matches_l3(struct mpam_class *class) +{ + int err, cpu; + + lockdep_assert_cpus_held(); + + if (class->type == MPAM_CLASS_CACHE && class->level == 3) + return true; + + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a different cache from L3\n", class->level); + return false; + } + + if (class->type != MPAM_CLASS_MEMORY) { + pr_debug("class %u is neither of type cache or memory\n", class->level); + return false; + } + + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) { + pr_debug("cpumask allocation failed\n"); + return false; + } + + cpu = cpumask_any_and(&class->affinity, cpu_online_mask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3 downstream to cpu %d\n", cpu); + return false; + } + + if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) { + pr_debug("There is more than one L3\n"); + return false; + } + + /* Be strict; the traffic might stop in the intermediate cache. */ + if (get_cpu_cacheinfo_id(cpu, 4) != -1) { + pr_debug("L3 isn't the last level of cache\n"); + return false; + } + + if (num_possible_nodes() > 1) { + pr_debug("There is more than one numa node\n"); + return false; + } + +#ifdef CONFIG_HMEM_REPORTING + if (node_devices[cpu_to_node(cpu)]->cache_dev) { + pr_debug("There is a memory side cache\n"); + return false; + } +#endif + + return true; +} + +/* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ +static void mpam_resctrl_pick_caches(void) +{ + struct mpam_class *class; + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + if (class->type != MPAM_CLASS_CACHE) { + pr_debug("class %u is not a cache\n", class->level); + continue; + } + + if (class->level != 2 && class->level != 3) { + pr_debug("class %u is not L2 or L3\n", class->level); + continue; + } + + if (!cache_has_usable_cpor(class)) { + pr_debug("class %u cache misses CPOR\n", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u has missing CPUs, mask %*pb != %*pb\n", class->level, + cpumask_pr_args(&class->affinity), + cpumask_pr_args(cpu_possible_mask)); + continue; + } + + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + res->class = class; + } +} + +static void mpam_resctrl_pick_mba(void) +{ + struct mpam_class *class, *candidate_class = NULL; + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_props *cprops = &class->props; + + if (class->level != 3 && class->type == MPAM_CLASS_CACHE) { + pr_debug("class %u is a cache but not the L3\n", class->level); + continue; + } + + if (!class_has_usable_mba(cprops)) { + pr_debug("class %u has no bandwidth control\n", + class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u has missing CPUs\n", class->level); + continue; + } + + if (!topology_matches_l3(class)) { + pr_debug("class %u topology doesn't match L3\n", + class->level); + continue; + } + + if (!traffic_matches_l3(class)) { + pr_debug("class %u traffic doesn't match L3 egress\n", + class->level); + continue; + } + + /* + * Pick a resource to be MBA that as close as possible to + * the L3. mbm_total counts the bandwidth leaving the L3 + * cache and MBA should correspond as closely as possible + * for proper operation of mba_sc. + */ + if (!candidate_class || class->level < candidate_class->level) + candidate_class = class; + } + + if (candidate_class) { + pr_debug("selected class %u to back MBA\n", + candidate_class->level); + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + res->class = candidate_class; + } +} + +static void counter_update_class(enum resctrl_event_id evt_id, + struct mpam_class *class) +{ + struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class; + + if (existing_class) { + if (class->level == 3) { + pr_debug("Existing class is L3 - L3 wins\n"); + return; + } + + if (existing_class->level < class->level) { + pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n", + existing_class->level, class->level); + return; + } + } + + mpam_resctrl_counters[evt_id].class = class; +} + +static void mpam_resctrl_pick_counters(void) +{ + struct mpam_class *class; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + /* The name of the resource is L3... */ + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a cache but not the L3", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u does not cover all CPUs", + class->level); + continue; + } + + if (cache_has_usable_csu(class)) { + pr_debug("class %u has usable CSU", + class->level); + + /* CSU counters only make sense on a cache. */ + switch (class->type) { + case MPAM_CLASS_CACHE: + if (update_rmid_limits(class)) + break; + + counter_update_class(QOS_L3_OCCUP_EVENT_ID, class); + break; + default: + break; + } + } + } +} + +static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) +{ + struct mpam_class *class = res->class; + struct mpam_props *cprops = &class->props; + struct rdt_resource *r = &res->resctrl_res; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + r->schema_fmt = RESCTRL_SCHEMA_BITMAP; + r->cache.arch_has_sparse_bitmasks = true; + + r->cache.cbm_len = class->props.cpbm_wd; + /* mpam_devices will reject empty bitmaps */ + r->cache.min_cbm_bits = 1; + + if (r->rid == RDT_RESOURCE_L2) { + r->name = "L2"; + r->ctrl_scope = RESCTRL_L2_CACHE; + r->cdp_capable = true; + } else { + r->name = "L3"; + r->ctrl_scope = RESCTRL_L3_CACHE; + r->cdp_capable = true; + } + + /* + * Which bits are shared with other ...things... Unknown + * devices use partid-0 which uses all the bitmap fields. Until + * we have configured the SMMU and GIC not to do this 'all the + * bits' is the correct answer here. + */ + r->cache.shareable_bits = resctrl_get_default_ctrl(r); + r->alloc_capable = true; + break; + case RDT_RESOURCE_MBA: + r->schema_fmt = RESCTRL_SCHEMA_RANGE; + r->ctrl_scope = RESCTRL_L3_CACHE; + + r->membw.delay_linear = true; + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->membw.min_bw = get_mba_min(cprops); + r->membw.max_bw = MAX_MBA_BW; + r->membw.bw_gran = get_mba_granularity(cprops); + + r->name = "MB"; + r->alloc_capable = true; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) +{ + struct mpam_class *class = comp->class; + + if (class->type == MPAM_CLASS_CACHE) + return comp->comp_id; + + if (topology_matches_l3(class)) { + /* Use the corresponding L3 component ID as the domain ID */ + int id = get_cpu_cacheinfo_id(cpu, 3); + + /* Implies topology_matches_l3() made a mistake */ + if (WARN_ON_ONCE(id == -1)) + return comp->comp_id; + + return id; + } + + /* Otherwise, expose the ID used by the firmware table code. */ + return comp->comp_id; +} + +static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, + enum resctrl_event_id type) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + lockdep_assert_cpus_held(); + + /* + * There also needs to be an L3 cache present. + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + if (get_cpu_cacheinfo_id(raw_smp_processor_id(), 3) == -1) + return 0; + + /* + * If there are no MPAM resources on L3, force it into existence. + * topology_matches_l3() already ensures this looks like the L3. + * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init(). + */ + if (!res->class) { + pr_warn_once("Faking L3 MSC to enable counters.\n"); + res->class = mpam_resctrl_counters[type].class; + } + + /* + * Called multiple times!, once per event type that has a + * monitoring class. + * Setting name is necessary on monitor only platforms. + */ + l3->name = "L3"; + l3->mon_scope = RESCTRL_L3_CACHE; + + /* + * num-rmid is the upper bound for the number of monitoring groups that + * can exist simultaneously, including the default monitoring group for + * each control group. Hence, advertise the whole rmid_idx space even + * though each control group has its own pmg/rmid space. Unfortunately, + * this does mean userspace needs to know the architecture to correctly + * interpret this value. + */ + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + + if (resctrl_enable_mon_event(type, false, 0, NULL)) + l3->mon_capable = true; + + return 0; +} + +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type type) +{ + u32 partid; + struct mpam_config *cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + enum mpam_device_features configured_by; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + return resctrl_get_default_ctrl(r); + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + /* + * When CDP is enabled, but the resource doesn't support it, + * the control is cloned across both partids. + * Pick one at random to read: + */ + if (mpam_resctrl_hide_cdp(r->rid)) + type = CDP_DATA; + + partid = resctrl_get_config_index(closid, type); + cfg = &dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + configured_by = mpam_feat_cpor_part; + break; + case RDT_RESOURCE_MBA: + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + configured_by = mpam_feat_mbw_max; + break; + } + fallthrough; + default: + return resctrl_get_default_ctrl(r); + } + + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) || + !mpam_has_feature(configured_by, cfg)) + return resctrl_get_default_ctrl(r); + + switch (configured_by) { + case mpam_feat_cpor_part: + return cfg->cpbm; + case mpam_feat_mbw_max: + return mbw_max_to_percent(cfg->mbw_max, cprops); + default: + return resctrl_get_default_ctrl(r); + } +} + +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + int err; + u32 partid; + struct mpam_config cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + if (!mpam_is_enabled()) + return -EINVAL; + + /* + * No need to check the CPU as mpam_apply_config() doesn't care, and + * resctrl_arch_update_domains() relies on this. + */ + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + if (mpam_resctrl_hide_cdp(r->rid)) + t = CDP_DATA; + + partid = resctrl_get_config_index(closid, t); + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) { + pr_debug("Not alloc capable or computed PARTID out of range\n"); + return -EINVAL; + } + + /* + * Copy the current config to avoid clearing other resources when the + * same component is exposed multiple times through resctrl. + */ + cfg = dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + cfg.cpbm = cfg_val; + mpam_set_feature(mpam_feat_cpor_part, &cfg); + break; + case RDT_RESOURCE_MBA: + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops); + mpam_set_feature(mpam_feat_mbw_max, &cfg); + break; + } + fallthrough; + default: + return -EINVAL; + } + + /* + * When CDP is enabled, but the resource doesn't support it, we need to + * apply the same configuration to the other partid. + */ + if (mpam_resctrl_hide_cdp(r->rid)) { + partid = resctrl_get_config_index(closid, CDP_CODE); + err = mpam_apply_config(dom->ctrl_comp, partid, &cfg); + if (err) + return err; + + partid = resctrl_get_config_index(closid, CDP_DATA); + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); + } + + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); +} + +int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) +{ + int err; + struct rdt_ctrl_domain *d; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + if (!mpam_is_enabled()) + return -EINVAL; + + list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) { + for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) { + struct resctrl_staged_config *cfg = &d->staged_config[t]; + + if (!cfg->have_new_ctrl) + continue; + + err = resctrl_arch_update_one(r, d, closid, t, + cfg->new_ctrl); + if (err) + return err; + } + } + + return 0; +} + +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) +{ + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + return; + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + mpam_reset_class_locked(res->class); +} + +static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, + enum resctrl_res_level rid, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + INIT_LIST_HEAD(&hdr->list); + hdr->id = mpam_resctrl_pick_domain_id(cpu, comp); + hdr->rid = rid; + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +static void mpam_resctrl_online_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +/** + * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU. + * @cpu: The CPU to remove from the domain. + * @hdr: The domain's header. + * + * Removes @cpu from the header mask. If this was the last CPU in the domain, + * the domain header is removed from its parent list and true is returned, + * indicating the parent structure can be freed. + * If there are other CPUs in the domain, returns false. + */ +static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_held(&domain_list_lock); + + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (cpumask_empty(&hdr->cpu_mask)) { + list_del_rcu(&hdr->list); + synchronize_rcu(); + return true; + } + + return false; +} + +static void mpam_resctrl_domain_insert(struct list_head *list, + struct rdt_domain_hdr *new) +{ + struct rdt_domain_hdr *err; + struct list_head *pos = NULL; + + lockdep_assert_held(&domain_list_lock); + + err = resctrl_find_domain(list, new->id, &pos); + if (WARN_ON_ONCE(err)) + return; + + list_add_tail_rcu(&new->list, pos); +} + +static struct mpam_component *find_component(struct mpam_class *class, int cpu) +{ + struct mpam_component *comp; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp->affinity)) + return comp; + } + + return NULL; +} + +static struct mpam_resctrl_dom * +mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) +{ + int err; + struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; + struct mpam_class *class = res->class; + struct mpam_component *comp_iter, *ctrl_comp; + struct rdt_resource *r = &res->resctrl_res; + + lockdep_assert_held(&domain_list_lock); + + ctrl_comp = NULL; + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp_iter, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { + ctrl_comp = comp_iter; + break; + } + } + + /* class has no component for this CPU */ + if (WARN_ON_ONCE(!ctrl_comp)) + return ERR_PTR(-EINVAL); + + dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); + if (!dom) + return ERR_PTR(-ENOMEM); + + if (r->alloc_capable) { + dom->ctrl_comp = ctrl_comp; + + ctrl_d = &dom->resctrl_ctrl_dom; + mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr); + ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; + err = resctrl_online_ctrl_domain(r, ctrl_d); + if (err) + goto free_domain; + + mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); + } else { + pr_debug("Skipped control domain online - no controls\n"); + } + + if (r->mon_capable) { + struct mpam_component *any_mon_comp; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + /* + * Even if the monitor domain is backed by a different + * component, the L3 component IDs need to be used... only + * there may be no ctrl_comp for the L3. + * Search each event's class list for a component with + * overlapping CPUs and set up the dom->mon_comp array. + */ + + for_each_mpam_resctrl_mon(mon, eventid) { + struct mpam_component *mon_comp; + + if (!mon->class) + continue; // dummy resource + + mon_comp = find_component(mon->class, cpu); + dom->mon_comp[eventid] = mon_comp; + if (mon_comp) + any_mon_comp = mon_comp; + } + if (!any_mon_comp) { + WARN_ON_ONCE(0); + err = -EFAULT; + goto offline_ctrl_domain; + } + + mon_d = &dom->resctrl_mon_dom; + mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr); + mon_d->hdr.type = RESCTRL_MON_DOMAIN; + err = resctrl_online_mon_domain(r, &mon_d->hdr); + if (err) + goto offline_ctrl_domain; + + mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); + } else { + pr_debug("Skipped monitor domain online - no monitors\n"); + } + + return dom; + +offline_ctrl_domain: + if (r->alloc_capable) { + mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + resctrl_offline_ctrl_domain(r, ctrl_d); + } +free_domain: + kfree(dom); + dom = ERR_PTR(err); + + return dom; +} + +/* + * We know all the monitors are associated with the L3, even if there are no + * controls and therefore no control component. Find the cache-id for the CPU + * and use that to search for existing resctrl domains. + * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id + * for anything that is not a cache. + */ +static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) +{ + int cache_id; + struct mpam_resctrl_dom *dom; + struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + + lockdep_assert_cpus_held(); + + if (!l3->class) + return NULL; + cache_id = get_cpu_cacheinfo_id(cpu, 3); + if (cache_id < 0) + return NULL; + + list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { + if (dom->resctrl_mon_dom.hdr.id == cache_id) + return dom; + } + + return NULL; +} + +static struct mpam_resctrl_dom * +mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) +{ + struct mpam_resctrl_dom *dom; + struct rdt_resource *r = &res->resctrl_res; + + lockdep_assert_cpus_held(); + + list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) { + if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) + return dom; + } + + if (r->rid != RDT_RESOURCE_L3) + return NULL; + + /* Search the mon domain list too - needed on monitor only platforms. */ + return mpam_resctrl_get_mon_domain_from_cpu(cpu); +} + +int mpam_resctrl_online_cpu(unsigned int cpu) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_resource *r = &res->resctrl_res; + + if (!res->class) + continue; // dummy_resource; + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (!dom) { + dom = mpam_resctrl_alloc_domain(cpu, res); + if (IS_ERR(dom)) + return PTR_ERR(dom); + } else { + if (r->alloc_capable) { + struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom; + + mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); + } + if (r->mon_capable) { + struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom; + + mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); + } + } + } + + resctrl_online_cpu(cpu); + + return 0; +} + +void mpam_resctrl_offline_cpu(unsigned int cpu) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + resctrl_offline_cpu(cpu); + + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; + bool ctrl_dom_empty, mon_dom_empty; + struct rdt_resource *r = &res->resctrl_res; + + if (!res->class) + continue; // dummy resource + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (WARN_ON_ONCE(!dom)) + continue; + + if (r->alloc_capable) { + ctrl_d = &dom->resctrl_ctrl_dom; + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + if (ctrl_dom_empty) + resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); + } else { + ctrl_dom_empty = true; + } + + if (r->mon_capable) { + mon_d = &dom->resctrl_mon_dom; + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + if (mon_dom_empty) + resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); + } else { + mon_dom_empty = true; + } + + if (ctrl_dom_empty && mon_dom_empty) + kfree(dom); + } +} + +int mpam_resctrl_setup(void) +{ + int err = 0; + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + wait_event(wait_cacheinfo_ready, cacheinfo_ready); + + cpus_read_lock(); + for_each_mpam_resctrl_control(res, rid) { + INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); + INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains); + res->resctrl_res.rid = rid; + } + + /* Find some classes to use for controls */ + mpam_resctrl_pick_caches(); + mpam_resctrl_pick_mba(); + + /* Initialise the resctrl structures from the classes */ + for_each_mpam_resctrl_control(res, rid) { + if (!res->class) + continue; // dummy resource + + err = mpam_resctrl_control_init(res); + if (err) { + pr_debug("Failed to initialise rid %u\n", rid); + goto internal_error; + } + } + + /* Find some classes to use for monitors */ + mpam_resctrl_pick_counters(); + + for_each_mpam_resctrl_mon(mon, eventid) { + if (!mon->class) + continue; // dummy resource + + err = mpam_resctrl_monitor_init(mon, eventid); + if (err) { + pr_debug("Failed to initialise event %u\n", eventid); + goto internal_error; + } + } + + cpus_read_unlock(); + + if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) { + pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n", + resctrl_arch_alloc_capable(), resctrl_arch_mon_capable()); + return -EOPNOTSUPP; + } + + err = resctrl_init(); + if (err) + return err; + + WRITE_ONCE(resctrl_enabled, true); + + return 0; + +internal_error: + cpus_read_unlock(); + pr_debug("Internal error %d - resctrl not supported\n", err); + return err; +} + +void mpam_resctrl_exit(void) +{ + if (!READ_ONCE(resctrl_enabled)) + return; + + WRITE_ONCE(resctrl_enabled, false); + resctrl_exit(); +} + +/* + * The driver is detaching an MSC from this class, if resctrl was using it, + * pull on resctrl_exit(). + */ +void mpam_resctrl_teardown_class(struct mpam_class *class) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + might_sleep(); + + for_each_mpam_resctrl_control(res, rid) { + if (res->class == class) { + res->class = NULL; + break; + } + } + for_each_mpam_resctrl_mon(mon, eventid) { + if (mon->class == class) { + mon->class = NULL; + break; + } + } +} + +static int __init __cacheinfo_ready(void) +{ + cacheinfo_ready = true; + wake_up(&wait_cacheinfo_ready); + + return 0; +} +device_initcall_sync(__cacheinfo_ready); + +#ifdef CONFIG_MPAM_KUNIT_TEST +#include "test_mpam_resctrl.c" +#endif diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c new file mode 100644 index 000000000000..b93d6ad87e43 --- /dev/null +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. +/* This file is intended to be included into mpam_resctrl.c */ + +#include +#include +#include +#include +#include + +struct percent_value_case { + u8 pc; + u8 width; + u16 value; +}; + +/* + * Mysterious inscriptions taken from the union of ARM DDI 0598D.b, + * "Arm Architecture Reference Manual Supplement - Memory System + * Resource Partitioning and Monitoring (MPAM), for A-profile + * architecture", Section 9.8, "About the fixed-point fractional + * format" (exact percentage entries only) and ARM IHI0099B.a + * "MPAM system component specification", Section 9.3, + * "The fixed-point fractional format": + */ +static const struct percent_value_case percent_value_cases[] = { + /* Architectural cases: */ + { 1, 8, 1 }, { 1, 12, 0x27 }, { 1, 16, 0x28e }, + { 25, 8, 0x3f }, { 25, 12, 0x3ff }, { 25, 16, 0x3fff }, + { 33, 8, 0x53 }, { 33, 12, 0x546 }, { 33, 16, 0x5479 }, + { 35, 8, 0x58 }, { 35, 12, 0x598 }, { 35, 16, 0x5998 }, + { 45, 8, 0x72 }, { 45, 12, 0x732 }, { 45, 16, 0x7332 }, + { 50, 8, 0x7f }, { 50, 12, 0x7ff }, { 50, 16, 0x7fff }, + { 52, 8, 0x84 }, { 52, 12, 0x850 }, { 52, 16, 0x851d }, + { 55, 8, 0x8b }, { 55, 12, 0x8cb }, { 55, 16, 0x8ccb }, + { 58, 8, 0x93 }, { 58, 12, 0x946 }, { 58, 16, 0x9479 }, + { 75, 8, 0xbf }, { 75, 12, 0xbff }, { 75, 16, 0xbfff }, + { 80, 8, 0xcb }, { 80, 12, 0xccb }, { 80, 16, 0xcccb }, + { 88, 8, 0xe0 }, { 88, 12, 0xe13 }, { 88, 16, 0xe146 }, + { 95, 8, 0xf2 }, { 95, 12, 0xf32 }, { 95, 16, 0xf332 }, + { 100, 8, 0xff }, { 100, 12, 0xfff }, { 100, 16, 0xffff }, +}; + +static void test_percent_value_desc(const struct percent_value_case *param, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, + "pc=%d, width=%d, value=0x%.*x\n", + param->pc, param->width, + DIV_ROUND_UP(param->width, 4), param->value); +} + +KUNIT_ARRAY_PARAM(test_percent_value, percent_value_cases, + test_percent_value_desc); + +struct percent_value_test_info { + u32 pc; /* result of value-to-percent conversion */ + u32 value; /* result of percent-to-value conversion */ + u32 max_value; /* maximum raw value allowed by test params */ + unsigned int shift; /* promotes raw testcase value to 16 bits */ +}; + +/* + * Convert a reference percentage to a fixed-point MAX value and + * vice-versa, based on param (not test->param_value!) + */ +static void __prepare_percent_value_test(struct kunit *test, + struct percent_value_test_info *res, + const struct percent_value_case *param) +{ + struct mpam_props fake_props = { }; + + /* Reject bogus test parameters that would break the tests: */ + KUNIT_ASSERT_GE(test, param->width, 1); + KUNIT_ASSERT_LE(test, param->width, 16); + KUNIT_ASSERT_LT(test, param->value, 1 << param->width); + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = param->width; + + res->shift = 16 - param->width; + res->max_value = GENMASK_U32(param->width - 1, 0); + res->value = percent_to_mbw_max(param->pc, &fake_props); + res->pc = mbw_max_to_percent(param->value << res->shift, &fake_props); +} + +static void test_get_mba_granularity(struct kunit *test) +{ + int ret; + struct mpam_props fake_props = { }; + + /* Use MBW_MAX */ + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + + fake_props.bwa_wd = 0; + KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_max(&fake_props)); + + fake_props.bwa_wd = 1; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* Architectural maximum: */ + fake_props.bwa_wd = 16; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* No usable control... */ + fake_props.bwa_wd = 0; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 0); + + fake_props.bwa_wd = 1; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 50); /* DIV_ROUND_UP(100, 1 << 1)% = 50% */ + + fake_props.bwa_wd = 2; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 25); /* DIV_ROUND_UP(100, 1 << 2)% = 25% */ + + fake_props.bwa_wd = 3; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 13); /* DIV_ROUND_UP(100, 1 << 3)% = 13% */ + + fake_props.bwa_wd = 6; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 2); /* DIV_ROUND_UP(100, 1 << 6)% = 2% */ + + fake_props.bwa_wd = 7; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 7)% = 1% */ + + /* Granularity saturates at 1% */ + fake_props.bwa_wd = 16; /* architectural maximum */ + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 16)% = 1% */ +} + +static void test_mbw_max_to_percent(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + /* + * Since the reference values in percent_value_cases[] all + * correspond to exact percentages, round-to-nearest will + * always give the exact percentage back when the MPAM max + * value has precision of 0.5% or finer. (Always true for the + * reference data, since they all specify 8 bits or more of + * precision. + * + * So, keep it simple and demand an exact match: + */ + __prepare_percent_value_test(test, &res, param); + KUNIT_EXPECT_EQ(test, res.pc, param->pc); +} + +static void test_percent_to_mbw_max(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + __prepare_percent_value_test(test, &res, param); + + KUNIT_EXPECT_GE(test, res.value, param->value << res.shift); + KUNIT_EXPECT_LE(test, res.value, (param->value + 1) << res.shift); + KUNIT_EXPECT_LE(test, res.value, res.max_value << res.shift); + + /* No flexibility allowed for 0% and 100%! */ + + if (param->pc == 0) + KUNIT_EXPECT_EQ(test, res.value, 0); + + if (param->pc == 100) + KUNIT_EXPECT_EQ(test, res.value, res.max_value << res.shift); +} + +static const void *test_all_bwa_wd_gen_params(struct kunit *test, const void *prev, + char *desc) +{ + uintptr_t param = (uintptr_t)prev; + + if (param > 15) + return NULL; + + param++; + + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "wd=%u\n", (unsigned int)param); + + return (void *)param; +} + +static unsigned int test_get_bwa_wd(struct kunit *test) +{ + uintptr_t param = (uintptr_t)test->param_value; + + KUNIT_ASSERT_GE(test, param, 1); + KUNIT_ASSERT_LE(test, param, 16); + + return param; +} + +static void test_mbw_max_to_percent_limits(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + u32 max_value; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + max_value = GENMASK(15, 16 - fake_props.bwa_wd); + + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(max_value, &fake_props), + MAX_MBA_BW); + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), + get_mba_min(&fake_props)); + + /* + * Rounding policy dependent 0% sanity-check: + * With round-to-nearest, the minimum mbw_max value really + * should map to 0% if there are at least 200 steps. + * (100 steps may be enough for some other rounding policies.) + */ + if (fake_props.bwa_wd >= 8) + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), 0); + + if (fake_props.bwa_wd < 8 && + mbw_max_to_percent(0, &fake_props) == 0) + kunit_warn(test, "wd=%d: Testsuite/driver Rounding policy mismatch?", + fake_props.bwa_wd); +} + +/* + * Check that converting a percentage to mbw_max and back again (or, as + * appropriate, vice-versa) always restores the original value: + */ +static void test_percent_max_roundtrip_stability(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + unsigned int shift; + u32 pc, max, pc2, max2; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + shift = 16 - fake_props.bwa_wd; + + /* + * Converting a valid value from the coarser scale to the finer + * scale and back again must yield the original value: + */ + if (fake_props.bwa_wd >= 7) { + /* More than 100 steps: only test exact pc values: */ + for (pc = get_mba_min(&fake_props); pc <= MAX_MBA_BW; pc++) { + max = percent_to_mbw_max(pc, &fake_props); + pc2 = mbw_max_to_percent(max, &fake_props); + KUNIT_EXPECT_EQ(test, pc2, pc); + } + } else { + /* Fewer than 100 steps: only test exact mbw_max values: */ + for (max = 0; max < 1 << 16; max += 1 << shift) { + pc = mbw_max_to_percent(max, &fake_props); + max2 = percent_to_mbw_max(pc, &fake_props); + KUNIT_EXPECT_EQ(test, max2, max); + } + } +} + +static void test_percent_to_max_rounding(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + unsigned int num_rounded_up = 0, total = 0; + struct percent_value_test_info res; + + for (param = percent_value_cases, total = 0; + param < &percent_value_cases[ARRAY_SIZE(percent_value_cases)]; + param++, total++) { + __prepare_percent_value_test(test, &res, param); + if (res.value > param->value << res.shift) + num_rounded_up++; + } + + /* + * The MPAM driver applies a round-to-nearest policy, whereas a + * round-down policy seems to have been applied in the + * reference table from which the test vectors were selected. + * + * For a large and well-distributed suite of test vectors, + * about half should be rounded up and half down compared with + * the reference table. The actual test vectors are few in + * number and probably not very well distributed however, so + * tolerate a round-up rate of between 1/4 and 3/4 before + * crying foul: + */ + + kunit_info(test, "Round-up rate: %u%% (%u/%u)\n", + DIV_ROUND_CLOSEST(num_rounded_up * 100, total), + num_rounded_up, total); + + KUNIT_EXPECT_GE(test, 4 * num_rounded_up, 1 * total); + KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total); +} + +static struct kunit_case mpam_resctrl_test_cases[] = { + KUNIT_CASE(test_get_mba_granularity), + KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params), + KUNIT_CASE(test_percent_to_max_rounding), + KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability, + test_all_bwa_wd_gen_params), + {} +}; + +static struct kunit_suite mpam_resctrl_test_suite = { + .name = "mpam_resctrl_test_suite", + .test_cases = mpam_resctrl_test_cases, +}; + +kunit_test_suites(&mpam_resctrl_test_suite); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 7f00c5285a32..f92a36187a52 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -5,6 +5,7 @@ #define __LINUX_ARM_MPAM_H #include +#include #include struct mpam_msc; @@ -49,6 +50,37 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, } #endif +bool resctrl_arch_alloc_capable(void); +bool resctrl_arch_mon_capable(void); + +void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); +void resctrl_arch_sched_in(struct task_struct *tsk); +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); +u32 resctrl_arch_system_num_rmid_idx(void); + +struct rdt_resource; +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); + +/* + * The CPU configuration for MPAM is cheap to write, and is only written if it + * has changed. No need for fine grained enables. + */ +static inline void resctrl_arch_enable_mon(void) { } +static inline void resctrl_arch_disable_mon(void) { } +static inline void resctrl_arch_enable_alloc(void) { } +static inline void resctrl_arch_disable_alloc(void) { } + +static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) +{ + return val; +} + /** * mpam_register_requestor() - Register a requestor with the MPAM driver * @partid_max: The maximum PARTID value the requestor can generate. diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index d223246401bc..e04d67e999a1 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -324,7 +324,7 @@ static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs) { instrumentation_begin(); syscall_exit_to_user_mode_work(regs); - local_irq_disable_exit_to_user(); + local_irq_disable(); syscall_exit_to_user_mode_prepare(regs); instrumentation_end(); exit_to_user_mode(); diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index b976946b3cdb..7ab41eec549f 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -109,37 +109,6 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) instrumentation_end(); } -/** - * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() - * @ti_work: Cached TIF flags gathered with interrupts disabled - * - * Defaults to local_irq_enable(). Can be supplied by architecture specific - * code. - */ -static inline void local_irq_enable_exit_to_user(unsigned long ti_work); - -#ifndef local_irq_enable_exit_to_user -static __always_inline void local_irq_enable_exit_to_user(unsigned long ti_work) -{ - local_irq_enable(); -} -#endif - -/** - * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable() - * - * Defaults to local_irq_disable(). Can be supplied by architecture specific - * code. - */ -static inline void local_irq_disable_exit_to_user(void); - -#ifndef local_irq_disable_exit_to_user -static __always_inline void local_irq_disable_exit_to_user(void) -{ - local_irq_disable(); -} -#endif - /** * arch_exit_to_user_mode_work - Architecture specific TIF work for exit * to user mode. @@ -348,6 +317,8 @@ static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) */ static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) { + lockdep_assert_irqs_disabled(); + instrumentation_begin(); irqentry_exit_to_user_mode_prepare(regs); instrumentation_end(); @@ -378,6 +349,207 @@ typedef struct irqentry_state { } irqentry_state_t; #endif +/** + * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt + * + * Conditional reschedule with additional sanity checks. + */ +void raw_irqentry_exit_cond_resched(void); + +#ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched +#define irqentry_exit_cond_resched_dynamic_disabled NULL +DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); +#define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void); +#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() +#endif +#else /* CONFIG_PREEMPT_DYNAMIC */ +#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() +#endif /* CONFIG_PREEMPT_DYNAMIC */ + +/** + * irqentry_enter_from_kernel_mode - Establish state before invoking the irq handler + * @regs: Pointer to currents pt_regs + * + * Invoked from architecture specific entry code with interrupts disabled. + * Can only be called when the interrupt entry came from kernel mode. The + * calling code must be non-instrumentable. When the function returns all + * state is correct and the subsequent functions can be instrumented. + * + * The function establishes state (lockdep, RCU (context tracking), tracing) and + * is provided for architectures which require a strict split between entry from + * kernel and user mode and therefore cannot use irqentry_enter() which handles + * both entry modes. + * + * Returns: An opaque object that must be passed to irqentry_exit_to_kernel_mode(). + */ +static __always_inline irqentry_state_t irqentry_enter_from_kernel_mode(struct pt_regs *regs) +{ + irqentry_state_t ret = { + .exit_rcu = false, + }; + + /* + * If this entry hit the idle task invoke ct_irq_enter() whether + * RCU is watching or not. + * + * Interrupts can nest when the first interrupt invokes softirq + * processing on return which enables interrupts. + * + * Scheduler ticks in the idle task can mark quiescent state and + * terminate a grace period, if and only if the timer interrupt is + * not nested into another interrupt. + * + * Checking for rcu_is_watching() here would prevent the nesting + * interrupt to invoke ct_irq_enter(). If that nested interrupt is + * the tick then rcu_flavor_sched_clock_irq() would wrongfully + * assume that it is the first interrupt and eventually claim + * quiescent state and end grace periods prematurely. + * + * Unconditionally invoke ct_irq_enter() so RCU state stays + * consistent. + * + * TINY_RCU does not support EQS, so let the compiler eliminate + * this part when enabled. + */ + if (!IS_ENABLED(CONFIG_TINY_RCU) && + (is_idle_task(current) || arch_in_rcu_eqs())) { + /* + * If RCU is not watching then the same careful + * sequence vs. lockdep and tracing is required + * as in irqentry_enter_from_user_mode(). + */ + lockdep_hardirqs_off(CALLER_ADDR0); + ct_irq_enter(); + instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); + trace_hardirqs_off_finish(); + instrumentation_end(); + + ret.exit_rcu = true; + return ret; + } + + /* + * If RCU is watching then RCU only wants to check whether it needs + * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() + * already contains a warning when RCU is not watching, so no point + * in having another one here. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); + rcu_irq_enter_check_tick(); + trace_hardirqs_off_finish(); + instrumentation_end(); + + return ret; +} + +/** + * irqentry_exit_to_kernel_mode_preempt - Run preempt checks on return to kernel mode + * @regs: Pointer to current's pt_regs + * @state: Return value from matching call to irqentry_enter_from_kernel_mode() + * + * This is to be invoked before irqentry_exit_to_kernel_mode_after_preempt() to + * allow kernel preemption on return from interrupt. + * + * Must be invoked with interrupts disabled and CPU state which allows kernel + * preemption. + * + * After returning from this function, the caller can modify CPU state before + * invoking irqentry_exit_to_kernel_mode_after_preempt(), which is required to + * re-establish the tracing, lockdep and RCU state for returning to the + * interrupted context. + */ +static inline void irqentry_exit_to_kernel_mode_preempt(struct pt_regs *regs, + irqentry_state_t state) +{ + if (regs_irqs_disabled(regs) || state.exit_rcu) + return; + + if (IS_ENABLED(CONFIG_PREEMPTION)) + irqentry_exit_cond_resched(); + + hrtimer_rearm_deferred(); +} + +/** + * irqentry_exit_to_kernel_mode_after_preempt - Establish trace, lockdep and RCU state + * @regs: Pointer to current's pt_regs + * @state: Return value from matching call to irqentry_enter_from_kernel_mode() + * + * This is to be invoked after irqentry_exit_to_kernel_mode_preempt() and before + * actually returning to the interrupted context. + * + * There are no requirements for the CPU state other than being able to complete + * the tracing, lockdep and RCU state transitions. After this function returns + * the caller must return directly to the interrupted context. + */ +static __always_inline void +irqentry_exit_to_kernel_mode_after_preempt(struct pt_regs *regs, irqentry_state_t state) +{ + if (!regs_irqs_disabled(regs)) { + /* + * If RCU was not watching on entry this needs to be done + * carefully and needs the same ordering of lockdep/tracing + * and RCU as the return to user mode path. + */ + if (state.exit_rcu) { + instrumentation_begin(); + /* Tell the tracer that IRET will enable interrupts */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + instrumentation_end(); + ct_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + instrumentation_begin(); + /* Covers both tracing and lockdep */ + trace_hardirqs_on(); + instrumentation_end(); + } else { + /* + * IRQ flags state is correct already. Just tell RCU if it + * was not watching on entry. + */ + if (state.exit_rcu) + ct_irq_exit(); + } +} + +/** + * irqentry_exit_to_kernel_mode - Run preempt checks and establish state after + * invoking the interrupt handler + * @regs: Pointer to current's pt_regs + * @state: Return value from matching call to irqentry_enter_from_kernel_mode() + * + * This is the counterpart of irqentry_enter_from_kernel_mode() and combines + * the calls to irqentry_exit_to_kernel_mode_preempt() and + * irqentry_exit_to_kernel_mode_after_preempt(). + * + * The requirement for the CPU state is that it can schedule. After the function + * returns the tracing, lockdep and RCU state transitions are completed and the + * caller must return directly to the interrupted context. + */ +static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs, + irqentry_state_t state) +{ + lockdep_assert_irqs_disabled(); + + instrumentation_begin(); + irqentry_exit_to_kernel_mode_preempt(regs, state); + instrumentation_end(); + + irqentry_exit_to_kernel_mode_after_preempt(regs, state); +} + /** * irqentry_enter - Handle state tracking on ordinary interrupt entries * @regs: Pointer to pt_regs of interrupted context @@ -407,32 +579,10 @@ typedef struct irqentry_state { * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit * would not be possible. * - * Returns: An opaque object that must be passed to idtentry_exit() + * Returns: An opaque object that must be passed to irqentry_exit() */ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); -/** - * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt - * - * Conditional reschedule with additional sanity checks. - */ -void raw_irqentry_exit_cond_resched(void); - -#ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched -#define irqentry_exit_cond_resched_dynamic_disabled NULL -DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); -#define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)() -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); -void dynamic_irqentry_exit_cond_resched(void); -#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched() -#endif -#else /* CONFIG_PREEMPT_DYNAMIC */ -#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched() -#endif /* CONFIG_PREEMPT_DYNAMIC */ - /** * irqentry_exit - Handle return from exception that used irqentry_enter() * @regs: Pointer to pt_regs (exception entry regs) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9e1a6afb07f2..19d2244a9fef 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -47,7 +47,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re */ while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { - local_irq_enable_exit_to_user(ti_work); + local_irq_enable(); if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) { if (!rseq_grant_slice_extension(ti_work, TIF_SLICE_EXT_DENY)) @@ -74,7 +74,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re * might have changed while interrupts and preemption was * enabled above. */ - local_irq_disable_exit_to_user(); + local_irq_disable(); /* Check if any of the above work has queued a deferred wakeup */ tick_nohz_user_enter_prepare(); @@ -105,70 +105,16 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) { - irqentry_state_t ret = { - .exit_rcu = false, - }; - if (user_mode(regs)) { + irqentry_state_t ret = { + .exit_rcu = false, + }; + irqentry_enter_from_user_mode(regs); return ret; } - /* - * If this entry hit the idle task invoke ct_irq_enter() whether - * RCU is watching or not. - * - * Interrupts can nest when the first interrupt invokes softirq - * processing on return which enables interrupts. - * - * Scheduler ticks in the idle task can mark quiescent state and - * terminate a grace period, if and only if the timer interrupt is - * not nested into another interrupt. - * - * Checking for rcu_is_watching() here would prevent the nesting - * interrupt to invoke ct_irq_enter(). If that nested interrupt is - * the tick then rcu_flavor_sched_clock_irq() would wrongfully - * assume that it is the first interrupt and eventually claim - * quiescent state and end grace periods prematurely. - * - * Unconditionally invoke ct_irq_enter() so RCU state stays - * consistent. - * - * TINY_RCU does not support EQS, so let the compiler eliminate - * this part when enabled. - */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && - (is_idle_task(current) || arch_in_rcu_eqs())) { - /* - * If RCU is not watching then the same careful - * sequence vs. lockdep and tracing is required - * as in irqentry_enter_from_user_mode(). - */ - lockdep_hardirqs_off(CALLER_ADDR0); - ct_irq_enter(); - instrumentation_begin(); - kmsan_unpoison_entry_regs(regs); - trace_hardirqs_off_finish(); - instrumentation_end(); - - ret.exit_rcu = true; - return ret; - } - - /* - * If RCU is watching then RCU only wants to check whether it needs - * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() - * already contains a warning when RCU is not watching, so no point - * in having another one here. - */ - lockdep_hardirqs_off(CALLER_ADDR0); - instrumentation_begin(); - kmsan_unpoison_entry_regs(regs); - rcu_irq_enter_check_tick(); - trace_hardirqs_off_finish(); - instrumentation_end(); - - return ret; + return irqentry_enter_from_kernel_mode(regs); } /** @@ -212,45 +158,10 @@ void dynamic_irqentry_exit_cond_resched(void) noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) { - lockdep_assert_irqs_disabled(); - - /* Check whether this returns to user mode */ - if (user_mode(regs)) { + if (user_mode(regs)) irqentry_exit_to_user_mode(regs); - } else if (!regs_irqs_disabled(regs)) { - /* - * If RCU was not watching on entry this needs to be done - * carefully and needs the same ordering of lockdep/tracing - * and RCU as the return to user mode path. - */ - if (state.exit_rcu) { - instrumentation_begin(); - hrtimer_rearm_deferred(); - /* Tell the tracer that IRET will enable interrupts */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - instrumentation_end(); - ct_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); - return; - } - - instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) - irqentry_exit_cond_resched(); - - hrtimer_rearm_deferred(); - /* Covers both tracing and lockdep */ - trace_hardirqs_on(); - instrumentation_end(); - } else { - /* - * IRQ flags state is correct already. Just tell RCU if it - * was not watching on entry. - */ - if (state.exit_rcu) - ct_irq_exit(); - } + else + irqentry_exit_to_kernel_mode(regs, state); } irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index c2661a312fc9..e22703d6b97c 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -56,7 +56,8 @@ static void atomics_sigill(void) static void cmpbr_sigill(void) { - /* Not implemented, too complicated and unreliable anyway */ + asm volatile(".inst 0x74C00040\n" /* CBEQ w0, w0, +8 */ + "udf #0" : : : "cc"); /* UDF #0 */ } static void crc32_sigill(void) diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index 73de5be58bab..fa3478a6c914 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -124,6 +124,7 @@ static const struct reg_ftr_bits ftr_id_aa64isar2_el1[] = { static const struct reg_ftr_bits ftr_id_aa64isar3_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, FPRCVT, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, LSUI, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, LSFE, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, FAMINMAX, 0), REG_FTR_END,