From 404ee89b4008cf2130554dac2c64cd8412601356 Mon Sep 17 00:00:00 2001 From: Louis-Alexis Eyraud Date: Fri, 3 Oct 2025 16:00:28 +0200 Subject: [PATCH 001/543] pinctrl: mediatek: mt8196: align register base names to dt-bindings ones The mt8196-pinctrl driver requires to probe that a device tree uses in the device node the same names than mt8196_pinctrl_register_base_names array. But they are not matching the required ones in the "mediatek,mt8196-pinctrl" dt-bindings, leading to possible dtbs check issues. So, align all mt8196_pinctrl_register_base_names entries on dt-bindings ones. Fixes: f7a29377c253 ("pinctrl: mediatek: Add pinctrl driver on mt8196") Signed-off-by: Louis-Alexis Eyraud Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Linus Walleij --- drivers/pinctrl/mediatek/pinctrl-mt8196.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8196.c b/drivers/pinctrl/mediatek/pinctrl-mt8196.c index 82a73929c7a0..dec957c1724b 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt8196.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt8196.c @@ -1801,10 +1801,8 @@ static const struct mtk_pin_reg_calc mt8196_reg_cals[PINCTRL_PIN_REG_MAX] = { }; static const char * const mt8196_pinctrl_register_base_names[] = { - "iocfg0", "iocfg_rt", "iocfg_rm1", "iocfg_rm2", - "iocfg_rb", "iocfg_bm1", "iocfg_bm2", "iocfg_bm3", - "iocfg_lt", "iocfg_lm1", "iocfg_lm2", "iocfg_lb1", - "iocfg_lb2", "iocfg_tm1", "iocfg_tm2", "iocfg_tm3", + "base", "rt", "rm1", "rm2", "rb", "bm1", "bm2", "bm3", + "lt", "lm1", "lm2", "lb1", "lb2", "tm1", "tm2", "tm3", }; static const struct mtk_eint_hw mt8196_eint_hw = { From 518919276c4119e34e24334003af70ab12477f00 Mon Sep 17 00:00:00 2001 From: Louis-Alexis Eyraud Date: Fri, 3 Oct 2025 15:48:49 +0200 Subject: [PATCH 002/543] pinctrl: mediatek: mt8189: align register base names to dt-bindings ones The mt8189-pinctrl driver requires to probe that a device tree uses in the device node the same names than mt8189_pinctrl_register_base_names array. But they are not matching the required ones in the "mediatek,mt8189-pinctrl" dt-bindings, leading to possible dtbs check issues. The mt8189_pinctrl_register_base_names entry order is also different. So, align all mt8189_pinctrl_register_base_names entry names and order on dt-bindings. Fixes: a3fe1324c3c5 ("pinctrl: mediatek: Add pinctrl driver for mt8189") Signed-off-by: Louis-Alexis Eyraud Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Linus Walleij --- drivers/pinctrl/mediatek/pinctrl-mt8189.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8189.c b/drivers/pinctrl/mediatek/pinctrl-mt8189.c index 7028aff55ae5..f6a3e584588b 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt8189.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt8189.c @@ -1642,9 +1642,7 @@ static const struct mtk_pin_reg_calc mt8189_reg_cals[PINCTRL_PIN_REG_MAX] = { }; static const char * const mt8189_pinctrl_register_base_names[] = { - "gpio_base", "iocfg_bm0_base", "iocfg_bm1_base", "iocfg_bm2_base", "iocfg_lm_base", - "iocfg_lt0_base", "iocfg_lt1_base", "iocfg_rb0_base", "iocfg_rb1_base", - "iocfg_rt_base" + "base", "lm", "rb0", "rb1", "bm0", "bm1", "bm2", "lt0", "lt1", "rt", }; static const struct mtk_eint_hw mt8189_eint_hw = { From c6d99e488117201c63efd747ce17b80687c3f5a9 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 13 Oct 2025 09:15:25 -0700 Subject: [PATCH 003/543] Input: goodix - add support for ACPI ID GDIX1003 Some newer devices use an ACPI hardware ID of GDIX1003 for their Goodix touchscreen controller, instead of GDIX1001 / GDIX1002. Add GDIX1003 to the goodix_acpi_match[] table. Reported-by: Weikang Guo Closes: https://lore.kernel.org/linux-input/20250225024409.1467040-1-guoweikang.kernel@gmail.com/ Tested-by: Weikang Guo Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20251013121022.44333-1-hansg@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/goodix.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/touchscreen/goodix.c b/drivers/input/touchscreen/goodix.c index 252dcae039f8..5551decb8d22 100644 --- a/drivers/input/touchscreen/goodix.c +++ b/drivers/input/touchscreen/goodix.c @@ -1557,6 +1557,7 @@ MODULE_DEVICE_TABLE(i2c, goodix_ts_id); static const struct acpi_device_id goodix_acpi_match[] = { { "GDIX1001", 0 }, { "GDIX1002", 0 }, + { "GDIX1003", 0 }, { "GDX9110", 0 }, { } }; From 7363096a5a08f8740c9075ecfc51945375c304bc Mon Sep 17 00:00:00 2001 From: Martyn Welch Date: Thu, 9 Oct 2025 14:41:32 +0100 Subject: [PATCH 004/543] Input: goodix - remove setting of RST pin to input The reset line is being set to input on non-ACPI devices apparently to save power. This isn't being done on ACPI devices as it's been found that some ACPI devices don't have a pull-up resistor fitted. This can also be the case for non-ACPI devices, resulting in: [ 941.672207] Goodix-TS 1-0014: Error reading 10 bytes from 0x814e: -110 [ 942.696168] Goodix-TS 1-0014: Error reading 10 bytes from 0x814e: -110 [ 945.832208] Goodix-TS 1-0014: Error reading 10 bytes from 0x814e: -110 This behaviour appears to have been initialing introduced in ec6e1b4082d9. This doesn't seem to be based on information in either the GT911 or GT9271 datasheets cited as sources of information for this change. Thus it seems likely that it is based on functionality in the Android driver which it also lists. This behaviour may be viable in very specific instances where the hardware is well known, but seems unwise in the upstream kernel where such hardware requirements can't be guaranteed. Remove this over optimisation to improve reliability on non-ACPI devices. Signed-off-by: Martyn Welch Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20251009134138.686215-1-martyn.welch@collabora.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/goodix.c | 27 +-------------------------- drivers/input/touchscreen/goodix.h | 1 - 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/drivers/input/touchscreen/goodix.c b/drivers/input/touchscreen/goodix.c index 5551decb8d22..f8798d11ec03 100644 --- a/drivers/input/touchscreen/goodix.c +++ b/drivers/input/touchscreen/goodix.c @@ -796,17 +796,6 @@ int goodix_reset_no_int_sync(struct goodix_ts_data *ts) usleep_range(6000, 10000); /* T4: > 5ms */ - /* - * Put the reset pin back in to input / high-impedance mode to save - * power. Only do this in the non ACPI case since some ACPI boards - * don't have a pull-up, so there the reset pin must stay active-high. - */ - if (ts->irq_pin_access_method == IRQ_PIN_ACCESS_GPIO) { - error = gpiod_direction_input(ts->gpiod_rst); - if (error) - goto error; - } - return 0; error: @@ -957,14 +946,6 @@ static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts) return -EINVAL; } - /* - * Normally we put the reset pin in input / high-impedance mode to save - * power. But some x86/ACPI boards don't have a pull-up, so for the ACPI - * case, leave the pin as is. This results in the pin not being touched - * at all on x86/ACPI boards, except when needed for error-recover. - */ - ts->gpiod_rst_flags = GPIOD_ASIS; - return devm_acpi_dev_add_driver_gpios(dev, gpio_mapping); } #else @@ -989,12 +970,6 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts) return -EINVAL; dev = &ts->client->dev; - /* - * By default we request the reset pin as input, leaving it in - * high-impedance when not resetting the controller to save power. - */ - ts->gpiod_rst_flags = GPIOD_IN; - ts->avdd28 = devm_regulator_get(dev, "AVDD28"); if (IS_ERR(ts->avdd28)) return dev_err_probe(dev, PTR_ERR(ts->avdd28), "Failed to get AVDD28 regulator\n"); @@ -1019,7 +994,7 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts) ts->gpiod_int = gpiod; /* Get the reset line GPIO pin number */ - gpiod = devm_gpiod_get_optional(dev, GOODIX_GPIO_RST_NAME, ts->gpiod_rst_flags); + gpiod = devm_gpiod_get_optional(dev, GOODIX_GPIO_RST_NAME, GPIOD_ASIS); if (IS_ERR(gpiod)) return dev_err_probe(dev, PTR_ERR(gpiod), "Failed to get %s GPIO\n", GOODIX_GPIO_RST_NAME); diff --git a/drivers/input/touchscreen/goodix.h b/drivers/input/touchscreen/goodix.h index 87797cc88b32..0d1e8a8d2cba 100644 --- a/drivers/input/touchscreen/goodix.h +++ b/drivers/input/touchscreen/goodix.h @@ -88,7 +88,6 @@ struct goodix_ts_data { struct gpio_desc *gpiod_rst; int gpio_count; int gpio_int_idx; - enum gpiod_flags gpiod_rst_flags; char id[GOODIX_ID_MAX_LEN + 1]; char cfg_name[64]; u16 version; From d425aef66e62221fa6bb0ccb94296df29e4cc107 Mon Sep 17 00:00:00 2001 From: Anand Moon Date: Mon, 13 Oct 2025 20:50:03 +0530 Subject: [PATCH 005/543] arm64: dts: rockchip: Set correct pinctrl for I2S1 8ch TX on odroid-m1 Enable proper pin multiplexing for the I2S1 8-channel transmit interface by adding the default pinctrl configuration which esures correct signal routing and avoids pinmux conflicts during audio playback. Changes fix the error [ 116.856643] [ T782] rockchip-pinctrl pinctrl: pin gpio1-10 already requested by affinity_hint; cannot claim for fe410000.i2s [ 116.857567] [ T782] rockchip-pinctrl pinctrl: error -EINVAL: pin-42 (fe410000.i2s) [ 116.857618] [ T782] rockchip-pinctrl pinctrl: error -EINVAL: could not request pin 42 (gpio1-10) from group i2s1m0-sdi1 on device rockchip-pinctrl [ 116.857659] [ T782] rockchip-i2s-tdm fe410000.i2s: Error applying setting, reverse things back I2S1 on the M1 to the codec in the RK809 only uses the SCLK, LRCK, SDI0 and SDO0 signals, so limit the claimed pins to those. With this change audio output works as expected: $ aplay -l **** List of PLAYBACK Hardware Devices **** card 0: HDMI [HDMI], device 0: fe400000.i2s-i2s-hifi i2s-hifi-0 [fe400000.i2s-i2s-hifi i2s-hifi-0] Subdevices: 1/1 Subdevice #0: subdevice #0 card 1: RK817 [Analog RK817], device 0: fe410000.i2s-rk817-hifi rk817-hifi-0 [fe410000.i2s-rk817-hifi rk817-hifi-0] Subdevices: 1/1 Subdevice #0: subdevice #0 Fixes: 78f858447cb7 ("arm64: dts: rockchip: Add analog audio on ODROID-M1") Cc: Aurelien Jarno Signed-off-by: Anand Moon [adapted the commit message a bit] Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts b/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts index 0f844806ec54..442a2bc43ba8 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts @@ -482,6 +482,8 @@ &i2s0_8ch { }; &i2s1_8ch { + pinctrl-names = "default"; + pinctrl-0 = <&i2s1m0_sclktx &i2s1m0_lrcktx &i2s1m0_sdi0 &i2s1m0_sdo0>; rockchip,trcm-sync-tx-only; status = "okay"; }; From e179de737d13ad99bd19ea0fafab759d4074a425 Mon Sep 17 00:00:00 2001 From: Andrey Leonchikov Date: Sun, 12 Oct 2025 14:33:36 +0200 Subject: [PATCH 006/543] arm64: dts: rockchip: Fix PCIe power enable pin for BigTreeTech CB2 and Pi2 Fix typo into regulator GPIO definition. With current definition, PCIe doesn't start up. Valid definition is already used in "pinctrl" section, "pcie_drv" (gpio4, RK_PB1). Fixes: bfbc663d2733a ("arm64: dts: rockchip: Add BigTreeTech CB2 and Pi2") Signed-off-by: Andrey Leonchikov Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi index 7f578c50b4ad..f74590af7e33 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi @@ -120,7 +120,7 @@ vcc3v3_pcie: regulator-vcc3v3-pcie { compatible = "regulator-fixed"; regulator-name = "vcc3v3_pcie"; enable-active-high; - gpios = <&gpio0 RK_PB1 GPIO_ACTIVE_HIGH>; + gpios = <&gpio4 RK_PB1 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&pcie_drv>; regulator-always-on; From 05b80cd1f37db042e074ecc7ee0d39869fed2f52 Mon Sep 17 00:00:00 2001 From: Alexey Charkov Date: Thu, 9 Oct 2025 16:34:01 +0400 Subject: [PATCH 007/543] arm64: dts: rockchip: Remove non-functioning CPU OPPs from RK3576 Drop the top-frequency OPPs from both the LITTLE and big CPU clusters on RK3576, as neither the opensource TF-A [1] nor the recent (after v1.08) binary BL31 images provided by Rockchip expose those. This fixes the problem [2] when the cpufreq governor tries to jump directly to the highest-frequency OPP, which results in a failed SCMI call leaving the system stuck at the previous OPP before the attempted change. [1] https://github.com/ARM-software/arm-trusted-firmware/blob/master/plat/rockchip/rk3576/scmi/rk3576_clk.c#L264-L304 [2] https://lore.kernel.org/linux-rockchip/CABjd4Yz4NbqzZH4Qsed3ias56gcga9K6CmYA+BLDBxtbG915Ag@mail.gmail.com/ Fixes: 57b1ce903966 ("arm64: dts: rockchip: Add rk3576 SoC base DT") Cc: stable@vger.kernel.org Signed-off-by: Alexey Charkov Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3576.dtsi | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3576.dtsi b/arch/arm64/boot/dts/rockchip/rk3576.dtsi index fc4e9e07f1cf..f0c3ab00a7f3 100644 --- a/arch/arm64/boot/dts/rockchip/rk3576.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3576.dtsi @@ -276,12 +276,6 @@ opp-2016000000 { opp-microvolt = <900000 900000 950000>; clock-latency-ns = <40000>; }; - - opp-2208000000 { - opp-hz = /bits/ 64 <2208000000>; - opp-microvolt = <950000 950000 950000>; - clock-latency-ns = <40000>; - }; }; cluster1_opp_table: opp-table-cluster1 { @@ -348,12 +342,6 @@ opp-2208000000 { opp-microvolt = <925000 925000 950000>; clock-latency-ns = <40000>; }; - - opp-2304000000 { - opp-hz = /bits/ 64 <2304000000>; - opp-microvolt = <950000 950000 950000>; - clock-latency-ns = <40000>; - }; }; gpu_opp_table: opp-table-gpu { From afb5f84b216d14a71e2962ed569edcea30cf9763 Mon Sep 17 00:00:00 2001 From: Diederik de Haas Date: Wed, 8 Oct 2025 10:21:22 +0200 Subject: [PATCH 008/543] arm64: dts: rockchip: Drop 'rockchip,grf' prop from tsadc on rk3328 The 'rockchip,grf' property for tsadc in rk3328 wasn't actually used in the driver and is no longer allowed in the DT since commit e881662aa06a ("dt-bindings: thermal: rockchip: Tighten grf requirements") So remove that property which fixes the following DT validation issue tsadc@ff250000 (rockchip,rk3328-tsadc): rockchip,grf: False schema does not allow [[58]] Signed-off-by: Diederik de Haas Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3328.dtsi | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 283d9cbc4368..03b7c4313750 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -598,7 +598,6 @@ tsadc: tsadc@ff250000 { pinctrl-2 = <&otp_pin>; resets = <&cru SRST_TSADC>; reset-names = "tsadc-apb"; - rockchip,grf = <&grf>; rockchip,hw-tshut-temp = <100000>; #thermal-sensor-cells = <1>; status = "disabled"; From b3fd04e23f6e4496f5a2279466a33fbdc83500f0 Mon Sep 17 00:00:00 2001 From: Dragan Simic Date: Sat, 6 Sep 2025 12:01:22 +0200 Subject: [PATCH 009/543] arm64: dts: rockchip: Make RK3588 GPU OPP table naming less generic Unify the naming of the existing GPU OPP table nodes found in the RK3588 and RK3588J SoC dtsi files with the other SoC's GPU OPP nodes, following the more "modern" node naming scheme. Fixes: a7b2070505a2 ("arm64: dts: rockchip: Split GPU OPPs of RK3588 and RK3588j") Signed-off-by: Dragan Simic [opp-table also is way too generic on systems with like 4-5 opp-tables] Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi | 2 +- arch/arm64/boot/dts/rockchip/rk3588j.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi index 0f1a77697351..b5d630d2c879 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi @@ -115,7 +115,7 @@ opp-2400000000 { }; }; - gpu_opp_table: opp-table { + gpu_opp_table: opp-table-gpu { compatible = "operating-points-v2"; opp-300000000 { diff --git a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi index 9884a5df47df..e1e0e3fc0ca7 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi @@ -66,7 +66,7 @@ opp-1608000000 { }; }; - gpu_opp_table: opp-table { + gpu_opp_table: opp-table-gpu { compatible = "operating-points-v2"; opp-300000000 { From ce121914f38aaa59504e20a1a625e5988fc6ead4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Oct 2025 16:52:26 +0100 Subject: [PATCH 010/543] arm64: tegra: Mark Jetson Xavier NX's PHY as a wakeup source Mark the RTL8211F PHY as a wakeup source for the Jetson Xavier NX. This allows the reworked RTL8211F driver to know that the PHY is wired to wakeup capable hardware, and thus to expose WoL capabilities. Signed-off-by: Russell King (Oracle) Acked-by: Jon Hunter Signed-off-by: Thierry Reding --- arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi index a410fc335fa3..c0f17f8189fa 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi @@ -42,6 +42,7 @@ phy: ethernet-phy@0 { interrupt-parent = <&gpio>; interrupts = ; #phy-cells = <0>; + wakeup-source; }; }; }; From ea138a607709bf72c162f62d2a670fe899d73daa Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 15 Oct 2025 17:17:09 -0700 Subject: [PATCH 011/543] RISC-V: KVM: Fix check for local interrupts on riscv32 To set all 64 bits in the mask on a 32-bit system, the constant must have type `unsigned long long`. Fixes: 6b1e8ba4bac4 ("RISC-V: KVM: Use bitmap for irqs_pending and irqs_pending_mask") Signed-off-by: Samuel Holland Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20251016001714.3889380-1-samuel.holland@sifive.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index bccb919ca615..5ce35aba6069 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -212,7 +212,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return (kvm_riscv_vcpu_has_interrupts(vcpu, -1UL) && + return (kvm_riscv_vcpu_has_interrupts(vcpu, -1ULL) && !kvm_riscv_vcpu_stopped(vcpu) && !vcpu->arch.pause); } From 85893094535cced32b33766e283240164a5b11f8 Mon Sep 17 00:00:00 2001 From: Tao Ren Date: Wed, 15 Oct 2025 13:48:37 -0700 Subject: [PATCH 012/543] ARM: dts: aspeed: fuji-data64: Enable mac3 controller "mac3" controller was removed from the initial version of fuji-data64 dts because the rgmii setting is incorrect, but dropping mac3 leads to regression in the existing fuji platform, because fuji.dts simply includes fuji-data64.dts. This patch adds mac3 back to fuji-data64.dts to fix the fuji regression[1], and rgmii settings need to be fixed later. Fixes: b0f294fdfc3e ("ARM: dts: aspeed: facebook-fuji: Include facebook-fuji-data64.dts") Link: https://lore.kernel.org/all/79ddc7b9-ef26-4959-9a16-aa4e006eb145@roeck-us.net/ [1] Signed-off-by: Tao Ren Reviewed-by: Andrew Lunn Signed-off-by: Andrew Jeffery --- .../dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts index aa9576d8ab56..48ca25f57ef6 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts @@ -1254,3 +1254,17 @@ &emmc { max-frequency = <25000000>; bus-width = <4>; }; + +/* + * FIXME: rgmii delay is introduced by MAC (configured in u-boot now) + * instead of PCB on fuji board, so the "phy-mode" should be updated to + * "rgmii-[tx|rx]id" when the aspeed-mac driver can handle the delay + * properly. + */ +&mac3 { + status = "okay"; + phy-mode = "rgmii"; + phy-handle = <ðphy3>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_rgmii4_default>; +}; From 873f10cf8e4d59605bc38fa1051dea8ee56fe3be Mon Sep 17 00:00:00 2001 From: Fangyu Yu Date: Thu, 16 Oct 2025 09:26:59 +0800 Subject: [PATCH 013/543] RISC-V: KVM: Read HGEIP CSR on the correct cpu When executing kvm_riscv_vcpu_aia_has_interrupts, the vCPU may have migrated and the IMSIC VS-file have not been updated yet, currently the HGEIP CSR should be read from the imsic->vsfile_cpu ( the pCPU before migration ) via on_each_cpu_mask, but this will trigger an IPI call and repeated IPI within a period of time is expensive in a many-core systems. Just let the vCPU execute and update the correct IMSIC VS-file via kvm_riscv_vcpu_aia_imsic_update may be a simple solution. Fixes: 4cec89db80ba ("RISC-V: KVM: Move HGEI[E|P] CSR access to IMSIC virtualization") Signed-off-by: Fangyu Yu Reviewed-by: Guo Ren Reviewed-by: Anup Patel Tested-by: Anup Patel Link: https://lore.kernel.org/r/20251016012659.82998-1-fangyu.yu@linux.alibaba.com Signed-off-by: Anup Patel --- arch/riscv/kvm/aia_imsic.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index fda0346f0ea1..11422cb95a64 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -689,8 +689,20 @@ bool kvm_riscv_vcpu_aia_imsic_has_interrupt(struct kvm_vcpu *vcpu) */ read_lock_irqsave(&imsic->vsfile_lock, flags); - if (imsic->vsfile_cpu > -1) - ret = !!(csr_read(CSR_HGEIP) & BIT(imsic->vsfile_hgei)); + if (imsic->vsfile_cpu > -1) { + /* + * This function is typically called from kvm_vcpu_block() via + * kvm_arch_vcpu_runnable() upon WFI trap. The kvm_vcpu_block() + * can be preempted and the blocking VCPU might resume on a + * different CPU. This means it is possible that current CPU + * does not match the imsic->vsfile_cpu hence this function + * must check imsic->vsfile_cpu before accessing HGEIP CSR. + */ + if (imsic->vsfile_cpu != vcpu->cpu) + ret = true; + else + ret = !!(csr_read(CSR_HGEIP) & BIT(imsic->vsfile_hgei)); + } read_unlock_irqrestore(&imsic->vsfile_lock, flags); return ret; From 69aeb507312306f73495598a055293fa749d454e Mon Sep 17 00:00:00 2001 From: Seungjin Bae Date: Fri, 17 Oct 2025 15:36:31 -0700 Subject: [PATCH 014/543] Input: pegasus-notetaker - fix potential out-of-bounds access In the pegasus_notetaker driver, the pegasus_probe() function allocates the URB transfer buffer using the wMaxPacketSize value from the endpoint descriptor. An attacker can use a malicious USB descriptor to force the allocation of a very small buffer. Subsequently, if the device sends an interrupt packet with a specific pattern (e.g., where the first byte is 0x80 or 0x42), the pegasus_parse_packet() function parses the packet without checking the allocated buffer size. This leads to an out-of-bounds memory access. Fixes: 1afca2b66aac ("Input: add Pegasus Notetaker tablet driver") Signed-off-by: Seungjin Bae Link: https://lore.kernel.org/r/20251007214131.3737115-2-eeodqql09@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/tablet/pegasus_notetaker.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/input/tablet/pegasus_notetaker.c b/drivers/input/tablet/pegasus_notetaker.c index 8d6b71d59793..eabb4a0b8a0d 100644 --- a/drivers/input/tablet/pegasus_notetaker.c +++ b/drivers/input/tablet/pegasus_notetaker.c @@ -63,6 +63,9 @@ #define BUTTON_PRESSED 0xb5 #define COMMAND_VERSION 0xa9 +/* 1 Status + 1 Color + 2 X + 2 Y = 6 bytes */ +#define NOTETAKER_PACKET_SIZE 6 + /* in xy data packet */ #define BATTERY_NO_REPORT 0x40 #define BATTERY_LOW 0x41 @@ -311,6 +314,12 @@ static int pegasus_probe(struct usb_interface *intf, } pegasus->data_len = usb_maxpacket(dev, pipe); + if (pegasus->data_len < NOTETAKER_PACKET_SIZE) { + dev_err(&intf->dev, "packet size is too small (%d)\n", + pegasus->data_len); + error = -EINVAL; + goto err_free_mem; + } pegasus->data = usb_alloc_coherent(dev, pegasus->data_len, GFP_KERNEL, &pegasus->data_dma); From 62bf7708fe80ec0db14b9179c25eeeda9f81e9d0 Mon Sep 17 00:00:00 2001 From: Dario Binacchi Date: Sat, 13 Sep 2025 11:16:31 +0200 Subject: [PATCH 015/543] ARM: dts: imx6ull-engicam-microgea-rmm: fix report-rate-hz value The 'report-rate-hz' property for the edt-ft5x06 driver was added and handled in the Linux kernel by me with patches [1] and [2] for this specific board. The v1 upstream version, which was the one applied to the customer's kernel, used the 'report-rate' property, which was written directly to the controller register. During review, the 'hz' suffix was added, changing its handling so that writing the value directly to the register was no longer possible for the M06 controller. Once the patches were accepted in mainline, I did not reapply them to the customer's kernel, and when upstreaming the DTS for this board, I forgot to correct the 'report-rate-hz' property value. The property must be set to 60 because this board uses the M06 controller, which expects the report rate in units of 10 Hz, meaning the actual value written to the register is 6. [1] 625f829586ea ("dt-bindings: input: touchscreen: edt-ft5x06: add report-rate-hz") [2] 5bcee83a406c ("Input: edt-ft5x06 - set report rate by dts property") Fixes: ffea3cac94ba ("ARM: dts: imx6ul: support Engicam MicroGEA RMM board") Co-developed-by: Michael Trimarchi Signed-off-by: Michael Trimarchi Signed-off-by: Dario Binacchi Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts b/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts index 107b00b9a939..540642e99a41 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts +++ b/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts @@ -136,7 +136,7 @@ touchscreen: touchscreen@38 { interrupt-parent = <&gpio2>; interrupts = <8 IRQ_TYPE_EDGE_FALLING>; reset-gpios = <&gpio2 14 GPIO_ACTIVE_LOW>; - report-rate-hz = <6>; + report-rate-hz = <60>; /* settings valid only for Hycon touchscreen */ touchscreen-size-x = <1280>; touchscreen-size-y = <800>; From f31e261712a0d107f09fb1d3dc8f094806149c83 Mon Sep 17 00:00:00 2001 From: Jihed Chaibi Date: Tue, 16 Sep 2025 00:06:55 +0200 Subject: [PATCH 016/543] ARM: dts: imx51-zii-rdu1: Fix audmux node names Rename the 'ssi2' and 'aud3' nodes to 'mux-ssi2' and 'mux-aud3' in the audmux configuration of imx51-zii-rdu1.dts to comply with the naming convention in imx-audmux.yaml. This fixes the following dt-schema warning: imx51-zii-rdu1.dtb: audmux@83fd0000 (fsl,imx51-audmux): 'aud3', 'ssi2' do not match any of the regexes: '^mux-[0-9a-z]*$', '^pinctrl-[0-9]+$' Fixes: ceef0396f367f ("ARM: dts: imx: add ZII RDU1 board") Signed-off-by: Jihed Chaibi Signed-off-by: Shawn Guo --- arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts b/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts index 06545a6052f7..43ff5eafb2bb 100644 --- a/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts +++ b/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts @@ -259,7 +259,7 @@ &audmux { pinctrl-0 = <&pinctrl_audmux>; status = "okay"; - ssi2 { + mux-ssi2 { fsl,audmux-port = <1>; fsl,port-config = < (IMX_AUDMUX_V2_PTCR_SYN | @@ -271,7 +271,7 @@ IMX_AUDMUX_V2_PDCR_RXDSEL(2) >; }; - aud3 { + mux-aud3 { fsl,audmux-port = <2>; fsl,port-config = < IMX_AUDMUX_V2_PTCR_SYN From 9d7dfb95da2cb5c1287df2f3468bcb70d8b31087 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 11:21:47 -0700 Subject: [PATCH 017/543] KVM: VMX: Inject #UD if guest tries to execute SEAMCALL or TDCALL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add VMX exit handlers for SEAMCALL and TDCALL to inject a #UD if a non-TD guest attempts to execute SEAMCALL or TDCALL. Neither SEAMCALL nor TDCALL is gated by any software enablement other than VMXON, and so will generate a VM-Exit instead of e.g. a native #UD when executed from the guest kernel. Note! No unprivileged DoS of the L1 kernel is possible as TDCALL and SEAMCALL #GP at CPL > 0, and the CPL check is performed prior to the VMX non-root (VM-Exit) check, i.e. userspace can't crash the VM. And for a nested guest, KVM forwards unknown exits to L1, i.e. an L2 kernel can crash itself, but not L1. Note #2! The Intel® Trust Domain CPU Architectural Extensions spec's pseudocode shows the CPL > 0 check for SEAMCALL coming _after_ the VM-Exit, but that appears to be a documentation bug (likely because the CPL > 0 check was incorrectly bundled with other lower-priority #GP checks). Testing on SPR and EMR shows that the CPL > 0 check is performed before the VMX non-root check, i.e. SEAMCALL #GPs when executed in usermode. Note #3! The aforementioned Trust Domain spec uses confusing pseudocode that says that SEAMCALL will #UD if executed "inSEAM", but "inSEAM" specifically means in SEAM Root Mode, i.e. in the TDX-Module. The long- form description explicitly states that SEAMCALL generates an exit when executed in "SEAM VMX non-root operation". But that's a moot point as the TDX-Module injects #UD if the guest attempts to execute SEAMCALL, as documented in the "Unconditionally Blocked Instructions" section of the TDX-Module base specification. Cc: stable@vger.kernel.org Cc: Kai Huang Cc: Xiaoyao Li Cc: Rick Edgecombe Cc: Dan Williams Cc: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20251016182148.69085-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/uapi/asm/vmx.h | 1 + arch/x86/kvm/vmx/nested.c | 8 ++++++++ arch/x86/kvm/vmx/vmx.c | 8 ++++++++ 3 files changed, 17 insertions(+) diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 9792e329343e..1baa86dfe029 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -93,6 +93,7 @@ #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_SEAMCALL 76 #define EXIT_REASON_TDCALL 77 #define EXIT_REASON_MSR_READ_IMM 84 #define EXIT_REASON_MSR_WRITE_IMM 85 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 76271962cb70..bcea087b642f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6728,6 +6728,14 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_NOTIFY: /* Notify VM exit is not exposed to L1 */ return false; + case EXIT_REASON_SEAMCALL: + case EXIT_REASON_TDCALL: + /* + * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't + * virtualized by KVM for L1 hypervisors, i.e. L1 should + * never want or expect such an exit. + */ + return false; default: return true; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f87c216d976d..91b6f2f3edc2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6032,6 +6032,12 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) return 1; } +static int handle_tdx_instruction(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + #ifndef CONFIG_X86_SGX_KVM static int handle_encls(struct kvm_vcpu *vcpu) { @@ -6157,6 +6163,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, [EXIT_REASON_NOTIFY] = handle_notify, + [EXIT_REASON_SEAMCALL] = handle_tdx_instruction, + [EXIT_REASON_TDCALL] = handle_tdx_instruction, [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm, [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm, }; From 26f0f122f92f2e8c384c08a05956417bfb5f6fbe Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 20 Oct 2025 11:11:39 +0200 Subject: [PATCH 018/543] arm64: dts: rockchip: Fix indentation on rk3399 haikou demo dtso The regulator-cam-dovdd-1v8 uses spaces for indentation, where it should use tabs. Fix this. Fixes: 066a69db9db3 ("arm64: dts: rockchip: add overlay for RK3399 Puma Haikou Video Demo adapter") Signed-off-by: Heiko Stuebner Reviewed-by: Quentin Schulz Link: https://patch.msgid.link/20251020091139.3652738-1-heiko@sntech.de Signed-off-by: Heiko Stuebner --- .../dts/rockchip/rk3399-puma-haikou-video-demo.dtso | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso index 5e8f729c2cf2..141a921a06e4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso @@ -45,11 +45,11 @@ cam_avdd_2v8: regulator-cam-avdd-2v8 { cam_dovdd_1v8: regulator-cam-dovdd-1v8 { compatible = "regulator-fixed"; - gpio = <&pca9670 3 GPIO_ACTIVE_LOW>; - regulator-max-microvolt = <1800000>; - regulator-min-microvolt = <1800000>; - regulator-name = "cam-dovdd-1v8"; - vin-supply = <&vcc1v8_video>; + gpio = <&pca9670 3 GPIO_ACTIVE_LOW>; + regulator-max-microvolt = <1800000>; + regulator-min-microvolt = <1800000>; + regulator-name = "cam-dovdd-1v8"; + vin-supply = <&vcc1v8_video>; }; cam_dvdd_1v2: regulator-cam-dvdd-1v2 { From 8d2a2a49c30f67a480fa9ed25e08436a446f057e Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 16 Oct 2025 12:39:12 +0200 Subject: [PATCH 019/543] xfrm: drop SA reference in xfrm_state_update if dir doesn't match We're not updating x1, but we still need to put() it. Fixes: a4a87fa4e96c ("xfrm: Add Direction to the SA in or out") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index d213ca3653a8..e4736d1ebb44 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2191,14 +2191,18 @@ int xfrm_state_update(struct xfrm_state *x) } if (x1->km.state == XFRM_STATE_ACQ) { - if (x->dir && x1->dir != x->dir) + if (x->dir && x1->dir != x->dir) { + to_put = x1; goto out; + } __xfrm_state_insert(x); x = NULL; } else { - if (x1->dir != x->dir) + if (x1->dir != x->dir) { + to_put = x1; goto out; + } } err = 0; From 10deb69864840ccf96b00ac2ab3a2055c0c04721 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 16 Oct 2025 12:39:13 +0200 Subject: [PATCH 020/543] xfrm: also call xfrm_state_delete_tunnel at destroy time for states that were never added In commit b441cf3f8c4b ("xfrm: delete x->tunnel as we delete x"), I missed the case where state creation fails between full initialization (->init_state has been called) and being inserted on the lists. In this situation, ->init_state has been called, so for IPcomp tunnels, the fallback tunnel has been created and added onto the lists, but the user state never gets added, because we fail before that. The user state doesn't go through __xfrm_state_delete, so we don't call xfrm_state_delete_tunnel for those states, and we end up leaking the FB tunnel. There are several codepaths affected by this: the add/update paths, in both net/key and xfrm, and the migrate code (xfrm_migrate, xfrm_state_migrate). A "proper" rollback of the init_state work would probably be doable in the add/update code, but for migrate it gets more complicated as multiple states may be involved. At some point, the new (not-inserted) state will be destroyed, so call xfrm_state_delete_tunnel during xfrm_state_gc_destroy. Most states will have their fallback tunnel cleaned up during __xfrm_state_delete, which solves the issue that b441cf3f8c4b (and other patches before it) aimed at. All states (including FB tunnels) will be removed from the lists once xfrm_state_fini has called flush_work(&xfrm_state_gc_work). Reported-by: syzbot+999eb23467f83f9bf9bf@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=999eb23467f83f9bf9bf Fixes: b441cf3f8c4b ("xfrm: delete x->tunnel as we delete x") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index e4736d1ebb44..721ef0f409b5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -592,6 +592,7 @@ void xfrm_state_free(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_free); +static void xfrm_state_delete_tunnel(struct xfrm_state *x); static void xfrm_state_gc_destroy(struct xfrm_state *x) { if (x->mode_cbs && x->mode_cbs->destroy_state) @@ -607,6 +608,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) kfree(x->replay_esn); kfree(x->preplay_esn); xfrm_unset_type_offload(x); + xfrm_state_delete_tunnel(x); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -806,7 +808,6 @@ void __xfrm_state_destroy(struct xfrm_state *x) } EXPORT_SYMBOL(__xfrm_state_destroy); -static void xfrm_state_delete_tunnel(struct xfrm_state *x); int __xfrm_state_delete(struct xfrm_state *x) { struct net *net = xs_net(x); From 5502bc4746e86bfe91ecbe0ed1ad53cb17673920 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 16 Oct 2025 12:39:14 +0200 Subject: [PATCH 021/543] xfrm: make state as DEAD before final put when migrate fails xfrm_state_migrate/xfrm_state_clone_and_setup create a new state, and call xfrm_state_put to destroy it in case of failure. __xfrm_state_destroy expects the state to be in XFRM_STATE_DEAD, but we currently don't do that. Reported-by: syzbot+5cd6299ede4d4f70987b@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=5cd6299ede4d4f70987b Fixes: 78347c8c6b2d ("xfrm: Fix xfrm_state_migrate leak") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 721ef0f409b5..1ab19ca007de 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2074,6 +2074,7 @@ static struct xfrm_state *xfrm_state_clone_and_setup(struct xfrm_state *orig, return x; error: + x->km.state = XFRM_STATE_DEAD; xfrm_state_put(x); out: return NULL; @@ -2163,6 +2164,7 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, return xc; error: + xc->km.state = XFRM_STATE_DEAD; xfrm_state_put(xc); return NULL; } From 7f02285764790e0ff1a731b4187fa3e389ed02c7 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 16 Oct 2025 12:39:15 +0200 Subject: [PATCH 022/543] xfrm: call xfrm_dev_state_delete when xfrm_state_migrate fails to add the state In case xfrm_state_migrate fails after calling xfrm_dev_state_add, we directly release the last reference and destroy the new state, without calling xfrm_dev_state_delete (this only happens in __xfrm_state_delete, which we're not calling on this path, since the state was never added). Call xfrm_dev_state_delete on error when an offload configuration was provided. Fixes: ab244a394c7f ("xfrm: Migrate offload configuration") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1ab19ca007de..c3518d1498cd 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2159,10 +2159,13 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, xfrm_state_insert(xc); } else { if (xfrm_state_add(xc) < 0) - goto error; + goto error_add; } return xc; +error_add: + if (xuo) + xfrm_dev_state_delete(xc); error: xc->km.state = XFRM_STATE_DEAD; xfrm_state_put(xc); From 1dcf617bec5cb85f68ca19969e7537ef6f6931d3 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 16 Oct 2025 12:39:16 +0200 Subject: [PATCH 023/543] xfrm: set err and extack on failure to create pcpu SA xfrm_state_construct can fail without setting an error if the requested pcpu_num value is too big. Set err and add an extack message to avoid confusing userspace. Fixes: 1ddf9916ac09 ("xfrm: Add support for per cpu xfrm state handling.") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 010c9e6638c0..9d98cc9daa37 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -947,8 +947,11 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (attrs[XFRMA_SA_PCPU]) { x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); - if (x->pcpu_num >= num_possible_cpus()) + if (x->pcpu_num >= num_possible_cpus()) { + err = -ERANGE; + NL_SET_ERR_MSG(extack, "pCPU number too big"); goto error; + } } err = __xfrm_init_state(x, extack); From f2bc8231fd43a02f9d97252b3435869727054d60 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 16 Oct 2025 12:39:17 +0200 Subject: [PATCH 024/543] xfrm: check all hash buckets for leftover states during netns deletion The current hlist_empty checks only test the first bucket of each hashtable, ignoring any other bucket. They should be caught by the WARN_ON for state_all, but better to make all the checks accurate. Fixes: 73d189dce486 ("netns xfrm: per-netns xfrm_state_bydst hash") Fixes: d320bbb306f2 ("netns xfrm: per-netns xfrm_state_bysrc hash") Fixes: b754a4fd8f58 ("netns xfrm: per-netns xfrm_state_byspi hash") Fixes: fe9f1d8779cb ("xfrm: add state hashtable keyed by seq") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c3518d1498cd..9e14e453b55c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -3308,6 +3308,7 @@ int __net_init xfrm_state_init(struct net *net) void xfrm_state_fini(struct net *net) { unsigned int sz; + int i; flush_work(&net->xfrm.state_hash_work); xfrm_state_flush(net, 0, false); @@ -3315,14 +3316,17 @@ void xfrm_state_fini(struct net *net) WARN_ON(!list_empty(&net->xfrm.state_all)); + for (i = 0; i <= net->xfrm.state_hmask; i++) { + WARN_ON(!hlist_empty(net->xfrm.state_byseq + i)); + WARN_ON(!hlist_empty(net->xfrm.state_byspi + i)); + WARN_ON(!hlist_empty(net->xfrm.state_bysrc + i)); + WARN_ON(!hlist_empty(net->xfrm.state_bydst + i)); + } + sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head); - WARN_ON(!hlist_empty(net->xfrm.state_byseq)); xfrm_hash_free(net->xfrm.state_byseq, sz); - WARN_ON(!hlist_empty(net->xfrm.state_byspi)); xfrm_hash_free(net->xfrm.state_byspi, sz); - WARN_ON(!hlist_empty(net->xfrm.state_bysrc)); xfrm_hash_free(net->xfrm.state_bysrc, sz); - WARN_ON(!hlist_empty(net->xfrm.state_bydst)); xfrm_hash_free(net->xfrm.state_bydst, sz); free_percpu(net->xfrm.state_cache_input); } From a7b17ece4032dd86bb411297f2169dda395cdc3c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 10 Oct 2025 15:38:56 +0200 Subject: [PATCH 025/543] mmc: wmt-sdmmc: fix compile test default Enabling compile testing should not enable every individual driver (we have "allyesconfig" for that). Fixes: 7cd8db0fb0b2 ("mmc: add COMPILE_TEST to multiple drivers") Cc: Mikko Rapeli Signed-off-by: Johan Hovold Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 2c963cb6724b..10d0ef58ef49 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -950,7 +950,7 @@ config MMC_USHC config MMC_WMT tristate "Wondermedia SD/MMC Host Controller support" depends on ARCH_VT8500 || COMPILE_TEST - default y + default ARCH_VT8500 help This selects support for the SD/MMC Host Controller on Wondermedia WM8505/WM8650 based SoCs. From 0778ac7df5137d5041783fadfc201f8fd55a1d9b Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Mon, 13 Oct 2025 19:41:51 +0800 Subject: [PATCH 026/543] fs: Fix uninitialized 'offp' in statmount_string() In statmount_string(), most flags assign an output offset pointer (offp) which is later updated with the string offset. However, the STATMOUNT_MNT_UIDMAP and STATMOUNT_MNT_GIDMAP cases directly set the struct fields instead of using offp. This leaves offp uninitialized, leading to a possible uninitialized dereference when *offp is updated. Fix it by assigning offp for UIDMAP and GIDMAP as well, keeping the code path consistent. Fixes: 37c4a9590e1e ("statmount: allow to retrieve idmappings") Fixes: e52e97f09fb6 ("statmount: let unset strings be empty") Cc: stable@vger.kernel.org Signed-off-by: Zhen Ni Link: https://patch.msgid.link/20251013114151.664341-1-zhen.ni@easystack.cn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index d82910f33dc4..5b5ab2ae238b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5454,11 +5454,11 @@ static int statmount_string(struct kstatmount *s, u64 flag) ret = statmount_sb_source(s, seq); break; case STATMOUNT_MNT_UIDMAP: - sm->mnt_uidmap = start; + offp = &sm->mnt_uidmap; ret = statmount_mnt_uidmap(s, seq); break; case STATMOUNT_MNT_GIDMAP: - sm->mnt_gidmap = start; + offp = &sm->mnt_gidmap; ret = statmount_mnt_gidmap(s, seq); break; default: From 2c2b67af5f5f77fc68261a137ad65dcfb8e52506 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Sat, 11 Oct 2025 09:22:35 +0000 Subject: [PATCH 027/543] hostfs: Fix only passing host root in boot stage with new mount In the old mount proceedure, hostfs could only pass root directory during boot. This is because it constructed the root directory using the @root_ino event without any mount options. However, when using it with the new mount API, this step is no longer triggered. As a result, if users mounts without specifying any mount options, the @host_root_path remains uninitialized. To prevent this issue, the @host_root_path should be initialized at the time of allocation. Reported-by: Geoffrey Thorpe Closes: https://lore.kernel.org/all/643333a0-f434-42fb-82ac-d25a0b56f3b7@geoffthorpe.net/ Fixes: cd140ce9f611 ("hostfs: convert hostfs to use the new mount API") Signed-off-by: Hongbo Li Link: https://patch.msgid.link/20251011092235.29880-1-lihongbo22@huawei.com Signed-off-by: Christian Brauner --- fs/hostfs/hostfs_kern.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 1e1acf5775ab..86455eebbf6c 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -979,7 +979,7 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hostfs_fs_info *fsi = fc->s_fs_info; struct fs_parse_result result; - char *host_root; + char *host_root, *tmp_root; int opt; opt = fs_parse(fc, hostfs_param_specs, param, &result); @@ -990,11 +990,13 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_hostfs: host_root = param->string; if (!*host_root) - host_root = ""; - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) + break; + tmp_root = kasprintf(GFP_KERNEL, "%s%s", + fsi->host_root_path, host_root); + if (!tmp_root) return -ENOMEM; + kfree(fsi->host_root_path); + fsi->host_root_path = tmp_root; break; } @@ -1004,17 +1006,17 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) static int hostfs_parse_monolithic(struct fs_context *fc, void *data) { struct hostfs_fs_info *fsi = fc->s_fs_info; - char *host_root = (char *)data; + char *tmp_root, *host_root = (char *)data; /* NULL is printed as '(null)' by printf(): avoid that. */ if (host_root == NULL) - host_root = ""; + return 0; - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) + tmp_root = kasprintf(GFP_KERNEL, "%s%s", fsi->host_root_path, host_root); + if (!tmp_root) return -ENOMEM; - + kfree(fsi->host_root_path); + fsi->host_root_path = tmp_root; return 0; } @@ -1049,6 +1051,11 @@ static int hostfs_init_fs_context(struct fs_context *fc) if (!fsi) return -ENOMEM; + fsi->host_root_path = kasprintf(GFP_KERNEL, "%s/", root_ino); + if (!fsi->host_root_path) { + kfree(fsi); + return -ENOMEM; + } fc->s_fs_info = fsi; fc->ops = &hostfs_context_ops; return 0; From 90c82941adf1986364e0f82c35cf59f2bf5f6a1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Thu, 16 Oct 2025 16:58:37 +0100 Subject: [PATCH 028/543] pmdomain: samsung: plug potential memleak during probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit of_genpd_add_provider_simple() could fail, in which case this code leaks the domain name, pd->pd.name. Use devm_kstrdup_const() to plug this leak. As a side-effect, we can simplify existing error handling. Fixes: c09a3e6c97f0 ("soc: samsung: pm_domains: Convert to regular platform driver") Cc: stable@vger.kernel.org Reviewed-by: Peter Griffin Reviewed-by: Krzysztof Kozlowski Signed-off-by: André Draszik Tested-by: Marek Szyprowski Signed-off-by: Ulf Hansson --- drivers/pmdomain/samsung/exynos-pm-domains.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/pmdomain/samsung/exynos-pm-domains.c b/drivers/pmdomain/samsung/exynos-pm-domains.c index 5d478bb37ad6..f53e1bd24798 100644 --- a/drivers/pmdomain/samsung/exynos-pm-domains.c +++ b/drivers/pmdomain/samsung/exynos-pm-domains.c @@ -92,13 +92,14 @@ static const struct of_device_id exynos_pm_domain_of_match[] = { { }, }; -static const char *exynos_get_domain_name(struct device_node *node) +static const char *exynos_get_domain_name(struct device *dev, + struct device_node *node) { const char *name; if (of_property_read_string(node, "label", &name) < 0) name = kbasename(node->full_name); - return kstrdup_const(name, GFP_KERNEL); + return devm_kstrdup_const(dev, name, GFP_KERNEL); } static int exynos_pd_probe(struct platform_device *pdev) @@ -115,15 +116,13 @@ static int exynos_pd_probe(struct platform_device *pdev) if (!pd) return -ENOMEM; - pd->pd.name = exynos_get_domain_name(np); + pd->pd.name = exynos_get_domain_name(dev, np); if (!pd->pd.name) return -ENOMEM; pd->base = of_iomap(np, 0); - if (!pd->base) { - kfree_const(pd->pd.name); + if (!pd->base) return -ENODEV; - } pd->pd.power_off = exynos_pd_power_off; pd->pd.power_on = exynos_pd_power_on; From 7c3643f204edf1c5edb12b36b34838683ee5f8dc Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Sat, 13 Sep 2025 10:32:24 +0800 Subject: [PATCH 029/543] acpi,srat: Fix incorrect device handle check for Generic Initiator The Generic Initiator Affinity Structure in SRAT table uses device handle type field to indicate the device type. According to ACPI specification, the device handle type value of 1 represents PCI device, not 0. Fixes: 894c26a1c274 ("ACPI: Support Generic Initiator only domains") Reported-by: Wu Zongyong Signed-off-by: Shuai Xue Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250913023224.39281-1-xueshuai@linux.alibaba.com Signed-off-by: Dave Jiang --- drivers/acpi/numa/srat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 53816dfab645..aa87ee1583a4 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -237,7 +237,7 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header) struct acpi_srat_generic_affinity *p = (struct acpi_srat_generic_affinity *)header; - if (p->device_handle_type == 0) { + if (p->device_handle_type == 1) { /* * For pci devices this may be the only place they * are assigned a proximity domain From 1dba74abf3e2fa4484b924d8ba6e54e64ebb8c82 Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Mon, 20 Oct 2025 17:27:04 +0200 Subject: [PATCH 030/543] clk: sunxi-ng: Mark A523 bus-r-cpucfg clock as critical bus-r-cpucfg clock is important for peripheral which takes care of powering CPU cores on and off. Since this operation is done by firmware (TF-A), mark it as critical. That way Linux won't interfere with that clock. Fixes: 8cea339cfb81 ("clk: sunxi-ng: add support for the A523/T527 PRCM CCU") Signed-off-by: Jernej Skrabec Reviewed-by: Andre Przywara Tested-by: Andre Przywara Link: https://patch.msgid.link/20251020152704.4804-1-jernej.skrabec@gmail.com Signed-off-by: Chen-Yu Tsai --- drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c b/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c index 70ce0ca0cb7d..c5b0d4a2e397 100644 --- a/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c +++ b/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c @@ -125,7 +125,7 @@ static SUNXI_CCU_GATE_HW(bus_r_dma_clk, "bus-r-dma", static SUNXI_CCU_GATE_HW(bus_r_rtc_clk, "bus-r-rtc", &r_apb0_clk.common.hw, 0x20c, BIT(0), 0); static SUNXI_CCU_GATE_HW(bus_r_cpucfg_clk, "bus-r-cpucfg", - &r_apb0_clk.common.hw, 0x22c, BIT(0), 0); + &r_apb0_clk.common.hw, 0x22c, BIT(0), CLK_IS_CRITICAL); static struct ccu_common *sun55i_a523_r_ccu_clks[] = { &r_ahb_clk.common, From a28352cf2d2f8380e7aca8cb61682396dca7a991 Mon Sep 17 00:00:00 2001 From: Shawn Lin Date: Mon, 20 Oct 2025 09:49:41 +0800 Subject: [PATCH 031/543] mmc: sdhci-of-dwcmshc: Change DLL_STRBIN_TAPNUM_DEFAULT to 0x4 strbin signal delay under 0x8 configuration is not stable after massive test. The recommandation of it should be 0x4. Signed-off-by: Shawn Lin Tested-by: Alexey Charkov Tested-by: Hugh Cole-Baker Fixes: 08f3dff799d4 ("mmc: sdhci-of-dwcmshc: add rockchip platform support") Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-dwcmshc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-of-dwcmshc.c b/drivers/mmc/host/sdhci-of-dwcmshc.c index eebd45389956..5b61401a7f3d 100644 --- a/drivers/mmc/host/sdhci-of-dwcmshc.c +++ b/drivers/mmc/host/sdhci-of-dwcmshc.c @@ -94,7 +94,7 @@ #define DLL_TXCLK_TAPNUM_DEFAULT 0x10 #define DLL_TXCLK_TAPNUM_90_DEGREES 0xA #define DLL_TXCLK_TAPNUM_FROM_SW BIT(24) -#define DLL_STRBIN_TAPNUM_DEFAULT 0x8 +#define DLL_STRBIN_TAPNUM_DEFAULT 0x4 #define DLL_STRBIN_TAPNUM_FROM_SW BIT(24) #define DLL_STRBIN_DELAY_NUM_SEL BIT(26) #define DLL_STRBIN_DELAY_NUM_OFFSET 16 From e4185bed738da755b191aa3f2e16e8b48450e1b8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 30 Sep 2025 15:32:34 +0300 Subject: [PATCH 032/543] mtdchar: fix integer overflow in read/write ioctls The "req.start" and "req.len" variables are u64 values that come from the user at the start of the function. We mask away the high 32 bits of "req.len" so that's capped at U32_MAX but the "req.start" variable can go up to U64_MAX which means that the addition can still integer overflow. Use check_add_overflow() to fix this bug. Fixes: 095bb6e44eb1 ("mtdchar: add MEMREAD ioctl") Fixes: 6420ac0af95d ("mtdchar: prevent unbounded allocation in MEMWRITE ioctl") Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Signed-off-by: Miquel Raynal --- drivers/mtd/mtdchar.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 8dc4f5c493fc..335c702633ff 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -599,6 +599,7 @@ mtdchar_write_ioctl(struct mtd_info *mtd, struct mtd_write_req __user *argp) uint8_t *datbuf = NULL, *oobbuf = NULL; size_t datbuf_len, oobbuf_len; int ret = 0; + u64 end; if (copy_from_user(&req, argp, sizeof(req))) return -EFAULT; @@ -618,7 +619,7 @@ mtdchar_write_ioctl(struct mtd_info *mtd, struct mtd_write_req __user *argp) req.len &= 0xffffffff; req.ooblen &= 0xffffffff; - if (req.start + req.len > mtd->size) + if (check_add_overflow(req.start, req.len, &end) || end > mtd->size) return -EINVAL; datbuf_len = min_t(size_t, req.len, mtd->erasesize); @@ -698,6 +699,7 @@ mtdchar_read_ioctl(struct mtd_info *mtd, struct mtd_read_req __user *argp) size_t datbuf_len, oobbuf_len; size_t orig_len, orig_ooblen; int ret = 0; + u64 end; if (copy_from_user(&req, argp, sizeof(req))) return -EFAULT; @@ -724,7 +726,7 @@ mtdchar_read_ioctl(struct mtd_info *mtd, struct mtd_read_req __user *argp) req.len &= 0xffffffff; req.ooblen &= 0xffffffff; - if (req.start + req.len > mtd->size) { + if (check_add_overflow(req.start, req.len, &end) || end > mtd->size) { ret = -EINVAL; goto out; } From 9225f02ff201837e1443076f37a3c008140d1835 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 3 Oct 2025 12:30:10 +0300 Subject: [PATCH 033/543] mtd: nand: realtek-ecc: Fix a IS_ERR() vs NULL bug in probe The dma_alloc_noncoherent() function doesn't return error pointers, it returns NULL on error. Fix the error checking to match. Fixes: 3148d0e5b1c5 ("mtd: nand: realtek-ecc: Add Realtek external ECC engine support") Signed-off-by: Dan Carpenter Reviewed-by: Geert Uytterhoeven Signed-off-by: Miquel Raynal --- drivers/mtd/nand/ecc-realtek.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/ecc-realtek.c b/drivers/mtd/nand/ecc-realtek.c index 7d718934c909..7c275f1eb4a7 100644 --- a/drivers/mtd/nand/ecc-realtek.c +++ b/drivers/mtd/nand/ecc-realtek.c @@ -418,8 +418,8 @@ static int rtl_ecc_probe(struct platform_device *pdev) rtlc->buf = dma_alloc_noncoherent(dev, RTL_ECC_DMA_SIZE, &rtlc->buf_dma, DMA_BIDIRECTIONAL, GFP_KERNEL); - if (IS_ERR(rtlc->buf)) - return PTR_ERR(rtlc->buf); + if (!rtlc->buf) + return -ENOMEM; rtlc->dev = dev; rtlc->engine.dev = dev; From 0d9c80aa572182d4b1464826cd77aa8973213216 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 8 Oct 2025 11:47:15 +0200 Subject: [PATCH 034/543] mtd: nand: MTD_NAND_ECC_REALTEK should depend on HAS_DMA If CONFIG_NO_DMA=y: ERROR: modpost: "dma_free_pages" [drivers/mtd/nand/ecc-realtek.ko] undefined! ERROR: modpost: "dma_alloc_pages" [drivers/mtd/nand/ecc-realtek.ko] undefined! The driver cannot function without DMA, hence fix this by adding a dependency on HAS_DMA. Fixes: 3148d0e5b1c5733d ("mtd: nand: realtek-ecc: Add Realtek external ECC engine support") Signed-off-by: Geert Uytterhoeven Signed-off-by: Miquel Raynal --- drivers/mtd/nand/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index 4a17271076bc..1e57c8de8578 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -63,7 +63,7 @@ config MTD_NAND_ECC_MEDIATEK config MTD_NAND_ECC_REALTEK tristate "Realtek RTL93xx hardware ECC engine" - depends on HAS_IOMEM + depends on HAS_IOMEM && HAS_DMA depends on MACH_REALTEK_RTL || COMPILE_TEST select MTD_NAND_ECC help From 9631350885929819d4e46c6521df35960b472ef3 Mon Sep 17 00:00:00 2001 From: Li Qiang Date: Mon, 20 Oct 2025 20:53:33 +0800 Subject: [PATCH 035/543] mtd: rawnand: realtek: Make rtl_ecc_engine_ops const The rtl_ecc_engine_ops structure is only used to provide a set of callback functions and is never modified after initialization. Mark it as const so it can be placed in the read-only section, which improves safety and allows better compiler optimization. Signed-off-by: Li Qiang Signed-off-by: Miquel Raynal --- drivers/mtd/nand/ecc-realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/ecc-realtek.c b/drivers/mtd/nand/ecc-realtek.c index 7c275f1eb4a7..0046da37ea3e 100644 --- a/drivers/mtd/nand/ecc-realtek.c +++ b/drivers/mtd/nand/ecc-realtek.c @@ -380,7 +380,7 @@ static void rtl_ecc_cleanup_ctx(struct nand_device *nand) nand_ecc_cleanup_req_tweaking(&ctx->req_ctx); } -static struct nand_ecc_engine_ops rtl_ecc_engine_ops = { +static const struct nand_ecc_engine_ops rtl_ecc_engine_ops = { .init_ctx = rtl_ecc_init_ctx, .cleanup_ctx = rtl_ecc_cleanup_ctx, .prepare_io_req = rtl_ecc_prepare_io_req, From 7458f72cc28f9eb0de811effcb5376d0ec19094a Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 17 Oct 2025 12:03:20 +0100 Subject: [PATCH 036/543] pmdomain: arm: scmi: Fix genpd leak on provider registration failure If of_genpd_add_provider_onecell() fails during probe, the previously created generic power domains are not removed, leading to a memory leak and potential kernel crash later in genpd_debug_add(). Add proper error handling to unwind the initialized domains before returning from probe to ensure all resources are correctly released on failure. Example crash trace observed without this fix: | Unable to handle kernel paging request at virtual address fffffffffffffc70 | CPU: 1 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.18.0-rc1 #405 PREEMPT | Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno Development Platform | pstate: 00000005 (nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : genpd_debug_add+0x2c/0x160 | lr : genpd_debug_init+0x74/0x98 | Call trace: | genpd_debug_add+0x2c/0x160 (P) | genpd_debug_init+0x74/0x98 | do_one_initcall+0xd0/0x2d8 | do_initcall_level+0xa0/0x140 | do_initcalls+0x60/0xa8 | do_basic_setup+0x28/0x40 | kernel_init_freeable+0xe8/0x170 | kernel_init+0x2c/0x140 | ret_from_fork+0x10/0x20 Fixes: 898216c97ed2 ("firmware: arm_scmi: add device power domain support using genpd") Signed-off-by: Sudeep Holla Reviewed-by: Peng Fan Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/pmdomain/arm/scmi_pm_domain.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/pmdomain/arm/scmi_pm_domain.c b/drivers/pmdomain/arm/scmi_pm_domain.c index 8fe1c0a501c9..b5e2ffd5ea64 100644 --- a/drivers/pmdomain/arm/scmi_pm_domain.c +++ b/drivers/pmdomain/arm/scmi_pm_domain.c @@ -41,7 +41,7 @@ static int scmi_pd_power_off(struct generic_pm_domain *domain) static int scmi_pm_domain_probe(struct scmi_device *sdev) { - int num_domains, i; + int num_domains, i, ret; struct device *dev = &sdev->dev; struct device_node *np = dev->of_node; struct scmi_pm_domain *scmi_pd; @@ -108,9 +108,18 @@ static int scmi_pm_domain_probe(struct scmi_device *sdev) scmi_pd_data->domains = domains; scmi_pd_data->num_domains = num_domains; + ret = of_genpd_add_provider_onecell(np, scmi_pd_data); + if (ret) + goto err_rm_genpds; + dev_set_drvdata(dev, scmi_pd_data); - return of_genpd_add_provider_onecell(np, scmi_pd_data); + return 0; +err_rm_genpds: + for (i = num_domains - 1; i >= 0; i--) + pm_genpd_remove(domains[i]); + + return ret; } static void scmi_pm_domain_remove(struct scmi_device *sdev) From 5888533c6011de319c5f23ae147f1f291ce81582 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Tue, 21 Oct 2025 01:10:51 +0800 Subject: [PATCH 037/543] clk: sunxi-ng: sun55i-a523-r-ccu: Mark bus-r-dma as critical The "bus-r-dma" clock in the A523's PRCM clock controller is also referred to as "DMA_CLKEN_SW" or "DMA ADB400 gating". It is unclear how this ties into the DMA controller MBUS clock gate; however if the clock is not enabled, the DMA controller in the MCU block will fail to access DRAM, even failing to retrieve the DMA descriptors. Mark this clock as critical. This sort of mirrors what is done for the main DMA controller's MBUS clock, which has a separate toggle that is currently left out of the main clock controller driver. Fixes: 8cea339cfb81 ("clk: sunxi-ng: add support for the A523/T527 PRCM CCU") Acked-by: Jernej Skrabec Link: https://patch.msgid.link/20251020171059.2786070-6-wens@kernel.org Signed-off-by: Chen-Yu Tsai --- drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c b/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c index c5b0d4a2e397..0339c4af0fe5 100644 --- a/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c +++ b/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c @@ -121,7 +121,7 @@ static SUNXI_CCU_GATE_HW(bus_r_ir_rx_clk, "bus-r-ir-rx", &r_apb0_clk.common.hw, 0x1cc, BIT(0), 0); static SUNXI_CCU_GATE_HW(bus_r_dma_clk, "bus-r-dma", - &r_apb0_clk.common.hw, 0x1dc, BIT(0), 0); + &r_apb0_clk.common.hw, 0x1dc, BIT(0), CLK_IS_CRITICAL); static SUNXI_CCU_GATE_HW(bus_r_rtc_clk, "bus-r-rtc", &r_apb0_clk.common.hw, 0x20c, BIT(0), 0); static SUNXI_CCU_GATE_HW(bus_r_cpucfg_clk, "bus-r-cpucfg", From 2050280a4bb660b47f8cccf75a69293ae7cbb087 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Tue, 21 Oct 2025 01:10:52 +0800 Subject: [PATCH 038/543] clk: sunxi-ng: sun55i-a523-ccu: Lower audio0 pll minimum rate While the user manual states that the PLL's rate should be between 180 MHz and 3 GHz in the register defninition section, it also says the actual operating frequency is 22.5792*4 MHz in the PLL features table. 22.5792*4 MHz is one of the actual clock rates that we want and is is available in the SDM table. Lower the minimum clock rate to 90 MHz so that both rates in the SDM table can be used. Fixes: 7cae1e2b5544 ("clk: sunxi-ng: Add support for the A523/T527 CCU PLLs") Reviewed-by: Jernej Skrabec Link: https://patch.msgid.link/20251020171059.2786070-7-wens@kernel.org Signed-off-by: Chen-Yu Tsai --- drivers/clk/sunxi-ng/ccu-sun55i-a523.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/sunxi-ng/ccu-sun55i-a523.c b/drivers/clk/sunxi-ng/ccu-sun55i-a523.c index acb532f8361b..20dad06b37ca 100644 --- a/drivers/clk/sunxi-ng/ccu-sun55i-a523.c +++ b/drivers/clk/sunxi-ng/ccu-sun55i-a523.c @@ -300,7 +300,7 @@ static struct ccu_nm pll_audio0_4x_clk = { .m = _SUNXI_CCU_DIV(16, 6), .sdm = _SUNXI_CCU_SDM(pll_audio0_sdm_table, BIT(24), 0x178, BIT(31)), - .min_rate = 180000000U, + .min_rate = 90000000U, .max_rate = 3000000000U, .common = { .reg = 0x078, From 5c56bf214af85ca042bf97f8584aab2151035840 Mon Sep 17 00:00:00 2001 From: Niravkumar L Rabara Date: Thu, 23 Oct 2025 11:32:01 +0800 Subject: [PATCH 039/543] mtd: rawnand: cadence: fix DMA device NULL pointer dereference The DMA device pointer `dma_dev` was being dereferenced before ensuring that `cdns_ctrl->dmac` is properly initialized. Move the assignment of `dma_dev` after successfully acquiring the DMA channel to ensure the pointer is valid before use. Fixes: d76d22b5096c ("mtd: rawnand: cadence: use dma_map_resource for sdma address") Cc: stable@vger.kernel.org Signed-off-by: Niravkumar L Rabara Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/cadence-nand-controller.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/cadence-nand-controller.c b/drivers/mtd/nand/raw/cadence-nand-controller.c index 6667eea95597..32ed38b89394 100644 --- a/drivers/mtd/nand/raw/cadence-nand-controller.c +++ b/drivers/mtd/nand/raw/cadence-nand-controller.c @@ -2871,7 +2871,7 @@ cadence_nand_irq_cleanup(int irqnum, struct cdns_nand_ctrl *cdns_ctrl) static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) { dma_cap_mask_t mask; - struct dma_device *dma_dev = cdns_ctrl->dmac->device; + struct dma_device *dma_dev; int ret; cdns_ctrl->cdma_desc = dma_alloc_coherent(cdns_ctrl->dev, @@ -2915,6 +2915,7 @@ static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) } } + dma_dev = cdns_ctrl->dmac->device; cdns_ctrl->io.iova_dma = dma_map_resource(dma_dev->dev, cdns_ctrl->io.dma, cdns_ctrl->io.size, DMA_BIDIRECTIONAL, 0); From 369f772299821f93f872bf1b4d7d7ed2fc50243b Mon Sep 17 00:00:00 2001 From: Yu-Chun Lin Date: Thu, 23 Oct 2025 15:55:29 +0800 Subject: [PATCH 040/543] pinctrl: realtek: Select REGMAP_MMIO for RTD driver The pinctrl-rtd driver uses 'devm_regmap_init_mmio', which requires 'REGMAP_MMIO' to be enabled. Without this selection, the build fails with an undefined reference: aarch64-none-linux-gnu-ld: drivers/pinctrl/realtek/pinctrl-rtd.o: in function rtd_pinctrl_probe': pinctrl-rtd.c:(.text+0x5a0): undefined reference to __devm_regmap_init_mmio_clk' Fix this by selecting 'REGMAP_MMIO' in the Kconfig. Fixes: e99ce78030db ("pinctrl: realtek: Add common pinctrl driver for Realtek DHC RTD SoCs") Signed-off-by: Yu-Chun Lin Signed-off-by: Linus Walleij --- drivers/pinctrl/realtek/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/realtek/Kconfig b/drivers/pinctrl/realtek/Kconfig index 0fc6bd4fcb7e..400c9e5b16ad 100644 --- a/drivers/pinctrl/realtek/Kconfig +++ b/drivers/pinctrl/realtek/Kconfig @@ -6,6 +6,7 @@ config PINCTRL_RTD default y select PINMUX select GENERIC_PINCONF + select REGMAP_MMIO config PINCTRL_RTD1619B tristate "Realtek DHC 1619B pin controller driver" From 316e361b5d2cdeb8d778983794a1c6eadcb26814 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 22 Oct 2025 15:34:26 +0200 Subject: [PATCH 041/543] dt-bindings: pinctrl: toshiba,visconti: Fix number of items in groups The "groups" property can hold multiple entries (e.g. toshiba/tmpv7708-rm-mbrc.dts file), so allow that by dropping incorrect type (pinmux-node.yaml schema already defines that as string-array) and adding constraints for items. This fixes dtbs_check warnings like: toshiba/tmpv7708-rm-mbrc.dtb: pinctrl@24190000 (toshiba,tmpv7708-pinctrl): pwm-pins:groups: ['pwm0_gpio16_grp', 'pwm1_gpio17_grp', 'pwm2_gpio18_grp', 'pwm3_gpio19_grp'] is too long Fixes: 1825c1fe0057 ("pinctrl: Add DT bindings for Toshiba Visconti TMPV7700 SoC") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Acked-by: Conor Dooley Signed-off-by: Linus Walleij --- .../pinctrl/toshiba,visconti-pinctrl.yaml | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml index 19d47fd414bc..ce04d2eadec9 100644 --- a/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml @@ -50,18 +50,20 @@ patternProperties: groups: description: Name of the pin group to use for the functions. - $ref: /schemas/types.yaml#/definitions/string - enum: [i2c0_grp, i2c1_grp, i2c2_grp, i2c3_grp, i2c4_grp, - i2c5_grp, i2c6_grp, i2c7_grp, i2c8_grp, - spi0_grp, spi0_cs0_grp, spi0_cs1_grp, spi0_cs2_grp, - spi1_grp, spi2_grp, spi3_grp, spi4_grp, spi5_grp, spi6_grp, - uart0_grp, uart1_grp, uart2_grp, uart3_grp, - pwm0_gpio4_grp, pwm0_gpio8_grp, pwm0_gpio12_grp, - pwm0_gpio16_grp, pwm1_gpio5_grp, pwm1_gpio9_grp, - pwm1_gpio13_grp, pwm1_gpio17_grp, pwm2_gpio6_grp, - pwm2_gpio10_grp, pwm2_gpio14_grp, pwm2_gpio18_grp, - pwm3_gpio7_grp, pwm3_gpio11_grp, pwm3_gpio15_grp, - pwm3_gpio19_grp, pcmif_out_grp, pcmif_in_grp] + items: + enum: [i2c0_grp, i2c1_grp, i2c2_grp, i2c3_grp, i2c4_grp, + i2c5_grp, i2c6_grp, i2c7_grp, i2c8_grp, + spi0_grp, spi0_cs0_grp, spi0_cs1_grp, spi0_cs2_grp, + spi1_grp, spi2_grp, spi3_grp, spi4_grp, spi5_grp, spi6_grp, + uart0_grp, uart1_grp, uart2_grp, uart3_grp, + pwm0_gpio4_grp, pwm0_gpio8_grp, pwm0_gpio12_grp, + pwm0_gpio16_grp, pwm1_gpio5_grp, pwm1_gpio9_grp, + pwm1_gpio13_grp, pwm1_gpio17_grp, pwm2_gpio6_grp, + pwm2_gpio10_grp, pwm2_gpio14_grp, pwm2_gpio18_grp, + pwm3_gpio7_grp, pwm3_gpio11_grp, pwm3_gpio15_grp, + pwm3_gpio19_grp, pcmif_out_grp, pcmif_in_grp] + minItems: 1 + maxItems: 8 drive-strength: enum: [2, 4, 6, 8, 16, 24, 32] From 6f37469a933030692741710db809722076f71973 Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Tue, 21 Oct 2025 14:47:06 -0500 Subject: [PATCH 042/543] memory: tegra210: Fix incorrect client ids The original commit had typos for two of the memory client ids. Fix them to reference the correct bindings. Fixes: 3804cef4c597 ("memory: tegra210: Use bindings for client ids") Signed-off-by: Aaron Kling Link: https://patch.msgid.link/20251021-t210-mem-clientid-fixup-v1-1-5094946faa31@gmail.com Signed-off-by: Krzysztof Kozlowski --- drivers/memory/tegra/tegra210.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/memory/tegra/tegra210.c b/drivers/memory/tegra/tegra210.c index cfa61dd88557..3c2949c16fde 100644 --- a/drivers/memory/tegra/tegra210.c +++ b/drivers/memory/tegra/tegra210.c @@ -1015,7 +1015,7 @@ static const struct tegra_mc_client tegra210_mc_clients[] = { }, }, }, { - .id = TEGRA210_MC_SESRD, + .id = TEGRA210_MC_SESWR, .name = "seswr", .swgroup = TEGRA_SWGROUP_SE, .regs = { @@ -1079,7 +1079,7 @@ static const struct tegra_mc_client tegra210_mc_clients[] = { }, }, }, { - .id = TEGRA210_MC_ETRR, + .id = TEGRA210_MC_ETRW, .name = "etrw", .swgroup = TEGRA_SWGROUP_ETR, .regs = { From 8c5fa3764facaad6d38336e90f406c2c11d69733 Mon Sep 17 00:00:00 2001 From: Fangyu Yu Date: Tue, 21 Oct 2025 22:21:31 +0800 Subject: [PATCH 043/543] RISC-V: KVM: Remove automatic I/O mapping for VM_PFNMAP As of commit aac6db75a9fc ("vfio/pci: Use unmap_mapping_range()"), vm_pgoff may no longer guaranteed to hold the PFN for VM_PFNMAP regions. Using vma->vm_pgoff to derive the HPA here may therefore produce incorrect mappings. Instead, I/O mappings for such regions can be established on-demand during g-stage page faults, making the upfront ioremap in this path is unnecessary. Fixes: aac6db75a9fc ("vfio/pci: Use unmap_mapping_range()") Signed-off-by: Fangyu Yu Tested-by: Daniel Henrique Barboza Reviewed-by: Guo Ren Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20251021142131.78796-1-fangyu.yu@linux.alibaba.com Signed-off-by: Anup Patel --- arch/riscv/kvm/mmu.c | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 525fb5a330c0..58f5f3536ffd 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -171,7 +171,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, enum kvm_mr_change change) { hva_t hva, reg_end, size; - gpa_t base_gpa; bool writable; int ret = 0; @@ -190,15 +189,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, hva = new->userspace_addr; size = new->npages << PAGE_SHIFT; reg_end = hva + size; - base_gpa = new->base_gfn << PAGE_SHIFT; writable = !(new->flags & KVM_MEM_READONLY); mmap_read_lock(current->mm); /* * A memory region could potentially cover multiple VMAs, and - * any holes between them, so iterate over all of them to find - * out if we can map any of them right now. + * any holes between them, so iterate over all of them. * * +--------------------------------------------+ * +---------------+----------------+ +----------------+ @@ -209,7 +206,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, */ do { struct vm_area_struct *vma; - hva_t vm_start, vm_end; + hva_t vm_end; vma = find_vma_intersection(current->mm, hva, reg_end); if (!vma) @@ -225,36 +222,18 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, } /* Take the intersection of this VMA with the memory region */ - vm_start = max(hva, vma->vm_start); vm_end = min(reg_end, vma->vm_end); if (vma->vm_flags & VM_PFNMAP) { - gpa_t gpa = base_gpa + (vm_start - hva); - phys_addr_t pa; - - pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; - pa += vm_start - vma->vm_start; - /* IO region dirty page logging not allowed */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { ret = -EINVAL; goto out; } - - ret = kvm_riscv_mmu_ioremap(kvm, gpa, pa, vm_end - vm_start, - writable, false); - if (ret) - break; } hva = vm_end; } while (hva < reg_end); - if (change == KVM_MR_FLAGS_ONLY) - goto out; - - if (ret) - kvm_riscv_mmu_iounmap(kvm, base_gpa, size); - out: mmap_read_unlock(current->mm); return ret; From ff7b5a27438275e4fd8c4809d815638c829fe520 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Mon, 13 Oct 2025 14:17:09 +0200 Subject: [PATCH 044/543] arm: imx_v6_v7_defconfig: enable ext4 directly In former times, ext4 was enabled implicitely by enabling ext3 but with the ext3 fs gone, it does not get enabled, which lets devices fail to mount root on non-initrd based boots with an ext4 root. Fixes: d6ace46c82fd ("ext4: remove obsolete EXT3 config options") Signed-off-by: Andreas Kemnade Reviewed-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/configs/imx_v6_v7_defconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig index 9a57763a8d38..0d55056c6f82 100644 --- a/arch/arm/configs/imx_v6_v7_defconfig +++ b/arch/arm/configs/imx_v6_v7_defconfig @@ -436,9 +436,9 @@ CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT3_FS=y -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=y From ec4daace64a44b53df76f0629e82684ef09ce869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Paulo=20Gon=C3=A7alves?= Date: Tue, 14 Oct 2025 09:56:43 -0300 Subject: [PATCH 045/543] arm64: dts: imx8-ss-img: Avoid gpio0_mipi_csi GPIOs being deferred MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gpio0_mipi_csi DT nodes are enabled by default, but they are dependent on the irqsteer_csi nodes, which are not enabled. This causes the gpio0_mipi_csi GPIOs to be probe deferred. Since these GPIOs can be used independently of the CSI controller, enable irqsteer_csi by default too to prevent them from being deferred and to ensure they work out of the box. Fixes: 2217f8243714 ("arm64: dts: imx8: add capture controller for i.MX8's img subsystem") Signed-off-by: João Paulo Gonçalves Reviewed-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi index 2cf0f7208350..a72b2f1c4a1b 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi @@ -67,7 +67,6 @@ irqsteer_csi0: irqsteer@58220000 { power-domains = <&pd IMX_SC_R_CSI_0>; fsl,channel = <0>; fsl,num-irqs = <32>; - status = "disabled"; }; gpio0_mipi_csi0: gpio@58222000 { @@ -144,7 +143,6 @@ irqsteer_csi1: irqsteer@58240000 { power-domains = <&pd IMX_SC_R_CSI_1>; fsl,channel = <0>; fsl,num-irqs = <32>; - status = "disabled"; }; gpio0_mipi_csi1: gpio@58242000 { From 1eb42bacd7cebede5d317569e4b874b54e5c41d6 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 17 Oct 2025 16:39:46 -0400 Subject: [PATCH 046/543] arm64: dts: imx95: Fix MSI mapping for PCIe endpoint nodes The msi-map property was incorrectly applied to pcie0-ep instead of pcie1-ep. Correct the msi-map for both pcie0-ep and pcie1-ep nodes. Fixes: bbe4b2f7d6533 ("arm64: dts: imx95: Add msi-map for pci-ep device") Signed-off-by: Frank Li Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx95.dtsi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi index 1292677cbe4e..6da961eb3fe5 100644 --- a/arch/arm64/boot/dts/freescale/imx95.dtsi +++ b/arch/arm64/boot/dts/freescale/imx95.dtsi @@ -1886,7 +1886,7 @@ pcie0_ep: pcie-ep@4c300000 { assigned-clock-rates = <3600000000>, <100000000>, <10000000>; assigned-clock-parents = <0>, <0>, <&scmi_clk IMX95_CLK_SYSPLL1_PFD1_DIV2>; - msi-map = <0x0 &its 0x98 0x1>; + msi-map = <0x0 &its 0x10 0x1>; power-domains = <&scmi_devpd IMX95_PD_HSIO_TOP>; status = "disabled"; }; @@ -1963,6 +1963,7 @@ pcie1_ep: pcie-ep@4c380000 { assigned-clock-rates = <3600000000>, <100000000>, <10000000>; assigned-clock-parents = <0>, <0>, <&scmi_clk IMX95_CLK_SYSPLL1_PFD1_DIV2>; + msi-map = <0x0 &its 0x98 0x1>; power-domains = <&scmi_devpd IMX95_PD_HSIO_TOP>; status = "disabled"; }; From 6504297872c7a5d0d06247970d32940eba26b8b3 Mon Sep 17 00:00:00 2001 From: Frieder Schrempf Date: Mon, 20 Oct 2025 15:21:51 +0200 Subject: [PATCH 047/543] arm64: dts: imx8mp-kontron: Fix USB OTG role switching The VBUS supply regulator is currently assigned to the PHY node. This causes the VBUS to be always on, even when the controller needs to be switched to peripheral mode. Fix the OTG role switching by adding a connector node and moving the VBUS supply regulator to that node. This way the VBUS gets correctly switched according to the current role. Fixes: 946ab10e3f40 ("arm64: dts: Add support for Kontron OSM-S i.MX8MP SoM and BL carrier board") Signed-off-by: Frieder Schrempf Signed-off-by: Shawn Guo --- .../dts/freescale/imx8mp-kontron-bl-osm-s.dts | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts b/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts index 614b4ce330b1..0924ac50fd2d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts @@ -16,11 +16,20 @@ aliases { ethernet1 = &eqos; }; - extcon_usbc: usbc { - compatible = "linux,extcon-usb-gpio"; + connector { + compatible = "gpio-usb-b-connector", "usb-b-connector"; + id-gpios = <&gpio1 10 GPIO_ACTIVE_HIGH>; + label = "Type-C"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usb1_id>; - id-gpios = <&gpio1 10 GPIO_ACTIVE_HIGH>; + type = "micro"; + vbus-supply = <®_usb1_vbus>; + + port { + usb_dr_connector: endpoint { + remote-endpoint = <&usb3_dwc>; + }; + }; }; leds { @@ -244,9 +253,15 @@ &usb_dwc3_0 { hnp-disable; srp-disable; dr_mode = "otg"; - extcon = <&extcon_usbc>; usb-role-switch; + role-switch-default-mode = "peripheral"; status = "okay"; + + port { + usb3_dwc: endpoint { + remote-endpoint = <&usb_dr_connector>; + }; + }; }; &usb_dwc3_1 { @@ -273,7 +288,6 @@ &usb3_1 { }; &usb3_phy0 { - vbus-supply = <®_usb1_vbus>; status = "okay"; }; From 23ee8a2563a0f24cf4964685ced23c32be444ab8 Mon Sep 17 00:00:00 2001 From: Qinxin Xia Date: Tue, 28 Oct 2025 20:08:59 +0800 Subject: [PATCH 048/543] dma-mapping: benchmark: Restore padding to ensure uABI remained consistent The padding field in the structure was previously reserved to maintain a stable interface for potential new fields, ensuring compatibility with user-space shared data structures. However,it was accidentally removed by tiantao in a prior commit, which may lead to incompatibility between user space and the kernel. This patch reinstates the padding to restore the original structure layout and preserve compatibility. Fixes: 8ddde07a3d28 ("dma-mapping: benchmark: extract a common header file for map_benchmark definition") Cc: stable@vger.kernel.org Acked-by: Barry Song Signed-off-by: Qinxin Xia Reported-by: Barry Song Closes: https://lore.kernel.org/lkml/CAGsJ_4waiZ2+NBJG+SCnbNk+nQ_ZF13_Q5FHJqZyxyJTcEop2A@mail.gmail.com/ Reviewed-by: Jonathan Cameron Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251028120900.2265511-2-xiaqinxin@huawei.com --- include/linux/map_benchmark.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/map_benchmark.h b/include/linux/map_benchmark.h index 62674c83bde4..48e2ff95332f 100644 --- a/include/linux/map_benchmark.h +++ b/include/linux/map_benchmark.h @@ -27,5 +27,6 @@ struct map_benchmark { __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ }; #endif /* _KERNEL_DMA_BENCHMARK_H */ From 330e2c514823008b22e6afd2055715bc46dd8d55 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Oct 2025 19:48:32 +0100 Subject: [PATCH 049/543] afs: Fix dynamic lookup to fail on cell lookup failure When a process tries to access an entry in /afs, normally what happens is that an automount dentry is created by ->lookup() and then triggered, which jumps through the ->d_automount() op. Currently, afs_dynroot_lookup() does not do cell DNS lookup, leaving that to afs_d_automount() to perform - however, it is possible to use access() or stat() on the automount point, which will always return successfully, have briefly created an afs_cell record if one did not already exist. This means that something like: test -d "/afs/.west" && echo Directory exists will print "Directory exists" even though no such cell is configured. This breaks the "west" python module available on PIP as it expects this access to fail. Now, it could be possible to make afs_dynroot_lookup() perform the DNS[*] lookup, but that would make "ls --color /afs" do this for each cell in /afs that is listed but not yet probed. kafs-client, probably wrongly, preloads the entire cell database and all the known cells are then listed in /afs - and doing ls /afs would be very, very slow, especially if any cell supplied addresses but was wholly inaccessible. [*] When I say "DNS", actually read getaddrinfo(), which could use any one of a host of mechanisms. Could also use static configuration. To fix this, make the following changes: (1) Create an enum to specify the origination point of a call to afs_lookup_cell() and pass this value into that function in place of the "excl" parameter (which can be derived from it). There are six points of origination: - Cell preload through /proc/net/afs/cells - Root cell config through /proc/net/afs/rootcell - Lookup in dynamic root - Automount trigger - Direct mount with mount() syscall - Alias check where YFS tells us the cell name is different (2) Add an extra state into the afs_cell state machine to indicate a cell that's been initialised, but not yet looked up. This is separate from one that can be considered active and has been looked up at least once. (3) Make afs_lookup_cell() vary its behaviour more, depending on where it was called from: If called from preload or root cell config, DNS lookup will not happen until we definitely want to use the cell (dynroot mount, automount, direct mount or alias check). The cell will appear in /afs but stat() won't trigger DNS lookup. If the cell already exists, dynroot will not wait for the DNS lookup to complete. If the cell did not already exist, dynroot will wait. If called from automount, direct mount or alias check, it will wait for the DNS lookup to complete. (4) Make afs_lookup_cell() return an error if lookup failed in one way or another. We try to return -ENOENT if the DNS says the cell does not exist and -EDESTADDRREQ if we couldn't access the DNS. Reported-by: Markus Suvanto Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220685 Signed-off-by: David Howells Link: https://patch.msgid.link/1784747.1761158912@warthog.procyon.org.uk Fixes: 1d0b929fc070 ("afs: Change dynroot to create contents on demand") Tested-by: Markus Suvanto cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- fs/afs/cell.c | 78 +++++++++++++++++++++++++++++++++++++++-------- fs/afs/dynroot.c | 3 +- fs/afs/internal.h | 12 +++++++- fs/afs/mntpt.c | 3 +- fs/afs/proc.c | 3 +- fs/afs/super.c | 2 +- fs/afs/vl_alias.c | 3 +- 7 files changed, 86 insertions(+), 18 deletions(-) diff --git a/fs/afs/cell.c b/fs/afs/cell.c index f31359922e98..d9b6fa1088b7 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -229,7 +229,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, * @name: The name of the cell. * @namesz: The strlen of the cell name. * @vllist: A colon/comma separated list of numeric IP addresses or NULL. - * @excl: T if an error should be given if the cell name already exists. + * @reason: The reason we're doing the lookup * @trace: The reason to be logged if the lookup is successful. * * Look up a cell record by name and query the DNS for VL server addresses if @@ -239,7 +239,8 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, */ struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl, + const char *vllist, + enum afs_lookup_cell_for reason, enum afs_cell_trace trace) { struct afs_cell *cell, *candidate, *cursor; @@ -247,12 +248,18 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, enum afs_cell_state state; int ret, n; - _enter("%s,%s", name, vllist); + _enter("%s,%s,%u", name, vllist, reason); - if (!excl) { + if (reason != AFS_LOOKUP_CELL_PRELOAD) { cell = afs_find_cell(net, name, namesz, trace); - if (!IS_ERR(cell)) + if (!IS_ERR(cell)) { + if (reason == AFS_LOOKUP_CELL_DYNROOT) + goto no_wait; + if (cell->state == AFS_CELL_SETTING_UP || + cell->state == AFS_CELL_UNLOOKED) + goto lookup_cell; goto wait_for_cell; + } } /* Assume we're probably going to create a cell and preallocate and @@ -298,26 +305,69 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, rb_insert_color(&cell->net_node, &net->cells); up_write(&net->cells_lock); - afs_queue_cell(cell, afs_cell_trace_queue_new); +lookup_cell: + if (reason != AFS_LOOKUP_CELL_PRELOAD && + reason != AFS_LOOKUP_CELL_ROOTCELL) { + set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); + afs_queue_cell(cell, afs_cell_trace_queue_new); + } wait_for_cell: - _debug("wait_for_cell"); state = smp_load_acquire(&cell->state); /* vs error */ - if (state != AFS_CELL_ACTIVE && - state != AFS_CELL_DEAD) { + switch (state) { + case AFS_CELL_ACTIVE: + case AFS_CELL_DEAD: + break; + case AFS_CELL_UNLOOKED: + default: + if (reason == AFS_LOOKUP_CELL_PRELOAD || + reason == AFS_LOOKUP_CELL_ROOTCELL) + break; + _debug("wait_for_cell"); afs_see_cell(cell, afs_cell_trace_wait); wait_var_event(&cell->state, ({ state = smp_load_acquire(&cell->state); /* vs error */ state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD; })); + _debug("waited_for_cell %d %d", cell->state, cell->error); } +no_wait: /* Check the state obtained from the wait check. */ + state = smp_load_acquire(&cell->state); /* vs error */ if (state == AFS_CELL_DEAD) { ret = cell->error; goto error; } + if (state == AFS_CELL_ACTIVE) { + switch (cell->dns_status) { + case DNS_LOOKUP_NOT_DONE: + if (cell->dns_source == DNS_RECORD_FROM_CONFIG) { + ret = 0; + break; + } + fallthrough; + default: + ret = -EIO; + goto error; + case DNS_LOOKUP_GOOD: + case DNS_LOOKUP_GOOD_WITH_BAD: + ret = 0; + break; + case DNS_LOOKUP_GOT_NOT_FOUND: + ret = -ENOENT; + goto error; + case DNS_LOOKUP_BAD: + ret = -EREMOTEIO; + goto error; + case DNS_LOOKUP_GOT_LOCAL_FAILURE: + case DNS_LOOKUP_GOT_TEMP_FAILURE: + case DNS_LOOKUP_GOT_NS_FAILURE: + ret = -EDESTADDRREQ; + goto error; + } + } _leave(" = %p [cell]", cell); return cell; @@ -325,7 +375,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, cell_already_exists: _debug("cell exists"); cell = cursor; - if (excl) { + if (reason == AFS_LOOKUP_CELL_PRELOAD) { ret = -EEXIST; } else { afs_use_cell(cursor, trace); @@ -384,7 +434,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) return -EINVAL; /* allocate a cell record for the root/workstation cell */ - new_root = afs_lookup_cell(net, rootcell, len, vllist, false, + new_root = afs_lookup_cell(net, rootcell, len, vllist, + AFS_LOOKUP_CELL_ROOTCELL, afs_cell_trace_use_lookup_ws); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); @@ -777,6 +828,7 @@ static bool afs_manage_cell(struct afs_cell *cell) switch (cell->state) { case AFS_CELL_SETTING_UP: goto set_up_cell; + case AFS_CELL_UNLOOKED: case AFS_CELL_ACTIVE: goto cell_is_active; case AFS_CELL_REMOVING: @@ -797,7 +849,7 @@ static bool afs_manage_cell(struct afs_cell *cell) goto remove_cell; } - afs_set_cell_state(cell, AFS_CELL_ACTIVE); + afs_set_cell_state(cell, AFS_CELL_UNLOOKED); cell_is_active: if (afs_has_cell_expired(cell, &next_manage)) @@ -807,6 +859,8 @@ static bool afs_manage_cell(struct afs_cell *cell) ret = afs_update_cell(cell); if (ret < 0) cell->error = ret; + if (cell->state == AFS_CELL_UNLOOKED) + afs_set_cell_state(cell, AFS_CELL_ACTIVE); } if (next_manage < TIME64_MAX && cell->net->live) { diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 8c6130789fde..dc9d29e3739e 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -108,7 +108,8 @@ static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry * dotted = true; } - cell = afs_lookup_cell(net, name, len, NULL, false, + cell = afs_lookup_cell(net, name, len, NULL, + AFS_LOOKUP_CELL_DYNROOT, afs_cell_trace_use_lookup_dynroot); if (IS_ERR(cell)) { ret = PTR_ERR(cell); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a45ae5c2ef8a..b92f96f56767 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -343,6 +343,7 @@ extern const char afs_init_sysname[]; enum afs_cell_state { AFS_CELL_SETTING_UP, + AFS_CELL_UNLOOKED, AFS_CELL_ACTIVE, AFS_CELL_REMOVING, AFS_CELL_DEAD, @@ -1049,9 +1050,18 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); +enum afs_lookup_cell_for { + AFS_LOOKUP_CELL_DYNROOT, + AFS_LOOKUP_CELL_MOUNTPOINT, + AFS_LOOKUP_CELL_DIRECT_MOUNT, + AFS_LOOKUP_CELL_PRELOAD, + AFS_LOOKUP_CELL_ROOTCELL, + AFS_LOOKUP_CELL_ALIAS_CHECK, +}; struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl, + const char *vllist, + enum afs_lookup_cell_for reason, enum afs_cell_trace trace); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 1ad048e6e164..57c204a3c04e 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (size > AFS_MAXCELLNAME) return -ENAMETOOLONG; - cell = afs_lookup_cell(ctx->net, p, size, NULL, false, + cell = afs_lookup_cell(ctx->net, p, size, NULL, + AFS_LOOKUP_CELL_MOUNTPOINT, afs_cell_trace_use_lookup_mntpt); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 40e879c8ca77..44520549b509 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -122,7 +122,8 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) if (strcmp(buf, "add") == 0) { struct afs_cell *cell; - cell = afs_lookup_cell(net, name, strlen(name), args, true, + cell = afs_lookup_cell(net, name, strlen(name), args, + AFS_LOOKUP_CELL_PRELOAD, afs_cell_trace_use_lookup_add); if (IS_ERR(cell)) { ret = PTR_ERR(cell); diff --git a/fs/afs/super.c b/fs/afs/super.c index da407f2d6f0d..d672b7ab57ae 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -290,7 +290,7 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) /* lookup the cell record */ if (cellname) { cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, - NULL, false, + NULL, AFS_LOOKUP_CELL_DIRECT_MOUNT, afs_cell_trace_use_lookup_mount); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%*.*s'\n", diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index 709b4cdb723e..fc9676abd252 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) if (!name_len || name_len > AFS_MAXCELLNAME) master = ERR_PTR(-EOPNOTSUPP); else - master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false, + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, + AFS_LOOKUP_CELL_ALIAS_CHECK, afs_cell_trace_use_lookup_canonical); kfree(cell_name); if (IS_ERR(master)) From 34ab4c75588c07cca12884f2bf6b0347c7a13872 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 23 Oct 2025 22:25:49 +0900 Subject: [PATCH 050/543] bfs: Reconstruct file type when loading from disk syzbot is reporting that S_IFMT bits of inode->i_mode can become bogus when the S_IFMT bits of the 32bits "mode" field loaded from disk are corrupted or when the 32bits "attributes" field loaded from disk are corrupted. A documentation says that BFS uses only lower 9 bits of the "mode" field. But I can't find an explicit explanation that the unused upper 23 bits (especially, the S_IFMT bits) are initialized with 0. Therefore, ignore the S_IFMT bits of the "mode" field loaded from disk. Also, verify that the value of the "attributes" field loaded from disk is either BFS_VREG or BFS_VDIR (because BFS supports only regular files and the root directory). Reported-by: syzbot+895c23f6917da440ed0d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d Signed-off-by: Tetsuo Handa Link: https://patch.msgid.link/fabce673-d5b9-4038-8287-0fd65d80203b@I-love.SAKURA.ne.jp Reviewed-by: Tigran Aivazian Signed-off-by: Christian Brauner --- fs/bfs/inode.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 1d41ce477df5..984b365df046 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -61,7 +61,19 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; di = (struct bfs_inode *)bh->b_data + off; - inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode); + /* + * https://martin.hinner.info/fs/bfs/bfs-structure.html explains that + * BFS in SCO UnixWare environment used only lower 9 bits of di->i_mode + * value. This means that, although bfs_write_inode() saves whole + * inode->i_mode bits (which include S_IFMT bits and S_IS{UID,GID,VTX} + * bits), middle 7 bits of di->i_mode value can be garbage when these + * bits were not saved by bfs_write_inode(). + * Since we can't tell whether middle 7 bits are garbage, use only + * lower 12 bits (i.e. tolerate S_IS{UID,GID,VTX} bits possibly being + * garbage) and reconstruct S_IFMT bits for Linux environment from + * di->i_vtype value. + */ + inode->i_mode = 0x00000FFF & le32_to_cpu(di->i_mode); if (le32_to_cpu(di->i_vtype) == BFS_VDIR) { inode->i_mode |= S_IFDIR; inode->i_op = &bfs_dir_inops; @@ -71,6 +83,11 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; + } else { + brelse(bh); + printf("Unknown vtype=%u %s:%08lx\n", + le32_to_cpu(di->i_vtype), inode->i_sb->s_id, ino); + goto error; } BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); From 9db8d46712d274a27d1d22c38e70211f20d508c2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Oct 2025 15:23:36 +0200 Subject: [PATCH 051/543] mnt: Remove dead code which might prevent from building Clang, in particular, is not happy about dead code: fs/namespace.c:135:37: error: unused function 'node_to_mnt_ns' [-Werror,-Wunused-function] 135 | static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) | ^~~~~~~~~~~~~~ 1 error generated. Remove a leftover from the previous cleanup. Fixes: 7d7d16498958 ("mnt: support ns lookup") Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20251024132336.1666382-1-andriy.shevchenko@linux.intel.com Signed-off-by: Christian Brauner --- fs/namespace.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 5b5ab2ae238b..cc6e00e72437 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -132,16 +132,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) -{ - struct ns_common *ns; - - if (!node) - return NULL; - ns = rb_entry(node, struct ns_common, ns_tree_node); - return container_of(ns, struct mnt_namespace, ns); -} - static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ From f4fa7c25f632cd925352b4d46f245653a23b1d1a Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 29 Oct 2025 14:08:43 +0100 Subject: [PATCH 052/543] sched_ext: Fix use of uninitialized variable in scx_bpf_cpuperf_set() scx_bpf_cpuperf_set() has a typo where it dereferences the local variable @sch, instead of the global @scx_root pointer. Fix by dereferencing the correct variable. Fixes: 956f2b11a8a4f ("sched_ext: Drop kf_cpu_valid()") Signed-off-by: Andrea Righi Reviewed-by: Christian Loehle Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ecb251e883ea..1a019a7728fb 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6401,7 +6401,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) guard(rcu)(); - sch = rcu_dereference(sch); + sch = rcu_dereference(scx_root); if (unlikely(!sch)) return; From beab067dbcff642243291fd528355d64c41dc3b2 Mon Sep 17 00:00:00 2001 From: Zhang Heng Date: Fri, 12 Sep 2025 20:38:18 +0800 Subject: [PATCH 053/543] HID: quirks: work around VID/PID conflict for 0x4c4a/0x4155 Based on available evidence, the USB ID 4c4a:4155 used by multiple devices has been attributed to Jieli. The commit 1a8953f4f774 ("HID: Add IGNORE quirk for SMARTLINKTECHNOLOGY") affected touchscreen functionality. Added checks for manufacturer and serial number to maintain microphone compatibility, enabling both devices to function properly. [jkosina@suse.com: edit shortlog] Fixes: 1a8953f4f774 ("HID: Add IGNORE quirk for SMARTLINKTECHNOLOGY") Cc: stable@vger.kernel.org Tested-by: staffan.melin@oscillator.se Reviewed-by: Terry Junge Signed-off-by: Zhang Heng Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 4 ++-- drivers/hid/hid-quirks.c | 13 ++++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 0723b4b1c9ec..52ae7c29f9e0 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -1543,7 +1543,7 @@ #define USB_VENDOR_ID_SIGNOTEC 0x2133 #define USB_DEVICE_ID_SIGNOTEC_VIEWSONIC_PD1011 0x0018 -#define USB_VENDOR_ID_SMARTLINKTECHNOLOGY 0x4c4a -#define USB_DEVICE_ID_SMARTLINKTECHNOLOGY_4155 0x4155 +#define USB_VENDOR_ID_JIELI_SDK_DEFAULT 0x4c4a +#define USB_DEVICE_ID_JIELI_SDK_4155 0x4155 #endif diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index bcd4bccf1a7c..22760ac50f2d 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -915,7 +915,6 @@ static const struct hid_device_id hid_ignore_list[] = { #endif { HID_USB_DEVICE(USB_VENDOR_ID_YEALINK, USB_DEVICE_ID_YEALINK_P1K_P4K_B2K) }, { HID_USB_DEVICE(USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_QUANTA_HP_5MP_CAMERA_5473) }, - { HID_USB_DEVICE(USB_VENDOR_ID_SMARTLINKTECHNOLOGY, USB_DEVICE_ID_SMARTLINKTECHNOLOGY_4155) }, { } }; @@ -1064,6 +1063,18 @@ bool hid_ignore(struct hid_device *hdev) strlen(elan_acpi_id[i].id))) return true; break; + case USB_VENDOR_ID_JIELI_SDK_DEFAULT: + /* + * Multiple USB devices with identical IDs (mic & touchscreen). + * The touch screen requires hid core processing, but the + * microphone does not. They can be distinguished by manufacturer + * and serial number. + */ + if (hdev->product == USB_DEVICE_ID_JIELI_SDK_4155 && + strncmp(hdev->name, "SmartlinkTechnology", 19) == 0 && + strncmp(hdev->uniq, "20201111000001", 14) == 0) + return true; + break; } if (hdev->type == HID_TYPE_USBMOUSE && From a45f15808fb753a14c6041fd1e5bef5d552bd2e3 Mon Sep 17 00:00:00 2001 From: Lauri Tirkkonen Date: Sat, 18 Oct 2025 15:35:15 +0900 Subject: [PATCH 054/543] HID: lenovo: fixup Lenovo Yoga Slim 7x Keyboard rdesc The keyboard of this device has the following in its report description for Usage (Keyboard) in Collection (Application): # 0x15, 0x00, // Logical Minimum (0) 52 # 0x25, 0x65, // Logical Maximum (101) 54 # 0x05, 0x07, // Usage Page (Keyboard) 56 # 0x19, 0x00, // Usage Minimum (0) 58 # 0x29, 0xdd, // Usage Maximum (221) 60 # 0x81, 0x00, // Input (Data,Arr,Abs) 62 Since the Usage Min/Max range exceeds the Logical Min/Max range, keypresses outside the Logical range are not recognized. This includes, for example, the Japanese language keyboard variant's keys for |, _ and \. Fixup the report description to make the Logical range match the Usage range, fixing the interpretation of keypresses above 101 on this device. Signed-off-by: Lauri Tirkkonen Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-lenovo.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 52ae7c29f9e0..85db279baa72 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -718,6 +718,7 @@ #define USB_DEVICE_ID_ITE_LENOVO_YOGA2 0x8350 #define I2C_DEVICE_ID_ITE_LENOVO_LEGION_Y720 0x837a #define USB_DEVICE_ID_ITE_LENOVO_YOGA900 0x8396 +#define I2C_DEVICE_ID_ITE_LENOVO_YOGA_SLIM_7X_KEYBOARD 0x8987 #define USB_DEVICE_ID_ITE8595 0x8595 #define USB_DEVICE_ID_ITE_MEDION_E1239T 0xce50 diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c index 654879814f97..9cc3e029e9f6 100644 --- a/drivers/hid/hid-lenovo.c +++ b/drivers/hid/hid-lenovo.c @@ -148,6 +148,14 @@ static const __u8 lenovo_tpIIbtkbd_need_fixup_collection[] = { 0x81, 0x01, /* Input (Const,Array,Abs,No Wrap,Linear,Preferred State,No Null Position) */ }; +static const __u8 lenovo_yoga7x_kbd_need_fixup_collection[] = { + 0x15, 0x00, // Logical Minimum (0) + 0x25, 0x65, // Logical Maximum (101) + 0x05, 0x07, // Usage Page (Keyboard) + 0x19, 0x00, // Usage Minimum (0) + 0x29, 0xDD, // Usage Maximum (221) +}; + static const __u8 *lenovo_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { @@ -177,6 +185,13 @@ static const __u8 *lenovo_report_fixup(struct hid_device *hdev, __u8 *rdesc, rdesc[260] = 0x01; /* report count (2) = 0x01 */ } break; + case I2C_DEVICE_ID_ITE_LENOVO_YOGA_SLIM_7X_KEYBOARD: + if (*rsize == 176 && + memcmp(&rdesc[52], lenovo_yoga7x_kbd_need_fixup_collection, + sizeof(lenovo_yoga7x_kbd_need_fixup_collection)) == 0) { + rdesc[55] = rdesc[61]; // logical maximum = usage maximum + } + break; } return rdesc; } @@ -1538,6 +1553,8 @@ static const struct hid_device_id lenovo_devices[] = { USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X12_TAB) }, { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X12_TAB2) }, + { HID_DEVICE(BUS_I2C, HID_GROUP_GENERIC, + USB_VENDOR_ID_ITE, I2C_DEVICE_ID_ITE_LENOVO_YOGA_SLIM_7X_KEYBOARD) }, { } }; From 082ef944e55da8a9a8df92e3842ca82a626d359a Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Tue, 28 Oct 2025 04:22:47 +0200 Subject: [PATCH 055/543] xfrm: Check inner packet family directly from skb_dst In the output path, xfrm_dev_offload_ok and xfrm_get_inner_ipproto need to determine the protocol family of the inner packet (skb) before it gets encapsulated. In xfrm_dev_offload_ok, the code checked x->inner_mode.family. This is unreliable because, for states handling both IPv4 and IPv6, the relevant inner family could be either x->inner_mode.family or x->inner_mode_iaf.family. Checking only the former can lead to a mismatch with the actual packet being processed. In xfrm_get_inner_ipproto, the code checked x->outer_mode.family. This is also incorrect for tunnel mode, as the inner packet's family can be different from the outer header's family. At both of these call sites, the skb variable holds the original inner packet. The most direct and reliable source of truth for its protocol family is its destination entry. This patch fixes the issue by using skb_dst(skb)->ops->family to ensure protocol-specific headers are only accessed for the correct packet type. Fixes: 91d8a53db219 ("xfrm: fix offloading of cross-family tunnels") Fixes: 45a98ef4922d ("net/xfrm: IPsec tunnel mode fix inner_ipproto setting in sec_path") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Reviewed-by: Zhu Yanjun Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 2 +- net/xfrm/xfrm_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 44b9de6e4e77..52ae0e034d29 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -438,7 +438,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->props.mode == XFRM_MODE_TUNNEL; - switch (x->inner_mode.family) { + switch (skb_dst(skb)->ops->family) { case AF_INET: /* Check for IPv4 options */ if (ip_hdr(skb)->ihl != 5) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 9077730ff7d0..a98b5bf55ac3 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -698,7 +698,7 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x) return; if (x->outer_mode.encap == XFRM_MODE_TUNNEL) { - switch (x->outer_mode.family) { + switch (skb_dst(skb)->ops->family) { case AF_INET: xo->inner_ipproto = ip_hdr(skb)->protocol; break; From 61fafbee6cfed283c02a320896089f658fa67e56 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Tue, 28 Oct 2025 04:22:48 +0200 Subject: [PATCH 056/543] xfrm: Determine inner GSO type from packet inner protocol The GSO segmentation functions for ESP tunnel mode (xfrm4_tunnel_gso_segment and xfrm6_tunnel_gso_segment) were determining the inner packet's L2 protocol type by checking the static x->inner_mode.family field from the xfrm state. This is unreliable. In tunnel mode, the state's actual inner family could be defined by x->inner_mode.family or by x->inner_mode_iaf.family. Checking only the former can lead to a mismatch with the actual packet being processed, causing GSO to create segments with the wrong L2 header type. This patch fixes the bug by deriving the inner mode directly from the packet's inner protocol stored in XFRM_MODE_SKB_CB(skb)->protocol. Instead of replicating the code, this patch modifies the xfrm_ip2inner_mode helper function. It now correctly returns &x->inner_mode if the selector family (x->sel.family) is already specified, thereby handling both specific and AF_UNSPEC cases appropriately. With this change, ESP GSO can use xfrm_ip2inner_mode to get the correct inner mode. It doesn't affect existing callers, as the updated logic now mirrors the checks they were already performing externally. Fixes: 26dbd66eab80 ("esp: choose the correct inner protocol for GSO on inter address family tunnels") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- net/ipv4/esp4_offload.c | 6 ++++-- net/ipv6/esp6_offload.c | 6 ++++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f3014e4f54fc..0a14daaa5dd4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -536,7 +536,8 @@ static inline int xfrm_af2proto(unsigned int family) static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) { - if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || + if ((x->sel.family != AF_UNSPEC) || + (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) return &x->inner_mode; else diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index e0d94270da28..05828d4cb6cd 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -122,8 +122,10 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6) - : htons(ETH_P_IP); + const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, + XFRM_MODE_SKB_CB(skb)->protocol); + __be16 type = inner_mode->family == AF_INET6 ? htons(ETH_P_IPV6) + : htons(ETH_P_IP); return skb_eth_gso_segment(skb, features, type); } diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 7b41fb4f00b5..22410243ebe8 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -158,8 +158,10 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP) - : htons(ETH_P_IPV6); + const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, + XFRM_MODE_SKB_CB(skb)->protocol); + __be16 type = inner_mode->family == AF_INET ? htons(ETH_P_IP) + : htons(ETH_P_IPV6); return skb_eth_gso_segment(skb, features, type); } From 59630e2ccd728703cc826e3a3515d70f8c7a766c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Wed, 29 Oct 2025 11:50:25 +0200 Subject: [PATCH 057/543] xfrm: Prevent locally generated packets from direct output in tunnel mode Add a check to ensure locally generated packets (skb->sk != NULL) do not use direct output in tunnel mode, as these packets require proper L2 header setup that is handled by the normal XFRM processing path. Fixes: 5eddd76ec2fd ("xfrm: fix tunnel mode TX datapath in packet offload mode") Signed-off-by: Jianbo Liu Reviewed-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index a98b5bf55ac3..54222fcbd7fd 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -772,8 +772,12 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) /* Exclusive direct xmit for tunnel mode, as * some filtering or matching rules may apply * in transport mode. + * Locally generated packets also require + * the normal XFRM path for L2 header setup, + * as the hardware needs the L2 header to match + * for encryption, so skip direct output as well. */ - if (x->props.mode == XFRM_MODE_TUNNEL) + if (x->props.mode == XFRM_MODE_TUNNEL && !skb->sk) return xfrm_dev_direct_output(sk, x, skb); return xfrm_output_resume(sk, skb, 0); From 743c81cdc98fd4fef62a89eb70efff994112c2d9 Mon Sep 17 00:00:00 2001 From: April Grimoire Date: Thu, 23 Oct 2025 00:37:26 +0800 Subject: [PATCH 058/543] HID: apple: Add SONiX AK870 PRO to non_apple_keyboards quirk list SONiX AK870 PRO keyboard pretends to be an apple keyboard by VID:PID, rendering function keys not treated properly. Despite being a SONiX USB DEVICE, it uses a different name, so adding it to the list. Signed-off-by: April Grimoire Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 61404d7a43ee..57da4f86a9fa 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -355,6 +355,7 @@ static const struct apple_key_translation swapped_fn_leftctrl_keys[] = { static const struct apple_non_apple_keyboard non_apple_keyboards[] = { { "SONiX USB DEVICE" }, + { "SONiX AK870 PRO" }, { "Keychron" }, { "AONE" }, { "GANSS" }, From 4d3a13afa8b64dc49293b3eab3e7beac11072c12 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Mon, 20 Oct 2025 10:50:42 -0500 Subject: [PATCH 059/543] HID: amd_sfh: Stop sensor before starting Titas reports that the accelerometer sensor on their laptop only works after a warm boot or unloading/reloading the amd-sfh kernel module. Presumably the sensor is in a bad state on cold boot and failing to start, so explicitly stop it before starting. Cc: stable@vger.kernel.org Fixes: 93ce5e0231d79 ("HID: amd_sfh: Implement SFH1.1 functionality") Reported-by: Titas Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220670 Tested-by: Titas Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Jiri Kosina --- drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c index 0a9b44ce4904..b0bab2a1ddcc 100644 --- a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c +++ b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c @@ -194,6 +194,8 @@ static int amd_sfh1_1_hid_client_init(struct amd_mp2_dev *privdata) if (rc) goto cleanup; + mp2_ops->stop(privdata, cl_data->sensor_idx[i]); + amd_sfh_wait_for_response(privdata, cl_data->sensor_idx[i], DISABLE_SENSOR); writel(0, privdata->mmio + amd_get_p2c_val(privdata, 0)); mp2_ops->start(privdata, info); status = amd_sfh_wait_for_response From da888524c393b4a14727e1a821bdd51313d0a2d3 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Tue, 21 Oct 2025 09:44:09 +0000 Subject: [PATCH 060/543] KVM: arm64: vgic-v3: Trap all if no in-kernel irqchip If there is no in-kernel irqchip for a GICv3 host set all of the trap bits to block all accesses. This fixes the no-vgic-v3 selftest again. Fixes: 3193287ddffb ("KVM: arm64: gic-v3: Only set ICH_HCR traps for v2-on-v3 or v3 guests") Reported-by: Mark Brown Closes: https://lore.kernel.org/all/23072856-6b8c-41e2-93d1-ea8a240a7079@sirena.org.uk Signed-off-by: Sascha Bischoff Reviewed-by: Sebastian Ott Tested-by: Mark Brown Link: https://patch.msgid.link/20251021094358.1963807-1-sascha.bischoff@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-v3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 6fbb4b099855..2f75ef14d339 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -301,7 +301,8 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) return; /* Hide GICv3 sysreg if necessary */ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2 || + !irqchip_in_kernel(vcpu->kvm)) { vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | ICH_HCR_EL2_TC); return; From a24f7afce048e724be072bd063ed864f124daf81 Mon Sep 17 00:00:00 2001 From: Maximilian Dittgen Date: Mon, 20 Oct 2025 16:59:46 +0200 Subject: [PATCH 061/543] KVM: selftests: fix MAPC RDbase target formatting in vgic_lpi_stress Since GITS_TYPER.PTA == 0, the ITS MAPC command demands a CPU ID, rather than a physical redistributor address, for its RDbase command argument. As such, when MAPC-ing guest ITS collections, vgic_lpi_stress iterates over CPU IDs in the range [0, nr_cpus), passing them as the RDbase vcpu_id argument to its_send_mapc_cmd(). However, its_encode_target() in the its_send_mapc_cmd() selftest handler expects RDbase arguments to be formatted with a 16 bit offset, as shown by the 16-bit target_addr right shift its implementation: its_mask_encode(&cmd->raw_cmd[2], target_addr >> 16, 51, 16) At the moment, all CPU IDs passed into its_send_mapc_cmd() have no offset, therefore becoming 0x0 after the bit shift. Thus, when vgic_its_cmd_handle_mapc() receives the ITS command in vgic-its.c, it always interprets the RDbase target CPU as CPU 0. All interrupts sent to collections will be processed by vCPU 0, which defeats the purpose of this multi-vCPU test. Fix by creating procnum_to_rdbase() helper function, which left-shifts the vCPU parameter received by its_send_mapc_cmd 16 bits before passing it to its_encode_target for encoding. Signed-off-by: Maximilian Dittgen Link: https://patch.msgid.link/20251020145946.48288-1-mdittgen@amazon.de Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c index 09f270545646..0e2f8ed90f30 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c @@ -15,6 +15,8 @@ #include "gic_v3.h" #include "processor.h" +#define GITS_COLLECTION_TARGET_SHIFT 16 + static u64 its_read_u64(unsigned long offset) { return readq_relaxed(GITS_BASE_GVA + offset); @@ -163,6 +165,11 @@ static void its_encode_collection(struct its_cmd_block *cmd, u16 col) its_mask_encode(&cmd->raw_cmd[2], col, 15, 0); } +static u64 procnum_to_rdbase(u32 vcpu_id) +{ + return vcpu_id << GITS_COLLECTION_TARGET_SHIFT; +} + #define GITS_CMDQ_POLL_ITERATIONS 0 static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd) @@ -217,7 +224,7 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val its_encode_cmd(&cmd, GITS_CMD_MAPC); its_encode_collection(&cmd, collection_id); - its_encode_target(&cmd, vcpu_id); + its_encode_target(&cmd, procnum_to_rdbase(vcpu_id)); its_encode_valid(&cmd, valid); its_send_cmd(cmdq_base, &cmd); From 92e781c93ebe75e39ecdf78fb8ef1fdf1b63a9f8 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 23 Oct 2025 22:19:29 +0100 Subject: [PATCH 062/543] KVM: arm64: selftests: Add SCTLR2_EL2 to get-reg-list We recently added support for SCTLR2_EL2 to the kernel but did not add it to get-reg-list, resulting in it reporting the missing register when it is available. Add it. Signed-off-by: Mark Brown Link: https://patch.msgid.link/20251023-b4-kvm-arm64-get-reg-list-sctlr-el2-v1-1-088f88ff992a@kernel.org Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/get-reg-list.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c index c9b84eeaab6b..2abef0a86d46 100644 --- a/tools/testing/selftests/kvm/arm64/get-reg-list.c +++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c @@ -63,6 +63,7 @@ static struct feature_id_reg feat_id_regs[] = { REG_FEAT(HDFGWTR2_EL2, ID_AA64MMFR0_EL1, FGT, FGT2), REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP), REG_FEAT(SCTLR2_EL1, ID_AA64MMFR3_EL1, SCTLRX, IMP), + REG_FEAT(SCTLR2_EL2, ID_AA64MMFR3_EL1, SCTLRX, IMP), REG_FEAT(VDISR_EL2, ID_AA64PFR0_EL1, RAS, IMP), REG_FEAT(VSESR_EL2, ID_AA64PFR0_EL1, RAS, IMP), REG_FEAT(VNCR_EL2, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY), @@ -718,6 +719,7 @@ static __u64 el2_regs[] = { SYS_REG(VMPIDR_EL2), SYS_REG(SCTLR_EL2), SYS_REG(ACTLR_EL2), + SYS_REG(SCTLR2_EL2), SYS_REG(HCR_EL2), SYS_REG(MDCR_EL2), SYS_REG(CPTR_EL2), From a186fbcfd845699d51809f7c7e54cf997fe32820 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 24 Oct 2025 00:43:39 +0100 Subject: [PATCH 063/543] KVM: arm64: selftests: Filter ZCR_EL2 in get-reg-list get-reg-list includes ZCR_EL2 in the list of EL2 registers that it looks for when NV is enabled but does not have any feature gate for this register, meaning that testing any combination of features that includes EL2 but does not include SVE will result in a test failure due to a missing register being reported: | The following lines are missing registers: | | ARM64_SYS_REG(3, 4, 1, 2, 0), Add ZCR_EL2 to feat_id_regs so that the test knows not to expect to see it without SVE being enabled. Fixes: 3a90b6f27964 ("KVM: arm64: selftests: get-reg-list: Add base EL2 registers") Signed-off-by: Mark Brown Link: https://patch.msgid.link/20251024-kvm-arm64-get-reg-list-zcr-el2-v1-1-0cd0ff75e22f@kernel.org Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/get-reg-list.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c index 2abef0a86d46..0a3a94c4cca1 100644 --- a/tools/testing/selftests/kvm/arm64/get-reg-list.c +++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c @@ -69,6 +69,7 @@ static struct feature_id_reg feat_id_regs[] = { REG_FEAT(VNCR_EL2, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY), REG_FEAT(CNTHV_CTL_EL2, ID_AA64MMFR1_EL1, VH, IMP), REG_FEAT(CNTHV_CVAL_EL2,ID_AA64MMFR1_EL1, VH, IMP), + REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP), }; bool filter_reg(__u64 reg) From f71f7afd0a0cd3f044cd2f8aba71a1a7229df762 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 16 Oct 2025 17:45:41 +0100 Subject: [PATCH 064/543] KVM: arm64: Check range args for pKVM mem transitions There's currently no verification for host issued ranges in most of the pKVM memory transitions. The end boundary might therefore be subject to overflow and later checks could be evaded. Close this loophole with an additional pfn_range_is_valid() check on a per public function basis. Once this check has passed, it is safe to convert pfn and nr_pages into a phys_addr_t and a size. host_unshare_guest transition is already protected via __check_host_shared_guest(), while assert_host_shared_guest() callers are already ignoring host checks. Signed-off-by: Vincent Donnefort Link: https://patch.msgid.link/20251016164541.3771235-1-vdonnefort@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index ddc8beb55eee..49db32f3ddf7 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -367,6 +367,19 @@ static int host_stage2_unmap_dev_all(void) return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); } +/* + * Ensure the PFN range is contained within PA-range. + * + * This check is also robust to overflows and is therefore a requirement before + * using a pfn/nr_pages pair from an untrusted source. + */ +static bool pfn_range_is_valid(u64 pfn, u64 nr_pages) +{ + u64 limit = BIT(kvm_phys_shift(&host_mmu.arch.mmu) - PAGE_SHIFT); + + return pfn < limit && ((limit - pfn) >= nr_pages); +} + struct kvm_mem_range { u64 start; u64 end; @@ -776,6 +789,9 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) void *virt = __hyp_va(phys); int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); hyp_lock_component(); @@ -804,6 +820,9 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) u64 virt = (u64)__hyp_va(phys); int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); hyp_lock_component(); @@ -887,6 +906,9 @@ int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) u64 size = PAGE_SIZE * nr_pages; int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); if (!ret) @@ -902,6 +924,9 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) u64 size = PAGE_SIZE * nr_pages; int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); if (!ret) @@ -945,6 +970,9 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu if (prot & ~KVM_PGTABLE_PROT_RWX) return -EINVAL; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + ret = __guest_check_transition_size(phys, ipa, nr_pages, &size); if (ret) return ret; From 103e17aac09cdd358133f9e00998b75d6c1f1518 Mon Sep 17 00:00:00 2001 From: Sebastian Ene Date: Fri, 17 Oct 2025 07:57:10 +0000 Subject: [PATCH 065/543] KVM: arm64: Check the untrusted offset in FF-A memory share Verify the offset to prevent OOB access in the hypervisor FF-A buffer in case an untrusted large enough value [U32_MAX - sizeof(struct ffa_composite_mem_region) + 1, U32_MAX] is set from the host kernel. Signed-off-by: Sebastian Ene Acked-by: Will Deacon Link: https://patch.msgid.link/20251017075710.2605118-1-sebastianene@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/ffa.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 4e16f9b96f63..58b7d0c477d7 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -479,7 +479,7 @@ static void __do_ffa_mem_xfer(const u64 func_id, struct ffa_mem_region_attributes *ep_mem_access; struct ffa_composite_mem_region *reg; struct ffa_mem_region *buf; - u32 offset, nr_ranges; + u32 offset, nr_ranges, checked_offset; int ret = 0; if (addr_mbz || npages_mbz || fraglen > len || @@ -516,7 +516,12 @@ static void __do_ffa_mem_xfer(const u64 func_id, goto out_unlock; } - if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) { + if (check_add_overflow(offset, sizeof(struct ffa_composite_mem_region), &checked_offset)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (fraglen < checked_offset) { ret = FFA_RET_INVALID_PARAMETERS; goto out_unlock; } From 53f731f5bba0cf03b751ccceb98b82fadc9ccd1e Mon Sep 17 00:00:00 2001 From: Masami Ichikawa Date: Sun, 21 Sep 2025 14:31:02 +0900 Subject: [PATCH 066/543] HID: hid-ntrig: Prevent memory leak in ntrig_report_version() Use a scope-based cleanup helper for the buffer allocated with kmalloc() in ntrig_report_version() to simplify the cleanup logic and prevent memory leaks (specifically the !hid_is_usb()-case one). [jkosina@suse.com: elaborate on the actual existing leak] Fixes: 185c926283da ("HID: hid-ntrig: fix unable to handle page fault in ntrig_report_version()") Signed-off-by: Masami Ichikawa Signed-off-by: Jiri Kosina --- drivers/hid/hid-ntrig.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/hid/hid-ntrig.c b/drivers/hid/hid-ntrig.c index 0f76e241e0af..a7f10c45f62b 100644 --- a/drivers/hid/hid-ntrig.c +++ b/drivers/hid/hid-ntrig.c @@ -142,13 +142,13 @@ static void ntrig_report_version(struct hid_device *hdev) int ret; char buf[20]; struct usb_device *usb_dev = hid_to_usb_dev(hdev); - unsigned char *data = kmalloc(8, GFP_KERNEL); + unsigned char *data __free(kfree) = kmalloc(8, GFP_KERNEL); if (!hid_is_usb(hdev)) return; if (!data) - goto err_free; + return; ret = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0), USB_REQ_CLEAR_FEATURE, @@ -163,9 +163,6 @@ static void ntrig_report_version(struct hid_device *hdev) hid_info(hdev, "Firmware version: %s (%02x%02x %02x%02x)\n", buf, data[2], data[3], data[4], data[5]); } - -err_free: - kfree(data); } static ssize_t show_phys_width(struct device *dev, From 534ca75e8e3b713514b3f2da85dab96831cf5b2a Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 30 Oct 2025 11:06:25 -0500 Subject: [PATCH 067/543] HID: hid-input: Extend Elan ignore battery quirk to USB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit USB Elan devices have the same problem as the I2C ones with a fake battery device showing up. Reviewed-by: Hans de Goede Reported-by: André Barata Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220722 Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Jiri Kosina --- drivers/hid/hid-input.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index e56e7de53279..2bbb645c2ff4 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -399,10 +399,11 @@ static const struct hid_device_id hid_battery_quirks[] = { { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_CHROMEBOOK_TROGDOR_POMPOM), HID_BATTERY_QUIRK_AVOID_QUERY }, /* - * Elan I2C-HID touchscreens seem to all report a non present battery, - * set HID_BATTERY_QUIRK_IGNORE for all Elan I2C-HID devices. + * Elan HID touchscreens seem to all report a non present battery, + * set HID_BATTERY_QUIRK_IGNORE for all Elan I2C and USB HID devices. */ { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, HID_ANY_ID), HID_BATTERY_QUIRK_IGNORE }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELAN, HID_ANY_ID), HID_BATTERY_QUIRK_IGNORE }, {} }; From 08d70143e3033d267507deb98a5fd187df3e6640 Mon Sep 17 00:00:00 2001 From: Quentin Schulz Date: Wed, 29 Oct 2025 14:50:59 +0100 Subject: [PATCH 068/543] arm64: dts: rockchip: include rk3399-base instead of rk3399 in rk3399-op1 In commit 296602b8e5f7 ("arm64: dts: rockchip: Move RK3399 OPPs to dtsi files for SoC variants"), everything shared between variants of RK3399 was put into rk3399-base.dtsi and the rest in variant-specific DTSI, such as rk3399-t, rk3399-op1, rk3399, etc. Therefore, the variant-specific DTSI should include rk3399-base.dtsi and not another variant's DTSI. rk3399-op1 wrongly includes rk3399 (a variant) DTSI instead of rk3399-base DTSI, let's fix this oversight by including the intended DTSI. Fortunately, this had no impact on the resulting DTB since all nodes were named the same and all node properties were overridden in rk3399-op1.dtsi. This was checked by doing a checksum of rk3399-op1 DTBs before and after this commit. No intended change in behavior. Fixes: 296602b8e5f7 ("arm64: dts: rockchip: Move RK3399 OPPs to dtsi files for SoC variants") Cc: stable@vger.kernel.org Signed-off-by: Quentin Schulz Reviewed-by: Dragan Simic Link: https://patch.msgid.link/20251029-rk3399-op1-include-v1-1-2472ee60e7f8@cherry.de Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi index c4f4f1ff6117..9da6fd82e46b 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi @@ -3,7 +3,7 @@ * Copyright (c) 2016-2017 Fuzhou Rockchip Electronics Co., Ltd */ -#include "rk3399.dtsi" +#include "rk3399-base.dtsi" / { cluster0_opp: opp-table-0 { From 03c7e964a02e388ee168c804add7404eda23908c Mon Sep 17 00:00:00 2001 From: Diederik de Haas Date: Mon, 27 Oct 2025 16:54:28 +0100 Subject: [PATCH 069/543] arm64: dts: rockchip: Fix vccio4-supply on rk3566-pinetab2 Page 13 of the PineTab2 v2 schematic dd 20230417 shows VCCIO4's power source is VCCIO_WL. Page 19 shows that VCCIO_WL is connected to VCCA1V8_PMU, so fix the PineTab2 dtsi to reflect that. Fixes: 1b7e19448f8f ("arm64: dts: rockchip: Add devicetree for Pine64 PineTab2") Cc: stable@vger.kernel.org Reviewed-by: Dragan Simic Signed-off-by: Diederik de Haas Link: https://patch.msgid.link/20251027155724.138096-1-diederik@cknow-tech.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi index d0e38412d56a..08bf40de17ea 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi @@ -789,7 +789,7 @@ &pmu_io_domains { vccio1-supply = <&vccio_acodec>; vccio2-supply = <&vcc_1v8>; vccio3-supply = <&vccio_sd>; - vccio4-supply = <&vcc_1v8>; + vccio4-supply = <&vcca1v8_pmu>; vccio5-supply = <&vcc_1v8>; vccio6-supply = <&vcc1v8_dvp>; vccio7-supply = <&vcc_3v3>; From 083d7af3350e04c428256a3bd10003f63151b6b1 Mon Sep 17 00:00:00 2001 From: Chunhai Guo Date: Mon, 27 Oct 2025 10:52:06 +0800 Subject: [PATCH 070/543] MAINTAINERS: erofs: add myself as reviewer In the past two years, I have focused on EROFS and contributed features including the reserved buffer pool, configurable global buffer pool, and the ongoing direct I/O support for compressed data. I would like to continue contributing to EROFS and help with code reviews. Please CC me on EROFS-related changes. Signed-off-by: Chunhai Guo Acked-by: Gao Xiang Acked-by: Chao Yu Acked-by: Hongbo Li Signed-off-by: Gao Xiang --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 46bd8e033042..f2665f23ad5e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9207,6 +9207,7 @@ R: Yue Hu R: Jeffle Xu R: Sandeep Dhavale R: Hongbo Li +R: Chunhai Guo L: linux-erofs@lists.ozlabs.org S: Maintained W: https://erofs.docs.kernel.org From a1d3bc606bf5c3b3ea811cc2019df6285d75b00f Mon Sep 17 00:00:00 2001 From: Mikhail Kshevetskiy Date: Mon, 3 Nov 2025 04:01:48 +0300 Subject: [PATCH 071/543] mtd: spinand: fmsh: remove QE bit for FM25S01A flash According to datasheet (http://eng.fmsh.com/nvm/FM25S01A_ds_eng.pdf) there is no QE (Quad Enable) bit for FM25S01A flash, so remove it. Fixes: 5f284dc15ca86 ("mtd: spinand: add support for FudanMicro FM25S01A") Signed-off-by: Mikhail Kshevetskiy Tested-by: Tianling Shen Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/fmsh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/spi/fmsh.c b/drivers/mtd/nand/spi/fmsh.c index 8b2097bfc771..c2b9a8c113cb 100644 --- a/drivers/mtd/nand/spi/fmsh.c +++ b/drivers/mtd/nand/spi/fmsh.c @@ -58,7 +58,7 @@ static const struct spinand_info fmsh_spinand_table[] = { SPINAND_INFO_OP_VARIANTS(&read_cache_variants, &write_cache_variants, &update_cache_variants), - SPINAND_HAS_QE_BIT, + 0, SPINAND_ECCINFO(&fm25s01a_ooblayout, NULL)), }; From 97315e7c901a1de60e8ca9b11e0e96d0f9253e18 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 1 Nov 2025 16:25:48 +0300 Subject: [PATCH 072/543] mtd: onenand: Pass correct pointer to IRQ handler This was supposed to pass "onenand" instead of "&onenand" with the ampersand. Passing a random stack address which will be gone when the function ends makes no sense. However the good thing is that the pointer is never used, so this doesn't cause a problem at run time. Fixes: e23abf4b7743 ("mtd: OneNAND: S5PC110: Implement DMA interrupt method") Signed-off-by: Dan Carpenter Signed-off-by: Miquel Raynal --- drivers/mtd/nand/onenand/onenand_samsung.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/onenand/onenand_samsung.c b/drivers/mtd/nand/onenand/onenand_samsung.c index f37a6138e461..6d6aa709a21f 100644 --- a/drivers/mtd/nand/onenand/onenand_samsung.c +++ b/drivers/mtd/nand/onenand/onenand_samsung.c @@ -906,7 +906,7 @@ static int s3c_onenand_probe(struct platform_device *pdev) err = devm_request_irq(&pdev->dev, r->start, s5pc110_onenand_irq, IRQF_SHARED, "onenand", - &onenand); + onenand); if (err) { dev_err(&pdev->dev, "failed to get irq\n"); return err; From 8da0efc3da9312b65f5cbf06e57d284f69222b2e Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 3 Nov 2025 11:58:09 +0000 Subject: [PATCH 073/543] ASoC: doc: cs35l56: Update firmware filename description for B0 silicon Update the text for firmware file naming to show that the l?u? suffix is supported on CS35L56 B0 silicon and ampN was only used on early firmware. The previous version of this text only said that B0 silicon used the ampN suffix. Since kernel 6.16 the driver supports both the old ampN and new l?u? suffix for B0 silicon. New firmwares will use the l?u? suffix. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20251103115809.33953-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- Documentation/sound/codecs/cs35l56.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Documentation/sound/codecs/cs35l56.rst b/Documentation/sound/codecs/cs35l56.rst index 57d1964453e1..d5363b08f515 100644 --- a/Documentation/sound/codecs/cs35l56.rst +++ b/Documentation/sound/codecs/cs35l56.rst @@ -105,10 +105,10 @@ In this example the SSID is 10280c63. The format of the firmware file names is: -SoundWire (except CS35L56 Rev B0): +SoundWire: cs35lxx-b0-dsp1-misc-SSID[-spkidX]-l?u? -SoundWire CS35L56 Rev B0: +SoundWire CS35L56 Rev B0 firmware released before kernel version 6.16: cs35lxx-b0-dsp1-misc-SSID[-spkidX]-ampN Non-SoundWire (HDA and I2S): @@ -127,9 +127,8 @@ Where: * spkidX is an optional part, used for laptops that have firmware configurations for different makes and models of internal speakers. -The CS35L56 Rev B0 continues to use the old filename scheme because a -large number of firmware files have already been published with these -names. +Early firmware for CS35L56 Rev B0 used the ALSA prefix (ampN) as the +filename qualifier. Support for the l?u? qualifier was added in kernel 6.16. Sound Open Firmware and ALSA topology files ------------------------------------------- From 8f05967b022d255412640670915475ac4cdc10e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Nov 2025 08:49:32 -0700 Subject: [PATCH 074/543] MAINTAINERS: correct git location for block layer tree As part of a recent move go exclusively listing git.kernel.org trees for the block and io_uring development, the "BLOCK LAYER" entry wasn't updated as it already used git.kernel.org. However, outside of just moving from git.kernel.dk to git.kernel.org, the "block" part of the trees was also dropped, as the tree serves both block and io_uring development trees. Fix up the "BLOCK LAYER" entry so they all use the same tree. Reported-by: John Garry Reviewed-by: John Garry Signed-off-by: Jens Axboe --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0554bf05b426..b986f4635b7d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4362,7 +4362,7 @@ BLOCK LAYER M: Jens Axboe L: linux-block@vger.kernel.org S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git F: Documentation/ABI/stable/sysfs-block F: Documentation/block/ F: block/ From 56b3c85e153b84f27e6cff39623ba40a1ad299d3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 27 Oct 2025 10:50:21 -0700 Subject: [PATCH 075/543] ftrace: Fix BPF fexit with livepatch When livepatch is attached to the same function as bpf trampoline with a fexit program, bpf trampoline code calls register_ftrace_direct() twice. The first time will fail with -EAGAIN, and the second time it will succeed. This requires register_ftrace_direct() to unregister the address on the first attempt. Otherwise, the bpf trampoline cannot attach. Here is an easy way to reproduce this issue: insmod samples/livepatch/livepatch-sample.ko bpftrace -e 'fexit:cmdline_proc_show {}' ERROR: Unable to attach probe: fexit:vmlinux:cmdline_proc_show... Fix this by cleaning up the hash when register_ftrace_function_nolock hits errors. Also, move the code that resets ops->func and ops->trampoline to the error path of register_ftrace_direct(); and add a helper function reset_direct() in register_ftrace_direct() and unregister_ftrace_direct(). Fixes: d05cb470663a ("ftrace: Fix modification of direct_function hash while in use") Cc: stable@vger.kernel.org # v6.6+ Reported-by: Andrey Grodzovsky Closes: https://lore.kernel.org/live-patching/c5058315a39d4615b333e485893345be@crowdstrike.com/ Cc: Steven Rostedt (Google) Cc: Masami Hiramatsu (Google) Acked-and-tested-by: Andrey Grodzovsky Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/r/20251027175023.1521602-2-song@kernel.org Signed-off-by: Alexei Starovoitov Acked-by: Steven Rostedt (Google) --- kernel/bpf/trampoline.c | 5 ----- kernel/trace/ftrace.c | 20 ++++++++++++++------ 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5949095e51c3..f2cb0b097093 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -479,11 +479,6 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the * trampoline again, and retry register. */ - /* reset fops->func and fops->trampoline for re-register */ - tr->fops->func = NULL; - tr->fops->trampoline = 0; - - /* free im memory and reallocate later */ bpf_tramp_image_free(im); goto again; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 42bd2ba68a82..cbeb7e833131 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5953,6 +5953,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp) free_ftrace_hash(fhp); } +static void reset_direct(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + remove_direct_functions_hash(hash, addr); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -6048,6 +6059,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) ops->direct_call = addr; err = register_ftrace_function_nolock(ops); + if (err) + reset_direct(ops, addr); out_unlock: mutex_unlock(&direct_mutex); @@ -6080,7 +6093,6 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, bool free_filters) { - struct ftrace_hash *hash = ops->func_hash->filter_hash; int err; if (check_direct_multi(ops)) @@ -6090,13 +6102,9 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, mutex_lock(&direct_mutex); err = unregister_ftrace_function(ops); - remove_direct_functions_hash(hash, addr); + reset_direct(ops, addr); mutex_unlock(&direct_mutex); - /* cleanup for possible another register call */ - ops->func = NULL; - ops->trampoline = 0; - if (free_filters) ftrace_free_filter(ops); return err; From 3e9a18e1c3e931abecf501cbb23d28d69f85bb56 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 27 Oct 2025 10:50:22 -0700 Subject: [PATCH 076/543] ftrace: bpf: Fix IPMODIFY + DIRECT in modify_ftrace_direct() ftrace_hash_ipmodify_enable() checks IPMODIFY and DIRECT ftrace_ops on the same kernel function. When needed, ftrace_hash_ipmodify_enable() calls ops->ops_func() to prepare the direct ftrace (BPF trampoline) to share the same function as the IPMODIFY ftrace (livepatch). ftrace_hash_ipmodify_enable() is called in register_ftrace_direct() path, but not called in modify_ftrace_direct() path. As a result, the following operations will break livepatch: 1. Load livepatch to a kernel function; 2. Attach fentry program to the kernel function; 3. Attach fexit program to the kernel function. After 3, the kernel function being used will not be the livepatched version, but the original version. Fix this by adding __ftrace_hash_update_ipmodify() to __modify_ftrace_direct() and adjust some logic around the call. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/r/20251027175023.1521602-3-song@kernel.org Signed-off-by: Alexei Starovoitov Acked-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cbeb7e833131..59cfacb8a5bb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1971,7 +1971,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops) */ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_hash *old_hash, - struct ftrace_hash *new_hash) + struct ftrace_hash *new_hash, + bool update_target) { struct ftrace_page *pg; struct dyn_ftrace *rec, *end = NULL; @@ -2006,10 +2007,13 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (rec->flags & FTRACE_FL_DISABLED) continue; - /* We need to update only differences of filter_hash */ + /* + * Unless we are updating the target of a direct function, + * we only need to update differences of filter_hash + */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); - if (in_old == in_new) + if (!update_target && (in_old == in_new)) continue; if (in_new) { @@ -2020,7 +2024,16 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (is_ipmodify) goto rollback; - FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + /* + * If this is called by __modify_ftrace_direct() + * then it is only changing where the direct + * pointer is jumping to, and the record already + * points to a direct trampoline. If it isn't, + * then it is a bug to update ipmodify on a direct + * caller. + */ + FTRACE_WARN_ON(!update_target && + (rec->flags & FTRACE_FL_DIRECT)); /* * Another ops with IPMODIFY is already @@ -2076,7 +2089,7 @@ static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash, false); } /* Disabling always succeeds */ @@ -2087,7 +2100,7 @@ static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH, false); } static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, @@ -2101,7 +2114,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, if (ftrace_hash_empty(new_hash)) new_hash = NULL; - return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash, false); } static void print_ip_ins(const char *fmt, const unsigned char *p) @@ -6114,7 +6127,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct); static int __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash; + struct ftrace_hash *hash = ops->func_hash->filter_hash; struct ftrace_func_entry *entry, *iter; static struct ftrace_ops tmp_ops = { .func = ftrace_stub, @@ -6134,13 +6147,21 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) if (err) return err; + /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, hash, hash, true); + if (err) + goto out; + /* * Now the ftrace_ops_list_func() is called to do the direct callers. * We can safely change the direct functions attached to each entry. */ mutex_lock(&ftrace_lock); - hash = ops->func_hash->filter_hash; size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(iter, &hash->buckets[i], hlist) { @@ -6155,6 +6176,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) mutex_unlock(&ftrace_lock); +out: /* Removing the tmp_ops will add the updated direct callers to the functions */ unregister_ftrace_function(&tmp_ops); From 62d2d0a33839c28173909616db2ef16e1a4a5071 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 27 Oct 2025 10:50:23 -0700 Subject: [PATCH 077/543] selftests/bpf: Add tests for livepatch + bpf trampoline Both livepatch and BPF trampoline use ftrace. Special attention is needed when livepatch and fexit program touch the same function at the same time, because livepatch updates a kernel function and the BPF trampoline need to call into the right version of the kernel function. Use samples/livepatch/livepatch-sample.ko for the test. The test covers two cases: 1) When a fentry program is loaded first. This exercises the modify_ftrace_direct code path. 2) When a fentry program is loaded first. This exercises the register_ftrace_direct code path. Signed-off-by: Song Liu Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/r/20251027175023.1521602-4-song@kernel.org Signed-off-by: Alexei Starovoitov Acked-by: Steven Rostedt (Google) --- tools/testing/selftests/bpf/config | 3 + .../bpf/prog_tests/livepatch_trampoline.c | 107 ++++++++++++++++++ .../bpf/progs/livepatch_trampoline.c | 30 +++++ 3 files changed, 140 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c create mode 100644 tools/testing/selftests/bpf/progs/livepatch_trampoline.c diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 70b28c1e653e..f2a2fd236ca8 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -50,6 +50,7 @@ CONFIG_IPV6_SIT=y CONFIG_IPV6_TUNNEL=y CONFIG_KEYS=y CONFIG_LIRC=y +CONFIG_LIVEPATCH=y CONFIG_LWTUNNEL=y CONFIG_MODULE_SIG=y CONFIG_MODULE_SRCVERSION_ALL=y @@ -111,6 +112,8 @@ CONFIG_IP6_NF_FILTER=y CONFIG_NF_NAT=y CONFIG_PACKET=y CONFIG_RC_CORE=y +CONFIG_SAMPLES=y +CONFIG_SAMPLE_LIVEPATCH=m CONFIG_SECURITY=y CONFIG_SECURITYFS=y CONFIG_SYN_COOKIES=y diff --git a/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c new file mode 100644 index 000000000000..72aa5376c30e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include "testing_helpers.h" +#include "livepatch_trampoline.skel.h" + +static int load_livepatch(void) +{ + char path[4096]; + + /* CI will set KBUILD_OUTPUT */ + snprintf(path, sizeof(path), "%s/samples/livepatch/livepatch-sample.ko", + getenv("KBUILD_OUTPUT") ? : "../../../.."); + + return load_module(path, env_verbosity > VERBOSE_NONE); +} + +static void unload_livepatch(void) +{ + /* Disable the livepatch before unloading the module */ + system("echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled"); + + unload_module("livepatch_sample", env_verbosity > VERBOSE_NONE); +} + +static void read_proc_cmdline(void) +{ + char buf[4096]; + int fd, ret; + + fd = open("/proc/cmdline", O_RDONLY); + if (!ASSERT_OK_FD(fd, "open /proc/cmdline")) + return; + + ret = read(fd, buf, sizeof(buf)); + if (!ASSERT_GT(ret, 0, "read /proc/cmdline")) + goto out; + + ASSERT_OK(strncmp(buf, "this has been live patched", 26), "strncmp"); + +out: + close(fd); +} + +static void __test_livepatch_trampoline(bool fexit_first) +{ + struct livepatch_trampoline *skel = NULL; + int err; + + skel = livepatch_trampoline__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + goto out; + + skel->bss->my_pid = getpid(); + + if (!fexit_first) { + /* fentry program is loaded first by default */ + err = livepatch_trampoline__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + } else { + /* Manually load fexit program first. */ + skel->links.fexit_cmdline = bpf_program__attach(skel->progs.fexit_cmdline); + if (!ASSERT_OK_PTR(skel->links.fexit_cmdline, "attach_fexit")) + goto out; + + skel->links.fentry_cmdline = bpf_program__attach(skel->progs.fentry_cmdline); + if (!ASSERT_OK_PTR(skel->links.fentry_cmdline, "attach_fentry")) + goto out; + } + + read_proc_cmdline(); + + ASSERT_EQ(skel->bss->fentry_hit, 1, "fentry_hit"); + ASSERT_EQ(skel->bss->fexit_hit, 1, "fexit_hit"); +out: + livepatch_trampoline__destroy(skel); +} + +void test_livepatch_trampoline(void) +{ + int retry_cnt = 0; + +retry: + if (load_livepatch()) { + if (retry_cnt) { + ASSERT_OK(1, "load_livepatch"); + goto out; + } + /* + * Something else (previous run of the same test?) loaded + * the KLP module. Unload the KLP module and retry. + */ + unload_livepatch(); + retry_cnt++; + goto retry; + } + + if (test__start_subtest("fentry_first")) + __test_livepatch_trampoline(false); + + if (test__start_subtest("fexit_first")) + __test_livepatch_trampoline(true); +out: + unload_livepatch(); +} diff --git a/tools/testing/selftests/bpf/progs/livepatch_trampoline.c b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c new file mode 100644 index 000000000000..15579d5bcd91 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include + +int fentry_hit; +int fexit_hit; +int my_pid; + +SEC("fentry/cmdline_proc_show") +int BPF_PROG(fentry_cmdline) +{ + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + fentry_hit = 1; + return 0; +} + +SEC("fexit/cmdline_proc_show") +int BPF_PROG(fexit_cmdline) +{ + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + fexit_hit = 1; + return 0; +} From b98b69c38512c3a8277c83b2d07674fd1ff59625 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sun, 2 Nov 2025 21:10:15 +0200 Subject: [PATCH 078/543] ALSA: usb-audio: add min_mute quirk for SteelSeries Arctis ID 1038:1294 SteelSeries ApS Arctis Pro Wireless is reported to have muted min playback volume. Apply quirk for that. Link: https://gitlab.freedesktop.org/pipewire/pipewire/-/issues/4229#note_3174448 Signed-off-by: Pauli Virtanen Link: https://patch.msgid.link/a83f2694b1f8c37e4667a3cf057ffdc408b0f70d.1762108507.git.pav@iki.fi Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 71638e6dfb20..e5b857129caf 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2267,6 +2267,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_FIXED_RATE), DEVICE_FLG(0x0fd9, 0x0008, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), + DEVICE_FLG(0x1038, 0x1294, /* SteelSeries Arctis Pro Wireless */ + QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x1101, 0x0003, /* Audioengine D1 */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x12d1, 0x3a07, /* Huawei Technologies Co., Ltd. */ From 249d96b492efb7a773296ab2c62179918301c146 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Tue, 4 Nov 2025 13:49:14 +0200 Subject: [PATCH 079/543] ASoC: da7213: Use component driver suspend/resume Since snd_soc_suspend() is invoked through snd_soc_pm_ops->suspend(), and snd_soc_pm_ops is associated with the soc_driver (defined in sound/soc/soc-core.c), and there is no parent-child relationship between the soc_driver and the DA7213 codec driver, the power management subsystem does not enforce a specific suspend/resume order between the DA7213 driver and the soc_driver. Because of this, the different codec component functionalities, called from snd_soc_resume() to reconfigure various functions, can race with the DA7213 struct dev_pm_ops::resume function, leading to misapplied configuration. This occasionally results in clipped sound. Fix this by dropping the struct dev_pm_ops::{suspend, resume} and use instead struct snd_soc_component_driver::{suspend, resume}. This ensures the proper configuration sequence is handled by the ASoC subsystem. Cc: stable@vger.kernel.org Fixes: 431e040065c8 ("ASoC: da7213: Add suspend to RAM support") Signed-off-by: Claudiu Beznea Link: https://patch.msgid.link/20251104114914.2060603-1-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Mark Brown --- sound/soc/codecs/da7213.c | 69 +++++++++++++++++++++++++-------------- sound/soc/codecs/da7213.h | 1 + 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/sound/soc/codecs/da7213.c b/sound/soc/codecs/da7213.c index ae89260ca215..3420011da444 100644 --- a/sound/soc/codecs/da7213.c +++ b/sound/soc/codecs/da7213.c @@ -2124,11 +2124,50 @@ static int da7213_probe(struct snd_soc_component *component) return 0; } +static int da7213_runtime_suspend(struct device *dev) +{ + struct da7213_priv *da7213 = dev_get_drvdata(dev); + + regcache_cache_only(da7213->regmap, true); + regcache_mark_dirty(da7213->regmap); + regulator_bulk_disable(DA7213_NUM_SUPPLIES, da7213->supplies); + + return 0; +} + +static int da7213_runtime_resume(struct device *dev) +{ + struct da7213_priv *da7213 = dev_get_drvdata(dev); + int ret; + + ret = regulator_bulk_enable(DA7213_NUM_SUPPLIES, da7213->supplies); + if (ret < 0) + return ret; + regcache_cache_only(da7213->regmap, false); + return regcache_sync(da7213->regmap); +} + +static int da7213_suspend(struct snd_soc_component *component) +{ + struct da7213_priv *da7213 = snd_soc_component_get_drvdata(component); + + return da7213_runtime_suspend(da7213->dev); +} + +static int da7213_resume(struct snd_soc_component *component) +{ + struct da7213_priv *da7213 = snd_soc_component_get_drvdata(component); + + return da7213_runtime_resume(da7213->dev); +} + static const struct snd_soc_component_driver soc_component_dev_da7213 = { .probe = da7213_probe, .set_bias_level = da7213_set_bias_level, .controls = da7213_snd_controls, .num_controls = ARRAY_SIZE(da7213_snd_controls), + .suspend = da7213_suspend, + .resume = da7213_resume, .dapm_widgets = da7213_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(da7213_dapm_widgets), .dapm_routes = da7213_audio_map, @@ -2175,6 +2214,8 @@ static int da7213_i2c_probe(struct i2c_client *i2c) if (!da7213->fin_min_rate) return -EINVAL; + da7213->dev = &i2c->dev; + i2c_set_clientdata(i2c, da7213); /* Get required supplies */ @@ -2224,31 +2265,9 @@ static void da7213_i2c_remove(struct i2c_client *i2c) pm_runtime_disable(&i2c->dev); } -static int da7213_runtime_suspend(struct device *dev) -{ - struct da7213_priv *da7213 = dev_get_drvdata(dev); - - regcache_cache_only(da7213->regmap, true); - regcache_mark_dirty(da7213->regmap); - regulator_bulk_disable(DA7213_NUM_SUPPLIES, da7213->supplies); - - return 0; -} - -static int da7213_runtime_resume(struct device *dev) -{ - struct da7213_priv *da7213 = dev_get_drvdata(dev); - int ret; - - ret = regulator_bulk_enable(DA7213_NUM_SUPPLIES, da7213->supplies); - if (ret < 0) - return ret; - regcache_cache_only(da7213->regmap, false); - return regcache_sync(da7213->regmap); -} - -static DEFINE_RUNTIME_DEV_PM_OPS(da7213_pm, da7213_runtime_suspend, - da7213_runtime_resume, NULL); +static const struct dev_pm_ops da7213_pm = { + RUNTIME_PM_OPS(da7213_runtime_suspend, da7213_runtime_resume, NULL) +}; static const struct i2c_device_id da7213_i2c_id[] = { { "da7213" }, diff --git a/sound/soc/codecs/da7213.h b/sound/soc/codecs/da7213.h index b9ab791d6b88..29cbf0eb6124 100644 --- a/sound/soc/codecs/da7213.h +++ b/sound/soc/codecs/da7213.h @@ -595,6 +595,7 @@ enum da7213_supplies { /* Codec private data */ struct da7213_priv { struct regmap *regmap; + struct device *dev; struct mutex ctrl_lock; struct regulator_bulk_data supplies[DA7213_NUM_SUPPLIES]; struct clk *mclk; From 8a7348a9ed70bda1c1f51d3f1815bcbdf9f3b38c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 8 Oct 2025 09:52:25 -0400 Subject: [PATCH 080/543] nfsd: fix refcount leak in nfsd_set_fh_dentry() nfsd exports a "pseudo root filesystem" which is used by NFSv4 to find the various exported filesystems using LOOKUP requests from a known root filehandle. NFSv3 uses the MOUNT protocol to find those exported filesystems and so is not given access to the pseudo root filesystem. If a v3 (or v2) client uses a filehandle from that filesystem, nfsd_set_fh_dentry() will report an error, but still stores the export in "struct svc_fh" even though it also drops the reference (exp_put()). This means that when fh_put() is called an extra reference will be dropped which can lead to use-after-free and possible denial of service. Normal NFS usage will not provide a pseudo-root filehandle to a v3 client. This bug can only be triggered by the client synthesising an incorrect filehandle. To fix this we move the assignments to the svc_fh later, after all possible error cases have been detected. Reported-and-tested-by: tianshuo han Fixes: ef7f6c4904d0 ("nfsd: move V4ROOT version check to nfsd_set_fh_dentry()") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Cc: stable@vger.kernel.org Signed-off-by: Chuck Lever --- fs/nfsd/nfsfh.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 3edccc38db42..bd9acfdc7b01 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -269,9 +269,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, dentry); } - fhp->fh_dentry = dentry; - fhp->fh_export = exp; - switch (fhp->fh_maxsize) { case NFS4_FHSIZE: if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR) @@ -293,6 +290,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, goto out; } + fhp->fh_dentry = dentry; + fhp->fh_export = exp; + return 0; out: exp_put(exp); From 4d3dbc2386fe051e44efad663e0ec828b98ab53f Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Oct 2025 16:37:59 -0400 Subject: [PATCH 081/543] nfsd: add missing FATTR4_WORD2_CLONE_BLKSIZE from supported attributes RFC 7862 Section 4.1.2 says that if the server supports CLONE it MUST support clone_blksize attribute. Fixes: d6ca7d2643ee ("NFSD: Implement FATTR4_CLONE_BLKSIZE attribute") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsd.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index f19320018639..b752433c3c2c 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -458,6 +458,7 @@ enum { #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ FATTR4_WORD2_MODE_UMASK | \ + FATTR4_WORD2_CLONE_BLKSIZE | \ NFSD4_2_SECURITY_ATTRS | \ FATTR4_WORD2_XATTR_SUPPORT | \ FATTR4_WORD2_TIME_DELEG_ACCESS | \ From fccac54b0d3d0602f177bb79f203ae6fbea0e32a Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Mon, 27 Oct 2025 13:55:15 +0100 Subject: [PATCH 082/543] pmdomain: samsung: Rework legacy splash-screen handover workaround Limit the workaround for the lack of the proper splash-screen handover handling to the legacy ARM 32bit systems and replace forcing a sync_state by explicite power domain shutdown. This approach lets compiler to optimize it out on newer ARM 64bit systems. Suggested-by: Ulf Hansson Fixes: 0745658aebbe ("pmdomain: samsung: Fix splash-screen handover by enforcing a sync_state") Signed-off-by: Marek Szyprowski Acked-by: Krzysztof Kozlowski Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/pmdomain/samsung/exynos-pm-domains.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/pmdomain/samsung/exynos-pm-domains.c b/drivers/pmdomain/samsung/exynos-pm-domains.c index f53e1bd24798..5c3aa8983087 100644 --- a/drivers/pmdomain/samsung/exynos-pm-domains.c +++ b/drivers/pmdomain/samsung/exynos-pm-domains.c @@ -128,6 +128,15 @@ static int exynos_pd_probe(struct platform_device *pdev) pd->pd.power_on = exynos_pd_power_on; pd->local_pwr_cfg = pm_domain_cfg->local_pwr_cfg; + /* + * Some Samsung platforms with bootloaders turning on the splash-screen + * and handing it over to the kernel, requires the power-domains to be + * reset during boot. + */ + if (IS_ENABLED(CONFIG_ARM) && + of_device_is_compatible(np, "samsung,exynos4210-pd")) + exynos_pd_power_off(&pd->pd); + on = readl_relaxed(pd->base + 0x4) & pd->local_pwr_cfg; pm_genpd_init(&pd->pd, NULL, !on); @@ -146,15 +155,6 @@ static int exynos_pd_probe(struct platform_device *pdev) parent.np, child.np); } - /* - * Some Samsung platforms with bootloaders turning on the splash-screen - * and handing it over to the kernel, requires the power-domains to be - * reset during boot. As a temporary hack to manage this, let's enforce - * a sync_state. - */ - if (!ret) - of_genpd_sync_state(np); - pm_runtime_enable(dev); return ret; } From bbde14682eba21d86f5f3d6fe2d371b1f97f1e61 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 28 Oct 2025 11:16:20 +0800 Subject: [PATCH 083/543] pmdomain: imx: Fix reference count leak in imx_gpc_remove of_get_child_by_name() returns a node pointer with refcount incremented, we should use of_node_put() on it when not needed anymore. Add the missing of_node_put() to avoid refcount leak. Fixes: 721cabf6c660 ("soc: imx: move PGC handling to a new GPC driver") Cc: stable@vger.kernel.org Signed-off-by: Miaoqian Lin Signed-off-by: Ulf Hansson --- drivers/pmdomain/imx/gpc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pmdomain/imx/gpc.c b/drivers/pmdomain/imx/gpc.c index 33991f3c6b55..a34b260274f7 100644 --- a/drivers/pmdomain/imx/gpc.c +++ b/drivers/pmdomain/imx/gpc.c @@ -536,6 +536,8 @@ static void imx_gpc_remove(struct platform_device *pdev) return; } } + + of_node_put(pgc_node); } static struct platform_driver imx_gpc_driver = { From 8819a49f9ff8953475ba09d978d66b50368c095b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 11:58:01 -0700 Subject: [PATCH 084/543] KVM: x86: Unload "FPU" state on INIT if and only if its currently in-use Replace the hack added by commit f958bd2314d1 ("KVM: x86: Fix potential put_fpu() w/o load_fpu() on MPX platform") with a more robust approach of unloading+reloading guest FPU state based on whether or not the vCPU's FPU is currently in-use, i.e. currently loaded. This fixes a bug on hosts that support CET but not MPX, where kvm_arch_vcpu_ioctl_get_mpstate() neglects to load FPU state (it only checks for MPX support) and leads to KVM attempting to put FPU state due to kvm_apic_accept_events() triggering INIT emulation. E.g. on a host with CET but not MPX, syzkaller+KASAN generates: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000004: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000020-0x0000000000000027] CPU: 211 UID: 0 PID: 20451 Comm: syz.9.26 Tainted: G S 6.18.0-smp-DEV #7 NONE Tainted: [S]=CPU_OUT_OF_SPEC Hardware name: Google Izumi/izumi, BIOS 0.20250729.1-0 07/29/2025 RIP: 0010:fpu_swap_kvm_fpstate+0x3ce/0x610 ../arch/x86/kernel/fpu/core.c:377 RSP: 0018:ff1100410c167cc0 EFLAGS: 00010202 RAX: 0000000000000004 RBX: 0000000000000020 RCX: 00000000000001aa RDX: 00000000000001ab RSI: ffffffff817bb960 RDI: 0000000022600000 RBP: dffffc0000000000 R08: ff110040d23c8007 R09: 1fe220081a479000 R10: dffffc0000000000 R11: ffe21c081a479001 R12: ff110040d23c8d98 R13: 00000000fffdc578 R14: 0000000000000000 R15: ff110040d23c8d90 FS: 00007f86dd1876c0(0000) GS:ff11007fc969b000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f86dd186fa8 CR3: 00000040d1dfa003 CR4: 0000000000f73ef0 PKRU: 80000000 Call Trace: kvm_vcpu_reset+0x80d/0x12c0 ../arch/x86/kvm/x86.c:11818 kvm_apic_accept_events+0x1cb/0x500 ../arch/x86/kvm/lapic.c:3489 kvm_arch_vcpu_ioctl_get_mpstate+0xd0/0x4e0 ../arch/x86/kvm/x86.c:12145 kvm_vcpu_ioctl+0x5e2/0xed0 ../virt/kvm/kvm_main.c:4539 __se_sys_ioctl+0x11d/0x1b0 ../fs/ioctl.c:51 do_syscall_x64 ../arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x6e/0x940 ../arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f86de71d9c9 with a very simple reproducer: r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x80b00, 0x0) r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0) ioctl$KVM_CREATE_IRQCHIP(r1, 0xae60) r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0) ioctl$KVM_SET_IRQCHIP(r1, 0x8208ae63, ...) ioctl$KVM_GET_MP_STATE(r2, 0x8004ae98, &(0x7f00000000c0)) Alternatively, the MPX hack in GET_MP_STATE could be extended to cover CET, but from a "don't break existing functionality" perspective, that isn't any less risky than peeking at the state of in_use, and it's far less robust for a long term solution (as evidenced by this bug). Reported-by: Alexander Potapenko Fixes: 69cc3e886582 ("KVM: x86: Add XSS support for CET_KERNEL and CET_USER") Reviewed-by: Yao Yuan Reviewed-by: Chao Gao Link: https://patch.msgid.link/20251030185802.3375059-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b4b5d2d09634..d1e048d14e88 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12137,9 +12137,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int r; vcpu_load(vcpu); - if (kvm_mpx_supported()) - kvm_load_guest_fpu(vcpu); - kvm_vcpu_srcu_read_lock(vcpu); r = kvm_apic_accept_events(vcpu); @@ -12156,9 +12153,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, out: kvm_vcpu_srcu_read_unlock(vcpu); - - if (kvm_mpx_supported()) - kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); return r; } @@ -12788,6 +12782,7 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) { struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; u64 xfeatures_mask; + bool fpu_in_use; int i; /* @@ -12811,13 +12806,23 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX); /* - * All paths that lead to INIT are required to load the guest's FPU - * state (because most paths are buried in KVM_RUN). + * Unload guest FPU state (if necessary) before zeroing XSTATE fields + * as the kernel can only modify the state when its resident in memory, + * i.e. when it's not loaded into hardware. + * + * WARN if the vCPU's desire to run, i.e. whether or not its in KVM_RUN, + * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the + * only path that can trigger INIT emulation _and_ loads FPU state, and + * KVM_RUN should _always_ load FPU state. */ - kvm_put_guest_fpu(vcpu); + WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use); + fpu_in_use = fpstate->in_use; + if (fpu_in_use) + kvm_put_guest_fpu(vcpu); for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX) fpstate_clear_xstate_component(fpstate, i); - kvm_load_guest_fpu(vcpu); + if (fpu_in_use) + kvm_load_guest_fpu(vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) From 9bc610b6a2a71d1a6acac27e82a0bc8ca861c7ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 11:58:02 -0700 Subject: [PATCH 085/543] KVM: x86: Harden KVM against imbalanced load/put of guest FPU state Assert, via KVM_BUG_ON(), that guest FPU state isn't/is in use when loading/putting the FPU to help detect KVM bugs without needing an assist from KASAN. If an imbalanced load/put is detected, skip the redundant load/put to avoid clobbering guest state and/or crashing the host. Note, kvm_access_xstate_msr() already provides a similar assertion. Reviewed-by: Yao Yuan Reviewed-by: Chao Gao Link: https://patch.msgid.link/20251030185802.3375059-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d1e048d14e88..67e5f735adf2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11807,6 +11807,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); @@ -11815,6 +11818,9 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); From cab4098be41826a91b55cdc851196d73d7057f9c Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Mon, 27 Oct 2025 23:01:41 -0700 Subject: [PATCH 086/543] KVM: x86: Call out MSR_IA32_S_CET is not handled by XSAVES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the comment above is_xstate_managed_msr() to note that MSR_IA32_S_CET isn't saved/restored by XSAVES/XRSTORS. MSR_IA32_S_CET isn't part of CET_U/S state as the SDM states: The register state used by Control-Flow Enforcement Technology (CET) comprises the two 64-bit MSRs (IA32_U_CET and IA32_PL3_SSP) that manage CET when CPL = 3 (CET_U state); and the three 64-bit MSRs (IA32_PL0_SSP–IA32_PL2_SSP) that manage CET when CPL < 3 (CET_S state). Opportunistically shift the snippet about the safety of loading certain MSRs to the function comment for kvm_access_xstate_msr(), which is where the MSRs are actually loaded into hardware. Fixes: e44eb58334bb ("KVM: x86: Load guest FPU state when access XSAVE-managed MSRs") Signed-off-by: Chao Gao Link: https://patch.msgid.link/20251028060142.29830-1-chao.gao@intel.com [sean: shift snippet about safety to kvm_access_xstate_msr()] Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 67e5f735adf2..c9c2aa6f4705 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3874,15 +3874,9 @@ static void record_steal_time(struct kvm_vcpu *vcpu) /* * Returns true if the MSR in question is managed via XSTATE, i.e. is context - * switched with the rest of guest FPU state. Note! S_CET is _not_ context - * switched via XSTATE even though it _is_ saved/restored via XSAVES/XRSTORS. - * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields, - * the value saved/restored via XSTATE is always the host's value. That detail - * is _extremely_ important, as the guest's S_CET must _never_ be resident in - * hardware while executing in the host. Loading guest values for U_CET and - * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to - * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower - * privilege levels, i.e. are effectively only consumed by userspace as well. + * switched with the rest of guest FPU state. + * + * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS. */ static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) { @@ -3905,6 +3899,11 @@ static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) * MSR that is managed via XSTATE. Note, the caller is responsible for doing * the initial FPU load, this helper only ensures that guest state is resident * in hardware (the kernel can load its FPU state in IRQ context). + * + * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the + * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only + * consumed when transitioning to lower privilege levels, i.e. are effectively + * only consumed by userspace as well. */ static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info, From 59a217ced3e7af849cc84fce36d8bfe225976e27 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 12:06:41 -0700 Subject: [PATCH 087/543] KVM: SVM: Initialize per-CPU svm_data at the end of hardware setup Setup the per-CPU SVM data structures at the very end of hardware setup so that svm_hardware_unsetup() can be used in svm_hardware_setup() to unwind AVIC setup (for the GALog notifier). Alternatively, the error path could do an explicit, manual unwind, e.g. by adding a helper to free the per-CPU structures. But the per-CPU allocations have no interactions or dependencies, i.e. can comfortably live at the end, and so converting to a manual unwind would introduce churn and code without providing any immediate advantage. Link: https://patch.msgid.link/20251016190643.80529-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 153c12dbf3eb..efc3a7adebef 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5386,12 +5386,6 @@ static __init int svm_hardware_setup(void) svm_hv_hardware_setup(); - for_each_possible_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err; - } - enable_apicv = avic_hardware_setup(); if (!enable_apicv) { enable_ipiv = false; @@ -5435,6 +5429,13 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; + + for_each_possible_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } + return 0; err: From adc6ae9729719be5e74219aaafb95e60a9e9950e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 12:06:42 -0700 Subject: [PATCH 088/543] KVM: SVM: Unregister KVM's GALog notifier on kvm-amd.ko exit Unregister the GALog notifier (used to get notified of wake events for blocking vCPUs) on kvm-amd.ko exit so that a KVM or IOMMU driver bug that results in a spurious GALog event "only" results in a spurious IRQ, and doesn't trigger a use-after-free due to executing unloaded module code. Fixes: 5881f73757cc ("svm: Introduce AMD IOMMU avic_ga_log_notifier") Reported-by: Hou Wenlong Closes: https://lore.kernel.org/all/20250918130320.GA119526@k08j02272.eu95sqa Link: https://patch.msgid.link/20251016190643.80529-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 6 ++++++ arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 9 insertions(+) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index f286b5706d7c..3ab74f2bd584 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1243,3 +1243,9 @@ bool __init avic_hardware_setup(void) return true; } + +void avic_hardware_unsetup(void) +{ + if (avic) + amd_iommu_register_ga_log_notifier(NULL); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index efc3a7adebef..76055c0ba177 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -921,6 +921,8 @@ static void svm_hardware_unsetup(void) { int cpu; + avic_hardware_unsetup(); + sev_hardware_unsetup(); for_each_possible_cpu(cpu) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e4b04f435b3d..b0fe40c21728 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -805,6 +805,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; ) bool __init avic_hardware_setup(void); +void avic_hardware_unsetup(void); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); From aaac099459f932b9dbaf85ca2a7251633cc213d0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 12:06:43 -0700 Subject: [PATCH 089/543] KVM: SVM: Make avic_ga_log_notifier() local to avic.c Make amd_iommu_register_ga_log_notifier() a local symbol now that it's defined and used purely within avic.c. No functional change intended. Fixes: 4bdec12aa8d6 ("KVM: SVM: Detect X2APIC virtualization (x2AVIC) support") Link: https://patch.msgid.link/20251016190643.80529-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 2 +- arch/x86/kvm/svm/svm.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 3ab74f2bd584..89864fee6e83 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -216,7 +216,7 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm) * This function is called from IOMMU driver to notify * SVM to schedule in a particular vCPU of a particular VM. */ -int avic_ga_log_notifier(u32 ga_tag) +static int avic_ga_log_notifier(u32 ga_tag) { unsigned long flags; struct kvm_svm *kvm_svm; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index b0fe40c21728..8c36ee0d67ef 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -806,7 +806,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; bool __init avic_hardware_setup(void); void avic_hardware_unsetup(void); -int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb); From fd92bd3b4445342e55f2c541c577796e0c3f1b8a Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 30 Oct 2025 15:41:30 -0400 Subject: [PATCH 090/543] KVM: SVM: switch to raw spinlock for svm->ir_list_lock Use a raw spinlock for vcpu_svm.ir_list_lock as the lock can be taken during schedule() via kvm_sched_out() => __avic_vcpu_put(), and "normal" spinlocks are sleepable locks when PREEMPT_RT=y. This fixes the following lockdep warning: ============================= [ BUG: Invalid wait context ] 6.12.0-146.1640_2124176644.el10.x86_64+debug #1 Not tainted ----------------------------- qemu-kvm/38299 is trying to lock: ff11000239725600 (&svm->ir_list_lock){....}-{3:3}, at: __avic_vcpu_put+0xfd/0x300 [kvm_amd] other info that might help us debug this: context-{5:5} 2 locks held by qemu-kvm/38299: #0: ff11000239723ba8 (&vcpu->mutex){+.+.}-{4:4}, at: kvm_vcpu_ioctl+0x240/0xe00 [kvm] #1: ff11000b906056d8 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x2e/0x130 stack backtrace: CPU: 1 UID: 0 PID: 38299 Comm: qemu-kvm Kdump: loaded Not tainted 6.12.0-146.1640_2124176644.el10.x86_64+debug #1 PREEMPT(voluntary) Hardware name: AMD Corporation QUARTZ/QUARTZ, BIOS RQZ100AB 09/14/2023 Call Trace: dump_stack_lvl+0x6f/0xb0 __lock_acquire+0x921/0xb80 lock_acquire.part.0+0xbe/0x270 _raw_spin_lock_irqsave+0x46/0x90 __avic_vcpu_put+0xfd/0x300 [kvm_amd] svm_vcpu_put+0xfa/0x130 [kvm_amd] kvm_arch_vcpu_put+0x48c/0x790 [kvm] kvm_sched_out+0x161/0x1c0 [kvm] prepare_task_switch+0x36b/0xf60 __schedule+0x4f7/0x1890 schedule+0xd4/0x260 xfer_to_guest_mode_handle_work+0x54/0xc0 vcpu_run+0x69a/0xa70 [kvm] kvm_arch_vcpu_ioctl_run+0xdc0/0x17e0 [kvm] kvm_vcpu_ioctl+0x39f/0xe00 [kvm] Signed-off-by: Maxim Levitsky Link: https://patch.msgid.link/20251030194130.307900-1-mlevitsk@redhat.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 16 ++++++++-------- arch/x86/kvm/svm/svm.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 89864fee6e83..fef00546c885 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -788,7 +788,7 @@ int avic_init_vcpu(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); + raw_spin_lock_init(&svm->ir_list_lock); if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) return 0; @@ -816,9 +816,9 @@ static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) if (!vcpu) return; - spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); + raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); list_del(&irqfd->vcpu_list); - spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); + raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); } int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, @@ -855,7 +855,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * list of IRQs being posted to the vCPU, to ensure the IRTE * isn't programmed with stale pCPU/IsRunning information. */ - guard(spinlock_irqsave)(&svm->ir_list_lock); + guard(raw_spinlock_irqsave)(&svm->ir_list_lock); /* * Update the target pCPU for IOMMU doorbells if the vCPU is @@ -972,7 +972,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, * up-to-date entry information, or that this task will wait until * svm_ir_list_add() completes to set the new target pCPU. */ - spin_lock_irqsave(&svm->ir_list_lock, flags); + raw_spin_lock_irqsave(&svm->ir_list_lock, flags); entry = svm->avic_physical_id_entry; WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); @@ -997,7 +997,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); - spin_unlock_irqrestore(&svm->ir_list_lock, flags); + raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); } void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1035,7 +1035,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) * or that this task will wait until svm_ir_list_add() completes to * mark the vCPU as not running. */ - spin_lock_irqsave(&svm->ir_list_lock, flags); + raw_spin_lock_irqsave(&svm->ir_list_lock, flags); avic_update_iommu_vcpu_affinity(vcpu, -1, action); @@ -1059,7 +1059,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) svm->avic_physical_id_entry = entry; - spin_unlock_irqrestore(&svm->ir_list_lock, flags); + raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); } void avic_vcpu_put(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 8c36ee0d67ef..c856d8e0f95e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -329,7 +329,7 @@ struct vcpu_svm { * back into remapped mode). */ struct list_head ir_list; - spinlock_t ir_list_lock; + raw_spinlock_t ir_list_lock; struct vcpu_sev_es_state sev_es; From ae431059e75d36170a5ae6b44cc4d06d43613215 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Nov 2025 17:12:05 -0800 Subject: [PATCH 091/543] KVM: guest_memfd: Remove bindings on memslot deletion when gmem is dying When unbinding a memslot from a guest_memfd instance, remove the bindings even if the guest_memfd file is dying, i.e. even if its file refcount has gone to zero. If the memslot is freed before the file is fully released, nullifying the memslot side of the binding in kvm_gmem_release() will write to freed memory, as detected by syzbot+KASAN: ================================================================== BUG: KASAN: slab-use-after-free in kvm_gmem_release+0x176/0x440 virt/kvm/guest_memfd.c:353 Write of size 8 at addr ffff88807befa508 by task syz.0.17/6022 CPU: 0 UID: 0 PID: 6022 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/02/2025 Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x240 mm/kasan/report.c:482 kasan_report+0x118/0x150 mm/kasan/report.c:595 kvm_gmem_release+0x176/0x440 virt/kvm/guest_memfd.c:353 __fput+0x44c/0xa70 fs/file_table.c:468 task_work_run+0x1d4/0x260 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop+0xe9/0x130 kernel/entry/common.c:43 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline] syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline] syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline] do_syscall_64+0x2bd/0xfa0 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fbeeff8efc9 Allocated by task 6023: kasan_save_stack mm/kasan/common.c:56 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:77 poison_kmalloc_redzone mm/kasan/common.c:397 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:414 kasan_kmalloc include/linux/kasan.h:262 [inline] __kmalloc_cache_noprof+0x3e2/0x700 mm/slub.c:5758 kmalloc_noprof include/linux/slab.h:957 [inline] kzalloc_noprof include/linux/slab.h:1094 [inline] kvm_set_memory_region+0x747/0xb90 virt/kvm/kvm_main.c:2104 kvm_vm_ioctl_set_memory_region+0x6f/0xd0 virt/kvm/kvm_main.c:2154 kvm_vm_ioctl+0x957/0xc60 virt/kvm/kvm_main.c:5201 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:597 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:583 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 6023: kasan_save_stack mm/kasan/common.c:56 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:77 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:584 poison_slab_object mm/kasan/common.c:252 [inline] __kasan_slab_free+0x5c/0x80 mm/kasan/common.c:284 kasan_slab_free include/linux/kasan.h:234 [inline] slab_free_hook mm/slub.c:2533 [inline] slab_free mm/slub.c:6622 [inline] kfree+0x19a/0x6d0 mm/slub.c:6829 kvm_set_memory_region+0x9c4/0xb90 virt/kvm/kvm_main.c:2130 kvm_vm_ioctl_set_memory_region+0x6f/0xd0 virt/kvm/kvm_main.c:2154 kvm_vm_ioctl+0x957/0xc60 virt/kvm/kvm_main.c:5201 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:597 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:583 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Deliberately don't acquire filemap invalid lock when the file is dying as the lifecycle of f_mapping is outside the purview of KVM. Dereferencing the mapping is *probably* fine, but there's no need to invalidate anything as memslot deletion is responsible for zapping SPTEs, and the only code that can access the dying file is kvm_gmem_release(), whose core code is mutually exclusive with unbinding. Note, the mutual exclusivity is also what makes it safe to access the bindings on a dying gmem instance. Unbinding either runs with slots_lock held, or after the last reference to the owning "struct kvm" is put, and kvm_gmem_release() nullifies the slot pointer under slots_lock, and puts its reference to the VM after that is done. Reported-by: syzbot+2479e53d0db9b32ae2aa@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68fa7a22.a70a0220.3bf6c6.008b.GAE@google.com Tested-by: syzbot+2479e53d0db9b32ae2aa@syzkaller.appspotmail.com Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Cc: stable@vger.kernel.org Cc: Hillf Danton Reviewed-By: Vishal Annapurve Link: https://patch.msgid.link/20251104011205.3853541-1-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 47 +++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index fbca8c0972da..ffadc5ee8e04 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -623,24 +623,11 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, return r; } -void kvm_gmem_unbind(struct kvm_memory_slot *slot) +static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct kvm_gmem *gmem) { unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; - struct kvm_gmem *gmem; - struct file *file; - /* - * Nothing to do if the underlying file was already closed (or is being - * closed right now), kvm_gmem_release() invalidates all bindings. - */ - file = kvm_gmem_get_file(slot); - if (!file) - return; - - gmem = file->private_data; - - filemap_invalidate_lock(file->f_mapping); xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); /* @@ -648,6 +635,38 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) * cannot see this memslot. */ WRITE_ONCE(slot->gmem.file, NULL); +} + +void kvm_gmem_unbind(struct kvm_memory_slot *slot) +{ + struct file *file; + + /* + * Nothing to do if the underlying file was _already_ closed, as + * kvm_gmem_release() invalidates and nullifies all bindings. + */ + if (!slot->gmem.file) + return; + + file = kvm_gmem_get_file(slot); + + /* + * However, if the file is _being_ closed, then the bindings need to be + * removed as kvm_gmem_release() might not run until after the memslot + * is freed. Note, modifying the bindings is safe even though the file + * is dying as kvm_gmem_release() nullifies slot->gmem.file under + * slots_lock, and only puts its reference to KVM after destroying all + * bindings. I.e. reaching this point means kvm_gmem_release() hasn't + * yet destroyed the bindings or freed the gmem_file, and can't do so + * until the caller drops slots_lock. + */ + if (!file) { + __kvm_gmem_unbind(slot, slot->gmem.file->private_data); + return; + } + + filemap_invalidate_lock(file->f_mapping); + __kvm_gmem_unbind(slot, file->private_data); filemap_invalidate_unlock(file->f_mapping); fput(file); From d83f1512758f4ef6fc5e83219fe7eeeb6b428ea4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 1 Nov 2025 16:25:27 +0300 Subject: [PATCH 092/543] Input: imx_sc_key - fix memory corruption on unload This is supposed to be "priv" but we accidentally pass "&priv" which is an address in the stack and so it will lead to memory corruption when the imx_sc_key_action() function is called. Remove the &. Fixes: 768062fd1284 ("Input: imx_sc_key - use devm_add_action_or_reset() to handle all cleanups") Signed-off-by: Dan Carpenter Reviewed-by: Peng Fan Reviewed-by: Frank Li Link: https://patch.msgid.link/aQYKR75r2VMFJutT@stanley.mountain Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/imx_sc_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/imx_sc_key.c b/drivers/input/keyboard/imx_sc_key.c index d18839f1f4f6..b620cd310cdb 100644 --- a/drivers/input/keyboard/imx_sc_key.c +++ b/drivers/input/keyboard/imx_sc_key.c @@ -158,7 +158,7 @@ static int imx_sc_key_probe(struct platform_device *pdev) return error; } - error = devm_add_action_or_reset(&pdev->dev, imx_sc_key_action, &priv); + error = devm_add_action_or_reset(&pdev->dev, imx_sc_key_action, priv); if (error) return error; From 9c16e4d216d8103a8178bbe070eb21f779f190c0 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Tue, 4 Nov 2025 18:45:18 +0100 Subject: [PATCH 093/543] arm64: defconfig: Fix V3D deferred probe timeout The commit 4adc20ba95d4 ("ARM: dts: broadcom: rpi: Switch to V3D firmware clock") causes a regression in arm64 developer setups, which stores the kernel modules via NFS. Before this change the involved V3D clock provider was builtin, but after this DT change the clk-raspberrypi is responsible for V3D and for arm64/defconfig this driver is build as a kernel module. In case these kernel modules are provided via NFS this takes too long and the PM domain core give up before the clock driver could be loaded: v3d fec00000.gpu: deferred probe timeout, ignoring dependency So resolve this issue by making this critical driver builtin. Reported-by: Mark Brown Closes: https://lore.kernel.org/linux-arm-kernel/9ebda74e-e700-4fbe-bca5-382f92417a9c@sirena.org.uk/ Fixes: 4adc20ba95d4 ("ARM: dts: broadcom: rpi: Switch to V3D firmware clock") Signed-off-by: Stefan Wahren Link: https://lore.kernel.org/r/20251104174518.11783-1-wahrenst@gmx.net Signed-off-by: Florian Fainelli --- arch/arm64/configs/defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index e3a2d37bd104..1a48faad2473 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1341,7 +1341,7 @@ CONFIG_COMMON_CLK_RS9_PCIE=y CONFIG_COMMON_CLK_VC3=y CONFIG_COMMON_CLK_VC5=y CONFIG_COMMON_CLK_BD718XX=m -CONFIG_CLK_RASPBERRYPI=m +CONFIG_CLK_RASPBERRYPI=y CONFIG_CLK_IMX8MM=y CONFIG_CLK_IMX8MN=y CONFIG_CLK_IMX8MP=y From 3d1c795bdef43363ed1ff71e3f476d86c22e059b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Thu, 2 Oct 2025 21:48:52 +0200 Subject: [PATCH 094/543] ARM: dts: BCM53573: Fix address of Luxul XAP-1440's Ethernet PHY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Luxul XAP-1440 has BCM54210E PHY at address 25. Fixes: 44ad82078069 ("ARM: dts: BCM53573: Fix Ethernet info for Luxul devices") Signed-off-by: Rafał Miłecki Link: https://lore.kernel.org/r/20251002194852.13929-1-zajec5@gmail.com Signed-off-by: Florian Fainelli --- arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts b/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts index ac44c745bdf8..a39a021a3910 100644 --- a/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts +++ b/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts @@ -55,8 +55,8 @@ &gmac0 { mdio { /delete-node/ switch@1e; - bcm54210e: ethernet-phy@0 { - reg = <0>; + bcm54210e: ethernet-phy@25 { + reg = <25>; }; }; }; From e08969c4d65ac31297fcb4d31d4808c789152f68 Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Tue, 4 Nov 2025 07:03:10 +0000 Subject: [PATCH 095/543] Input: cros_ec_keyb - fix an invalid memory access If cros_ec_keyb_register_matrix() isn't called (due to `buttons_switches_only`) in cros_ec_keyb_probe(), `ckdev->idev` remains NULL. An invalid memory access is observed in cros_ec_keyb_process() when receiving an EC_MKBP_EVENT_KEY_MATRIX event in cros_ec_keyb_work() in such case. Unable to handle kernel read from unreadable memory at virtual address 0000000000000028 ... x3 : 0000000000000000 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000 Call trace: input_event cros_ec_keyb_work blocking_notifier_call_chain ec_irq_thread It's still unknown about why the kernel receives such malformed event, in any cases, the kernel shouldn't access `ckdev->idev` and friends if the driver doesn't intend to initialize them. Signed-off-by: Tzung-Bi Shih Link: https://patch.msgid.link/20251104070310.3212712-1-tzungbi@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/cros_ec_keyb.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c index c1e53d87c8a7..067f2cd57e04 100644 --- a/drivers/input/keyboard/cros_ec_keyb.c +++ b/drivers/input/keyboard/cros_ec_keyb.c @@ -261,6 +261,12 @@ static int cros_ec_keyb_work(struct notifier_block *nb, case EC_MKBP_EVENT_KEY_MATRIX: pm_wakeup_event(ckdev->dev, 0); + if (!ckdev->idev) { + dev_warn_once(ckdev->dev, + "Unexpected key matrix event\n"); + return NOTIFY_OK; + } + if (ckdev->ec->event_size != ckdev->cols) { dev_err(ckdev->dev, "Discarded incomplete key matrix event.\n"); From ea0714d61dea6e00b853a0116d0afe2b2fe70ef3 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 4 Nov 2025 22:54:25 +0000 Subject: [PATCH 096/543] bpf:add _impl suffix for bpf_task_work_schedule* kfuncs Rename: bpf_task_work_schedule_resume()->bpf_task_work_schedule_resume_impl() bpf_task_work_schedule_signal()->bpf_task_work_schedule_signal_impl() This aligns task work scheduling kfuncs with the established naming scheme for kfuncs with the bpf_prog_aux argument provided by the verifier implicitly. This convention will be taken advantage of with the upcoming KF_IMPLICIT_ARGS feature to preserve backwards compatibility to BPF programs. Acked-by: Andrii Nakryiko Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20251104-implv2-v3-1-4772b9ae0e06@meta.com Signed-off-by: Alexei Starovoitov Acked-by: Ihor Solodrai --- kernel/bpf/helpers.c | 24 +++++++++++-------- kernel/bpf/verifier.c | 12 +++++----- tools/testing/selftests/bpf/progs/task_work.c | 6 ++--- .../selftests/bpf/progs/task_work_fail.c | 8 +++---- .../selftests/bpf/progs/task_work_stress.c | 4 ++-- 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index eb25e70e0bdc..33173b027ccf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4169,7 +4169,8 @@ static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work } /** - * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4178,15 +4179,17 @@ static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); } /** - * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4195,9 +4198,10 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } @@ -4377,8 +4381,8 @@ BTF_ID_FLAGS(func, bpf_strnstr); BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff40e5e65c43..8314518c8d93 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12259,8 +12259,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, - KF_bpf_task_work_schedule_signal, - KF_bpf_task_work_schedule_resume, + KF_bpf_task_work_schedule_signal_impl, + KF_bpf_task_work_schedule_resume_impl, }; BTF_ID_LIST(special_kfunc_list) @@ -12331,13 +12331,13 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) -BTF_ID(func, bpf_task_work_schedule_signal) -BTF_ID(func, bpf_task_work_schedule_resume) +BTF_ID(func, bpf_task_work_schedule_signal_impl) +BTF_ID(func, bpf_task_work_schedule_resume_impl) static bool is_task_work_add_kfunc(u32 func_id) { - return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || - func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; } static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c index 23217f06a3ec..663a80990f8f 100644 --- a/tools/testing/selftests/bpf/progs/task_work.c +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -66,7 +66,7 @@ int oncpu_hash_map(struct pt_regs *args) if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); return 0; } @@ -80,7 +80,7 @@ int oncpu_array_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work, NULL); + bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL); return 0; } @@ -102,6 +102,6 @@ int oncpu_lru_map(struct pt_regs *args) work = bpf_map_lookup_elem(&lrumap, &key); if (!work || work->data[0]) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 77fe8f28facd..1270953fd092 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); return 0; } @@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args) struct bpf_task_work tw; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume(task, &tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL); return 0; } @@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args) struct task_struct *task; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume(task, NULL, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL); return 0; } @@ -91,6 +91,6 @@ int map_null(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c index 90fca06fff56..55e555f7f41b 100644 --- a/tools/testing/selftests/bpf/progs/task_work_stress.c +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -51,8 +51,8 @@ int schedule_task_work(void *ctx) if (!work) return 0; } - err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, - process_work, NULL); + err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work, NULL); if (err) __sync_fetch_and_add(&schedule_error, 1); else From 137cc92ffe2e71705fce112656a460d924934ebe Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 4 Nov 2025 22:54:26 +0000 Subject: [PATCH 097/543] bpf: add _impl suffix for bpf_stream_vprintk() kfunc Rename bpf_stream_vprintk() to bpf_stream_vprintk_impl(). This makes bpf_stream_vprintk() follow the already established "_impl" suffix-based naming convention for kfuncs with the bpf_prog_aux argument provided by the verifier implicitly. This convention will be taken advantage of with the upcoming KF_IMPLICIT_ARGS feature to preserve backwards compatibility to BPF programs. Acked-by: Andrii Nakryiko Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20251104-implv2-v3-2-4772b9ae0e06@meta.com Signed-off-by: Alexei Starovoitov Acked-by: Ihor Solodrai --- kernel/bpf/helpers.c | 2 +- kernel/bpf/stream.c | 3 ++- .../bpftool/Documentation/bpftool-prog.rst | 2 +- tools/lib/bpf/bpf_helpers.h | 26 +++++++++---------- .../testing/selftests/bpf/progs/stream_fail.c | 6 ++--- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 33173b027ccf..e4007fea4909 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4380,7 +4380,7 @@ BTF_ID_FLAGS(func, bpf_strnstr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index eb6c5a21c2ef..ff16c631951b 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -355,7 +355,8 @@ __bpf_kfunc_start_defs(); * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the * enum in headers. */ -__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog) +__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, void *aux__prog) { struct bpf_bprintf_data data = { .get_bin_args = true, diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 009633294b09..35aeeaf5f711 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -182,7 +182,7 @@ bpftool prog tracelog bpftool prog tracelog { stdout | stderr } *PROG* Dump the BPF stream of the program. BPF programs can write to these streams - at runtime with the **bpf_stream_vprintk**\ () kfunc. The kernel may write + at runtime with the **bpf_stream_vprintk_impl**\ () kfunc. The kernel may write error messages to the standard error stream. This facility should be used only for debugging purposes. diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 80c028540656..d4e4e388e625 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -315,20 +315,20 @@ enum libbpf_tristate { ___param, sizeof(___param)); \ }) -extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, - __u32 len__sz, void *aux__prog) __weak __ksym; +extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, + __u32 len__sz, void *aux__prog) __weak __ksym; -#define bpf_stream_printk(stream_id, fmt, args...) \ -({ \ - static const char ___fmt[] = fmt; \ - unsigned long long ___param[___bpf_narg(args)]; \ - \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - ___bpf_fill(___param, args); \ - _Pragma("GCC diagnostic pop") \ - \ - bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param), NULL);\ +#define bpf_stream_printk(stream_id, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_stream_vprintk_impl(stream_id, ___fmt, ___param, sizeof(___param), NULL); \ }) /* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index b4a0d0cc8ec8..3662515f0107 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -10,7 +10,7 @@ SEC("syscall") __failure __msg("Possibly NULL pointer passed") int stream_vprintk_null_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL); return 0; } @@ -18,7 +18,7 @@ SEC("syscall") __failure __msg("R3 type=scalar expected=") int stream_vprintk_scalar_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL); return 0; } @@ -26,7 +26,7 @@ SEC("syscall") __failure __msg("arg#1 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL); return 0; } From 636f4618b1cd96f6b5a2b8c7c4f665c8533ecf13 Mon Sep 17 00:00:00 2001 From: Haotian Zhang Date: Wed, 29 Oct 2025 01:28:28 +0800 Subject: [PATCH 098/543] regulator: fixed: fix GPIO descriptor leak on register failure In the commit referenced by the Fixes tag, devm_gpiod_get_optional() was replaced by manual GPIO management, relying on the regulator core to release the GPIO descriptor. However, this approach does not account for the error path: when regulator registration fails, the core never takes over the GPIO, resulting in a resource leak. Add gpiod_put() before returning on regulator registration failure. Fixes: 5e6f3ae5c13b ("regulator: fixed: Let core handle GPIO descriptor") Signed-off-by: Haotian Zhang Link: https://patch.msgid.link/20251028172828.625-1-vulab@iscas.ac.cn Signed-off-by: Mark Brown --- drivers/regulator/fixed.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c index 1cb647ed70c6..a2d16e9abfb5 100644 --- a/drivers/regulator/fixed.c +++ b/drivers/regulator/fixed.c @@ -334,6 +334,7 @@ static int reg_fixed_voltage_probe(struct platform_device *pdev) ret = dev_err_probe(&pdev->dev, PTR_ERR(drvdata->dev), "Failed to register regulator: %ld\n", PTR_ERR(drvdata->dev)); + gpiod_put(cfg.ena_gpiod); return ret; } From a50f7456f853ec3a6f07cbe1d16ad8a8b2501320 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 30 Oct 2025 14:05:27 +0000 Subject: [PATCH 099/543] dma-mapping: Allow use of DMA_BIT_MASK(64) in global scope Clang doesn't like that (1ULL<<(64)) overflows when initializing a global scope variable, even if that part of the ternary isn't used when n = 64. The same initialization can be done without warnings in function scopes, and GCC doesn't mind either way. The build failure that highlighted this was already fixed in a different way [1], which also has detailed links to the Clang issues. However it's not going to be long before the same thing happens again, so it's better to fix the root cause. Fix it by using GENMASK_ULL() which does exactly the same thing, is much more readable anyway, and doesn't have a shift that overflows. [1]: https://lore.kernel.org/all/20250918-mmp-pdma-simplify-dma-addressing-v1-1-5c2be2b85696@riscstar.com/ Signed-off-by: James Clark Reviewed-by: Nathan Chancellor Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251030-james-fix-dma_bit_mask-v1-1-ad1ce7cfab6e@linaro.org --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 8248ff9363ee..2ceda49c609f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -90,7 +90,7 @@ */ #define DMA_MAPPING_ERROR (~(dma_addr_t)0) -#define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) +#define DMA_BIT_MASK(n) GENMASK_ULL(n - 1, 0) struct dma_iova_state { dma_addr_t addr; From 63b5aa01da0f38cdbd97d021477258e511631497 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:06 +0800 Subject: [PATCH 100/543] vfat: fix missing sb_min_blocksize() return value checks When emulating an nvme device on qemu with both logical_block_size and physical_block_size set to 8 KiB, but without format, a kernel panic was triggered during the early boot stage while attempting to mount a vfat filesystem. [95553.682035] EXT4-fs (nvme0n1): unable to set blocksize [95553.684326] EXT4-fs (nvme0n1): unable to set blocksize [95553.686501] EXT4-fs (nvme0n1): unable to set blocksize [95553.696448] ISOFS: unsupported/invalid hardware sector size 8192 [95553.697117] ------------[ cut here ]------------ [95553.697567] kernel BUG at fs/buffer.c:1582! [95553.697984] Oops: invalid opcode: 0000 [#1] SMP NOPTI [95553.698602] CPU: 0 UID: 0 PID: 7212 Comm: mount Kdump: loaded Not tainted 6.18.0-rc2+ #38 PREEMPT(voluntary) [95553.699511] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [95553.700534] RIP: 0010:folio_alloc_buffers+0x1bb/0x1c0 [95553.701018] Code: 48 8b 15 e8 93 18 02 65 48 89 35 e0 93 18 02 48 83 c4 10 5b 41 5c 41 5d 41 5e 41 5f 5d 31 d2 31 c9 31 f6 31 ff c3 cc cc cc cc <0f> 0b 90 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f [95553.702648] RSP: 0018:ffffd1b0c676f990 EFLAGS: 00010246 [95553.703132] RAX: ffff8cfc4176d820 RBX: 0000000000508c48 RCX: 0000000000000001 [95553.703805] RDX: 0000000000002000 RSI: 0000000000000000 RDI: 0000000000000000 [95553.704481] RBP: ffffd1b0c676f9c8 R08: 0000000000000000 R09: 0000000000000000 [95553.705148] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001 [95553.705816] R13: 0000000000002000 R14: fffff8bc8257e800 R15: 0000000000000000 [95553.706483] FS: 000072ee77315840(0000) GS:ffff8cfdd2c8d000(0000) knlGS:0000000000000000 [95553.707248] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [95553.707782] CR2: 00007d8f2a9e5a20 CR3: 0000000039d0c006 CR4: 0000000000772ef0 [95553.708439] PKRU: 55555554 [95553.708734] Call Trace: [95553.709015] [95553.709266] __getblk_slow+0xd2/0x230 [95553.709641] ? find_get_block_common+0x8b/0x530 [95553.710084] bdev_getblk+0x77/0xa0 [95553.710449] __bread_gfp+0x22/0x140 [95553.710810] fat_fill_super+0x23a/0xfc0 [95553.711216] ? __pfx_setup+0x10/0x10 [95553.711580] ? __pfx_vfat_fill_super+0x10/0x10 [95553.712014] vfat_fill_super+0x15/0x30 [95553.712401] get_tree_bdev_flags+0x141/0x1e0 [95553.712817] get_tree_bdev+0x10/0x20 [95553.713177] vfat_get_tree+0x15/0x20 [95553.713550] vfs_get_tree+0x2a/0x100 [95553.713910] vfs_cmd_create+0x62/0xf0 [95553.714273] __do_sys_fsconfig+0x4e7/0x660 [95553.714669] __x64_sys_fsconfig+0x20/0x40 [95553.715062] x64_sys_call+0x21ee/0x26a0 [95553.715453] do_syscall_64+0x80/0x670 [95553.715816] ? __fs_parse+0x65/0x1e0 [95553.716172] ? fat_parse_param+0x103/0x4b0 [95553.716587] ? vfs_parse_fs_param_source+0x21/0xa0 [95553.717034] ? __do_sys_fsconfig+0x3d9/0x660 [95553.717548] ? __x64_sys_fsconfig+0x20/0x40 [95553.717957] ? x64_sys_call+0x21ee/0x26a0 [95553.718360] ? do_syscall_64+0xb8/0x670 [95553.718734] ? __x64_sys_fsconfig+0x20/0x40 [95553.719141] ? x64_sys_call+0x21ee/0x26a0 [95553.719545] ? do_syscall_64+0xb8/0x670 [95553.719922] ? x64_sys_call+0x1405/0x26a0 [95553.720317] ? do_syscall_64+0xb8/0x670 [95553.720702] ? __x64_sys_close+0x3e/0x90 [95553.721080] ? x64_sys_call+0x1b5e/0x26a0 [95553.721478] ? do_syscall_64+0xb8/0x670 [95553.721841] ? irqentry_exit+0x43/0x50 [95553.722211] ? exc_page_fault+0x90/0x1b0 [95553.722681] entry_SYSCALL_64_after_hwframe+0x76/0x7e [95553.723166] RIP: 0033:0x72ee774f3afe [95553.723562] Code: 73 01 c3 48 8b 0d 0a 33 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 49 89 ca b8 af 01 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d da 32 0f 00 f7 d8 64 89 01 48 [95553.725188] RSP: 002b:00007ffe97148978 EFLAGS: 00000246 ORIG_RAX: 00000000000001af [95553.725892] RAX: ffffffffffffffda RBX: 00005dcfe53d0080 RCX: 000072ee774f3afe [95553.726526] RDX: 0000000000000000 RSI: 0000000000000006 RDI: 0000000000000003 [95553.727176] RBP: 00007ffe97148ac0 R08: 0000000000000000 R09: 000072ee775e7ac0 [95553.727818] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [95553.728459] R13: 00005dcfe53d04b0 R14: 000072ee77670b00 R15: 00005dcfe53d1a28 [95553.729086] The panic occurs as follows: 1. logical_block_size is 8KiB, causing {struct super_block *sb}->s_blocksize is initialized to 0. vfat_fill_super - fat_fill_super - sb_min_blocksize - sb_set_blocksize //return 0 when size is 8KiB. 2. __bread_gfp is called with size == 0, causing folio_alloc_buffers() to compute an offset equal to folio_size(folio), which triggers a BUG_ON. fat_fill_super - sb_bread - __bread_gfp // size == {struct super_block *sb}->s_blocksize == 0 - bdev_getblk - __getblk_slow - grow_buffers - grow_dev_folio - folio_alloc_buffers // size == 0 - folio_set_bh //offset == folio_size(folio) and panic To fix this issue, add proper return value checks for sb_min_blocksize(). Cc: stable@vger.kernel.org # v6.15 Fixes: a64e5a596067bd ("bdev: add back PAGE_SIZE block size validation for sb_set_blocksize()") Reviewed-by: Matthew Wilcox Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: OGAWA Hirofumi Reviewed-by: Christoph Hellwig Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-2-yangyongpeng.storage@gmail.com Acked-by: OGAWA Hirofumi Signed-off-by: Christian Brauner --- fs/fat/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 9648ed097816..9cfe20a3daaf 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1595,8 +1595,12 @@ int fat_fill_super(struct super_block *sb, struct fs_context *fc, setup(sb); /* flavour-specific stuff that needs options */ + error = -EINVAL; + if (!sb_min_blocksize(sb, 512)) { + fat_msg(sb, KERN_ERR, "unable to set blocksize"); + goto out_fail; + } error = -EIO; - sb_min_blocksize(sb, 512); bh = sb_bread(sb, 0); if (bh == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector"); From f2c1f631630e01821fe4c3fdf6077bc7a8284f82 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:07 +0800 Subject: [PATCH 101/543] exfat: check return value of sb_min_blocksize in exfat_read_boot_sector sb_min_blocksize() may return 0. Check its return value to avoid accessing the filesystem super block when sb->s_blocksize is 0. Cc: stable@vger.kernel.org # v6.15 Fixes: 719c1e1829166d ("exfat: add super block operations") Reviewed-by: Christoph Hellwig Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-3-yangyongpeng.storage@gmail.com Signed-off-by: Christian Brauner --- fs/exfat/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 7f9592856bf7..74d451f732c7 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -433,7 +433,10 @@ static int exfat_read_boot_sector(struct super_block *sb) struct exfat_sb_info *sbi = EXFAT_SB(sb); /* set block size to read super block */ - sb_min_blocksize(sb, 512); + if (!sb_min_blocksize(sb, 512)) { + exfat_err(sb, "unable to set blocksize"); + return -EINVAL; + } /* read boot sector */ sbi->boot_bh = sb_bread(sb, 0); From e106e269c5cb38315eb0a0e7e38f71e9b20c8c66 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:08 +0800 Subject: [PATCH 102/543] isofs: check the return value of sb_min_blocksize() in isofs_fill_super sb_min_blocksize() may return 0. Check its return value to avoid opt->blocksize and sb->s_blocksize is 0. Cc: stable@vger.kernel.org # v6.15 Fixes: 1b17a46c9243e9 ("isofs: convert isofs to use the new mount API") Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-4-yangyongpeng.storage@gmail.com Signed-off-by: Christian Brauner --- fs/isofs/inode.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 6f0e6b19383c..ad3143d4066b 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -610,6 +610,11 @@ static int isofs_fill_super(struct super_block *s, struct fs_context *fc) goto out_freesbi; } opt->blocksize = sb_min_blocksize(s, opt->blocksize); + if (!opt->blocksize) { + printk(KERN_ERR + "ISOFS: unable to set blocksize\n"); + goto out_freesbi; + } sbi->s_high_sierra = 0; /* default is iso9660 */ sbi->s_session = opt->session; From 124af0868ec6929ba838fb76d25f00c06ba8fc0d Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:09 +0800 Subject: [PATCH 103/543] xfs: check the return value of sb_min_blocksize() in xfs_fs_fill_super sb_min_blocksize() may return 0. Check its return value to avoid the filesystem super block when sb->s_blocksize is 0. Cc: stable@vger.kernel.org # v6.15 Fixes: a64e5a596067bd ("bdev: add back PAGE_SIZE block size validation for sb_set_blocksize()") Reviewed-by: Christoph Hellwig Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-5-yangyongpeng.storage@gmail.com Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xfs/xfs_super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e85a156dc17d..fbb8009f1c0f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1662,7 +1662,10 @@ xfs_fs_fill_super( if (error) return error; - sb_min_blocksize(sb, BBSIZE); + if (!sb_min_blocksize(sb, BBSIZE)) { + xfs_err(mp, "unable to set blocksize"); + return -EINVAL; + } sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; #ifdef CONFIG_XFS_QUOTA From c014021253d77cd89b2d8788ce522283d83fbd40 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Mon, 27 Oct 2025 03:46:47 -0700 Subject: [PATCH 104/543] virtio-fs: fix incorrect check for fsvq->kobj In virtio_fs_add_queues_sysfs(), the code incorrectly checks fs->mqs_kobj after calling kobject_create_and_add(). Change the check to fsvq->kobj (fs->mqs_kobj -> fsvq->kobj) to ensure the per-queue kobject is successfully created. Fixes: 87cbdc396a31 ("virtio_fs: add sysfs entries for queue information") Signed-off-by: Alok Tiwari Link: https://patch.msgid.link/20251027104658.1668537-1-alok.a.tiwari@oracle.com Reviewed-by: Stefan Hajnoczi Signed-off-by: Christian Brauner --- fs/fuse/virtio_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 6bc7c97b017d..b2f6486fe1d5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -373,7 +373,7 @@ static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs) sprintf(buff, "%d", i); fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj); - if (!fs->mqs_kobj) { + if (!fsvq->kobj) { ret = -ENOMEM; goto out_del; } From 8637fa89e678422995301ddb20b74190dffcccee Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:10 +0800 Subject: [PATCH 105/543] block: add __must_check attribute to sb_min_blocksize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When sb_min_blocksize() returns 0 and the return value is not checked, it may lead to a situation where sb->s_blocksize is 0 when accessing the filesystem super block. After commit a64e5a596067bd ("bdev: add back PAGE_SIZE block size validation for sb_set_blocksize()"), this becomes more likely to happen when the block device’s logical_block_size is larger than PAGE_SIZE and the filesystem is unformatted. Add the __must_check attribute to ensure callers always check the return value. Cc: stable@vger.kernel.org # v6.15 Suggested-by: Matthew Wilcox Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-6-yangyongpeng.storage@gmail.com Signed-off-by: Christian Brauner --- block/bdev.c | 2 +- include/linux/fs.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 810707cca970..638f0cd458ae 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -231,7 +231,7 @@ int sb_set_blocksize(struct super_block *sb, int size) EXPORT_SYMBOL(sb_set_blocksize); -int sb_min_blocksize(struct super_block *sb, int size) +int __must_check sb_min_blocksize(struct super_block *sb, int size) { int minsize = bdev_logical_block_size(sb->s_bdev); if (size < minsize) diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..3ea98c6cce81 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3423,8 +3423,8 @@ static inline void remove_inode_hash(struct inode *inode) extern void inode_sb_list_add(struct inode *inode); extern void inode_add_lru(struct inode *inode); -extern int sb_set_blocksize(struct super_block *, int); -extern int sb_min_blocksize(struct super_block *, int); +int sb_set_blocksize(struct super_block *sb, int size); +int __must_check sb_min_blocksize(struct super_block *sb, int size); int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); From 90f601b497d76f40fa66795c3ecf625b6aced9fd Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Wed, 5 Nov 2025 02:29:23 +0000 Subject: [PATCH 106/543] binfmt_misc: restore write access before closing files opened by open_exec() bm_register_write() opens an executable file using open_exec(), which internally calls do_open_execat() and denies write access on the file to avoid modification while it is being executed. However, when an error occurs, bm_register_write() closes the file using filp_close() directly. This does not restore the write permission, which may cause subsequent write operations on the same file to fail. Fix this by calling exe_file_allow_write_access() before filp_close() to restore the write permission properly. Fixes: e7850f4d844e ("binfmt_misc: fix possible deadlock in bm_register_write") Signed-off-by: Zilin Guan Link: https://patch.msgid.link/20251105022923.1813587-1-zilin@seu.edu.cn Signed-off-by: Christian Brauner --- fs/binfmt_misc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a839f960cd4a..a8b1d79e4af0 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -837,8 +837,10 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, inode_unlock(d_inode(root)); if (err) { - if (f) + if (f) { + exe_file_allow_write_access(f); filp_close(f, NULL); + } kfree(e); return err; } From 3cd2018e15b3d66d2187d92867e265f45ad79e6f Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 2 Nov 2025 20:09:21 +0100 Subject: [PATCH 107/543] spi: Try to get ACPI GPIO IRQ earlier Since commit d24cfee7f63d ("spi: Fix acpi deferred irq probe"), the acpi_dev_gpio_irq_get() call gets delayed till spi_probe() is called on the SPI device. If there is no driver for the SPI device then the move to spi_probe() results in acpi_dev_gpio_irq_get() never getting called. This may cause problems by leaving the GPIO pin floating because this call is responsible for setting up the GPIO pin direction and/or bias according to the values from the ACPI tables. Re-add the removed acpi_dev_gpio_irq_get() in acpi_register_spi_device() to ensure the GPIO pin is always correctly setup, while keeping the acpi_dev_gpio_irq_get() call added to spi_probe() to deal with -EPROBE_DEFER returns caused by the GPIO controller not having a driver yet. Link: https://bbs.archlinux.org/viewtopic.php?id=302348 Fixes: d24cfee7f63d ("spi: Fix acpi deferred irq probe") Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Link: https://patch.msgid.link/20251102190921.30068-1-hansg@kernel.org Signed-off-by: Mark Brown --- drivers/spi/spi.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 2e0647a06890..8588e8562220 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2851,6 +2851,16 @@ static acpi_status acpi_register_spi_device(struct spi_controller *ctlr, acpi_set_modalias(adev, acpi_device_hid(adev), spi->modalias, sizeof(spi->modalias)); + /* + * This gets re-tried in spi_probe() for -EPROBE_DEFER handling in case + * the GPIO controller does not have a driver yet. This needs to be done + * here too, because this call sets the GPIO direction and/or bias. + * Setting these needs to be done even if there is no driver, in which + * case spi_probe() will never get called. + */ + if (spi->irq < 0) + spi->irq = acpi_dev_gpio_irq_get(adev, 0); + acpi_device_set_enumerated(adev); adev->power.flags.ignore_parent = true; From 997c06330fd5c2e220b692f2a358986c6c8fd5a2 Mon Sep 17 00:00:00 2001 From: Laurentiu Mihalcea Date: Tue, 4 Nov 2025 04:02:54 -0800 Subject: [PATCH 108/543] reset: imx8mp-audiomix: Fix bad mask values As per the i.MX8MP TRM, section 14.2 "AUDIO_BLK_CTRL", table 14.2.3.1.1 "memory map", the definition of the EARC control register shows that the EARC controller software reset is controlled via bit 0, while the EARC PHY software reset is controlled via bit 1. This means that the current definitions of IMX8MP_AUDIOMIX_EARC_RESET_MASK and IMX8MP_AUDIOMIX_EARC_PHY_RESET_MASK are wrong since their values would imply that the EARC controller software reset is controlled via bit 1 and the EARC PHY software reset is controlled via bit 2. Fix them. Fixes: a83bc87cd30a ("reset: imx8mp-audiomix: Prepare the code for more reset bits") Cc: stable@vger.kernel.org Reviewed-by: Shengjiu Wang Reviewed-by: Frank Li Reviewed-by: Daniel Baluta Signed-off-by: Laurentiu Mihalcea Signed-off-by: Philipp Zabel --- drivers/reset/reset-imx8mp-audiomix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/reset/reset-imx8mp-audiomix.c b/drivers/reset/reset-imx8mp-audiomix.c index 6b357adfe646..eceb37ff5dc5 100644 --- a/drivers/reset/reset-imx8mp-audiomix.c +++ b/drivers/reset/reset-imx8mp-audiomix.c @@ -14,8 +14,8 @@ #include #define IMX8MP_AUDIOMIX_EARC_RESET_OFFSET 0x200 -#define IMX8MP_AUDIOMIX_EARC_RESET_MASK BIT(1) -#define IMX8MP_AUDIOMIX_EARC_PHY_RESET_MASK BIT(2) +#define IMX8MP_AUDIOMIX_EARC_RESET_MASK BIT(0) +#define IMX8MP_AUDIOMIX_EARC_PHY_RESET_MASK BIT(1) #define IMX8MP_AUDIOMIX_DSP_RUNSTALL_OFFSET 0x108 #define IMX8MP_AUDIOMIX_DSP_RUNSTALL_MASK BIT(5) From a7da9c6a2fc08b6ad1a2e9aebbb14bcc59320374 Mon Sep 17 00:00:00 2001 From: Andrea della Porta Date: Tue, 21 Oct 2025 15:55:33 +0200 Subject: [PATCH 109/543] arm64: dts: broadcom: Assign clock rates in eth node for RPi5 In Raspberry Pi 5 DTS, the Ethernet clock rates must be assigned as the default clock register values are not valid for the Ethernet interface to function. This can be done either in rp1_clocks node or in rp1_eth node. Define the rates in rp1_eth node, as those clocks are 'leaf' clocks used specifically by the Ethernet device only. Fixes: 43456fdfc014 ("arm64: dts: broadcom: Enable RP1 ethernet for Raspberry Pi 5") Signed-off-by: Andrea della Porta Link: https://lore.kernel.org/r/20251021135533.5517-1-andrea.porta@suse.com Signed-off-by: Florian Fainelli --- arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts b/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts index b8f256545022..09a849dd09b1 100644 --- a/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts +++ b/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts @@ -23,6 +23,10 @@ &pcie2 { }; &rp1_eth { + assigned-clocks = <&rp1_clocks RP1_CLK_ETH_TSU>, + <&rp1_clocks RP1_CLK_ETH>; + assigned-clock-rates = <50000000>, + <125000000>; status = "okay"; phy-mode = "rgmii-id"; phy-handle = <&phy1>; From 5e44c5a2cc84bed6b92cdfd9c567fcdb9f792604 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 2 Nov 2025 13:14:42 +0200 Subject: [PATCH 110/543] arm64: dts: broadcom: bcm2712: rpi-5: Add ethernet0 alias The RP1 ethernet controller DT node contains a local-mac-address property to pass the MAC address from the boot loader to the kernel. The boot loader does not fill the MAC address as the ethernet0 alias is missing. Add it. Signed-off-by: Laurent Pinchart Reviewed-by: Andrea della Porta Link: https://lore.kernel.org/r/20251102111443.18206-1-laurent.pinchart@ideasonboard.com Fixes: 43456fdfc014 ("arm64: dts: broadcom: Enable RP1 ethernet for Raspberry Pi 5") Signed-off-by: Florian Fainelli --- arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts b/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts index 09a849dd09b1..3e0319fdb93f 100644 --- a/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts +++ b/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts @@ -18,6 +18,12 @@ #include "bcm2712-rpi-5-b-ovl-rp1.dts" +/ { + aliases { + ethernet0 = &rp1_eth; + }; +}; + &pcie2 { #include "rp1-nexus.dtsi" }; From 94f54924b96d3565c6b559294b3401b5496c21ac Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 12 Sep 2025 15:43:21 +0900 Subject: [PATCH 111/543] btrfs: zoned: fix conventional zone capacity calculation When a block group contains both conventional zone and sequential zone, the capacity of the block group is wrongly set to the block group's full length. The capacity should be calculated in btrfs_load_block_group_* using the last allocation offset. Fixes: 568220fa9657 ("btrfs: zoned: support RAID0/1/10 on top of raid stripe tree") CC: stable@vger.kernel.org # v6.12+ Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 838149fa60ce..8f006dff8893 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1317,6 +1317,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, if (!btrfs_dev_is_sequential(device, info->physical)) { up_read(&dev_replace->rwsem); info->alloc_offset = WP_CONVENTIONAL; + info->capacity = device->zone_info->zone_size; return 0; } @@ -1683,8 +1684,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); if (num_conventional > 0) { - /* Zone capacity is always zone size in emulation */ - cache->zone_capacity = cache->length; ret = calculate_alloc_pointer(cache, &last_alloc, new); if (ret) { btrfs_err(fs_info, @@ -1693,6 +1692,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } else if (map->num_stripes == num_conventional) { cache->alloc_offset = last_alloc; + cache->zone_capacity = cache->length; set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); goto out; } From 6a1ab50135ce829b834b448ce49867b5210a1641 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 16 Sep 2025 11:46:11 +0900 Subject: [PATCH 112/543] btrfs: zoned: fix stripe width calculation The stripe offset calculation in the zoned code for raid0 and raid10 wrongly uses map->stripe_size to calculate it. In fact, map->stripe_size is the size of the device extent composing the block group, which always is the zone_size on the zoned setup. Fix it by using BTRFS_STRIPE_LEN and BTRFS_STRIPE_LEN_SHIFT. Also, optimize the calculation a bit by doing the common calculation only once. Fixes: c0d90a79e8e6 ("btrfs: zoned: fix alloc_offset calculation for partly conventional block groups") CC: stable@vger.kernel.org # 6.17+ Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 56 ++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 8f006dff8893..b622c73fe30f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1523,6 +1523,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 stripe_nr = 0, stripe_offset = 0; + u32 stripe_index = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1530,28 +1532,26 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, return -EINVAL; } + if (last_alloc) { + u32 factor = map->num_stripes; + + stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + } + for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - u64 stripe_nr, full_stripe_nr; - u64 stripe_offset; - int stripe_index; - stripe_nr = div64_u64(last_alloc, map->stripe_size); - stripe_offset = stripe_nr * map->stripe_size; - full_stripe_nr = div_u64(stripe_nr, map->num_stripes); - div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - - zone_info[i].alloc_offset = - full_stripe_nr * map->stripe_size; + zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > i) - zone_info[i].alloc_offset += map->stripe_size; + zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; else if (stripe_index == i) - zone_info[i].alloc_offset += - (last_alloc - stripe_offset); + zone_info[i].alloc_offset += stripe_offset; } if (test_bit(0, active) != test_bit(i, active)) { @@ -1575,6 +1575,8 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 stripe_nr = 0, stripe_offset = 0; + u32 stripe_index = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1582,6 +1584,14 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, return -EINVAL; } + if (last_alloc) { + u32 factor = map->num_stripes / map->sub_stripes; + + stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + } + for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; @@ -1595,26 +1605,12 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - u64 stripe_nr, full_stripe_nr; - u64 stripe_offset; - int stripe_index; - - stripe_nr = div64_u64(last_alloc, map->stripe_size); - stripe_offset = stripe_nr * map->stripe_size; - full_stripe_nr = div_u64(stripe_nr, - map->num_stripes / map->sub_stripes); - div_u64_rem(stripe_nr, - (map->num_stripes / map->sub_stripes), - &stripe_index); - - zone_info[i].alloc_offset = - full_stripe_nr * map->stripe_size; + zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > (i / map->sub_stripes)) - zone_info[i].alloc_offset += map->stripe_size; + zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; else if (stripe_index == (i / map->sub_stripes)) - zone_info[i].alloc_offset += - (last_alloc - stripe_offset); + zone_info[i].alloc_offset += stripe_offset; } if ((i % map->sub_stripes) == 0) { From bfe3d755ef7cec71aac6ecda34a107624735aac7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 29 Oct 2025 13:05:32 +0000 Subject: [PATCH 113/543] btrfs: do not update last_log_commit when logging inode due to a new name When logging that a new name exists, we skip updating the inode's last_log_commit field to prevent a later explicit fsync against the inode from doing nothing (as updating last_log_commit makes btrfs_inode_in_log() return true). We are detecting, at btrfs_log_inode(), that logging a new name is happening by checking the logging mode is not LOG_INODE_EXISTS, but that is not enough because we may log parent directories when logging a new name of a file in LOG_INODE_ALL mode - we need to check that the logging_new_name field of the log context too. An example scenario where this results in an explicit fsync against a directory not persisting changes to the directory is the following: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt $ touch /mnt/foo $ sync $ mkdir /mnt/dir # Write some data to our file and fsync it. $ xfs_io -c "pwrite -S 0xab 0 64K" -c "fsync" /mnt/foo # Add a new link to our file. Since the file was logged before, we # update it in the log tree by calling btrfs_log_new_name(). $ ln /mnt/foo /mnt/dir/bar # fsync the root directory - we expect it to persist the dentry for # the new directory "dir". $ xfs_io -c "fsync" /mnt After mounting the fs the entry for directory "dir" does not exists, despite the explicit fsync on the root directory. Here's why this happens: 1) When we fsync the file we log the inode, so that it's present in the log tree; 2) When adding the new link we enter btrfs_log_new_name(), and since the inode is in the log tree we proceed to updating the inode in the log tree; 3) We first set the inode's last_unlink_trans to the current transaction (early in btrfs_log_new_name()); 4) We then eventually enter btrfs_log_inode_parent(), and after logging the file's inode, we call btrfs_log_all_parents() because the inode's last_unlink_trans matches the current transaction's ID (updated in the previous step); 5) So btrfs_log_all_parents() logs the root directory by calling btrfs_log_inode() for the root's inode with a log mode of LOG_INODE_ALL so that new dentries are logged; 6) At btrfs_log_inode(), because the log mode is LOG_INODE_ALL, we update root inode's last_log_commit to the last transaction that changed the inode (->last_sub_trans field of the inode), which corresponds to the current transaction's ID; 7) Then later when user space explicitly calls fsync against the root directory, we enter btrfs_sync_file(), which calls skip_inode_logging() and that returns true, since its call to btrfs_inode_in_log() returns true and there are no ordered extents (it's a directory, never has ordered extents). This results in btrfs_sync_file() returning without syncing the log or committing the current transaction, so all the updates we did when logging the new name, including logging the root directory, are not persisted. So fix this by but updating the inode's last_log_commit if we are sure we are not logging a new name (if ctx->logging_new_name is false). A test case for fstests will follow soon. Reported-by: Vyacheslav Kovalevsky Link: https://lore.kernel.org/linux-btrfs/03c5d7ec-5b3d-49d1-95bc-8970a7f82d87@gmail.com/ Fixes: 130341be7ffa ("btrfs: always update the logged transaction when logging new names") CC: stable@vger.kernel.org # 6.1+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 00a59fb79167..98599644986f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7122,7 +7122,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * a power failure unless the log was synced as part of an fsync * against any other unrelated inode. */ - if (inode_only != LOG_INODE_EXISTS) + if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS) inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); From 5fea61aa1ca70c4b3738eebad9ce2d7e7938ebbd Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Wed, 5 Nov 2025 03:53:21 +0000 Subject: [PATCH 114/543] btrfs: scrub: put bio after errors in scrub_raid56_parity_stripe() scrub_raid56_parity_stripe() allocates a bio with bio_alloc(), but fails to release it on some error paths, leading to a potential memory leak. Add the missing bio_put() calls to properly drop the bio reference in those error cases. Fixes: 1009254bf22a3 ("btrfs: scrub: use scrub_stripe to implement RAID56 P/Q scrub") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Qu Wenruo Signed-off-by: Zilin Guan Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 651b11884f82..ba20d9286a34 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2203,6 +2203,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, &length, &bioc, NULL, NULL); if (ret < 0) { + bio_put(bio); btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); goto out; @@ -2212,6 +2213,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_put_bioc(bioc); if (!rbio) { ret = -ENOMEM; + bio_put(bio); btrfs_bio_counter_dec(fs_info); goto out; } From c367af440e03eba7beb0c9f3fe540f9bcb69134a Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Wed, 5 Nov 2025 02:37:22 +0000 Subject: [PATCH 115/543] btrfs: release root after error in data_reloc_print_warning_inode() data_reloc_print_warning_inode() calls btrfs_get_fs_root() to obtain local_root, but fails to release its reference when paths_from_inode() returns an error. This causes a potential memory leak. Add a missing btrfs_put_root() call in the error path to properly decrease the reference count of local_root. Fixes: b9a9a85059cde ("btrfs: output affected files when relocation fails") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Qu Wenruo Signed-off-by: Zilin Guan Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b95175116ea3..d097532fd85b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -177,8 +177,10 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, return ret; } ret = paths_from_inode(inum, ipath); - if (ret < 0) + if (ret < 0) { + btrfs_put_root(local_root); goto err; + } /* * We deliberately ignore the bit ipath might have been too small to From 03b3bcd319b3ab5182bc9aaa0421351572c78ac0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 4 Nov 2025 14:48:30 -0800 Subject: [PATCH 116/543] nvme: fix admin request_queue lifetime The namespaces can access the controller's admin request_queue, and stale references on the namespaces may exist after tearing down the controller. Ensure the admin request_queue is active by moving the controller's 'put' to after all controller references have been released to ensure no one is can access the request_queue. This fixes a reported use-after-free bug: BUG: KASAN: slab-use-after-free in blk_queue_enter+0x41c/0x4a0 Read of size 8 at addr ffff88c0a53819f8 by task nvme/3287 CPU: 67 UID: 0 PID: 3287 Comm: nvme Tainted: G E 6.13.2-ga1582f1a031e #15 Tainted: [E]=UNSIGNED_MODULE Hardware name: Jabil /EGS 2S MB1, BIOS 1.00 06/18/2025 Call Trace: dump_stack_lvl+0x4f/0x60 print_report+0xc4/0x620 ? _raw_spin_lock_irqsave+0x70/0xb0 ? _raw_read_unlock_irqrestore+0x30/0x30 ? blk_queue_enter+0x41c/0x4a0 kasan_report+0xab/0xe0 ? blk_queue_enter+0x41c/0x4a0 blk_queue_enter+0x41c/0x4a0 ? __irq_work_queue_local+0x75/0x1d0 ? blk_queue_start_drain+0x70/0x70 ? irq_work_queue+0x18/0x20 ? vprintk_emit.part.0+0x1cc/0x350 ? wake_up_klogd_work_func+0x60/0x60 blk_mq_alloc_request+0x2b7/0x6b0 ? __blk_mq_alloc_requests+0x1060/0x1060 ? __switch_to+0x5b7/0x1060 nvme_submit_user_cmd+0xa9/0x330 nvme_user_cmd.isra.0+0x240/0x3f0 ? force_sigsegv+0xe0/0xe0 ? nvme_user_cmd64+0x400/0x400 ? vfs_fileattr_set+0x9b0/0x9b0 ? cgroup_update_frozen_flag+0x24/0x1c0 ? cgroup_leave_frozen+0x204/0x330 ? nvme_ioctl+0x7c/0x2c0 blkdev_ioctl+0x1a8/0x4d0 ? blkdev_common_ioctl+0x1930/0x1930 ? fdget+0x54/0x380 __x64_sys_ioctl+0x129/0x190 do_syscall_64+0x5b/0x160 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7f765f703b0b Code: ff ff ff 85 c0 79 9b 49 c7 c4 ff ff ff ff 5b 5d 4c 89 e0 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d dd 52 0f 00 f7 d8 64 89 01 48 RSP: 002b:00007ffe2cefe808 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffe2cefe860 RCX: 00007f765f703b0b RDX: 00007ffe2cefe860 RSI: 00000000c0484e41 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000003 R09: 0000000000000000 R10: 00007f765f611d50 R11: 0000000000000202 R12: 0000000000000003 R13: 00000000c0484e41 R14: 0000000000000001 R15: 00007ffe2cefea60 Reported-by: Casey Chen Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fa4181d7de73..f1f719351f3f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4901,7 +4901,6 @@ void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) */ nvme_stop_keep_alive(ctrl); blk_mq_destroy_queue(ctrl->admin_q); - blk_put_queue(ctrl->admin_q); if (ctrl->ops->flags & NVME_F_FABRICS) { blk_mq_destroy_queue(ctrl->fabrics_q); blk_put_queue(ctrl->fabrics_q); @@ -5045,6 +5044,8 @@ static void nvme_free_ctrl(struct device *dev) container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; + if (ctrl->admin_q) + blk_put_queue(ctrl->admin_q); if (!subsys || ctrl->instance != subsys->instance) ida_free(&nvme_instance_ida, ctrl->instance); nvme_free_cels(ctrl); From 6d08340d1e354787d6c65a8c3cdd4d41ffb8a5ed Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 4 Nov 2025 22:54:02 +0100 Subject: [PATCH 117/543] Revert "perf/x86: Always store regs->ip in perf_callchain_kernel()" This reverts commit 83f44ae0f8afcc9da659799db8693f74847e66b3. Currently we store initial stacktrace entry twice for non-HW ot_regs, which means callers that fail perf_hw_regs(regs) condition in perf_callchain_kernel. It's easy to reproduce this bpftrace: # bpftrace -e 'tracepoint:sched:sched_process_exec { print(kstack()); }' Attaching 1 probe... bprm_execve+1767 bprm_execve+1767 do_execveat_common.isra.0+425 __x64_sys_execve+56 do_syscall_64+133 entry_SYSCALL_64_after_hwframe+118 When perf_callchain_kernel calls unwind_start with first_frame, AFAICS we do not skip regs->ip, but it's added as part of the unwind process. Hence reverting the extra perf_callchain_store for non-hw regs leg. I was not able to bisect this, so I'm not really sure why this was needed in v5.2 and why it's not working anymore, but I could see double entries as far as v5.10. I did the test for both ORC and framepointer unwind with and without the this fix and except for the initial entry the stacktraces are the same. Acked-by: Song Liu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20251104215405.168643-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov Acked-by: Steven Rostedt (Google) --- arch/x86/events/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 745caa6c15a3..fa6c47b50989 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2789,13 +2789,13 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re return; } - if (perf_callchain_store(entry, regs->ip)) - return; - - if (perf_hw_regs(regs)) + if (perf_hw_regs(regs)) { + if (perf_callchain_store(entry, regs->ip)) + return; unwind_start(&state, current, regs, NULL); - else + } else { unwind_start(&state, current, NULL, (void *)regs->sp); + } for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); From 20a0bc10272fa17a44fc857c31574a8306f60d20 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 4 Nov 2025 22:54:03 +0100 Subject: [PATCH 118/543] x86/fgraph,bpf: Fix stack ORC unwind from kprobe_multi return probe Currently we don't get stack trace via ORC unwinder on top of fgraph exit handler. We can see that when generating stacktrace from kretprobe_multi bpf program which is based on fprobe/fgraph. The reason is that the ORC unwind code won't get pass the return_to_handler callback installed by fgraph return probe machinery. Solving this by creating stack frame in return_to_handler expected by ftrace_graph_ret_addr function to recover original return address and continue with the unwind. Also updating the pt_regs data with cs/flags/rsp which are needed for successful stack retrieval from ebpf bpf_get_stackid helper. - in get_perf_callchain we check user_mode(regs) so CS has to be set - in perf_callchain_kernel we call perf_hw_regs(regs), so EFLAGS/FIXED has to be unset Acked-by: Masami Hiramatsu (Google) Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20251104215405.168643-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov Acked-by: Steven Rostedt (Google) --- arch/x86/include/asm/ftrace.h | 5 +++++ arch/x86/kernel/ftrace_64.S | 8 +++++++- include/linux/ftrace.h | 10 +++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 93156ac4ffe0..b08c95872eed 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -56,6 +56,11 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) return &arch_ftrace_regs(fregs)->regs; } +#define arch_ftrace_partial_regs(regs) do { \ + regs->flags &= ~X86_EFLAGS_FIXED; \ + regs->cs = __KERNEL_CS; \ +} while (0) + #define arch_ftrace_fill_perf_regs(fregs, _regs) do { \ (_regs)->ip = arch_ftrace_regs(fregs)->regs.ip; \ (_regs)->sp = arch_ftrace_regs(fregs)->regs.sp; \ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 367da3638167..823dbdd0eb41 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -354,12 +354,17 @@ SYM_CODE_START(return_to_handler) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR + /* Restore return_to_handler value that got eaten by previous ret instruction. */ + subq $8, %rsp + UNWIND_HINT_FUNC + /* Save ftrace_regs for function exit context */ subq $(FRAME_SIZE), %rsp movq %rax, RAX(%rsp) movq %rdx, RDX(%rsp) movq %rbp, RBP(%rsp) + movq %rsp, RSP(%rsp) movq %rsp, %rdi call ftrace_return_to_handler @@ -368,7 +373,8 @@ SYM_CODE_START(return_to_handler) movq RDX(%rsp), %rdx movq RAX(%rsp), %rax - addq $(FRAME_SIZE), %rsp + addq $(FRAME_SIZE) + 8, %rsp + /* * Jump back to the old return address. This cannot be JMP_NOSPEC rdi * since IBT would demand that contain ENDBR, which simply isn't so for diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7ded7df6e9b5..07f8c309e432 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -193,6 +193,10 @@ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs #if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) || \ defined(CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS) +#ifndef arch_ftrace_partial_regs +#define arch_ftrace_partial_regs(regs) do {} while (0) +#endif + static __always_inline struct pt_regs * ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs) { @@ -202,7 +206,11 @@ ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs) * Since arch_ftrace_get_regs() will check some members and may return * NULL, we can not use it. */ - return &arch_ftrace_regs(fregs)->regs; + regs = &arch_ftrace_regs(fregs)->regs; + + /* Allow arch specific updates to regs. */ + arch_ftrace_partial_regs(regs); + return regs; } #endif /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS || CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS */ From c9e208fa93cd66f8077ee15df0728e62b105a687 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 4 Nov 2025 22:54:04 +0100 Subject: [PATCH 119/543] selftests/bpf: Add stacktrace ips test for kprobe_multi/kretprobe_multi Adding test that attaches kprobe/kretprobe multi and verifies the ORC stacktrace matches expected functions. Adding bpf_testmod_stacktrace_test function to bpf_testmod kernel module which is called through several functions so we get reliable call path for stacktrace. The test is only for ORC unwinder to keep it simple. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20251104215405.168643-4-jolsa@kernel.org Signed-off-by: Alexei Starovoitov Acked-by: Steven Rostedt (Google) --- .../selftests/bpf/prog_tests/stacktrace_ips.c | 104 ++++++++++++++++++ .../selftests/bpf/progs/stacktrace_ips.c | 41 +++++++ .../selftests/bpf/test_kmods/bpf_testmod.c | 26 +++++ 3 files changed, 171 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c create mode 100644 tools/testing/selftests/bpf/progs/stacktrace_ips.c diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c new file mode 100644 index 000000000000..6fca459ba550 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "stacktrace_ips.skel.h" + +#ifdef __x86_64__ +static int check_stacktrace_ips(int fd, __u32 key, int cnt, ...) +{ + __u64 ips[PERF_MAX_STACK_DEPTH]; + struct ksyms *ksyms = NULL; + int i, err = 0; + va_list args; + + /* sorted by addr */ + ksyms = load_kallsyms_local(); + if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_local")) + return -1; + + /* unlikely, but... */ + if (!ASSERT_LT(cnt, PERF_MAX_STACK_DEPTH, "check_max")) + return -1; + + err = bpf_map_lookup_elem(fd, &key, ips); + if (err) + goto out; + + /* + * Compare all symbols provided via arguments with stacktrace ips, + * and their related symbol addresses.t + */ + va_start(args, cnt); + + for (i = 0; i < cnt; i++) { + unsigned long val; + struct ksym *ksym; + + val = va_arg(args, unsigned long); + ksym = ksym_search_local(ksyms, ips[i]); + if (!ASSERT_OK_PTR(ksym, "ksym_search_local")) + break; + ASSERT_EQ(ksym->addr, val, "stack_cmp"); + } + + va_end(args); + +out: + free_kallsyms_local(ksyms); + return err; +} + +static void test_stacktrace_ips_kprobe_multi(bool retprobe) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts, + .retprobe = retprobe + ); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.kprobe_multi_test = bpf_program__attach_kprobe_multi_opts( + skel->progs.kprobe_multi_test, + "bpf_testmod_stacktrace_test", &opts); + if (!ASSERT_OK_PTR(skel->links.kprobe_multi_test, "bpf_program__attach_kprobe_multi_opts")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + +cleanup: + stacktrace_ips__destroy(skel); +} + +static void __test_stacktrace_ips(void) +{ + if (test__start_subtest("kprobe_multi")) + test_stacktrace_ips_kprobe_multi(false); + if (test__start_subtest("kretprobe_multi")) + test_stacktrace_ips_kprobe_multi(true); +} +#else +static void __test_stacktrace_ips(void) +{ + test__skip(); +} +#endif + +void test_stacktrace_ips(void) +{ + __test_stacktrace_ips(); +} diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c new file mode 100644 index 000000000000..e2eb30945c1b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include +#include +#include + +#ifndef PERF_MAX_STACK_DEPTH +#define PERF_MAX_STACK_DEPTH 127 +#endif + +typedef __u64 stack_trace_t[PERF_MAX_STACK_DEPTH]; + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(max_entries, 16384); + __type(key, __u32); + __type(value, stack_trace_t); +} stackmap SEC(".maps"); + +extern bool CONFIG_UNWINDER_ORC __kconfig __weak; + +/* + * This function is here to have CONFIG_UNWINDER_ORC + * used and added to object BTF. + */ +int unused(void) +{ + return CONFIG_UNWINDER_ORC ? 0 : 1; +} + +__u32 stack_key; + +SEC("kprobe.multi") +int kprobe_multi_test(struct pt_regs *ctx) +{ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 8074bc5f6f20..ed0a4721d8fd 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -417,6 +417,30 @@ noinline int bpf_testmod_fentry_test11(u64 a, void *b, short c, int d, return a + (long)b + c + d + (long)e + f + g + h + i + j + k; } +noinline void bpf_testmod_stacktrace_test(void) +{ + /* used for stacktrace test as attach function */ + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_3(void) +{ + bpf_testmod_stacktrace_test(); + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_2(void) +{ + bpf_testmod_stacktrace_test_3(); + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_1(void) +{ + bpf_testmod_stacktrace_test_2(); + asm volatile (""); +} + int bpf_testmod_fentry_ok; noinline ssize_t @@ -497,6 +521,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, 21, 22, 23, 24, 25, 26) != 231) goto out; + bpf_testmod_stacktrace_test_1(); + bpf_testmod_fentry_ok = 1; out: return -EIO; /* always fail */ From 3490d29964bdd524366d266b655112cb549c7460 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 4 Nov 2025 22:54:05 +0100 Subject: [PATCH 120/543] selftests/bpf: Add stacktrace ips test for raw_tp Adding test that verifies we get expected initial 2 entries from stacktrace for rawtp probe via ORC unwind. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20251104215405.168643-5-jolsa@kernel.org Signed-off-by: Alexei Starovoitov Acked-by: Steven Rostedt (Google) --- .../selftests/bpf/prog_tests/stacktrace_ips.c | 46 +++++++++++++++++++ .../selftests/bpf/progs/stacktrace_ips.c | 8 ++++ 2 files changed, 54 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c index 6fca459ba550..c9efdd2a5b18 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -84,12 +84,58 @@ static void test_stacktrace_ips_kprobe_multi(bool retprobe) stacktrace_ips__destroy(skel); } +static void test_stacktrace_ips_raw_tp(void) +{ + __u32 info_len = sizeof(struct bpf_prog_info); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_prog_info info = {}; + struct stacktrace_ips *skel; + __u64 bpf_prog_ksym = 0; + int err; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.rawtp_test = bpf_program__attach_raw_tracepoint( + skel->progs.rawtp_test, + "bpf_testmod_test_read"); + if (!ASSERT_OK_PTR(skel->links.rawtp_test, "bpf_program__attach_raw_tracepoint")) + goto cleanup; + + /* get bpf program address */ + info.jited_ksyms = ptr_to_u64(&bpf_prog_ksym); + info.nr_jited_ksyms = 1; + err = bpf_prog_get_info_by_fd(bpf_program__fd(skel->progs.rawtp_test), + &info, &info_len); + if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 2, + bpf_prog_ksym, + ksym_get_addr("bpf_trace_run2")); + +cleanup: + stacktrace_ips__destroy(skel); +} + static void __test_stacktrace_ips(void) { if (test__start_subtest("kprobe_multi")) test_stacktrace_ips_kprobe_multi(false); if (test__start_subtest("kretprobe_multi")) test_stacktrace_ips_kprobe_multi(true); + if (test__start_subtest("raw_tp")) + test_stacktrace_ips_raw_tp(); } #else static void __test_stacktrace_ips(void) diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c index e2eb30945c1b..a96c8150d7f5 100644 --- a/tools/testing/selftests/bpf/progs/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -38,4 +38,12 @@ int kprobe_multi_test(struct pt_regs *ctx) return 0; } +SEC("raw_tp/bpf_testmod_test_read") +int rawtp_test(void *ctx) +{ + /* Skip ebpf program entry in the stack. */ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + char _license[] SEC("license") = "GPL"; From 59b0afd01b2ce353ab422ea9c8375b03db313a21 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 27 Oct 2025 23:09:34 +0800 Subject: [PATCH 121/543] crypto: hisilicon/qm - Fix device reference leak in qm_get_qos_value The qm_get_qos_value() function calls bus_find_device_by_name() which increases the device reference count, but fails to call put_device() to balance the reference count and lead to a device reference leak. Add put_device() calls in both the error path and success path to properly balance the reference count. Found via static analysis. Fixes: 22d7a6c39cab ("crypto: hisilicon/qm - add pci bdf number check") Cc: stable@vger.kernel.org Signed-off-by: Miaoqian Lin Reviewed-by: Longfang Liu Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index a5b96adf2d1e..3b391a146635 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3871,10 +3871,12 @@ static ssize_t qm_get_qos_value(struct hisi_qm *qm, const char *buf, pdev = container_of(dev, struct pci_dev, dev); if (pci_physfn(pdev) != qm->pdev) { pci_err(qm->pdev, "the pdev input does not match the pf!\n"); + put_device(dev); return -EINVAL; } *fun_index = pdev->devfn; + put_device(dev); return 0; } From 82420bd4e17bdaba8453fbf9e10c58c9ed0c9727 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 6 Nov 2025 11:46:45 +0100 Subject: [PATCH 122/543] ALSA: hda/hdmi: Fix breakage at probing nvhdmi-mcp driver After restructuring and splitting the HDMI codec driver code, each HDMI codec driver contains the own build_controls and build_pcms ops. A copy-n-paste error put the wrong entries for nvhdmi-mcp driver; both build_controls and build_pcms are swapped. Unfortunately both callbacks have the very same form, and the compiler didn't complain it, either. This resulted in a NULL dereference because the PCM instance hasn't been initialized at calling the build_controls callback. Fix it by passing the proper entries. Fixes: ad781b550f9a ("ALSA: hda/hdmi: Rewrite to new probe method") Cc: Link: https://bugzilla.kernel.org/show_bug.cgi?id=220743 Link: https://patch.msgid.link/20251106104647.25805-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/hda/codecs/hdmi/nvhdmi-mcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/hda/codecs/hdmi/nvhdmi-mcp.c b/sound/hda/codecs/hdmi/nvhdmi-mcp.c index 8fd8d76fa72f..1c5fdfe872f2 100644 --- a/sound/hda/codecs/hdmi/nvhdmi-mcp.c +++ b/sound/hda/codecs/hdmi/nvhdmi-mcp.c @@ -350,8 +350,8 @@ static int nvhdmi_mcp_probe(struct hda_codec *codec, static const struct hda_codec_ops nvhdmi_mcp_codec_ops = { .probe = nvhdmi_mcp_probe, .remove = snd_hda_hdmi_simple_remove, - .build_controls = nvhdmi_mcp_build_pcms, - .build_pcms = nvhdmi_mcp_build_controls, + .build_pcms = nvhdmi_mcp_build_pcms, + .build_controls = nvhdmi_mcp_build_controls, .init = nvhdmi_mcp_init, .unsol_event = snd_hda_hdmi_simple_unsol_event, }; From 54afb047cd7eb40149f3fc42d69fd4ddde2be9f0 Mon Sep 17 00:00:00 2001 From: Edip Hazuri Date: Wed, 15 Oct 2025 21:10:44 +0300 Subject: [PATCH 123/543] platform/x86: hp-wmi: mark Victus 16-r0 and 16-s0 for victus_s fan and thermal profile support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds Victus 16-r0 (8bbe) and Victus 16-s0(8bd4, 8bd5) laptop DMI board name into existing list Signed-off-by: Edip Hazuri Link: https://patch.msgid.link/20251015181042.23961-3-edip@medip.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-wmi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index 8b3533d6ba09..9a668e258795 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -92,8 +92,9 @@ static const char * const victus_thermal_profile_boards[] = { "8A25" }; -/* DMI Board names of Victus 16-r1000 and Victus 16-s1000 laptops */ +/* DMI Board names of Victus 16-r and Victus 16-s laptops */ static const char * const victus_s_thermal_profile_boards[] = { + "8BBE", "8BD4", "8BD5", "8C99", "8C9C" }; From 5c72329716d0858621021193330594d5d26bf44d Mon Sep 17 00:00:00 2001 From: Jia Ston Date: Wed, 29 Oct 2025 05:18:38 +0000 Subject: [PATCH 124/543] platform/x86: huawei-wmi: add keys for HONOR models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HONOR MagicBook X16/X14 models produced in 2025 cannot use the Print Screen and YOYO keys properly, with the system reporting them as unknown key presses (codes: 0x028b and 0x028e). To resolve this, a key_entry is added for both the HONOR Print Screen key and the HONOR YOYO key, ensuring they function correctly on these models. Signed-off-by: Ston Jia Link: https://patch.msgid.link/20251029051804.220111-1-ston.jia@outlook.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/huawei-wmi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/platform/x86/huawei-wmi.c b/drivers/platform/x86/huawei-wmi.c index c3772df34679..8a4c54089ace 100644 --- a/drivers/platform/x86/huawei-wmi.c +++ b/drivers/platform/x86/huawei-wmi.c @@ -81,6 +81,10 @@ static const struct key_entry huawei_wmi_keymap[] = { { KE_KEY, 0x289, { KEY_WLAN } }, // Huawei |M| key { KE_KEY, 0x28a, { KEY_CONFIG } }, + // HONOR YOYO key + { KE_KEY, 0x28b, { KEY_NOTIFICATION_CENTER } }, + // HONOR print screen + { KE_KEY, 0x28e, { KEY_PRINT } }, // Keyboard backlit { KE_IGNORE, 0x293, { KEY_KBDILLUMTOGGLE } }, { KE_IGNORE, 0x294, { KEY_KBDILLUMUP } }, From fb146a38cb119c8d69633851c7a2ce2c8d34861a Mon Sep 17 00:00:00 2001 From: Krishna Chomal Date: Sat, 18 Oct 2025 16:40:01 +0530 Subject: [PATCH 125/543] platform/x86: hp-wmi: Add Omen 16-wf1xxx fan support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The newer HP Omen laptops, such as Omen 16-wf1xxx, use the same WMI-based thermal profile interface as Victus 16-r1000 and 16-s1000 models. Add the DMI board name "8C78" to the victus_s_thermal_profile_boards list to enable proper fan and thermal mode control. Tested on: HP Omen 16-wf1xxx (board 8C78) Result: * Fan RPMs are readable * echo 0 | sudo tee /sys/devices/platform/hp-wmi/hwmon/*/pwm1_enable allows the fans to run on max RPM. Signed-off-by: Krishna Chomal Link: https://patch.msgid.link/20251018111001.56625-1-krishna.chomal108@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index 9a668e258795..e10c75d91f24 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -95,7 +95,7 @@ static const char * const victus_thermal_profile_boards[] = { /* DMI Board names of Victus 16-r and Victus 16-s laptops */ static const char * const victus_s_thermal_profile_boards[] = { "8BBE", "8BD4", "8BD5", - "8C99", "8C9C" + "8C78", "8C99", "8C9C", }; enum hp_wmi_radio { From a229809c18926e79aeca232d5b502157beb0dec3 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 22 Oct 2025 14:17:33 -0700 Subject: [PATCH 126/543] platform/x86: intel-uncore-freq: Add additional client processors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Intel uncore frequency driver support for Pantherlake, Wildcatlake and Novalake processors. Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20251022211733.3565526-1-sathyanarayanan.kuppuswamy@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../platform/x86/intel/uncore-frequency/uncore-frequency.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c index 2a6897035150..0dfc552b2802 100644 --- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c @@ -256,6 +256,10 @@ static const struct x86_cpu_id intel_uncore_cpu_ids[] = { X86_MATCH_VFM(INTEL_ARROWLAKE, NULL), X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL), X86_MATCH_VFM(INTEL_LUNARLAKE_M, NULL), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), + X86_MATCH_VFM(INTEL_NOVALAKE, NULL), + X86_MATCH_VFM(INTEL_NOVALAKE_L, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_cpu_ids); From 5f20bc206beb902e32b77216cb7935b46ca00b0a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 23 Oct 2025 12:46:14 -0700 Subject: [PATCH 127/543] platform/x86: ISST: isst_if.h: fix all kernel-doc warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix all kernel-doc warnings in : - don't use "[]" in the variable name in kernel-doc - add a few missing entries - change "power_domain" to "power_domain_id" in kernel-doc to match the struct member name - add a leading '@' on a few existing kernel-doc lines - use '_' instead of '-' in struct member names Examples (but not all 27 warnings): Warning: include/uapi/linux/isst_if.h:63 struct member 'cpu_map' not described in 'isst_if_cpu_maps' Warning: ../include/uapi/linux/isst_if.h:95 struct member 'req_count' not described in 'isst_if_io_regs' Warning: include/uapi/linux/isst_if.h:132 struct member 'mbox_cmd' not described in 'isst_if_mbox_cmds' Warning: ../include/uapi/linux/isst_if.h:183 struct member 'supported' not described in 'isst_core_power' Warning: ../include/uapi/linux/isst_if.h:206 struct member 'power_domain_id' not described in 'isst_clos_param' Warning: ../include/uapi/linux/isst_if.h:239 struct member 'assoc_info' not described in 'isst_if_clos_assoc_cmds' Warning: ../include/uapi/linux/isst_if.h:286 struct member 'sst_tf_support' not described in 'isst_perf_level_info' Warning: ../include/uapi/linux/isst_if.h:375 struct member 'trl_freq_mhz' not described in 'isst_perf_level_data_info' Warning: ../include/uapi/linux/isst_if.h:475 struct member 'max_buckets' not described in 'isst_turbo_freq_info' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251023194615.180824-1-rdunlap@infradead.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/uapi/linux/isst_if.h | 50 +++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h index 8197a4800604..40aa545101a3 100644 --- a/include/uapi/linux/isst_if.h +++ b/include/uapi/linux/isst_if.h @@ -52,7 +52,7 @@ struct isst_if_cpu_map { /** * struct isst_if_cpu_maps - structure for CPU map IOCTL * @cmd_count: Number of CPU mapping command in cpu_map[] - * @cpu_map[]: Holds one or more CPU map data structure + * @cpu_map: Holds one or more CPU map data structure * * This structure used with ioctl ISST_IF_GET_PHY_ID to send * one or more CPU mapping commands. Here IOCTL return value indicates @@ -82,8 +82,8 @@ struct isst_if_io_reg { /** * struct isst_if_io_regs - structure for IO register commands - * @cmd_count: Number of io reg commands in io_reg[] - * @io_reg[]: Holds one or more io_reg command structure + * @req_count: Number of io reg commands in io_reg[] + * @io_reg: Holds one or more io_reg command structure * * This structure used with ioctl ISST_IF_IO_CMD to send * one or more read/write commands to PUNIT. Here IOCTL return value @@ -120,7 +120,7 @@ struct isst_if_mbox_cmd { /** * struct isst_if_mbox_cmds - structure for mailbox commands * @cmd_count: Number of mailbox commands in mbox_cmd[] - * @mbox_cmd[]: Holds one or more mbox commands + * @mbox_cmd: Holds one or more mbox commands * * This structure used with ioctl ISST_IF_MBOX_COMMAND to send * one or more mailbox commands to PUNIT. Here IOCTL return value @@ -152,7 +152,7 @@ struct isst_if_msr_cmd { /** * struct isst_if_msr_cmds - structure for msr commands * @cmd_count: Number of mailbox commands in msr_cmd[] - * @msr_cmd[]: Holds one or more msr commands + * @msr_cmd: Holds one or more msr commands * * This structure used with ioctl ISST_IF_MSR_COMMAND to send * one or more MSR commands. IOCTL return value indicates number of @@ -167,8 +167,9 @@ struct isst_if_msr_cmds { * struct isst_core_power - Structure to get/set core_power feature * @get_set: 0: Get, 1: Set * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @enable: Feature enable status + * @supported: Power domain supports SST_CP interface * @priority_type: Priority type for the feature (ordered/proportional) * * Structure to get/set core_power feature state using IOCTL @@ -187,11 +188,11 @@ struct isst_core_power { * struct isst_clos_param - Structure to get/set clos praram * @get_set: 0: Get, 1: Set * @socket_id: Socket/package id - * @power_domain: Power Domain id - * clos: Clos ID for the parameters - * min_freq_mhz: Minimum frequency in MHz - * max_freq_mhz: Maximum frequency in MHz - * prop_prio: Proportional priority from 0-15 + * @power_domain_id: Power Domain id + * @clos: Clos ID for the parameters + * @min_freq_mhz: Minimum frequency in MHz + * @max_freq_mhz: Maximum frequency in MHz + * @prop_prio: Proportional priority from 0-15 * * Structure to get/set per clos property using IOCTL * ISST_IF_CLOS_PARAM. @@ -209,7 +210,7 @@ struct isst_clos_param { /** * struct isst_if_clos_assoc - Structure to assign clos to a CPU * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @logical_cpu: CPU number * @clos: Clos ID to assign to the logical CPU * @@ -228,6 +229,7 @@ struct isst_if_clos_assoc { * @get_set: Request is for get or set * @punit_cpu_map: Set to 1 if the CPU number is punit numbering not * Linux CPU number + * @assoc_info: CLOS data for this CPU * * Structure used to get/set associate CPUs to clos using IOCTL * ISST_IF_CLOS_ASSOC. @@ -257,7 +259,7 @@ struct isst_tpmi_instance_count { /** * struct isst_perf_level_info - Structure to get information on SST-PP levels * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @logical_cpu: CPU number * @clos: Clos ID to assign to the logical CPU * @max_level: Maximum performance level supported by the platform @@ -267,8 +269,8 @@ struct isst_tpmi_instance_count { * @feature_state: SST-BF and SST-TF (enabled/disabled) status at current level * @locked: SST-PP performance level change is locked/unlocked * @enabled: SST-PP feature is enabled or not - * @sst-tf_support: SST-TF support status at this level - * @sst-bf_support: SST-BF support status at this level + * @sst_tf_support: SST-TF support status at this level + * @sst_bf_support: SST-BF support status at this level * * Structure to get SST-PP details using IOCTL ISST_IF_PERF_LEVELS. */ @@ -289,7 +291,7 @@ struct isst_perf_level_info { /** * struct isst_perf_level_control - Structure to set SST-PP level * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: level to set * * Structure used change SST-PP level using IOCTL ISST_IF_PERF_SET_LEVEL. @@ -303,7 +305,7 @@ struct isst_perf_level_control { /** * struct isst_perf_feature_control - Structure to activate SST-BF/SST-TF * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @feature: bit 0 = SST-BF state, bit 1 = SST-TF state * * Structure used to enable SST-BF/SST-TF using IOCTL ISST_IF_PERF_SET_FEATURE. @@ -320,7 +322,7 @@ struct isst_perf_feature_control { /** * struct isst_perf_level_data_info - Structure to get SST-PP level details * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @tdp_ratio: TDP Ratio * @base_freq_mhz: Base frequency in MHz @@ -341,8 +343,8 @@ struct isst_perf_feature_control { * @pm_fabric_freq_mhz: Fabric (Uncore) minimum frequency * @max_buckets: Maximum trl buckets * @max_trl_levels: Maximum trl levels - * @bucket_core_counts[TRL_MAX_BUCKETS]: Number of cores per bucket - * @trl_freq_mhz[TRL_MAX_LEVELS][TRL_MAX_BUCKETS]: maximum frequency + * @bucket_core_counts: Number of cores per bucket + * @trl_freq_mhz: maximum frequency * for a bucket and trl level * * Structure used to get information on frequencies and TDP for a SST-PP @@ -402,7 +404,7 @@ struct isst_perf_level_fabric_info { /** * struct isst_perf_level_cpu_mask - Structure to get SST-PP level CPU mask * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @punit_cpu_map: Set to 1 if the CPU number is punit numbering not * Linux CPU number. If 0 CPU buffer is copied to user space @@ -430,7 +432,7 @@ struct isst_perf_level_cpu_mask { /** * struct isst_base_freq_info - Structure to get SST-BF frequencies * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @high_base_freq_mhz: High priority CPU base frequency * @low_base_freq_mhz: Low priority CPU base frequency @@ -453,9 +455,11 @@ struct isst_base_freq_info { /** * struct isst_turbo_freq_info - Structure to get SST-TF frequencies * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @max_clip_freqs: Maximum number of low priority core clipping frequencies + * @max_buckets: Maximum trl buckets + * @max_trl_levels: Maximum trl levels * @lp_clip_freq_mhz: Clip frequencies per trl level * @bucket_core_counts: Maximum number of cores for a bucket * @trl_freq_mhz: Frequencies per trl level for each bucket From bd4f9f113dda07293ed4002a17d14f62121d324f Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Mon, 3 Nov 2025 14:01:44 -0500 Subject: [PATCH 128/543] platform/x86: alienware-wmi-wmax: Fix "Alienware m16 R1 AMD" quirk order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quirks are matched using dmi_first_match(), therefore move the "Alienware m16 R1 AMD" entry above other m16 entries. Reported-by: Cihan Ozakca Fixes: e2468dc70074 ("Revert "platform/x86: alienware-wmi-wmax: Add G-Mode support to Alienware m16 R1"") Cc: stable@vger.kernel.org Signed-off-by: Kurt Borja Link: https://patch.msgid.link/20251103-family-supp-v1-1-a241075d1787@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi-wmax.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/platform/x86/dell/alienware-wmi-wmax.c b/drivers/platform/x86/dell/alienware-wmi-wmax.c index f417dcc9af35..53f476604269 100644 --- a/drivers/platform/x86/dell/alienware-wmi-wmax.c +++ b/drivers/platform/x86/dell/alienware-wmi-wmax.c @@ -121,14 +121,6 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = { }, .driver_data = &generic_quirks, }, - { - .ident = "Alienware m16 R1", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m16 R1"), - }, - .driver_data = &g_series_quirks, - }, { .ident = "Alienware m16 R1 AMD", .matches = { @@ -137,6 +129,14 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = { }, .driver_data = &generic_quirks, }, + { + .ident = "Alienware m16 R1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m16 R1"), + }, + .driver_data = &g_series_quirks, + }, { .ident = "Alienware m16 R2", .matches = { From 173b23808768ce5a9210d7783b06dce8a0cb3c2e Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Mon, 3 Nov 2025 14:01:45 -0500 Subject: [PATCH 129/543] platform/x86: alienware-wmi-wmax: Drop redundant DMI entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The awcc_dmi_table[] uses DMI_MATCH() that supports partial matches. As there is already "Alienware Area-51m" entry, "Alienware Area-51m R2" entry is redundant. Signed-off-by: Kurt Borja Link: https://patch.msgid.link/20251103-family-supp-v1-2-a241075d1787@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi-wmax.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/platform/x86/dell/alienware-wmi-wmax.c b/drivers/platform/x86/dell/alienware-wmi-wmax.c index 53f476604269..b911921575ad 100644 --- a/drivers/platform/x86/dell/alienware-wmi-wmax.c +++ b/drivers/platform/x86/dell/alienware-wmi-wmax.c @@ -97,14 +97,6 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = { }, .driver_data = &generic_quirks, }, - { - .ident = "Alienware Area-51m R2", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware Area-51m R2"), - }, - .driver_data = &generic_quirks, - }, { .ident = "Alienware m15 R5", .matches = { From e8c3c875e1017c04c594f0e6127ba82095b1cb87 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Mon, 3 Nov 2025 14:01:46 -0500 Subject: [PATCH 130/543] platform/x86: alienware-wmi-wmax: Add support for the whole "M" family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the whole "Alienware M" laptop family. Cc: stable@vger.kernel.org Signed-off-by: Kurt Borja Link: https://patch.msgid.link/20251103-family-supp-v1-3-a241075d1787@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../platform/x86/dell/alienware-wmi-wmax.c | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/platform/x86/dell/alienware-wmi-wmax.c b/drivers/platform/x86/dell/alienware-wmi-wmax.c index b911921575ad..53d09978efbd 100644 --- a/drivers/platform/x86/dell/alienware-wmi-wmax.c +++ b/drivers/platform/x86/dell/alienware-wmi-wmax.c @@ -98,18 +98,10 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = { .driver_data = &generic_quirks, }, { - .ident = "Alienware m15 R5", + .ident = "Alienware m15", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m15 R5"), - }, - .driver_data = &generic_quirks, - }, - { - .ident = "Alienware m15 R7", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m15 R7"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m15"), }, .driver_data = &generic_quirks, }, @@ -138,18 +130,18 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = { .driver_data = &generic_quirks, }, { - .ident = "Alienware m17 R5", + .ident = "Alienware m17", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m17 R5 AMD"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m17"), }, .driver_data = &generic_quirks, }, { - .ident = "Alienware m18 R2", + .ident = "Alienware m18", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m18 R2"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m18"), }, .driver_data = &generic_quirks, }, From 21ebfff1cf4727bc325c89b94ed93741f870744f Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Mon, 3 Nov 2025 14:01:47 -0500 Subject: [PATCH 131/543] platform/x86: alienware-wmi-wmax: Add support for the whole "X" family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the whole "Alienware X" laptop family. Cc: stable@vger.kernel.org Signed-off-by: Kurt Borja Link: https://patch.msgid.link/20251103-family-supp-v1-4-a241075d1787@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi-wmax.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/platform/x86/dell/alienware-wmi-wmax.c b/drivers/platform/x86/dell/alienware-wmi-wmax.c index 53d09978efbd..c545eca9192f 100644 --- a/drivers/platform/x86/dell/alienware-wmi-wmax.c +++ b/drivers/platform/x86/dell/alienware-wmi-wmax.c @@ -146,26 +146,18 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = { .driver_data = &generic_quirks, }, { - .ident = "Alienware x15 R1", + .ident = "Alienware x15", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x15 R1"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x15"), }, .driver_data = &generic_quirks, }, { - .ident = "Alienware x15 R2", + .ident = "Alienware x17", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x15 R2"), - }, - .driver_data = &generic_quirks, - }, - { - .ident = "Alienware x17 R2", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x17 R2"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x17"), }, .driver_data = &generic_quirks, }, From a6003d90f02863898babbcb3f55b1cd33f7867c2 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Mon, 3 Nov 2025 14:01:48 -0500 Subject: [PATCH 132/543] platform/x86: alienware-wmi-wmax: Add support for the whole "G" family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the whole "Dell G" laptop family. Cc: stable@vger.kernel.org Signed-off-by: Kurt Borja Link: https://patch.msgid.link/20251103-family-supp-v1-5-a241075d1787@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../platform/x86/dell/alienware-wmi-wmax.c | 56 +++---------------- 1 file changed, 8 insertions(+), 48 deletions(-) diff --git a/drivers/platform/x86/dell/alienware-wmi-wmax.c b/drivers/platform/x86/dell/alienware-wmi-wmax.c index c545eca9192f..1c92db1ac087 100644 --- a/drivers/platform/x86/dell/alienware-wmi-wmax.c +++ b/drivers/platform/x86/dell/alienware-wmi-wmax.c @@ -162,74 +162,34 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = { .driver_data = &generic_quirks, }, { - .ident = "Dell Inc. G15 5510", + .ident = "Dell Inc. G15", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5510"), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15"), }, .driver_data = &g_series_quirks, }, { - .ident = "Dell Inc. G15 5511", + .ident = "Dell Inc. G16", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5511"), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell G16"), }, .driver_data = &g_series_quirks, }, { - .ident = "Dell Inc. G15 5515", + .ident = "Dell Inc. G3", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5515"), + DMI_MATCH(DMI_PRODUCT_NAME, "G3"), }, .driver_data = &g_series_quirks, }, { - .ident = "Dell Inc. G15 5530", + .ident = "Dell Inc. G5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5530"), - }, - .driver_data = &g_series_quirks, - }, - { - .ident = "Dell Inc. G16 7630", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell G16 7630"), - }, - .driver_data = &g_series_quirks, - }, - { - .ident = "Dell Inc. G3 3500", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "G3 3500"), - }, - .driver_data = &g_series_quirks, - }, - { - .ident = "Dell Inc. G3 3590", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "G3 3590"), - }, - .driver_data = &g_series_quirks, - }, - { - .ident = "Dell Inc. G5 5500", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "G5 5500"), - }, - .driver_data = &g_series_quirks, - }, - { - .ident = "Dell Inc. G5 5505", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "G5 5505"), + DMI_MATCH(DMI_PRODUCT_NAME, "G5"), }, .driver_data = &g_series_quirks, }, From db4a3f0fbedb0398f77b9047e8b8bb2b49f355bb Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Fri, 24 Oct 2025 17:21:50 +0200 Subject: [PATCH 133/543] platform/x86/amd/pmc: Add support for Van Gogh SoC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ROG Xbox Ally (non-X) SoC features a similar architecture to the Steam Deck. While the Steam Deck supports S3 (s2idle causes a crash), this support was dropped by the Xbox Ally which only S0ix suspend. Since the handler is missing here, this causes the device to not suspend and the AMD GPU driver to crash while trying to resume afterwards due to a power hang. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4659 Signed-off-by: Antheas Kapenekakis Reviewed-by: Mario Limonciello (AMD) Acked-by: Shyam Sundar S K Link: https://patch.msgid.link/20251024152152.3981721-2-lkml@antheas.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc.c | 3 +++ drivers/platform/x86/amd/pmc/pmc.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c index bd318fd02ccf..cae3fcafd4d7 100644 --- a/drivers/platform/x86/amd/pmc/pmc.c +++ b/drivers/platform/x86/amd/pmc/pmc.c @@ -106,6 +106,7 @@ static void amd_pmc_get_ip_info(struct amd_pmc_dev *dev) switch (dev->cpu_id) { case AMD_CPU_ID_PCO: case AMD_CPU_ID_RN: + case AMD_CPU_ID_VG: case AMD_CPU_ID_YC: case AMD_CPU_ID_CB: dev->num_ips = 12; @@ -517,6 +518,7 @@ static int amd_pmc_get_os_hint(struct amd_pmc_dev *dev) case AMD_CPU_ID_PCO: return MSG_OS_HINT_PCO; case AMD_CPU_ID_RN: + case AMD_CPU_ID_VG: case AMD_CPU_ID_YC: case AMD_CPU_ID_CB: case AMD_CPU_ID_PS: @@ -717,6 +719,7 @@ static const struct pci_device_id pmc_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_RV) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_SP) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_SHP) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_VG) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_ROOT) }, { } diff --git a/drivers/platform/x86/amd/pmc/pmc.h b/drivers/platform/x86/amd/pmc/pmc.h index 62f3e51020fd..fe3f53eb5955 100644 --- a/drivers/platform/x86/amd/pmc/pmc.h +++ b/drivers/platform/x86/amd/pmc/pmc.h @@ -156,6 +156,7 @@ void amd_mp2_stb_deinit(struct amd_pmc_dev *dev); #define AMD_CPU_ID_RN 0x1630 #define AMD_CPU_ID_PCO AMD_CPU_ID_RV #define AMD_CPU_ID_CZN AMD_CPU_ID_RN +#define AMD_CPU_ID_VG 0x1645 #define AMD_CPU_ID_YC 0x14B5 #define AMD_CPU_ID_CB 0x14D8 #define AMD_CPU_ID_PS 0x14E8 From c0ddc54016636dd8dedfaf1a3b482a95058e1db2 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Fri, 24 Oct 2025 17:21:51 +0200 Subject: [PATCH 134/543] platform/x86/amd/pmc: Add spurious_8042 to Xbox Ally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Xbox Ally features a Van Gogh SoC that has spurious interrupts during resume. We get the following logs: atkbd_receive_byte: 20 callbacks suppressed atkbd serio0: Spurious ACK on isa0060/serio0. Some program might be trying to access hardware directly. So, add the spurious_8042 quirk for it. It does not have a keyboard, so this does not result in any functional loss. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4659 Signed-off-by: Antheas Kapenekakis Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20251024152152.3981721-3-lkml@antheas.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc-quirks.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index d63aaad7ef59..eb641ce0e982 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -122,6 +122,14 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_PRODUCT_NAME, "21A1"), } }, + { + .ident = "ROG Xbox Ally RC73YA", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_BOARD_NAME, "RC73YA"), + } + }, /* https://bugzilla.kernel.org/show_bug.cgi?id=218024 */ { .ident = "V14 G4 AMN", From f945afe01c6768dcfed7868c671a26e1164c2284 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Wed, 8 Oct 2025 15:50:57 +0200 Subject: [PATCH 135/543] platform/x86/amd: pmc: Add Lenovo Legion Go 2 to pmc quirk list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Lenovo Legion Go 2 takes a long time to resume from suspend. This is due to it having an nvme resume handler that interferes with IOMMU mappings. It is a common issue with older Lenovo laptops. Adding it to that quirk list fixes this issue. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4618 Suggested-by: Mario Limonciello Signed-off-by: Antheas Kapenekakis Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20251008135057.731928-1-lkml@antheas.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc-quirks.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index eb641ce0e982..404e62ad293a 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -212,6 +212,23 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_PRODUCT_NAME, "82ND"), } }, + /* https://gitlab.freedesktop.org/drm/amd/-/issues/4618 */ + { + .ident = "Lenovo Legion Go 2", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "83N0"), + } + }, + { + .ident = "Lenovo Legion Go 2", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "83N1"), + } + }, /* https://gitlab.freedesktop.org/drm/amd/-/issues/2684 */ { .ident = "HP Laptop 15s-eq2xxx", From 6b6eddc63ce871897d3a5bc4f8f593e698aef104 Mon Sep 17 00:00:00 2001 From: Haotian Zhang Date: Wed, 5 Nov 2025 14:22:46 +0800 Subject: [PATCH 136/543] ASoC: cs4271: Fix regulator leak on probe failure The probe function enables regulators at the beginning but fails to disable them in its error handling path. If any operation after enabling the regulators fails, the probe will exit with an error, leaving the regulators permanently enabled, which could lead to a resource leak. Add a proper error handling path to call regulator_bulk_disable() before returning an error. Fixes: 9a397f473657 ("ASoC: cs4271: add regulator consumer support") Signed-off-by: Haotian Zhang Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20251105062246.1955-1-vulab@iscas.ac.cn Signed-off-by: Mark Brown --- sound/soc/codecs/cs4271.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/cs4271.c b/sound/soc/codecs/cs4271.c index 6a3cca3d26c7..ead447a5da7f 100644 --- a/sound/soc/codecs/cs4271.c +++ b/sound/soc/codecs/cs4271.c @@ -581,17 +581,17 @@ static int cs4271_component_probe(struct snd_soc_component *component) ret = regcache_sync(cs4271->regmap); if (ret < 0) - return ret; + goto err_disable_regulator; ret = regmap_update_bits(cs4271->regmap, CS4271_MODE2, CS4271_MODE2_PDN | CS4271_MODE2_CPEN, CS4271_MODE2_PDN | CS4271_MODE2_CPEN); if (ret < 0) - return ret; + goto err_disable_regulator; ret = regmap_update_bits(cs4271->regmap, CS4271_MODE2, CS4271_MODE2_PDN, 0); if (ret < 0) - return ret; + goto err_disable_regulator; /* Power-up sequence requires 85 uS */ udelay(85); @@ -601,6 +601,10 @@ static int cs4271_component_probe(struct snd_soc_component *component) CS4271_MODE2_MUTECAEQUB); return 0; + +err_disable_regulator: + regulator_bulk_disable(ARRAY_SIZE(cs4271->supplies), cs4271->supplies); + return ret; } static void cs4271_component_remove(struct snd_soc_component *component) From 1a58d865f423f4339edf59053e496089075fa950 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 29 Oct 2025 15:17:58 +0800 Subject: [PATCH 137/543] ASoC: sdw_utils: fix device reference leak in is_sdca_endpoint_present() The bus_find_device_by_name() function returns a device pointer with an incremented reference count, but the original code was missing put_device() calls in some return paths, leading to reference count leaks. Fix this by ensuring put_device() is called before function exit after bus_find_device_by_name() succeeds This follows the same pattern used elsewhere in the kernel where bus_find_device_by_name() is properly paired with put_device(). Found via static analysis and code review. Fixes: 4f8ef33dd44a ("ASoC: soc_sdw_utils: skip the endpoint that doesn't present") Cc: stable@vger.kernel.org Signed-off-by: Miaoqian Lin Link: https://patch.msgid.link/20251029071804.8425-1-linmq006@gmail.com Signed-off-by: Mark Brown --- sound/soc/sdw_utils/soc_sdw_utils.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index f7c8c16308de..3848c7df1916 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -1277,7 +1277,7 @@ static int is_sdca_endpoint_present(struct device *dev, struct sdw_slave *slave; struct device *sdw_dev; const char *sdw_codec_name; - int i; + int ret, i; dlc = kzalloc(sizeof(*dlc), GFP_KERNEL); if (!dlc) @@ -1307,13 +1307,16 @@ static int is_sdca_endpoint_present(struct device *dev, } slave = dev_to_sdw_dev(sdw_dev); - if (!slave) - return -EINVAL; + if (!slave) { + ret = -EINVAL; + goto put_device; + } /* Make sure BIOS provides SDCA properties */ if (!slave->sdca_data.interface_revision) { dev_warn(&slave->dev, "SDCA properties not found in the BIOS\n"); - return 1; + ret = 1; + goto put_device; } for (i = 0; i < slave->sdca_data.num_functions; i++) { @@ -1322,7 +1325,8 @@ static int is_sdca_endpoint_present(struct device *dev, if (dai_type == dai_info->dai_type) { dev_dbg(&slave->dev, "DAI type %d sdca function %s found\n", dai_type, slave->sdca_data.function[i].name); - return 1; + ret = 1; + goto put_device; } } @@ -1330,7 +1334,11 @@ static int is_sdca_endpoint_present(struct device *dev, "SDCA device function for DAI type %d not supported, skip endpoint\n", dai_info->dai_type); - return 0; + ret = 0; + +put_device: + put_device(sdw_dev); + return ret; } int asoc_sdw_parse_sdw_endpoints(struct snd_soc_card *card, From 84f5526e4dce0a44d050ceb1b1bf21d43016d91b Mon Sep 17 00:00:00 2001 From: Niranjan H Y Date: Thu, 30 Oct 2025 20:46:37 +0530 Subject: [PATCH 138/543] ASoC: tas2783A: Fix issues in firmware parsing During firmware download, if the size of the firmware is too small, it wrongly assumes the firmware download is successful. If there is size mismatch with chunk's header, invalid memory is accessed. Fix these issues by throwing error during these cases. Fixes: 4cc9bd8d7b32 (ASoc: tas2783A: Add soundwire based codec driver) Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202510291226.2R3fbYNh-lkp@intel.com/ Signed-off-by: Niranjan H Y Link: https://patch.msgid.link/20251030151637.566-1-niranjan.hy@ti.com Signed-off-by: Mark Brown --- sound/soc/codecs/tas2783-sdw.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/tas2783-sdw.c b/sound/soc/codecs/tas2783-sdw.c index 1fb4227b711e..e273b80d033e 100644 --- a/sound/soc/codecs/tas2783-sdw.c +++ b/sound/soc/codecs/tas2783-sdw.c @@ -762,10 +762,17 @@ static void tas2783_fw_ready(const struct firmware *fmw, void *context) goto out; } - mutex_lock(&tas_dev->pde_lock); img_sz = fmw->size; buf = fmw->data; offset += FW_DL_OFFSET; + if (offset >= (img_sz - FW_FL_HDR)) { + dev_err(tas_dev->dev, + "firmware is too small"); + ret = -EINVAL; + goto out; + } + + mutex_lock(&tas_dev->pde_lock); while (offset < (img_sz - FW_FL_HDR)) { memset(&hdr, 0, sizeof(hdr)); offset += read_header(&buf[offset], &hdr); @@ -776,6 +783,14 @@ static void tas2783_fw_ready(const struct firmware *fmw, void *context) /* size also includes the header */ file_blk_size = hdr.length - FW_FL_HDR; + /* make sure that enough data is there */ + if (offset + file_blk_size > img_sz) { + ret = -EINVAL; + dev_err(tas_dev->dev, + "corrupt firmware file"); + break; + } + switch (hdr.file_id) { case 0: ret = sdw_nwrite_no_pm(tas_dev->sdw_peripheral, @@ -808,7 +823,8 @@ static void tas2783_fw_ready(const struct firmware *fmw, void *context) break; } mutex_unlock(&tas_dev->pde_lock); - tas2783_update_calibdata(tas_dev); + if (!ret) + tas2783_update_calibdata(tas_dev); out: if (!ret) From 86d57d9c07d54e8cb385ffe800930816ccdba0c1 Mon Sep 17 00:00:00 2001 From: Robin Gong Date: Fri, 24 Oct 2025 13:53:20 +0800 Subject: [PATCH 139/543] spi: imx: keep dma request disabled before dma transfer setup Since sdma hardware configure postpone to transfer phase, have to disable dma request before dma transfer setup because there is a hardware limitation on sdma event enable(ENBLn) as below: "It is thus essential for the Arm platform to program them before any DMA request is triggered to the SDMA, otherwise an unpredictable combination of channels may be started." Signed-off-by: Carlos Song Signed-off-by: Robin Gong Link: https://patch.msgid.link/20251024055320.408482-1-carlos.song@nxp.com Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 155ddeb8fcd4..bbf1fd4fe1e9 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -519,9 +519,15 @@ static void mx51_ecspi_trigger(struct spi_imx_data *spi_imx) { u32 reg; - reg = readl(spi_imx->base + MX51_ECSPI_CTRL); - reg |= MX51_ECSPI_CTRL_XCH; - writel(reg, spi_imx->base + MX51_ECSPI_CTRL); + if (spi_imx->usedma) { + reg = readl(spi_imx->base + MX51_ECSPI_DMA); + reg |= MX51_ECSPI_DMA_TEDEN | MX51_ECSPI_DMA_RXDEN; + writel(reg, spi_imx->base + MX51_ECSPI_DMA); + } else { + reg = readl(spi_imx->base + MX51_ECSPI_CTRL); + reg |= MX51_ECSPI_CTRL_XCH; + writel(reg, spi_imx->base + MX51_ECSPI_CTRL); + } } static void mx51_ecspi_disable(struct spi_imx_data *spi_imx) @@ -759,7 +765,6 @@ static void mx51_setup_wml(struct spi_imx_data *spi_imx) writel(MX51_ECSPI_DMA_RX_WML(spi_imx->wml - 1) | MX51_ECSPI_DMA_TX_WML(tx_wml) | MX51_ECSPI_DMA_RXT_WML(spi_imx->wml) | - MX51_ECSPI_DMA_TEDEN | MX51_ECSPI_DMA_RXDEN | MX51_ECSPI_DMA_RXTDEN, spi_imx->base + MX51_ECSPI_DMA); } @@ -1520,6 +1525,8 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx, reinit_completion(&spi_imx->dma_tx_completion); dma_async_issue_pending(controller->dma_tx); + spi_imx->devtype_data->trigger(spi_imx); + transfer_timeout = spi_imx_calculate_timeout(spi_imx, transfer->len); /* Wait SDMA to finish the data transfer.*/ From d0164c161923ac303bd843e04ebe95cfd03c6e19 Mon Sep 17 00:00:00 2001 From: Sukrit Bhatnagar Date: Thu, 6 Nov 2025 14:28:51 +0900 Subject: [PATCH 140/543] KVM: VMX: Fix check for valid GVA on an EPT violation On an EPT violation, bit 7 of the exit qualification is set if the guest linear-address is valid. The derived page fault error code should not be checked for this bit. Fixes: f3009482512e ("KVM: VMX: Set PFERR_GUEST_{FINAL,PAGE}_MASK if and only if the GVA is valid") Cc: stable@vger.kernel.org Signed-off-by: Sukrit Bhatnagar Reviewed-by: Xiaoyao Li Link: https://patch.msgid.link/20251106052853.3071088-1-Sukrit.Bhatnagar@sony.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h index bc5ece76533a..412d0829d7a2 100644 --- a/arch/x86/kvm/vmx/common.h +++ b/arch/x86/kvm/vmx/common.h @@ -98,7 +98,7 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) ? PFERR_PRESENT_MASK : 0; - if (error_code & EPT_VIOLATION_GVA_IS_VALID) + if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID) error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; From 9065b968752334f972e0d48e50c4463a172fc2a7 Mon Sep 17 00:00:00 2001 From: Nicolas Escande Date: Tue, 4 Nov 2025 09:39:57 +0100 Subject: [PATCH 141/543] wifi: ath11k: zero init info->status in wmi_process_mgmt_tx_comp() When reporting tx completion using ieee80211_tx_status_xxx() family of functions, the status part of the struct ieee80211_tx_info nested in the skb is used to report things like transmit rates & retry count to mac80211 On the TX data path, this is correctly memset to 0 before calling ieee80211_tx_status_ext(), but on the tx mgmt path this was not done. This leads to mac80211 treating garbage values as valid transmit counters (like tx retries for example) and accounting them as real statistics that makes their way to userland via station dump. The same issue was resolved in ath12k by commit 9903c0986f78 ("wifi: ath12k: Add memset and update default rate value in wmi tx completion") Tested-on: QCN9074 PCI WLAN.HK.2.9.0.1-01977-QCAHKSWPL_SILICONZ-1 Fixes: d5c65159f289 ("ath11k: driver for Qualcomm IEEE 802.11ax devices") Signed-off-by: Nicolas Escande Reviewed-by: Vasanthakumar Thiagarajan Reviewed-by: Baochen Qiang Link: https://patch.msgid.link/20251104083957.717825-1-nico.escande@gmail.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath11k/wmi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c index 0491e3fd6b5e..e3b444333dee 100644 --- a/drivers/net/wireless/ath/ath11k/wmi.c +++ b/drivers/net/wireless/ath/ath11k/wmi.c @@ -5961,6 +5961,9 @@ static int wmi_process_mgmt_tx_comp(struct ath11k *ar, dma_unmap_single(ar->ab->dev, skb_cb->paddr, msdu->len, DMA_TO_DEVICE); info = IEEE80211_SKB_CB(msdu); + memset(&info->status, 0, sizeof(info->status)); + info->status.rates[0].idx = -1; + if ((!(info->flags & IEEE80211_TX_CTL_NO_ACK)) && !tx_compl_param->status) { info->flags |= IEEE80211_TX_STAT_ACK; From 3dc8c73365d3ca25c99e7e1a0f493039d7291df5 Mon Sep 17 00:00:00 2001 From: Haotian Zhang Date: Thu, 6 Nov 2025 22:31:14 +0800 Subject: [PATCH 142/543] ASoC: codecs: va-macro: fix resource leak in probe error path In the commit referenced by the Fixes tag, clk_hw_get_clk() was added in va_macro_probe() to get the fsgen clock, but forgot to add the corresponding clk_put() in va_macro_remove(). This leads to a clock reference leak when the driver is unloaded. Switch to devm_clk_hw_get_clk() to automatically manage the clock resource. Fixes: 30097967e056 ("ASoC: codecs: va-macro: use fsgen as clock") Suggested-by: Konrad Dybcio Signed-off-by: Haotian Zhang Reviewed-by: Konrad Dybcio Link: https://patch.msgid.link/20251106143114.729-1-vulab@iscas.ac.cn Signed-off-by: Mark Brown --- sound/soc/codecs/lpass-va-macro.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/lpass-va-macro.c b/sound/soc/codecs/lpass-va-macro.c index 2e1b77973a3e..92c177b82a02 100644 --- a/sound/soc/codecs/lpass-va-macro.c +++ b/sound/soc/codecs/lpass-va-macro.c @@ -1638,7 +1638,7 @@ static int va_macro_probe(struct platform_device *pdev) if (ret) goto err_clkout; - va->fsgen = clk_hw_get_clk(&va->hw, "fsgen"); + va->fsgen = devm_clk_hw_get_clk(dev, &va->hw, "fsgen"); if (IS_ERR(va->fsgen)) { ret = PTR_ERR(va->fsgen); goto err_clkout; From a9da90e618cd0669a22bcc06a96209db5dd96e9b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:41:19 +0100 Subject: [PATCH 143/543] wifi: mac80211: reject address change while connecting While connecting, the MAC address can already no longer be changed. The change is already rejected if netif_carrier_ok(), but of course that's not true yet while connecting. Check for auth_data or assoc_data, so the MAC address cannot be changed. Also more comprehensively check that there are no stations on the interface being changed - if any peer station is added it will know about our address already, so we cannot change it. Cc: stable@vger.kernel.org Fixes: 3c06e91b40db ("wifi: mac80211: Support POWERED_ADDR_CHANGE feature") Link: https://patch.msgid.link/20251105154119.f9f6c1df81bb.I9bb3760ede650fb96588be0d09a5a7bdec21b217@changeid Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index a7873832d4fa..0ca55b9655a7 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -223,6 +223,10 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata if (netif_carrier_ok(sdata->dev)) return -EBUSY; + /* if any stations are set known (so they know this vif too), reject */ + if (sta_info_get_by_idx(sdata, 0)) + return -EBUSY; + /* First check no ROC work is happening on this iface */ list_for_each_entry(roc, &local->roc_list, list) { if (roc->sdata != sdata) @@ -242,12 +246,16 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata ret = -EBUSY; } + /* + * More interface types could be added here but changing the + * address while powered makes the most sense in client modes. + */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - /* More interface types could be added here but changing the - * address while powered makes the most sense in client modes. - */ + /* refuse while connecting */ + if (sdata->u.mgd.auth_data || sdata->u.mgd.assoc_data) + return -EBUSY; break; default: ret = -EOPNOTSUPP; From f2a12cc3b97f062186568a7b94ddb7aa2ef68140 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 31 Oct 2025 13:47:39 +0800 Subject: [PATCH 144/543] erofs: avoid infinite loop due to incomplete zstd-compressed data Currently, the decompression logic incorrectly spins if compressed data is truncated in crafted (deliberately corrupted) images. Fixes: 7c35de4df105 ("erofs: Zstandard compression support") Reported-by: Robert Morris Closes: https://lore.kernel.org/r/50958.1761605413@localhost Signed-off-by: Gao Xiang Reviewed-by: Chunhai Guo Reviewed-by: Chao Yu --- fs/erofs/decompressor_zstd.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c index b4bfe14229f9..e38d93bb2104 100644 --- a/fs/erofs/decompressor_zstd.c +++ b/fs/erofs/decompressor_zstd.c @@ -172,7 +172,6 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, dctx.bounce = strm->bounce; do { - dctx.avail_out = out_buf.size - out_buf.pos; dctx.inbuf_sz = in_buf.size; dctx.inbuf_pos = in_buf.pos; err = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst, @@ -188,14 +187,18 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, in_buf.pos = dctx.inbuf_pos; zerr = zstd_decompress_stream(stream, &out_buf, &in_buf); - if (zstd_is_error(zerr) || (!zerr && rq->outputsize)) { + dctx.avail_out = out_buf.size - out_buf.pos; + if (zstd_is_error(zerr) || + ((rq->outputsize + dctx.avail_out) && (!zerr || (zerr > 0 && + !(rq->inputsize + in_buf.size - in_buf.pos))))) { erofs_err(sb, "failed to decompress in[%u] out[%u]: %s", rq->inputsize, rq->outputsize, - zerr ? zstd_get_error_name(zerr) : "unexpected end of stream"); + zstd_is_error(zerr) ? zstd_get_error_name(zerr) : + "unexpected end of stream"); err = -EFSCORRUPTED; break; } - } while (rq->outputsize || out_buf.pos < out_buf.size); + } while (rq->outputsize + dctx.avail_out); if (dctx.kout) kunmap_local(dctx.kout); From a59e927ff46a967f84ddf94e89cbb045810e8974 Mon Sep 17 00:00:00 2001 From: Andrey Leonchikov Date: Wed, 5 Nov 2025 22:07:33 +0100 Subject: [PATCH 145/543] arm64: dts: rockchip: Fix USB power enable pin for BTT CB2 and Pi2 Fix typo into regulator GPIO definition. With current definition - USB powered off. Valid definition can be found on "pinctrl" section: vcc5v0_usb2t_en: vcc5v0-usb2t-en { rockchip,pins = <3 RK_PD5 RK_FUNC_GPIO &pcfg_pull_none>; }; vcc5v0_usb2b_en: vcc5v0-usb2b-en { rockchip,pins = <4 RK_PC4 RK_FUNC_GPIO &pcfg_pull_none>; }; Fixes: bfbc663d2733a ("arm64: dts: rockchip: Add BigTreeTech CB2 and Pi2") Signed-off-by: Andrey Leonchikov Link: https://patch.msgid.link/20251105210741.850031-1-andreil499@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi index f74590af7e33..b6cf03a7ba66 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi @@ -187,7 +187,7 @@ vcc5v0_usb: regulator-vcc5v0-usb { vcc5v0_usb2b: regulator-vcc5v0-usb2b { compatible = "regulator-fixed"; enable-active-high; - gpio = <&gpio0 RK_PC4 GPIO_ACTIVE_HIGH>; + gpio = <&gpio4 RK_PC4 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&vcc5v0_usb2b_en>; regulator-name = "vcc5v0_usb2b"; @@ -199,7 +199,7 @@ vcc5v0_usb2b: regulator-vcc5v0-usb2b { vcc5v0_usb2t: regulator-vcc5v0-usb2t { compatible = "regulator-fixed"; enable-active-high; - gpios = <&gpio0 RK_PD5 GPIO_ACTIVE_HIGH>; + gpios = <&gpio3 RK_PD5 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&vcc5v0_usb2t_en>; regulator-name = "vcc5v0_usb2t"; From 74d4432421a3e2669fbccc08c0f4fc2980bf0e39 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 5 Nov 2025 21:29:08 +0200 Subject: [PATCH 146/543] docs: netlink: Couple of intro-specs documentation fixes Fix typo "handul" to "handful" and remove outdated limitation stating only generic netlink is supported (we have netlink-raw). Reviewed-by: Carolina Jubran Signed-off-by: Gal Pressman Link: https://patch.msgid.link/20251105192908.686458-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/userspace-api/netlink/intro-specs.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/userspace-api/netlink/intro-specs.rst b/Documentation/userspace-api/netlink/intro-specs.rst index a4435ae4628d..e5ebc617754a 100644 --- a/Documentation/userspace-api/netlink/intro-specs.rst +++ b/Documentation/userspace-api/netlink/intro-specs.rst @@ -13,10 +13,10 @@ Simple CLI Kernel comes with a simple CLI tool which should be useful when developing Netlink related code. The tool is implemented in Python and can use a YAML specification to issue Netlink requests -to the kernel. Only Generic Netlink is supported. +to the kernel. The tool is located at ``tools/net/ynl/pyynl/cli.py``. It accepts -a handul of arguments, the most important ones are: +a handful of arguments, the most important ones are: - ``--spec`` - point to the spec file - ``--do $name`` / ``--dump $name`` - issue request ``$name`` From 32b415a9dc2c212e809b7ebc2b14bc3fbda2b9af Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Tue, 21 Oct 2025 14:01:28 -0500 Subject: [PATCH 147/543] drm/vmwgfx: Validate command header size against SVGA_CMD_MAX_DATASIZE This data originates from userspace and is used in buffer offset calculations which could potentially overflow causing an out-of-bounds access. Fixes: 8ce75f8ab904 ("drm/vmwgfx: Update device includes for DX device functionality") Reported-by: Rohit Keshri Signed-off-by: Ian Forbes Reviewed-by: Maaz Mombasawala Signed-off-by: Zack Rusin Link: https://patch.msgid.link/20251021190128.13014-1-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index d539f25b5fbe..3057f8baa7d2 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -3668,6 +3668,11 @@ static int vmw_cmd_check(struct vmw_private *dev_priv, cmd_id = header->id; + if (header->size > SVGA_CMD_MAX_DATASIZE) { + VMW_DEBUG_USER("SVGA3D command: %d is too big.\n", + cmd_id + SVGA_3D_CMD_BASE); + return -E2BIG; + } *size = header->size + sizeof(SVGA3dCmdHeader); cmd_id -= SVGA_3D_CMD_BASE; From c1962742ffff7e245f935903a4658eb6f94f6058 Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Thu, 30 Oct 2025 14:36:40 -0500 Subject: [PATCH 148/543] drm/vmwgfx: Use kref in vmw_bo_dirty Rather than using an ad hoc reference count use kref which is atomic and has underflow warnings. Signed-off-by: Ian Forbes Signed-off-by: Zack Rusin Link: https://patch.msgid.link/20251030193640.153697-1-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c index 7de20e56082c..fd4e76486f2d 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c @@ -32,22 +32,22 @@ enum vmw_bo_dirty_method { /** * struct vmw_bo_dirty - Dirty information for buffer objects + * @ref_count: Reference count for this structure. Must be first member! * @start: First currently dirty bit * @end: Last currently dirty bit + 1 * @method: The currently used dirty method * @change_count: Number of consecutive method change triggers - * @ref_count: Reference count for this structure * @bitmap_size: The size of the bitmap in bits. Typically equal to the * nuber of pages in the bo. * @bitmap: A bitmap where each bit represents a page. A set bit means a * dirty page. */ struct vmw_bo_dirty { + struct kref ref_count; unsigned long start; unsigned long end; enum vmw_bo_dirty_method method; unsigned int change_count; - unsigned int ref_count; unsigned long bitmap_size; unsigned long bitmap[]; }; @@ -221,7 +221,7 @@ int vmw_bo_dirty_add(struct vmw_bo *vbo) int ret; if (dirty) { - dirty->ref_count++; + kref_get(&dirty->ref_count); return 0; } @@ -235,7 +235,7 @@ int vmw_bo_dirty_add(struct vmw_bo *vbo) dirty->bitmap_size = num_pages; dirty->start = dirty->bitmap_size; dirty->end = 0; - dirty->ref_count = 1; + kref_init(&dirty->ref_count); if (num_pages < PAGE_SIZE / sizeof(pte_t)) { dirty->method = VMW_BO_DIRTY_PAGETABLE; } else { @@ -274,10 +274,8 @@ void vmw_bo_dirty_release(struct vmw_bo *vbo) { struct vmw_bo_dirty *dirty = vbo->dirty; - if (dirty && --dirty->ref_count == 0) { - kvfree(dirty); + if (dirty && kref_put(&dirty->ref_count, (void *)kvfree)) vbo->dirty = NULL; - } } /** From eef295a8508202e750e4f103a97447f3c9d5e3d0 Mon Sep 17 00:00:00 2001 From: Ian Forbes Date: Mon, 3 Nov 2025 14:19:20 -0600 Subject: [PATCH 149/543] drm/vmwgfx: Restore Guest-Backed only cursor plane support The referenced fixes commit broke the cursor plane for configurations which have Guest-Backed surfaces but no cursor MOB support. Fixes: 965544150d1c ("drm/vmwgfx: Refactor cursor handling") Signed-off-by: Ian Forbes Signed-off-by: Zack Rusin Link: https://patch.msgid.link/20251103201920.381503-1-ian.forbes@broadcom.com --- drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c | 16 +++++++++++++++- drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h | 1 + 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c index 718832b08d96..c46f17ba7236 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c @@ -100,8 +100,10 @@ vmw_cursor_update_type(struct vmw_private *vmw, struct vmw_plane_state *vps) if (vmw->has_mob) { if ((vmw->capabilities2 & SVGA_CAP2_CURSOR_MOB) != 0) return VMW_CURSOR_UPDATE_MOB; + else + return VMW_CURSOR_UPDATE_GB_ONLY; } - + drm_warn_once(&vmw->drm, "Unknown Cursor Type!\n"); return VMW_CURSOR_UPDATE_NONE; } @@ -139,6 +141,7 @@ static u32 vmw_cursor_mob_size(enum vmw_cursor_update_type update_type, { switch (update_type) { case VMW_CURSOR_UPDATE_LEGACY: + case VMW_CURSOR_UPDATE_GB_ONLY: case VMW_CURSOR_UPDATE_NONE: return 0; case VMW_CURSOR_UPDATE_MOB: @@ -623,6 +626,7 @@ int vmw_cursor_plane_prepare_fb(struct drm_plane *plane, if (!surface || vps->cursor.legacy.id == surface->snooper.id) vps->cursor.update_type = VMW_CURSOR_UPDATE_NONE; break; + case VMW_CURSOR_UPDATE_GB_ONLY: case VMW_CURSOR_UPDATE_MOB: { bo = vmw_user_object_buffer(&vps->uo); if (bo) { @@ -737,6 +741,7 @@ void vmw_cursor_plane_atomic_update(struct drm_plane *plane, struct drm_atomic_state *state) { + struct vmw_bo *bo; struct drm_plane_state *new_state = drm_atomic_get_new_plane_state(state, plane); struct drm_plane_state *old_state = @@ -762,6 +767,15 @@ vmw_cursor_plane_atomic_update(struct drm_plane *plane, case VMW_CURSOR_UPDATE_MOB: vmw_cursor_update_mob(dev_priv, vps); break; + case VMW_CURSOR_UPDATE_GB_ONLY: + bo = vmw_user_object_buffer(&vps->uo); + if (bo) + vmw_send_define_cursor_cmd(dev_priv, bo->map.virtual, + vps->base.crtc_w, + vps->base.crtc_h, + vps->base.hotspot_x, + vps->base.hotspot_y); + break; case VMW_CURSOR_UPDATE_NONE: /* do nothing */ break; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h index 40694925a70e..0c2cc0699b0d 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h @@ -33,6 +33,7 @@ static const u32 __maybe_unused vmw_cursor_plane_formats[] = { enum vmw_cursor_update_type { VMW_CURSOR_UPDATE_NONE = 0, VMW_CURSOR_UPDATE_LEGACY, + VMW_CURSOR_UPDATE_GB_ONLY, VMW_CURSOR_UPDATE_MOB, }; From 29528c8e643bb0c54da01237a35010c6438423d2 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Fri, 7 Nov 2025 13:49:59 +0800 Subject: [PATCH 150/543] ASoC: tas2781: fix getting the wrong device number The return value of device_property_read_u32_array used for getting the property is the status instead of the number of the property. Fixes: ef3bcde75d06 ("ASoC: tas2781: Add tas2781 driver") Signed-off-by: Shenghao Ding Link: https://patch.msgid.link/20251107054959.950-1-shenghao-ding@ti.com Signed-off-by: Mark Brown --- sound/soc/codecs/tas2781-i2c.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c index ba880b5de7e8..8f37aa00e62e 100644 --- a/sound/soc/codecs/tas2781-i2c.c +++ b/sound/soc/codecs/tas2781-i2c.c @@ -1957,7 +1957,8 @@ static void tasdevice_parse_dt(struct tasdevice_priv *tas_priv) { struct i2c_client *client = (struct i2c_client *)tas_priv->client; unsigned int dev_addrs[TASDEVICE_MAX_CHANNELS]; - int i, ndev = 0; + int ndev = 0; + int i, rc; if (tas_priv->isacpi) { ndev = device_property_read_u32_array(&client->dev, @@ -1968,8 +1969,12 @@ static void tasdevice_parse_dt(struct tasdevice_priv *tas_priv) } else { ndev = (ndev < ARRAY_SIZE(dev_addrs)) ? ndev : ARRAY_SIZE(dev_addrs); - ndev = device_property_read_u32_array(&client->dev, + rc = device_property_read_u32_array(&client->dev, "ti,audio-slots", dev_addrs, ndev); + if (rc != 0) { + ndev = 1; + dev_addrs[0] = client->addr; + } } tas_priv->irq = From 939edfaa10f1d22e6af6a84bf4bd96dc49c67302 Mon Sep 17 00:00:00 2001 From: Alvaro Gamez Machado Date: Thu, 6 Nov 2025 14:45:35 +0100 Subject: [PATCH 151/543] spi: xilinx: increase number of retries before declaring stall SPI devices using a (relative) slow frequency need a larger time. For instance, microblaze running at 83.25MHz and performing a 3 bytes transaction using a 10MHz/16 = 625kHz needed this stall value increased to at least 20. The SPI device is quite slow, but also is the microblaze, so set this value to 32 to give it even more margin. Signed-off-by: Alvaro Gamez Machado Reviewed-by: Ricardo Ribalda Link: https://patch.msgid.link/20251106134545.31942-1-alvaro.gamez@hazent.com Signed-off-by: Mark Brown --- drivers/spi/spi-xilinx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-xilinx.c b/drivers/spi/spi-xilinx.c index d59cc8a18484..c86dc56f38b4 100644 --- a/drivers/spi/spi-xilinx.c +++ b/drivers/spi/spi-xilinx.c @@ -300,7 +300,7 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t) /* Read out all the data from the Rx FIFO */ rx_words = n_words; - stalled = 10; + stalled = 32; while (rx_words) { if (rx_words == n_words && !(stalled--) && !(sr & XSPI_SR_TX_EMPTY_MASK) && From 535fdfc5a228524552ee8810c9175e877e127c27 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 6 Nov 2025 15:52:13 +0000 Subject: [PATCH 152/543] arm64: Use load LSE atomics for the non-return per-CPU atomic operations The non-return per-CPU this_cpu_*() atomic operations are implemented as STADD/STCLR/STSET when FEAT_LSE is available. On many microarchitecture implementations, these instructions tend to be executed "far" in the interconnect or memory subsystem (unless the data is already in the L1 cache). This is in general more efficient when there is contention as it avoids bouncing cache lines between CPUs. The load atomics (e.g. LDADD without XZR as destination), OTOH, tend to be executed "near" with the data loaded into the L1 cache. STADD executed back to back as in srcu_read_{lock,unlock}*() incur an additional overhead due to the default posting behaviour on several CPU implementations. Since the per-CPU atomics are unlikely to be used concurrently on the same memory location, encourage the hardware to to execute them "near" by issuing load atomics - LDADD/LDCLR/LDSET - with the destination register unused (but not XZR). Signed-off-by: Catalin Marinas Link: https://lore.kernel.org/r/e7d539ed-ced0-4b96-8ecd-048a5b803b85@paulmck-laptop Reported-by: Paul E. McKenney Tested-by: Paul E. McKenney Cc: Will Deacon Reviewed-by: Palmer Dabbelt [will: Add comment and link to the discussion thread] Signed-off-by: Will Deacon --- arch/arm64/include/asm/percpu.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 9abcc8ef3087..b57b2bb00967 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -77,7 +77,7 @@ __percpu_##name##_case_##sz(void *ptr, unsigned long val) \ " stxr" #sfx "\t%w[loop], %" #w "[tmp], %[ptr]\n" \ " cbnz %w[loop], 1b", \ /* LSE atomics */ \ - #op_lse "\t%" #w "[val], %[ptr]\n" \ + #op_lse "\t%" #w "[val], %" #w "[tmp], %[ptr]\n" \ __nops(3)) \ : [loop] "=&r" (loop), [tmp] "=&r" (tmp), \ [ptr] "+Q"(*(u##sz *)ptr) \ @@ -124,9 +124,16 @@ PERCPU_RW_OPS(8) PERCPU_RW_OPS(16) PERCPU_RW_OPS(32) PERCPU_RW_OPS(64) -PERCPU_OP(add, add, stadd) -PERCPU_OP(andnot, bic, stclr) -PERCPU_OP(or, orr, stset) + +/* + * Use value-returning atomics for CPU-local ops as they are more likely + * to execute "near" to the CPU (e.g. in L1$). + * + * https://lore.kernel.org/r/e7d539ed-ced0-4b96-8ecd-048a5b803b85@paulmck-laptop + */ +PERCPU_OP(add, add, ldadd) +PERCPU_OP(andnot, bic, ldclr) +PERCPU_OP(or, orr, ldset) PERCPU_RET_OP(add, add, ldadd) #undef PERCPU_RW_OPS From eeb8c19896952e18fb538ec76e603884070a6c6a Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Fri, 31 Oct 2025 11:11:37 +0000 Subject: [PATCH 153/543] Revert "ACPI: Suppress misleading SPCR console message when SPCR table is absent" This reverts commit bad3fa2fb9206f4dcec6ddef094ec2fbf6e8dcb2. Commit bad3fa2fb920 ("ACPI: Suppress misleading SPCR console message when SPCR table is absent") mistakenly assumes acpi_parse_spcr() returning 0 to indicate a failure to parse SPCR. While addressing the resultant incorrect logging it was deemed that dropping the message is a better approach as it is not particularly useful. Roll back the commit introducing the bug as a step towards dropping the log message. Link: https://lore.kernel.org/all/aQN0YWUYaPYWpgJM@willie-the-truck/ Signed-off-by: Punit Agrawal Signed-off-by: Will Deacon --- arch/arm64/kernel/acpi.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 7aca29e1d30b..fd164e8a35b2 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -197,8 +197,6 @@ static int __init acpi_fadt_sanity_check(void) */ void __init acpi_boot_table_init(void) { - int ret; - /* * Enable ACPI instead of device tree unless * - ACPI has been disabled explicitly (acpi=off), or @@ -252,12 +250,10 @@ void __init acpi_boot_table_init(void) * behaviour, use acpi=nospcr to disable console in ACPI SPCR * table as default serial console. */ - ret = acpi_parse_spcr(earlycon_acpi_spcr_enable, + acpi_parse_spcr(earlycon_acpi_spcr_enable, !param_acpi_nospcr); - if (!ret || param_acpi_nospcr || !IS_ENABLED(CONFIG_ACPI_SPCR_TABLE)) - pr_info("Use ACPI SPCR as default console: No\n"); - else - pr_info("Use ACPI SPCR as default console: Yes\n"); + pr_info("Use ACPI SPCR as default console: %s\n", + param_acpi_nospcr ? "No" : "Yes"); if (IS_ENABLED(CONFIG_ACPI_BGRT)) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); From 7991fda619f7251994ab364f03f3e6fc0aa143d9 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Fri, 31 Oct 2025 11:11:38 +0000 Subject: [PATCH 154/543] arm64: acpi: Drop message logging SPCR default console Commit f5a4af3c7527 ("ACPI: Add acpi=nospcr to disable ACPI SPCR as default console on ARM64") introduced a command line parameter to prevent using SPCR provided console as default. It also introduced a message to log this choice. Drop the message as it is not particularly useful and can be incorrect in situations where no SPCR is provided by the firmware. Link: https://lore.kernel.org/all/aQN0YWUYaPYWpgJM@willie-the-truck/ Signed-off-by: Punit Agrawal Signed-off-by: Will Deacon --- arch/arm64/kernel/acpi.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index fd164e8a35b2..c022c1acb8c7 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -252,8 +252,6 @@ void __init acpi_boot_table_init(void) */ acpi_parse_spcr(earlycon_acpi_spcr_enable, !param_acpi_nospcr); - pr_info("Use ACPI SPCR as default console: %s\n", - param_acpi_nospcr ? "No" : "Yes"); if (IS_ENABLED(CONFIG_ACPI_BGRT)) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); From 0ec364c0c95fc85bcbc88f1a9a06ebe83c88e18c Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 Nov 2025 13:49:47 -0800 Subject: [PATCH 155/543] arm64: kprobes: check the return value of set_memory_rox() Since commit a166563e7ec3 ("arm64: mm: support large block mapping when rodata=full"), __change_memory_common has more chance to fail due to memory allocation failure when splitting page table. So check the return value of set_memory_rox(), then bail out if it fails otherwise we may have RW memory mapping for kprobes insn page. Fixes: 195a1b7d8388 ("arm64: kprobes: call set_memory_rox() for kprobe page") Reviewed-by: Ryan Roberts Reviewed-by: Dev Jain Signed-off-by: Yang Shi Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/kprobes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 8ab6104a4883..43a0361a8bf0 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -49,7 +49,10 @@ void *alloc_insn_page(void) addr = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!addr) return NULL; - set_memory_rox((unsigned long)addr, 1); + if (set_memory_rox((unsigned long)addr, 1)) { + execmem_free(addr); + return NULL; + } return addr; } From ce2b3a50ad922abbba36425343a1bcec46903a26 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 6 Nov 2025 16:09:41 +0000 Subject: [PATCH 156/543] arm64: mm: Don't sleep in split_kernel_leaf_mapping() when in atomic context It has been reported that split_kernel_leaf_mapping() is trying to sleep in non-sleepable context. It does this when acquiring the pgtable_split_lock mutex, when either CONFIG_DEBUG_PAGEALLOC or CONFIG_KFENCE are enabled, which change linear map permissions within softirq context during memory allocation and/or freeing. All other paths into this function are called from sleepable context and so are safe. But it turns out that the memory for which these 2 features may attempt to modify the permissions is always mapped by pte, so there is no need to attempt to split the mapping. So let's exit early in these cases and avoid attempting to take the mutex. There is one wrinkle to this approach; late-initialized kfence allocates it's pool from the buddy which may be block mapped. So we must hook that allocation and convert it to pte-mappings up front. Previously this was done as a side-effect of kfence protecting all the individual pages in its pool at init-time, but this no longer works due to the added early exit path in split_kernel_leaf_mapping(). So instead, do this via the existing arch_kfence_init_pool() arch hook, and reuse the existing linear_map_split_to_ptes() infrastructure. Closes: https://lore.kernel.org/all/f24b9032-0ec9-47b1-8b95-c0eeac7a31c5@roeck-us.net/ Fixes: a166563e7ec3 ("arm64: mm: support large block mapping when rodata=full") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand (Red Hat) Reviewed-by: Yang Shi Signed-off-by: Will Deacon --- arch/arm64/include/asm/kfence.h | 3 +- arch/arm64/mm/mmu.c | 92 +++++++++++++++++++++++---------- 2 files changed, 67 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h index a81937fae9f6..21dbc9dda747 100644 --- a/arch/arm64/include/asm/kfence.h +++ b/arch/arm64/include/asm/kfence.h @@ -10,8 +10,6 @@ #include -static inline bool arch_kfence_init_pool(void) { return true; } - static inline bool kfence_protect_page(unsigned long addr, bool protect) { set_memory_valid(addr, 1, !protect); @@ -25,6 +23,7 @@ static inline bool arm64_kfence_can_set_direct_map(void) { return !kfence_early_init; } +bool arch_kfence_init_pool(void); #else /* CONFIG_KFENCE */ static inline bool arm64_kfence_can_set_direct_map(void) { return false; } #endif /* CONFIG_KFENCE */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b8d37eb037fc..a364ac2c9c61 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -708,6 +708,16 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr) return ret; } +static inline bool force_pte_mapping(void) +{ + bool bbml2 = system_capabilities_finalized() ? + system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); + + return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() || + is_realm_world())) || + debug_pagealloc_enabled(); +} + static DEFINE_MUTEX(pgtable_split_lock); int split_kernel_leaf_mapping(unsigned long start, unsigned long end) @@ -723,6 +733,16 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) if (!system_supports_bbml2_noabort()) return 0; + /* + * If the region is within a pte-mapped area, there is no need to try to + * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may + * change permissions from atomic context so for those cases (which are + * always pte-mapped), we must not go any further because taking the + * mutex below may sleep. + */ + if (force_pte_mapping() || is_kfence_address((void *)start)) + return 0; + /* * Ensure start and end are at least page-aligned since this is the * finest granularity we can split to. @@ -758,30 +778,30 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) return ret; } -static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr, - unsigned long next, - struct mm_walk *walk) +static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr, + unsigned long next, struct mm_walk *walk) { + gfp_t gfp = *(gfp_t *)walk->private; pud_t pud = pudp_get(pudp); int ret = 0; if (pud_leaf(pud)) - ret = split_pud(pudp, pud, GFP_ATOMIC, false); + ret = split_pud(pudp, pud, gfp, false); return ret; } -static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, - unsigned long next, - struct mm_walk *walk) +static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, + unsigned long next, struct mm_walk *walk) { + gfp_t gfp = *(gfp_t *)walk->private; pmd_t pmd = pmdp_get(pmdp); int ret = 0; if (pmd_leaf(pmd)) { if (pmd_cont(pmd)) split_contpmd(pmdp); - ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false); + ret = split_pmd(pmdp, pmd, gfp, false); /* * We have split the pmd directly to ptes so there is no need to @@ -793,9 +813,8 @@ static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, return ret; } -static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, - unsigned long next, - struct mm_walk *walk) +static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) { pte_t pte = __ptep_get(ptep); @@ -805,12 +824,18 @@ static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, return 0; } -static const struct mm_walk_ops split_to_ptes_ops __initconst = { +static const struct mm_walk_ops split_to_ptes_ops = { .pud_entry = split_to_ptes_pud_entry, .pmd_entry = split_to_ptes_pmd_entry, .pte_entry = split_to_ptes_pte_entry, }; +static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp) +{ + return walk_kernel_page_table_range_lockless(start, end, + &split_to_ptes_ops, NULL, &gfp); +} + static bool linear_map_requires_bbml2 __initdata; u32 idmap_kpti_bbml2_flag; @@ -847,11 +872,9 @@ static int __init linear_map_split_to_ptes(void *__unused) * PTE. The kernel alias remains static throughout runtime so * can continue to be safely mapped with large mappings. */ - ret = walk_kernel_page_table_range_lockless(lstart, kstart, - &split_to_ptes_ops, NULL, NULL); + ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC); if (!ret) - ret = walk_kernel_page_table_range_lockless(kend, lend, - &split_to_ptes_ops, NULL, NULL); + ret = range_split_to_ptes(kend, lend, GFP_ATOMIC); if (ret) panic("Failed to split linear map\n"); flush_tlb_kernel_range(lstart, lend); @@ -1002,6 +1025,33 @@ static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); __kfence_pool = phys_to_virt(kfence_pool); } + +bool arch_kfence_init_pool(void) +{ + unsigned long start = (unsigned long)__kfence_pool; + unsigned long end = start + KFENCE_POOL_SIZE; + int ret; + + /* Exit early if we know the linear map is already pte-mapped. */ + if (!system_supports_bbml2_noabort() || force_pte_mapping()) + return true; + + /* Kfence pool is already pte-mapped for the early init case. */ + if (kfence_early_init) + return true; + + mutex_lock(&pgtable_split_lock); + ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL); + mutex_unlock(&pgtable_split_lock); + + /* + * Since the system supports bbml2_noabort, tlb invalidation is not + * required here; the pgtable mappings have been split to pte but larger + * entries may safely linger in the TLB. + */ + + return !ret; +} #else /* CONFIG_KFENCE */ static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; } @@ -1009,16 +1059,6 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { #endif /* CONFIG_KFENCE */ -static inline bool force_pte_mapping(void) -{ - bool bbml2 = system_capabilities_finalized() ? - system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); - - return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() || - is_realm_world())) || - debug_pagealloc_enabled(); -} - static void __init map_mem(pgd_t *pgdp) { static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); From 40a292f701474f7c21b27911677485efa233e94e Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 6 Nov 2025 16:09:42 +0000 Subject: [PATCH 157/543] arm64: mm: Optimize range_split_to_ptes() Enter lazy_mmu mode while splitting a range of memory to pte mappings. This causes barriers, which would otherwise be emitted after every pte (and pmd/pud) write, to be deferred until exiting lazy_mmu mode. For large systems, this is expected to significantly speed up fallback to pte-mapping the linear map for the case where the boot CPU has BBML2_NOABORT, but secondary CPUs do not. I haven't directly measured it, but this is equivalent to commit 1fcb7cea8a5f ("arm64: mm: Batch dsb and isb when populating pgtables"). Note that for the path from arch_kfence_init_pool(), we may sleep while allocating memory inside the lazy_mmu mode. Sleeping is not allowed by generic code inside lazy_mmu, but we know that the arm64 implementation is sleep-safe. So this is ok and follows the same pattern already used by split_kernel_leaf_mapping(). Signed-off-by: Ryan Roberts Reviewed-by: Yang Shi Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index a364ac2c9c61..652bb8c14035 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -832,8 +832,14 @@ static const struct mm_walk_ops split_to_ptes_ops = { static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp) { - return walk_kernel_page_table_range_lockless(start, end, + int ret; + + arch_enter_lazy_mmu_mode(); + ret = walk_kernel_page_table_range_lockless(start, end, &split_to_ptes_ops, NULL, &gfp); + arch_leave_lazy_mmu_mode(); + + return ret; } static bool linear_map_requires_bbml2 __initdata; From 53357f14f924a06cced46069755bb10c2a6891c1 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 6 Nov 2025 16:09:43 +0000 Subject: [PATCH 158/543] arm64: mm: Tidy up force_pte_mapping() Tidy up the implementation of force_pte_mapping() to make it easier to read and introduce the split_leaf_mapping_possible() helper to reduce code duplication in split_kernel_leaf_mapping() and arch_kfence_init_pool(). Suggested-by: David Hildenbrand (Red Hat) Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand (Red Hat) Reviewed-by: Yang Shi Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 652bb8c14035..2ba01dc8ef82 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -710,12 +710,26 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr) static inline bool force_pte_mapping(void) { - bool bbml2 = system_capabilities_finalized() ? + const bool bbml2 = system_capabilities_finalized() ? system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); - return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() || - is_realm_world())) || - debug_pagealloc_enabled(); + if (debug_pagealloc_enabled()) + return true; + if (bbml2) + return false; + return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world(); +} + +static inline bool split_leaf_mapping_possible(void) +{ + /* + * !BBML2_NOABORT systems should never run into scenarios where we would + * have to split. So exit early and let calling code detect it and raise + * a warning. + */ + if (!system_supports_bbml2_noabort()) + return false; + return !force_pte_mapping(); } static DEFINE_MUTEX(pgtable_split_lock); @@ -725,22 +739,11 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) int ret; /* - * !BBML2_NOABORT systems should not be trying to change permissions on - * anything that is not pte-mapped in the first place. Just return early - * and let the permission change code raise a warning if not already - * pte-mapped. + * Exit early if the region is within a pte-mapped area or if we can't + * split. For the latter case, the permission change code will raise a + * warning if not already pte-mapped. */ - if (!system_supports_bbml2_noabort()) - return 0; - - /* - * If the region is within a pte-mapped area, there is no need to try to - * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may - * change permissions from atomic context so for those cases (which are - * always pte-mapped), we must not go any further because taking the - * mutex below may sleep. - */ - if (force_pte_mapping() || is_kfence_address((void *)start)) + if (!split_leaf_mapping_possible() || is_kfence_address((void *)start)) return 0; /* @@ -1039,7 +1042,7 @@ bool arch_kfence_init_pool(void) int ret; /* Exit early if we know the linear map is already pte-mapped. */ - if (!system_supports_bbml2_noabort() || force_pte_mapping()) + if (!split_leaf_mapping_possible()) return true; /* Kfence pool is already pte-mapped for the early init case. */ From 62e72463ca714073962eda450e80c5d71dfb0dcb Mon Sep 17 00:00:00 2001 From: shechenglong Date: Fri, 31 Oct 2025 17:15:05 +0800 Subject: [PATCH 159/543] arm64: proton-pack: Drop print when !CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY Following the pattern established with other Spectre mitigations, do not print a message when the CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY Kconfig option is disabled. Suggested-by: Will Deacon Signed-off-by: shechenglong Signed-off-by: Will Deacon --- arch/arm64/kernel/proton-pack.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index f9a32dfde006..d833b7c1bba8 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -1042,8 +1042,6 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) { /* No point mitigating Spectre-BHB alone. */ - } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) { - pr_info_once("spectre-bhb mitigation disabled by compile time option\n"); } else if (cpu_mitigations_off() || __nospectre_bhb) { pr_info_once("spectre-bhb mitigation disabled by command line option\n"); } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { From 7f1635737823a6c0c412ccf3767a12bec642c10f Mon Sep 17 00:00:00 2001 From: shechenglong Date: Fri, 31 Oct 2025 17:15:06 +0800 Subject: [PATCH 160/543] arm64: proton-pack: Fix hard lockup due to print in scheduler context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Relocate the printk() calls from spectre_v4_mitigations_off() and spectre_v2_mitigations_off() into setup_system_capabilities() function, preventing hard lockups caused by printk calls in scheduler context: | _raw_spin_lock_nested+168 | ttwu_queue+180 (rq_lock(rq, &rf); 2nd acquiring the rq->__lock) | try_to_wake_up+548 | wake_up_process+32 | __up+88 | up+100 | __up_console_sem+96 | console_unlock+696 | vprintk_emit+428 | vprintk_default+64 | vprintk_func+220 | printk+104 | spectre_v4_enable_task_mitigation+344 | __switch_to+100 | __schedule+1028 (rq_lock(rq, &rf); 1st acquiring the rq->__lock) | schedule_idle+48 | do_idle+388 | cpu_startup_entry+44 | secondary_start_kernel+352 Suggested-by: Mark Rutland Suggested-by: Catalin Marinas Suggested-by: Will Deacon Signed-off-by: shechenglong Signed-off-by: Will Deacon --- arch/arm64/include/asm/spectre.h | 1 + arch/arm64/kernel/cpufeature.c | 6 ++++++ arch/arm64/kernel/proton-pack.c | 33 +++++++++++++++++--------------- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index 8fef12626090..900454aaa292 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -117,6 +117,7 @@ void spectre_bhb_patch_wa3(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst); void spectre_bhb_patch_clearbhb(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst); +void spectre_print_disabled_mitigations(void); #endif /* __ASSEMBLY__ */ #endif /* __ASM_SPECTRE_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5ed401ff79e3..e25b0f84a22d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -95,6 +95,7 @@ #include #include +#include /* Kernel representation of AT_HWCAP and AT_HWCAP2 */ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; @@ -3875,6 +3876,11 @@ static void __init setup_system_capabilities(void) */ if (system_uses_ttbr0_pan()) pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); + + /* + * Report Spectre mitigations status. + */ + spectre_print_disabled_mitigations(); } void __init setup_system_features(void) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index d833b7c1bba8..c7d70d04c164 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -91,12 +91,7 @@ early_param("nospectre_v2", parse_spectre_v2_param); static bool spectre_v2_mitigations_off(void) { - bool ret = __nospectre_v2 || cpu_mitigations_off(); - - if (ret) - pr_info_once("spectre-v2 mitigation disabled by command line option\n"); - - return ret; + return __nospectre_v2 || cpu_mitigations_off(); } static const char *get_bhb_affected_string(enum mitigation_state bhb_state) @@ -421,13 +416,8 @@ early_param("ssbd", parse_spectre_v4_param); */ static bool spectre_v4_mitigations_off(void) { - bool ret = cpu_mitigations_off() || - __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; - - if (ret) - pr_info_once("spectre-v4 mitigation disabled by command-line option\n"); - - return ret; + return cpu_mitigations_off() || + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; } /* Do we need to toggle the mitigation state on entry to/exit from the kernel? */ @@ -1042,8 +1032,6 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) { /* No point mitigating Spectre-BHB alone. */ - } else if (cpu_mitigations_off() || __nospectre_bhb) { - pr_info_once("spectre-bhb mitigation disabled by command line option\n"); } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { state = SPECTRE_MITIGATED; set_bit(BHB_HW, &system_bhb_mitigations); @@ -1197,3 +1185,18 @@ void unpriv_ebpf_notify(int new_state) pr_err("WARNING: %s", EBPF_WARN); } #endif + +void spectre_print_disabled_mitigations(void) +{ + /* Keep a single copy of the common message suffix to avoid duplication. */ + const char *spectre_disabled_suffix = "mitigation disabled by command-line option\n"; + + if (spectre_v2_mitigations_off()) + pr_info("spectre-v2 %s", spectre_disabled_suffix); + + if (spectre_v4_mitigations_off()) + pr_info("spectre-v4 %s", spectre_disabled_suffix); + + if (__nospectre_bhb || cpu_mitigations_off()) + pr_info("spectre-bhb %s", spectre_disabled_suffix); +} From 6d4a0fbd34a40c9f877b136de874dc3498031309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Barna=C5=9B?= Date: Mon, 22 Sep 2025 13:04:26 +0000 Subject: [PATCH 161/543] arm64: Fail module loading if dynamic SCS patching fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Disallow a module to load if SCS dynamic patching fails for its code. For module loading, instead of running a dry-run to check for patching errors, try to run patching in the first run and propagate any errors so module loading will fail. Signed-off-by: Adrian Barnaś Reviewed-by: Ard Biesheuvel Signed-off-by: Will Deacon --- arch/arm64/include/asm/scs.h | 2 +- arch/arm64/kernel/module.c | 12 ++++++++++-- arch/arm64/kernel/pi/map_kernel.c | 2 +- arch/arm64/kernel/pi/patch-scs.c | 10 ++++++---- arch/arm64/kernel/pi/pi.h | 2 +- 5 files changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h index a76f9b387a26..c59f6324f2bb 100644 --- a/arch/arm64/include/asm/scs.h +++ b/arch/arm64/include/asm/scs.h @@ -53,7 +53,7 @@ enum { EDYNSCS_INVALID_CFA_OPCODE = 4, }; -int __pi_scs_patch(const u8 eh_frame[], int size); +int __pi_scs_patch(const u8 eh_frame[], int size, bool skip_dry_run); #endif /* __ASSEMBLY __ */ diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index d6d443c4a01a..01acbff8a1ae 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -495,10 +495,18 @@ int module_finalize(const Elf_Ehdr *hdr, if (scs_is_dynamic()) { s = find_section(hdr, sechdrs, ".init.eh_frame"); if (s) { - ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size); - if (ret) + /* + * Because we can reject modules that are malformed + * so SCS patching fails, skip dry run and try to patch + * it in place. If patching fails, the module would not + * be loaded anyway. + */ + ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size, true); + if (ret) { pr_err("module %s: error occurred during dynamic SCS patching (%d)\n", me->name, ret); + return -ENOEXEC; + } } } diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index e8ddbde31a83..659297f87cfa 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -104,7 +104,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) if (enable_scs) { scs_patch(__eh_frame_start + va_offset, - __eh_frame_end - __eh_frame_start); + __eh_frame_end - __eh_frame_start, false); asm("ic ialluis"); dynamic_scs_is_enabled = true; diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c index 55d0cd64ef71..bbe7d30ed12b 100644 --- a/arch/arm64/kernel/pi/patch-scs.c +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -225,7 +225,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, return 0; } -int scs_patch(const u8 eh_frame[], int size) +int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run) { int code_alignment_factor = 1; bool fde_use_sdata8 = false; @@ -277,11 +277,13 @@ int scs_patch(const u8 eh_frame[], int size) } } else { ret = scs_handle_fde_frame(frame, code_alignment_factor, - fde_use_sdata8, true); + fde_use_sdata8, !skip_dry_run); if (ret) return ret; - scs_handle_fde_frame(frame, code_alignment_factor, - fde_use_sdata8, false); + + if (!skip_dry_run) + scs_handle_fde_frame(frame, code_alignment_factor, + fde_use_sdata8, false); } p += sizeof(frame->size) + frame->size; diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h index 08ef9f80456b..aec3172d4003 100644 --- a/arch/arm64/kernel/pi/pi.h +++ b/arch/arm64/kernel/pi/pi.h @@ -27,7 +27,7 @@ extern pgd_t init_pg_dir[], init_pg_end[]; void init_feature_override(u64 boot_status, const void *fdt, int chosen); u64 kaslr_early_init(void *fdt, int chosen); void relocate_kernel(u64 offset); -int scs_patch(const u8 eh_frame[], int size); +int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run); void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, From 8e8ae788964aa2573b4335026db4068540fa6a86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Barna=C5=9B?= Date: Mon, 22 Sep 2025 13:04:27 +0000 Subject: [PATCH 162/543] arm64: Reject modules with internal alternative callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During module loading, check if a callback function used by the alternatives specified in the '.altinstruction' ELF section (if present) is located in core kernel .text. If not fail module loading before callback is called. Reported-by: Fanqin Cui Closes: https://lore.kernel.org/all/20250807072700.348514-1-fanqincui@163.com/ Signed-off-by: Adrian Barnaś Reviewed-by: Ard Biesheuvel [will: Folded in 'noinstr' tweak from Mark] Signed-off-by: Will Deacon --- arch/arm64/include/asm/alternative.h | 7 +++++-- arch/arm64/kernel/alternative.c | 19 ++++++++++++------- arch/arm64/kernel/module.c | 9 +++++++-- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 00d97b8a757f..51746005239b 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -26,9 +26,12 @@ void __init apply_alternatives_all(void); bool alternative_is_applied(u16 cpucap); #ifdef CONFIG_MODULES -void apply_alternatives_module(void *start, size_t length); +int apply_alternatives_module(void *start, size_t length); #else -static inline void apply_alternatives_module(void *start, size_t length) { } +static inline int apply_alternatives_module(void *start, size_t length) +{ + return 0; +} #endif void alt_cb_patch_nops(struct alt_instr *alt, __le32 *origptr, diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 8ff6610af496..f5ec7e7c1d3f 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -139,9 +139,9 @@ static noinstr void clean_dcache_range_nopatch(u64 start, u64 end) } while (cur += d_size, cur < end); } -static void __apply_alternatives(const struct alt_region *region, - bool is_module, - unsigned long *cpucap_mask) +static int __apply_alternatives(const struct alt_region *region, + bool is_module, + unsigned long *cpucap_mask) { struct alt_instr *alt; __le32 *origptr, *updptr; @@ -166,10 +166,13 @@ static void __apply_alternatives(const struct alt_region *region, updptr = is_module ? origptr : lm_alias(origptr); nr_inst = alt->orig_len / AARCH64_INSN_SIZE; - if (ALT_HAS_CB(alt)) + if (ALT_HAS_CB(alt)) { alt_cb = ALT_REPL_PTR(alt); - else + if (is_module && !core_kernel_text((unsigned long)alt_cb)) + return -ENOEXEC; + } else { alt_cb = patch_alternative; + } alt_cb(alt, origptr, updptr, nr_inst); @@ -193,6 +196,8 @@ static void __apply_alternatives(const struct alt_region *region, bitmap_and(applied_alternatives, applied_alternatives, system_cpucaps, ARM64_NCAPS); } + + return 0; } static void __init apply_alternatives_vdso(void) @@ -277,7 +282,7 @@ void __init apply_boot_alternatives(void) } #ifdef CONFIG_MODULES -void apply_alternatives_module(void *start, size_t length) +int apply_alternatives_module(void *start, size_t length) { struct alt_region region = { .begin = start, @@ -287,7 +292,7 @@ void apply_alternatives_module(void *start, size_t length) bitmap_fill(all_capabilities, ARM64_NCAPS); - __apply_alternatives(®ion, true, &all_capabilities[0]); + return __apply_alternatives(®ion, true, &all_capabilities[0]); } #endif diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 01acbff8a1ae..24adb581af0e 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -489,8 +489,13 @@ int module_finalize(const Elf_Ehdr *hdr, int ret; s = find_section(hdr, sechdrs, ".altinstructions"); - if (s) - apply_alternatives_module((void *)s->sh_addr, s->sh_size); + if (s) { + ret = apply_alternatives_module((void *)s->sh_addr, s->sh_size); + if (ret < 0) { + pr_err("module %s: error occurred when applying alternatives\n", me->name); + return ret; + } + } if (scs_is_dynamic()) { s = find_section(hdr, sechdrs, ".init.eh_frame"); From 62b9ca1706e1bbb60d945a58de7c7b5826f6b2a2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 5 Nov 2025 22:51:05 -0600 Subject: [PATCH 163/543] PM: hibernate: Emit an error when image writing fails If image writing fails, a return code is passed up to the caller, but none of the callers log anything to the log and so the only record of it is the return code that userspace gets. Adjust the logging so that the image size and speed of writing is only emitted on success and if there is an error, it's saved to the logs. Fixes: a06c6f5d3cc9 ("PM: hibernate: Move to crypto APIs for LZO compression") Reported-by: Askar Safin Closes: https://lore.kernel.org/linux-pm/20251105180506.137448-1-safinaskar@gmail.com/ Signed-off-by: Mario Limonciello (AMD) Tested-by: Askar Safin Cc: 6.9+ # 6.9+ [ rjw: Added missing braces after "else", changelog edits ] Link: https://patch.msgid.link/20251106045158.3198061-2-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 0beff7eeaaba..7daa716d2cb1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -877,11 +877,14 @@ static int save_compressed_image(struct swap_map_handle *handle, stop = ktime_get(); if (!ret) ret = err2; - if (!ret) + if (!ret) { + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + pr_info("Image size after compression: %d kbytes\n", + (atomic_read(&compressed_size) / 1024)); pr_info("Image saving done\n"); - swsusp_show_speed(start, stop, nr_to_write, "Wrote"); - pr_info("Image size after compression: %d kbytes\n", - (atomic_read(&compressed_size) / 1024)); + } else { + pr_err("Image saving failed: %d\n", ret); + } out_clean: hib_finish_batch(&hb); From 66ededc694f1d06a71ca35a3c8e3689e9b85b3ce Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 5 Nov 2025 22:51:06 -0600 Subject: [PATCH 164/543] PM: hibernate: Use atomic64_t for compressed_size variable `compressed_size` can overflow, showing nonsensical values. Change from `atomic_t` to `atomic64_t` to prevent overflow. Fixes: a06c6f5d3cc9 ("PM: hibernate: Move to crypto APIs for LZO compression") Reported-by: Askar Safin Closes: https://lore.kernel.org/linux-pm/20251105180506.137448-1-safinaskar@gmail.com/ Signed-off-by: Mario Limonciello (AMD) Tested-by: Askar Safin Cc: 6.9+ # 6.9+ Link: https://patch.msgid.link/20251106045158.3198061-3-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7daa716d2cb1..e0441483dbee 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -635,7 +635,7 @@ struct cmp_data { }; /* Indicates the image size after compression */ -static atomic_t compressed_size = ATOMIC_INIT(0); +static atomic64_t compressed_size = ATOMIC_INIT(0); /* * Compression function that runs in its own thread. @@ -664,7 +664,7 @@ static int compress_threadfn(void *data) d->ret = crypto_acomp_compress(d->cr); d->cmp_len = d->cr->dlen; - atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); + atomic64_add(d->cmp_len, &compressed_size); atomic_set_release(&d->stop, 1); wake_up(&d->done); } @@ -696,7 +696,7 @@ static int save_compressed_image(struct swap_map_handle *handle, hib_init_batch(&hb); - atomic_set(&compressed_size, 0); + atomic64_set(&compressed_size, 0); /* * We'll limit the number of threads for compression to limit memory @@ -879,8 +879,8 @@ static int save_compressed_image(struct swap_map_handle *handle, ret = err2; if (!ret) { swsusp_show_speed(start, stop, nr_to_write, "Wrote"); - pr_info("Image size after compression: %d kbytes\n", - (atomic_read(&compressed_size) / 1024)); + pr_info("Image size after compression: %lld kbytes\n", + (atomic64_read(&compressed_size) / 1024)); pr_info("Image saving done\n"); } else { pr_err("Image saving failed: %d\n", ret); From 0b6c10cb8479d0d1b7b208277df2e2afe082d4bd Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 5 Nov 2025 22:51:07 -0600 Subject: [PATCH 165/543] PM: hibernate: Fix style issues in save_compressed_image() Address two issues indicated by checkpatch: - Trailing statements should be on next line. - Prefer 'unsigned int' to bare use of 'unsigned'. Signed-off-by: Mario Limonciello (AMD) [ rjw: Changelog edits ] Link: https://patch.msgid.link/20251106045158.3198061-4-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index e0441483dbee..70ae21f7370d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -689,7 +689,7 @@ static int save_compressed_image(struct swap_map_handle *handle, ktime_t start; ktime_t stop; size_t off; - unsigned thr, run_threads, nr_threads; + unsigned int thr, run_threads, nr_threads; unsigned char *page = NULL; struct cmp_data *data = NULL; struct crc_data *crc = NULL; @@ -902,7 +902,8 @@ static int save_compressed_image(struct swap_map_handle *handle, } vfree(data); } - if (page) free_page((unsigned long)page); + if (page) + free_page((unsigned long)page); return ret; } From b6cfddd26ec55e865b4715f73e9bbb17a15091ed Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 31 Oct 2025 10:32:24 -0700 Subject: [PATCH 166/543] cxl: Adjust offset calculation for poison injection The HPA to DPA translation for poison injection assumes that the base address starts from where the CXL region begins. When the extended linear cache is active, the offset can be within the DRAM region. Adjust the offset so that it correctly reflects the offset within the CXL region. [ dj: Add fixes tag from Alison ] Fixes: c3dd67681c70 ("cxl/region: Add inject and clear poison by region offset") Link: https://patch.msgid.link/20251031173224.3537030-5-dave.jiang@intel.com Reviewed-by: Alison Schofield Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index b06fee1978ba..41b64d871c5a 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3702,6 +3702,7 @@ static int cxl_region_debugfs_poison_inject(void *data, u64 offset) if (validate_region_offset(cxlr, offset)) return -EINVAL; + offset -= cxlr->params.cache_size; rc = region_offset_to_dpa_result(cxlr, offset, &result); if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) { dev_dbg(&cxlr->dev, @@ -3734,6 +3735,7 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset) if (validate_region_offset(cxlr, offset)) return -EINVAL; + offset -= cxlr->params.cache_size; rc = region_offset_to_dpa_result(cxlr, offset, &result); if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) { dev_dbg(&cxlr->dev, From 4fe5934db4a7187d358f1af1b3ef9b6dd59bce58 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Fri, 7 Nov 2025 13:11:41 +0530 Subject: [PATCH 167/543] ACPI: CPPC: Detect preferred core availability on online CPUs Commit 279f838a61f9 ("x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()") introduced the ability to detect the preferred core on AMD platforms by checking if there at least two distinct highest_perf values. However, it uses for_each_present_cpu() to iterate through all the CPUs in the platform, which is problematic when the kernel is booted with "nosmt=force" commandline option. Hence limit the search to only the online CPUs. Fixes: 279f838a61f9 ("x86/amd: Detect preferred cores in amd_get_boost_ratio_numerator()") Reported-by: Christopher Harris Closes: https://lore.kernel.org/lkml/CAM+eXpdDT7KjLV0AxEwOLkSJ2QtrsvGvjA2cCHvt1d0k2_C4Cw@mail.gmail.com/ Reviewed-by: "Mario Limonciello (AMD) (kernel.org)" Tested-by: Chrisopher Harris Signed-off-by: Gautham R. Shenoy Link: https://patch.msgid.link/20251107074145.2340-2-gautham.shenoy@amd.com Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/cppc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 7047124490f6..d7c8ef1e354d 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -196,7 +196,7 @@ int amd_detect_prefcore(bool *detected) break; } - for_each_present_cpu(cpu) { + for_each_online_cpu(cpu) { u32 tmp; int ret; From 6dd3b8a709a130a4d55c866af9804c81b8486d28 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Fri, 7 Nov 2025 13:11:42 +0530 Subject: [PATCH 168/543] ACPI: CPPC: Check _CPC validity for only the online CPUs per_cpu(cpc_desc_ptr, cpu) object is initialized for only the online CPUs via acpi_soft_cpu_online() --> __acpi_processor_start() --> acpi_cppc_processor_probe(). However the function acpi_cpc_valid() checks for the validity of the _CPC object for all the present CPUs. This breaks when the kernel is booted with "nosmt=force". Hence check the validity of the _CPC objects of only the online CPUs. Fixes: 2aeca6bd0277 ("ACPI: CPPC: Check present CPUs for determining _CPC is valid") Reported-by: Christopher Harris Closes: https://lore.kernel.org/lkml/CAM+eXpdDT7KjLV0AxEwOLkSJ2QtrsvGvjA2cCHvt1d0k2_C4Cw@mail.gmail.com/ Suggested-by: Mario Limonciello Reviewed-by: "Mario Limonciello (AMD) (kernel.org)" Tested-by: Chrisopher Harris Signed-off-by: Gautham R. Shenoy Link: https://patch.msgid.link/20251107074145.2340-3-gautham.shenoy@amd.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 6c684e54fe01..da6b35ac8c87 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -460,7 +460,7 @@ bool acpi_cpc_valid(void) if (acpi_disabled) return false; - for_each_present_cpu(cpu) { + for_each_online_cpu(cpu) { cpc_ptr = per_cpu(cpc_desc_ptr, cpu); if (!cpc_ptr) return false; From 8821c8e80a65bc4eb73daf63b34aac6b8ad69461 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Fri, 7 Nov 2025 13:11:43 +0530 Subject: [PATCH 169/543] ACPI: CPPC: Perform fast check switch only for online CPUs per_cpu(cpc_desc_ptr, cpu) object is initialized for only the online CPUs via acpi_soft_cpu_online() --> __acpi_processor_start() --> acpi_cppc_processor_probe(). However the function cppc_allow_fast_switch() checks for the validity of the _CPC object for all the present CPUs. This breaks when the kernel is booted with "nosmt=force". Check fast_switch capability only on online CPUs Fixes: 15eece6c5b05 ("ACPI: CPPC: Fix NULL pointer dereference when nosmp is used") Reviewed-by: "Mario Limonciello (AMD) (kernel.org)" Signed-off-by: Gautham R. Shenoy Link: https://patch.msgid.link/20251107074145.2340-4-gautham.shenoy@amd.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index da6b35ac8c87..7492c9922866 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -476,7 +476,7 @@ bool cppc_allow_fast_switch(void) struct cpc_desc *cpc_ptr; int cpu; - for_each_present_cpu(cpu) { + for_each_online_cpu(cpu) { cpc_ptr = per_cpu(cpc_desc_ptr, cpu); desired_reg = &cpc_ptr->cpc_regs[DESIRED_PERF]; if (!CPC_IN_SYSTEM_MEMORY(desired_reg) && From 0fce75870666b46b700cfbd3216380b422f975da Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Fri, 7 Nov 2025 13:11:44 +0530 Subject: [PATCH 170/543] ACPI: CPPC: Limit perf ctrs in PCC check only to online CPUs per_cpu(cpc_desc_ptr, cpu) object is initialized for only the online CPU via acpi_soft_cpu_online() --> __acpi_processor_start() --> acpi_cppc_processor_probe(). However the function cppc_perf_ctrs_in_pcc() checks if the CPPC perf-ctrs are in a PCC region for all the present CPUs, which breaks when the kernel is booted with "nosmt=force". Hence, limit the check only to the online CPUs. Fixes: ae2df912d1a5 ("ACPI: CPPC: Disable FIE if registers in PCC regions") Reviewed-by: "Mario Limonciello (AMD) (kernel.org)" Signed-off-by: Gautham R. Shenoy Link: https://patch.msgid.link/20251107074145.2340-5-gautham.shenoy@amd.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 7492c9922866..3bdeeee3414e 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1435,7 +1435,7 @@ bool cppc_perf_ctrs_in_pcc(void) { int cpu; - for_each_present_cpu(cpu) { + for_each_online_cpu(cpu) { struct cpc_register_resource *ref_perf_reg; struct cpc_desc *cpc_desc; From 2cf95b9baa52262bfb645cb3c04f902dd50c29e2 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Thu, 23 Oct 2025 17:01:08 +0530 Subject: [PATCH 171/543] EDAC/versalnet: Handle split messages for non-standard errors The current code assumes that only DDR errors have split messages. Ensure proper logging of non-standard event errors that may be split across multiple messages too. [ bp: Massage, move comment too, fix it up. ] Fixes: d5fe2fec6c40 ("EDAC: Add a driver for the AMD Versal NET DDR controller") Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20251023113108.3467132-1-shubhrajyoti.datta@amd.com --- drivers/edac/versalnet_edac.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c index 1ded4c3f0213..1a1092793092 100644 --- a/drivers/edac/versalnet_edac.c +++ b/drivers/edac/versalnet_edac.c @@ -605,21 +605,23 @@ static int rpmsg_cb(struct rpmsg_device *rpdev, void *data, length = result[MSG_ERR_LENGTH]; offset = result[MSG_ERR_OFFSET]; + /* + * The data can come in two stretches. Construct the regs from two + * messages. The offset indicates the offset from which the data is to + * be taken. + */ + for (i = 0 ; i < length; i++) { + k = offset + i; + j = ERROR_DATA + i; + mc_priv->regs[k] = result[j]; + } + if (result[TOTAL_ERR_LENGTH] > length) { if (!mc_priv->part_len) mc_priv->part_len = length; else mc_priv->part_len += length; - /* - * The data can come in 2 stretches. Construct the regs from 2 - * messages the offset indicates the offset from which the data is to - * be taken - */ - for (i = 0 ; i < length; i++) { - k = offset + i; - j = ERROR_DATA + i; - mc_priv->regs[k] = result[j]; - } + if (mc_priv->part_len < result[TOTAL_ERR_LENGTH]) return 0; mc_priv->part_len = 0; @@ -705,7 +707,7 @@ static int rpmsg_cb(struct rpmsg_device *rpdev, void *data, /* Convert to bytes */ length = result[TOTAL_ERR_LENGTH] * 4; log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message, - sec_sev, (void *)&result[ERROR_DATA], length); + sec_sev, (void *)&mc_priv->regs, length); return 0; } From 4b93d211bbffd3dce76664d95f2306d23e7215ce Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Thu, 30 Oct 2025 08:02:28 +0530 Subject: [PATCH 172/543] ACPI: MRRM: Fix memory leaks and improve error handling Add proper error handling and resource cleanup to prevent memory leaks in add_boot_memory_ranges(). The function now checks for NULL return from kobject_create_and_add(), uses local buffer for range names to avoid dynamic allocation, and implements a cleanup path that removes previously created sysfs groups and kobjects on failure. This prevents resource leaks when kobject creation or sysfs group creation fails during boot memory range initialization. Signed-off-by: Kaushlendra Kumar Reviewed-by: Tony Luck Link: https://patch.msgid.link/20251030023228.3956296-1-kaushlendra.kumar@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_mrrm.c | 51 +++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/drivers/acpi/acpi_mrrm.c b/drivers/acpi/acpi_mrrm.c index a6dbf623e557..6d69554c940e 100644 --- a/drivers/acpi/acpi_mrrm.c +++ b/drivers/acpi/acpi_mrrm.c @@ -152,26 +152,49 @@ ATTRIBUTE_GROUPS(memory_range); static __init int add_boot_memory_ranges(void) { - struct kobject *pkobj, *kobj; + struct kobject *pkobj, *kobj, **kobjs; int ret = -EINVAL; - char *name; + char name[16]; + int i; pkobj = kobject_create_and_add("memory_ranges", acpi_kobj); + if (!pkobj) + return -ENOMEM; - for (int i = 0; i < mrrm_mem_entry_num; i++) { - name = kasprintf(GFP_KERNEL, "range%d", i); - if (!name) { - ret = -ENOMEM; - break; - } - - kobj = kobject_create_and_add(name, pkobj); - - ret = sysfs_create_groups(kobj, memory_range_groups); - if (ret) - return ret; + kobjs = kcalloc(mrrm_mem_entry_num, sizeof(*kobjs), GFP_KERNEL); + if (!kobjs) { + kobject_put(pkobj); + return -ENOMEM; } + for (i = 0; i < mrrm_mem_entry_num; i++) { + scnprintf(name, sizeof(name), "range%d", i); + kobj = kobject_create_and_add(name, pkobj); + if (!kobj) { + ret = -ENOMEM; + goto cleanup; + } + + ret = sysfs_create_groups(kobj, memory_range_groups); + if (ret) { + kobject_put(kobj); + goto cleanup; + } + kobjs[i] = kobj; + } + + kfree(kobjs); + return 0; + +cleanup: + for (int j = 0; j < i; j++) { + if (kobjs[j]) { + sysfs_remove_groups(kobjs[j], memory_range_groups); + kobject_put(kobjs[j]); + } + } + kfree(kobjs); + kobject_put(pkobj); return ret; } From 3ad1b71fdc5707d14332d9ae710a237de936be9b Mon Sep 17 00:00:00 2001 From: Feng Jiang Date: Wed, 29 Oct 2025 17:44:28 +0800 Subject: [PATCH 173/543] riscv: Build loader.bin exclusively for Canaan K210 According to the explanation in commit ef10bdf9c3e6 ("riscv: Kconfig.socs: Split ARCH_CANAAN and SOC_CANAAN_K210"), loader.bin is a special feature of the Canaan K210 and is not applicable to other SoCs. Fixes: e79dfcbfb902 ("riscv: make image compression configurable") Signed-off-by: Feng Jiang Reviewed-by: Emil Renner Berthing Link: https://lore.kernel.org/r/20251029094429.553842-1-jiangfeng@kylinos.cn Signed-off-by: Paul Walmsley --- arch/riscv/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index ecf2fcce2d92..3998d4036f15 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -159,7 +159,7 @@ boot-image-$(CONFIG_KERNEL_LZO) := Image.lzo boot-image-$(CONFIG_KERNEL_ZSTD) := Image.zst boot-image-$(CONFIG_KERNEL_XZ) := Image.xz ifdef CONFIG_RISCV_M_MODE -boot-image-$(CONFIG_ARCH_CANAAN) := loader.bin +boot-image-$(CONFIG_SOC_CANAAN_K210) := loader.bin endif boot-image-$(CONFIG_EFI_ZBOOT) := vmlinuz.efi boot-image-$(CONFIG_XIP_KERNEL) := xipImage From 5e8632987dd1882ed4d1e1039032ab1b0c1ec12b Mon Sep 17 00:00:00 2001 From: Feng Jiang Date: Wed, 29 Oct 2025 17:44:29 +0800 Subject: [PATCH 174/543] riscv: Remove redundant judgment for the default build target The value of KBUILD_IMAGE is derived from $(boot-image-y), so there's no need for redundant checks before this. Signed-off-by: Feng Jiang Reviewed-by: Emil Renner Berthing Link: https://lore.kernel.org/r/20251029094429.553842-2-jiangfeng@kylinos.cn Signed-off-by: Paul Walmsley --- arch/riscv/Makefile | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 3998d4036f15..4c6de57f65ef 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -134,21 +134,6 @@ endif CHECKFLAGS += -D__riscv -D__riscv_xlen=$(BITS) # Default target when executing plain make -boot := arch/riscv/boot -ifeq ($(CONFIG_XIP_KERNEL),y) -KBUILD_IMAGE := $(boot)/xipImage -else -ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_CANAAN_K210),yy) -KBUILD_IMAGE := $(boot)/loader.bin -else -ifeq ($(CONFIG_EFI_ZBOOT),) -KBUILD_IMAGE := $(boot)/Image.gz -else -KBUILD_IMAGE := $(boot)/vmlinuz.efi -endif -endif -endif - boot := arch/riscv/boot boot-image-y := Image boot-image-$(CONFIG_KERNEL_BZIP2) := Image.bz2 From dc20452e6caf962f04ede7f364267b0c37784ab4 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 7 Nov 2025 14:56:59 -0700 Subject: [PATCH 175/543] riscv: Fix CONFIG_AS_HAS_INSN for new .insn usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 44aa25c000b4 ("riscv: asm: use .insn for making custom instructions"), builds using LLVM older that 19 or binutils older than 2.38 fail with: arch/riscv/include/asm/vdso/processor.h: Assembler messages: arch/riscv/include/asm/vdso/processor.h:27: Error: unrecognized opcode `0x100000f' arch/riscv/include/asm/vdso/processor.h:27: Error: unrecognized opcode `0x100000f' arch/riscv/include/asm/vdso/processor.h:27: Error: unrecognized opcode `0x100000f' arch/riscv/include/asm/vdso/processor.h:27: Error: unrecognized opcode `0x100000f' make[4]: *** [scripts/Makefile.build:287: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1 In file included from :4: In file included from lib/vdso/gettimeofday.c:6: In file included from include/vdso/datapage.h:21: In file included from include/vdso/processor.h:10: arch/riscv/include/asm/vdso/processor.h:23:2: error: expected instruction format 23 | ALT_RISCV_PAUSE(); | ^ arch/riscv/include/asm/errata_list.h:47:3: note: expanded from macro 'ALT_RISCV_PAUSE' 47 | RISCV_PAUSE, /* Original RISC‑V pause insn */ \ | ^ arch/riscv/include/asm/insn-def.h:259:21: note: expanded from macro 'RISCV_PAUSE' 259 | #define RISCV_PAUSE ASM_INSN_I("0x100000f") | ^ arch/riscv/include/asm/asm.h:16:26: note: expanded from macro 'ASM_INSN_I' 16 | #define ASM_INSN_I(__x) ".insn " __x | ^ :5:7: note: instantiated into assembly here 5 | .insn 0x100000f | ^ binutils gained support for '.insn ' in 2.38 [1] and LLVM gained support in 19 [2]. Adjust the test for CONFIG_AS_HAS_INSN to ensure that all versions of .insn are supported before being used. Fixes: 44aa25c000b4 ("riscv: asm: use .insn for making custom instructions") Link: https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=a262b82fdbf4cda3b0648b1adc32245ca3f78b7a [1] Link: https://github.com/llvm/llvm-project/commit/2a086dce691e3cc34a2fc27f4fb255bb2cbbfac9 [2] Suggested-by: Andrew Jones Signed-off-by: Nathan Chancellor Reviewed-by: Andrew Jones Link: https://patch.msgid.link/20251107-riscv-fix-new-insn-usage-v1-1-9a186c5928a0@kernel.org Signed-off-by: Paul Walmsley --- arch/riscv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 22cda9c452d2..fadec20b87a8 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -367,7 +367,7 @@ config RISCV_NONSTANDARD_CACHE_OPS systems to handle cache management. config AS_HAS_INSN - def_bool $(as-instr,.insn r 51$(comma) 0$(comma) 0$(comma) t0$(comma) t0$(comma) zero) + def_bool $(as-instr,.insn 0x100000f) config AS_HAS_OPTION_ARCH # https://github.com/llvm/llvm-project/commit/9e8ed3403c191ab9c4903e8eeb8f732ff8a43cb4 From 4da4e4bde1c453ac5cc2dce5def81d504ae257ee Mon Sep 17 00:00:00 2001 From: Nate Karstens Date: Thu, 6 Nov 2025 16:28:33 -0600 Subject: [PATCH 176/543] strparser: Fix signed/unsigned mismatch bug The `len` member of the sk_buff is an unsigned int. This is cast to `ssize_t` (a signed type) for the first sk_buff in the comparison, but not the second sk_buff. On 32-bit systems, this can result in an integer underflow for certain values because unsigned arithmetic is being used. This appears to be an oversight: if the intention was to use unsigned arithmetic, then the first cast would have been omitted. The change ensures both len values are cast to `ssize_t`. The underflow causes an issue with ktls when multiple TLS PDUs are included in a single TCP segment. The mainline kernel does not use strparser for ktls anymore, but this is still useful for other features that still use strparser, and for backporting. Signed-off-by: Nate Karstens Cc: stable@vger.kernel.org Fixes: 43a0c6751a32 ("strparser: Stream parser for messages") Reviewed-by: Jacob Keller Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20251106222835.1871628-1-nate.karstens@garmin.com Signed-off-by: Jakub Kicinski --- net/strparser/strparser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 43b1f558b33d..e659fea2da70 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -238,7 +238,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, strp_parser_err(strp, -EMSGSIZE, desc); break; } else if (len <= (ssize_t)head->len - - skb->len - stm->strp.offset) { + (ssize_t)skb->len - stm->strp.offset) { /* Length must be into new skb (and also * greater than zero) */ From 57531b3416448d1ced36a2a974a4085ec43d57b0 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Thu, 6 Nov 2025 17:12:09 +0100 Subject: [PATCH 177/543] selftests: net: local_termination: Wait for interfaces to come up It seems that most of the tests prepare the interfaces once before the test run (setup_prepare()), rely on setup_wait() to wait for link and only then run the test(s). local_termination brings the physical interfaces down and up during test run but never wait for them to come up. If the auto-negotiation takes some seconds, first test packets are being lost, which leads to false-negative test results. Use setup_wait() in run_test() to make sure auto-negotiation has been completed after all simple_if_init() calls on physical interfaces and test packets will not be lost because of the race against link establishment. Fixes: 90b9566aa5cd3f ("selftests: forwarding: add a test for local_termination.sh") Reviewed-by: Vladimir Oltean Signed-off-by: Alexander Sverdlin Link: https://patch.msgid.link/20251106161213.459501-1-alexander.sverdlin@siemens.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/local_termination.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index ecd34f364125..892895659c7e 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -176,6 +176,8 @@ run_test() local rcv_dmac=$(mac_get $rcv_if_name) local should_receive + setup_wait + tcpdump_start $rcv_if_name mc_route_prepare $send_if_name From ad17e7e92a7c52ce70bb764813fcf99464f96903 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 6 Nov 2025 10:14:21 +0800 Subject: [PATCH 178/543] net: fec: correct rx_bytes statistic for the case SHIFT16 is set Two additional bytes in front of each frame received into the RX FIFO if SHIFT16 is set, so we need to subtract the extra two bytes from pkt_len to correct the statistic of rx_bytes. Fixes: 3ac72b7b63d5 ("net: fec: align IP header in hardware") Signed-off-by: Wei Fang Reviewed-by: Frank Li Link: https://patch.msgid.link/20251106021421.2096585-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 1edcfaee6819..3222359ac15b 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1835,6 +1835,8 @@ fec_enet_rx_queue(struct net_device *ndev, u16 queue_id, int budget) ndev->stats.rx_packets++; pkt_len = fec16_to_cpu(bdp->cbd_datlen); ndev->stats.rx_bytes += pkt_len; + if (fep->quirks & FEC_QUIRK_HAS_RACC) + ndev->stats.rx_bytes -= 2; index = fec_enet_get_bd_index(bdp, &rxq->bd); page = rxq->rx_skb_info[index].page; From 96a9178a29a6b84bb632ebeb4e84cf61191c73d5 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 6 Nov 2025 10:06:37 +0100 Subject: [PATCH 179/543] net: phy: micrel: lan8814 fix reset of the QSGMII interface The lan8814 is a quad-phy and it is using QSGMII towards the MAC. The problem is that everytime when one of the ports is configured then the PCS is reseted for all the PHYs. Meaning that the other ports can loose traffic until the link is establish again. To fix this, do the reset one time for the entire PHY package. Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy") Signed-off-by: Horatiu Vultur Reviewed-by: Andrew Lunn Reviewed-by: Divya Koppera Link: https://patch.msgid.link/20251106090637.2030625-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 6a1a424e3b30..01c87c9b7702 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -4380,12 +4380,6 @@ static int lan8814_config_init(struct phy_device *phydev) { struct kszphy_priv *lan8814 = phydev->priv; - /* Reset the PHY */ - lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, - LAN8814_QSGMII_SOFT_RESET, - LAN8814_QSGMII_SOFT_RESET_BIT, - LAN8814_QSGMII_SOFT_RESET_BIT); - /* Disable ANEG with QSGMII PCS Host side */ lanphy_modify_page_reg(phydev, LAN8814_PAGE_PORT_REGS, LAN8814_QSGMII_PCS1G_ANEG_CONFIG, @@ -4471,6 +4465,12 @@ static int lan8814_probe(struct phy_device *phydev) addr, sizeof(struct lan8814_shared_priv)); if (phy_package_init_once(phydev)) { + /* Reset the PHY */ + lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, + LAN8814_QSGMII_SOFT_RESET, + LAN8814_QSGMII_SOFT_RESET_BIT, + LAN8814_QSGMII_SOFT_RESET_BIT); + err = lan8814_release_coma_mode(phydev); if (err) return err; From 3f9eacf4f0705876a5d6526d7d320ca91d7d7a16 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Oct 2025 12:27:05 +0000 Subject: [PATCH 180/543] KVM: arm64: Make all 32bit ID registers fully writable 32bit ID registers aren't getting much love these days, and are often missed in updates. One of these updates broke restoring a GICv2 guest on a GICv3 machine. Instead of performing a piecemeal fix, just bite the bullet and make all 32bit ID regs fully writable. KVM itself never relies on them for anything, and if the VMM wants to mess up the guest, so be it. Fixes: 5cb57a1aff755 ("KVM: arm64: Zero ID_AA64PFR0_EL1.GIC when no GICv3 is presented to the guest") Reported-by: Peter Maydell Cc: stable@vger.kernel.org Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20251030122707.2033690-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 59 ++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e67eb39ddc11..ad82264c6cbe 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2595,19 +2595,23 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, .val = 0, \ } -/* sys_reg_desc initialiser for known cpufeature ID registers */ -#define AA32_ID_SANITISED(name) { \ - ID_DESC(name), \ - .visibility = aa32_id_visibility, \ - .val = 0, \ -} - /* sys_reg_desc initialiser for writable ID registers */ #define ID_WRITABLE(name, mask) { \ ID_DESC(name), \ .val = mask, \ } +/* + * 32bit ID regs are fully writable when the guest is 32bit + * capable. Nothing in the KVM code should rely on 32bit features + * anyway, only 64bit, so let the VMM do its worse. + */ +#define AA32_ID_WRITABLE(name) { \ + ID_DESC(name), \ + .visibility = aa32_id_visibility, \ + .val = GENMASK(31, 0), \ +} + /* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ #define ID_FILTERED(sysreg, name, mask) { \ ID_DESC(sysreg), \ @@ -3128,40 +3132,39 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 mappings of the AArch32 ID registers */ /* CRm=1 */ - AA32_ID_SANITISED(ID_PFR0_EL1), - AA32_ID_SANITISED(ID_PFR1_EL1), + AA32_ID_WRITABLE(ID_PFR0_EL1), + AA32_ID_WRITABLE(ID_PFR1_EL1), { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg, .get_user = get_id_reg, .set_user = set_id_dfr0_el1, .visibility = aa32_id_visibility, .reset = read_sanitised_id_dfr0_el1, - .val = ID_DFR0_EL1_PerfMon_MASK | - ID_DFR0_EL1_CopDbg_MASK, }, + .val = GENMASK(31, 0) }, ID_HIDDEN(ID_AFR0_EL1), - AA32_ID_SANITISED(ID_MMFR0_EL1), - AA32_ID_SANITISED(ID_MMFR1_EL1), - AA32_ID_SANITISED(ID_MMFR2_EL1), - AA32_ID_SANITISED(ID_MMFR3_EL1), + AA32_ID_WRITABLE(ID_MMFR0_EL1), + AA32_ID_WRITABLE(ID_MMFR1_EL1), + AA32_ID_WRITABLE(ID_MMFR2_EL1), + AA32_ID_WRITABLE(ID_MMFR3_EL1), /* CRm=2 */ - AA32_ID_SANITISED(ID_ISAR0_EL1), - AA32_ID_SANITISED(ID_ISAR1_EL1), - AA32_ID_SANITISED(ID_ISAR2_EL1), - AA32_ID_SANITISED(ID_ISAR3_EL1), - AA32_ID_SANITISED(ID_ISAR4_EL1), - AA32_ID_SANITISED(ID_ISAR5_EL1), - AA32_ID_SANITISED(ID_MMFR4_EL1), - AA32_ID_SANITISED(ID_ISAR6_EL1), + AA32_ID_WRITABLE(ID_ISAR0_EL1), + AA32_ID_WRITABLE(ID_ISAR1_EL1), + AA32_ID_WRITABLE(ID_ISAR2_EL1), + AA32_ID_WRITABLE(ID_ISAR3_EL1), + AA32_ID_WRITABLE(ID_ISAR4_EL1), + AA32_ID_WRITABLE(ID_ISAR5_EL1), + AA32_ID_WRITABLE(ID_MMFR4_EL1), + AA32_ID_WRITABLE(ID_ISAR6_EL1), /* CRm=3 */ - AA32_ID_SANITISED(MVFR0_EL1), - AA32_ID_SANITISED(MVFR1_EL1), - AA32_ID_SANITISED(MVFR2_EL1), + AA32_ID_WRITABLE(MVFR0_EL1), + AA32_ID_WRITABLE(MVFR1_EL1), + AA32_ID_WRITABLE(MVFR2_EL1), ID_UNALLOCATED(3,3), - AA32_ID_SANITISED(ID_PFR2_EL1), + AA32_ID_WRITABLE(ID_PFR2_EL1), ID_HIDDEN(ID_DFR1_EL1), - AA32_ID_SANITISED(ID_MMFR5_EL1), + AA32_ID_WRITABLE(ID_MMFR5_EL1), ID_UNALLOCATED(3,7), /* AArch64 ID registers */ From 8a9866ff860052efc5f9766f3f87fae30c983156 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Oct 2025 12:27:06 +0000 Subject: [PATCH 181/543] KVM: arm64: Set ID_{AA64PFR0,PFR1}_EL1.GIC when GICv3 is configured Drive the idreg fields indicating the presence of GICv3 directly from the vgic code. This avoids having to do any sort of runtime clearing of the idreg. Fixes: 5cb57a1aff755 ("KVM: arm64: Zero ID_AA64PFR0_EL1.GIC when no GICv3 is presented to the guest") Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20251030122707.2033690-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-init.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 1796b1a22a72..ca411cce4140 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -71,6 +71,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type); int kvm_vgic_create(struct kvm *kvm, u32 type) { struct kvm_vcpu *vcpu; + u64 aa64pfr0, pfr1; unsigned long i; int ret; @@ -161,10 +162,19 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) + aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) { kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - else + } else { INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); + aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); + pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3); + } + + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); if (type == KVM_DEV_TYPE_ARM_VGIC_V3) kvm->arch.vgic.nassgicap = system_supports_direct_sgis(); From 50e7cce81b9b2fbd6f0104c1698959d45ce3cf58 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Oct 2025 12:27:07 +0000 Subject: [PATCH 182/543] KVM: arm64: Limit clearing of ID_{AA64PFR0,PFR1}_EL1.GIC to userspace irqchip Now that the idreg's GIC field is in sync with the irqchip, limit the runtime clearing of these fields to the pathological case where we do not have an in-kernel GIC. While we're at it, use the existing API instead of open-coded accessors to access the ID regs. Fixes: 5cb57a1aff755 ("KVM: arm64: Zero ID_AA64PFR0_EL1.GIC when no GICv3 is presented to the guest") Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20251030122707.2033690-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ad82264c6cbe..8ae2bca81614 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -5609,11 +5609,13 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) guard(mutex)(&kvm->arch.config_lock); - if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) && - irqchip_in_kernel(kvm) && - kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) { - kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK; - kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK; + if (!irqchip_in_kernel(kvm)) { + u64 val; + + val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); + val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val); } if (vcpu_has_nv(vcpu)) { From 75360a9a338580990c1ee188d40a838c025bbd30 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 7 Nov 2025 10:48:46 -0800 Subject: [PATCH 183/543] KVM: arm64: vgic-v3: Reinstate IRQ lock ordering for LPI xarray Zenghui reports that running a KVM guest with an assigned device and lockdep enabled produces an unfriendly splat due to an inconsistent irq context when taking the lpi_xa's spinlock. This is no good as in rare cases the last reference to an LPI can get dropped after injection of a cached LPI translation. In this case, vgic_put_irq() will release the IRQ struct and take the lpi_xa's spinlock to erase it from the xarray. Reinstate the IRQ ordering and update the lockdep hint accordingly. Note that there is no irqsave equivalent of might_lock(), so just explictly grab and release the spinlock on lockdep kernels. Reported-by: Zenghui Yu Closes: https://lore.kernel.org/kvmarm/b4d7cb0f-f007-0b81-46d1-998b15cc14bc@huawei.com/ Fixes: 982f31bbb5b0 ("KVM: arm64: vgic-v3: Don't require IRQs be disabled for LPI xarray lock") Signed-off-by: Oliver Upton Link: https://patch.msgid.link/20251107184847.1784820-2-oupton@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-debug.c | 16 ++++++++++++---- arch/arm64/kvm/vgic/vgic-init.c | 2 +- arch/arm64/kvm/vgic/vgic-its.c | 7 ++++--- arch/arm64/kvm/vgic/vgic.c | 23 +++++++++++++++-------- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index 4c1209261b65..bb92853d1fd3 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -64,29 +64,37 @@ static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) static int iter_mark_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long intid, flags; struct vgic_irq *irq; - unsigned long intid; int nr_lpis = 0; + xa_lock_irqsave(&dist->lpi_xa, flags); + xa_for_each(&dist->lpi_xa, intid, irq) { if (!vgic_try_get_irq_ref(irq)) continue; - xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + __xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); nr_lpis++; } + xa_unlock_irqrestore(&dist->lpi_xa, flags); + return nr_lpis; } static void iter_unmark_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long intid, flags; struct vgic_irq *irq; - unsigned long intid; xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { - xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + xa_lock_irqsave(&dist->lpi_xa, flags); + __xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + xa_unlock_irqrestore(&dist->lpi_xa, flags); + + /* vgic_put_irq() expects to be called outside of the xa_lock */ vgic_put_irq(kvm, irq); } } diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index ca411cce4140..da62edbc1205 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -53,7 +53,7 @@ void kvm_vgic_early_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - xa_init(&dist->lpi_xa); + xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ); } /* CREATION */ diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index ce3e3ed3f29f..f162206adb48 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -78,6 +78,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq; + unsigned long flags; int ret; /* In this case there is no put, since we keep the reference. */ @@ -88,7 +89,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, if (!irq) return ERR_PTR(-ENOMEM); - ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); + ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); if (ret) { kfree(irq); return ERR_PTR(ret); @@ -103,7 +104,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, irq->target_vcpu = vcpu; irq->group = 1; - xa_lock(&dist->lpi_xa); + xa_lock_irqsave(&dist->lpi_xa, flags); /* * There could be a race with another vgic_add_lpi(), so we need to @@ -125,7 +126,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, } out_unlock: - xa_unlock(&dist->lpi_xa); + xa_unlock_irqrestore(&dist->lpi_xa, flags); if (ret) return ERR_PTR(ret); diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 6dd5a10081e2..8d20c53faef0 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -28,7 +28,7 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * kvm->arch.config_lock (mutex) * its->cmd_lock (mutex) * its->its_lock (mutex) - * vgic_dist->lpi_xa.xa_lock + * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled * vgic_cpu->ap_list_lock must be taken with IRQs disabled * vgic_irq->irq_lock must be taken with IRQs disabled * @@ -141,32 +141,39 @@ static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long flags; - if (irq->intid >= VGIC_MIN_LPI) - might_lock(&dist->lpi_xa.xa_lock); + /* + * Normally the lock is only taken when the refcount drops to 0. + * Acquire/release it early on lockdep kernels to make locking issues + * in rare release paths a bit more obvious. + */ + if (IS_ENABLED(CONFIG_LOCKDEP) && irq->intid >= VGIC_MIN_LPI) { + guard(spinlock_irqsave)(&dist->lpi_xa.xa_lock); + } if (!__vgic_put_irq(kvm, irq)) return; - xa_lock(&dist->lpi_xa); + xa_lock_irqsave(&dist->lpi_xa, flags); vgic_release_lpi_locked(dist, irq); - xa_unlock(&dist->lpi_xa); + xa_unlock_irqrestore(&dist->lpi_xa, flags); } static void vgic_release_deleted_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long intid; + unsigned long flags, intid; struct vgic_irq *irq; - xa_lock(&dist->lpi_xa); + xa_lock_irqsave(&dist->lpi_xa, flags); xa_for_each(&dist->lpi_xa, intid, irq) { if (irq->pending_release) vgic_release_lpi_locked(dist, irq); } - xa_unlock(&dist->lpi_xa); + xa_unlock_irqrestore(&dist->lpi_xa, flags); } void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) From 66768669f27d98b45b20ed401cca913c387a9934 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 7 Nov 2025 10:48:47 -0800 Subject: [PATCH 184/543] KVM: arm64: vgic-v3: Release reserved slot outside of lpi_xa's lock xa_release() expects to be called outside of the xa_lock. Fix vgic_add_lpi() to drop the lock before calling and restructure to get rid of the goto label. Reported-by: Zenghui Yu Closes: https://lore.kernel.org/kvmarm/d0853e82-7d95-5025-7abf-c6f1e0cdf7b5@huawei.com/ Fixes: 481c9ee846d2 ("KVM: arm64: vgic-its: Get rid of the lpi_list_lock") Signed-off-by: Oliver Upton Link: https://patch.msgid.link/20251107184847.1784820-3-oupton@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-its.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index f162206adb48..3f1c4b10fed9 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -115,21 +115,18 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, /* Someone was faster with adding this LPI, lets use that. */ kfree(irq); irq = oldirq; - - goto out_unlock; + } else { + ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0)); } - ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0)); + xa_unlock_irqrestore(&dist->lpi_xa, flags); + if (ret) { xa_release(&dist->lpi_xa, intid); kfree(irq); - } -out_unlock: - xa_unlock_irqrestore(&dist->lpi_xa, flags); - - if (ret) return ERR_PTR(ret); + } /* * We "cache" the configuration table entries in our struct vgic_irq's. From 4af235bf645516481a82227d82d1352b9788903a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 6 Nov 2025 17:28:25 -0800 Subject: [PATCH 185/543] MAINTAINERS: Switch myself to using kernel.org address I've been running into issues with the linux.dev email semi-periodically, switching to my kernel.org address while I go figure out a better home for my inbox. Signed-off-by: Oliver Upton Link: https://patch.msgid.link/20251107012830.1708225-1-oupton@kernel.org Signed-off-by: Marc Zyngier --- .mailmap | 3 ++- MAINTAINERS | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index d2edd256b19d..54dde6226079 100644 --- a/.mailmap +++ b/.mailmap @@ -603,7 +603,8 @@ Oleksij Rempel Oleksij Rempel Oliver Hartkopp Oliver Hartkopp -Oliver Upton +Oliver Upton +Oliver Upton Ondřej Jirman Oza Pawandeep Pali Rohár diff --git a/MAINTAINERS b/MAINTAINERS index 46126ce2f968..234b50c2c10b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13642,7 +13642,7 @@ F: virt/kvm/* KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) M: Marc Zyngier -M: Oliver Upton +M: Oliver Upton R: Joey Gouly R: Suzuki K Poulose R: Zenghui Yu From e6965188f84a7883e6a0d3448e86b0cf29b24dfc Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Wed, 5 Nov 2025 11:25:46 -0800 Subject: [PATCH 186/543] scsi: target: tcm_loop: Fix segfault in tcm_loop_tpg_address_show() If the allocation of tl_hba->sh fails in tcm_loop_driver_probe() and we attempt to dereference it in tcm_loop_tpg_address_show() we will get a segfault, see below for an example. So, check tl_hba->sh before dereferencing it. Unable to allocate struct scsi_host BUG: kernel NULL pointer dereference, address: 0000000000000194 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 8356 Comm: tokio-runtime-w Not tainted 6.6.104.2-4.azl3 #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/28/2024 RIP: 0010:tcm_loop_tpg_address_show+0x2e/0x50 [tcm_loop] ... Call Trace: configfs_read_iter+0x12d/0x1d0 [configfs] vfs_read+0x1b5/0x300 ksys_read+0x6f/0xf0 ... Cc: stable@vger.kernel.org Fixes: 2628b352c3d4 ("tcm_loop: Show address of tpg in configfs") Signed-off-by: Hamza Mahfooz Reviewed-by: Chaitanya Kulkarni Reviewed-by: Allen Pais Link: https://patch.msgid.link/1762370746-6304-1-git-send-email-hamzamahfooz@linux.microsoft.com Signed-off-by: Martin K. Petersen --- drivers/target/loopback/tcm_loop.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index c7b7da629741..01a8e349dc4d 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -894,6 +894,9 @@ static ssize_t tcm_loop_tpg_address_show(struct config_item *item, struct tcm_loop_tpg, tl_se_tpg); struct tcm_loop_hba *tl_hba = tl_tpg->tl_hba; + if (!tl_hba->sh) + return -ENODEV; + return snprintf(page, PAGE_SIZE, "%d:0:%d\n", tl_hba->sh->host_no, tl_tpg->tl_tpgt); } From dc55b3c3f61246e483e50c85d8d5366f9567e188 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 8 Nov 2025 00:45:19 +0000 Subject: [PATCH 187/543] KVM: SVM: Mark VMCB_LBR dirty when MSR_IA32_DEBUGCTLMSR is updated The APM lists the DbgCtlMsr field as being tracked by the VMCB_LBR clean bit. Always clear the bit when MSR_IA32_DEBUGCTLMSR is updated. The history is complicated, it was correctly cleared for L1 before commit 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running"). At that point svm_set_msr() started to rely on svm_update_lbrv() to clear the bit, but when nested virtualization is enabled the latter does not always clear it even if MSR_IA32_DEBUGCTLMSR changed. Go back to clearing it directly in svm_set_msr(). Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running") Reported-by: Matteo Rizzo Reported-by: evn@google.com Co-developed-by: Jim Mattson Signed-off-by: Jim Mattson Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251108004524.1600006-2-yosry.ahmed@linux.dev Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 76055c0ba177..39538098002b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3004,7 +3004,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & DEBUGCTL_RESERVED_BITS) return 1; + if (svm_get_lbr_vmcb(svm)->save.dbgctl == data) + break; + svm_get_lbr_vmcb(svm)->save.dbgctl = data; + vmcb_mark_dirty(svm->vmcb, VMCB_LBR); svm_update_lbrv(vcpu); break; case MSR_VM_HSAVE_PA: From fbe5e5f030c22ae717ee422aaab0e00ea84fab5e Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 8 Nov 2025 00:45:20 +0000 Subject: [PATCH 188/543] KVM: nSVM: Always recalculate LBR MSR intercepts in svm_update_lbrv() svm_update_lbrv() is called when MSR_IA32_DEBUGCTLMSR is updated, and on nested transitions where LBRV is used. It checks whether LBRV enablement needs to be changed in the current VMCB, and if it does, it also recalculate intercepts to LBR MSRs. However, there are cases where intercepts need to be updated even when LBRV enablement doesn't. Example scenario: - L1 has MSR_IA32_DEBUGCTLMSR cleared. - L1 runs L2 without LBR_CTL_ENABLE (no LBRV). - L2 sets DEBUGCTLMSR_LBR in MSR_IA32_DEBUGCTLMSR, svm_update_lbrv() sets LBR_CTL_ENABLE in VMCB02 and disables intercepts to LBR MSRs. - L2 exits to L1, svm_update_lbrv() is not called on this transition. - L1 clears MSR_IA32_DEBUGCTLMSR, svm_update_lbrv() finds that LBR_CTL_ENABLE is already cleared in VMCB01 and does nothing. - Intercepts remain disabled, L1 reads to LBR MSRs read the host MSRs. Fix it by always recalculating intercepts in svm_update_lbrv(). Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running") Cc: stable@vger.kernel.org Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251108004524.1600006-3-yosry.ahmed@linux.dev Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 39538098002b..53201f13a43c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -806,25 +806,29 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) vmcb_mark_dirty(to_vmcb, VMCB_LBR); } -void svm_enable_lbrv(struct kvm_vcpu *vcpu) +static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - svm_recalc_lbr_msr_intercepts(vcpu); /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ if (is_guest_mode(vcpu)) svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); } -static void svm_disable_lbrv(struct kvm_vcpu *vcpu) +void svm_enable_lbrv(struct kvm_vcpu *vcpu) +{ + __svm_enable_lbrv(vcpu); + svm_recalc_lbr_msr_intercepts(vcpu); +} + +static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; - svm_recalc_lbr_msr_intercepts(vcpu); /* * Move the LBR msrs back to the vmcb01 to avoid copying them @@ -853,13 +857,18 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu) (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); - if (enable_lbrv == current_enable_lbrv) - return; + if (enable_lbrv && !current_enable_lbrv) + __svm_enable_lbrv(vcpu); + else if (!enable_lbrv && current_enable_lbrv) + __svm_disable_lbrv(vcpu); - if (enable_lbrv) - svm_enable_lbrv(vcpu); - else - svm_disable_lbrv(vcpu); + /* + * During nested transitions, it is possible that the current VMCB has + * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa). + * In this case, even though LBR_CTL does not need an update, intercepts + * do, so always recalculate the intercepts here. + */ + svm_recalc_lbr_msr_intercepts(vcpu); } void disable_nmi_singlestep(struct vcpu_svm *svm) From 8a4821412cf2c1429fffa07c012dd150f2edf78c Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 8 Nov 2025 00:45:21 +0000 Subject: [PATCH 189/543] KVM: nSVM: Fix and simplify LBR virtualization handling with nested The current scheme for handling LBRV when nested is used is very complicated, especially when L1 does not enable LBRV (i.e. does not set LBR_CTL_ENABLE_MASK). To avoid copying LBRs between VMCB01 and VMCB02 on every nested transition, the current implementation switches between using VMCB01 or VMCB02 as the source of truth for the LBRs while L2 is running. If L2 enables LBR, VMCB02 is used as the source of truth. When L2 disables LBR, the LBRs are copied to VMCB01 and VMCB01 is used as the source of truth. This introduces significant complexity, and incorrect behavior in some cases. For example, on a nested #VMEXIT, the LBRs are only copied from VMCB02 to VMCB01 if LBRV is enabled in VMCB01. This is because L2's writes to MSR_IA32_DEBUGCTLMSR to enable LBR are intercepted and propagated to VMCB01 instead of VMCB02. However, LBRV is only enabled in VMCB02 when L2 is running. This means that if L2 enables LBR and exits to L1, the LBRs will not be propagated from VMCB02 to VMCB01, because LBRV is disabled in VMCB01. There is no meaningful difference in CPUID rate in L2 when copying LBRs on every nested transition vs. the current approach, so do the simple and correct thing and always copy LBRs between VMCB01 and VMCB02 on nested transitions (when LBRV is disabled by L1). Drop the conditional LBRs copying in __svm_{enable/disable}_lbrv() as it is now unnecessary. VMCB02 becomes the only source of truth for LBRs when L2 is running, regardless of LBRV being enabled by L1, drop svm_get_lbr_vmcb() and use svm->vmcb directly in its place. Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running") Cc: stable@vger.kernel.org Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251108004524.1600006-4-yosry.ahmed@linux.dev Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 20 ++++++----------- arch/x86/kvm/svm/svm.c | 46 +++++++++------------------------------ 2 files changed, 17 insertions(+), 49 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index a6443feab252..da6e80b3ac35 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -677,11 +677,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 */ svm_copy_lbrs(vmcb02, vmcb12); vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; - svm_update_lbrv(&svm->vcpu); - - } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + } else { svm_copy_lbrs(vmcb02, vmcb01); } + svm_update_lbrv(&svm->vcpu); } static inline bool is_evtinj_soft(u32 evtinj) @@ -833,11 +832,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->soft_int_next_rip = vmcb12_rip; } - vmcb02->control.virt_ext = vmcb01->control.virt_ext & - LBR_CTL_ENABLE_MASK; - if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV)) - vmcb02->control.virt_ext |= - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); + /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */ if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; @@ -1189,13 +1184,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) svm_copy_lbrs(vmcb12, vmcb02); - svm_update_lbrv(vcpu); - } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + else svm_copy_lbrs(vmcb01, vmcb02); - svm_update_lbrv(vcpu); - } + + svm_update_lbrv(vcpu); if (vnmi) { if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 53201f13a43c..10c21e4c5406 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -808,13 +808,7 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - - /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ - if (is_guest_mode(vcpu)) - svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); + to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; } void svm_enable_lbrv(struct kvm_vcpu *vcpu) @@ -825,35 +819,15 @@ void svm_enable_lbrv(struct kvm_vcpu *vcpu) static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); - svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; - - /* - * Move the LBR msrs back to the vmcb01 to avoid copying them - * on nested guest entries. - */ - if (is_guest_mode(vcpu)) - svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); -} - -static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) -{ - /* - * If LBR virtualization is disabled, the LBR MSRs are always kept in - * vmcb01. If LBR virtualization is enabled and L1 is running VMs of - * its own, the MSRs are moved between vmcb01 and vmcb02 as needed. - */ - return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb : - svm->vmcb01.ptr; + to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; } void svm_update_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; - bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || + bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) || (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); @@ -2733,19 +2707,19 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = svm->tsc_aux; break; case MSR_IA32_DEBUGCTLMSR: - msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl; + msr_info->data = svm->vmcb->save.dbgctl; break; case MSR_IA32_LASTBRANCHFROMIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from; + msr_info->data = svm->vmcb->save.br_from; break; case MSR_IA32_LASTBRANCHTOIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to; + msr_info->data = svm->vmcb->save.br_to; break; case MSR_IA32_LASTINTFROMIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from; + msr_info->data = svm->vmcb->save.last_excp_from; break; case MSR_IA32_LASTINTTOIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to; + msr_info->data = svm->vmcb->save.last_excp_to; break; case MSR_VM_HSAVE_PA: msr_info->data = svm->nested.hsave_msr; @@ -3013,10 +2987,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & DEBUGCTL_RESERVED_BITS) return 1; - if (svm_get_lbr_vmcb(svm)->save.dbgctl == data) + if (svm->vmcb->save.dbgctl == data) break; - svm_get_lbr_vmcb(svm)->save.dbgctl = data; + svm->vmcb->save.dbgctl = data; vmcb_mark_dirty(svm->vmcb, VMCB_LBR); svm_update_lbrv(vcpu); break; From 7a39c723b7472b8aaa2e0a67d2b6c7cf1c45cafb Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Sat, 8 Nov 2025 22:23:25 +0800 Subject: [PATCH 190/543] ALSA: hda/tas2781: Add new quirk for HP new projects Add new vendor_id and subsystem_id in quirk for HP new projects. Signed-off-by: Baojun Xu Link: https://patch.msgid.link/20251108142325.2563-1-baojun.xu@ti.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 4aec5067c59d..a9698bf26887 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -6694,6 +6694,15 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8e60, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8e61, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8e62, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x8ed5, "HP Merino13X", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8ed6, "HP Merino13", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8ed7, "HP Merino14", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8ed8, "HP Merino16", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8ed9, "HP Merino14W", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8eda, "HP Merino16W", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8f40, "HP Lampas14", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x103c, 0x8f41, "HP Lampas16", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x103c, 0x8f42, "HP LampasW14", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x1043, 0x1032, "ASUS VivoBook X513EA", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1034, "ASUS GU605C", ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), From 9b07cdf86a0b90556f5b68a6b20b35833b558df3 Mon Sep 17 00:00:00 2001 From: Haotian Zhang Date: Tue, 28 Oct 2025 11:05:09 +0800 Subject: [PATCH 191/543] pinctrl: cirrus: Fix fwnode leak in cs42l43_pin_probe() The driver calls fwnode_get_named_child_node() which takes a reference on the child node, but never releases it, which causes a reference leak. Fix by using devm_add_action_or_reset() to automatically release the reference when the device is removed. Fixes: d5282a539297 ("pinctrl: cs42l43: Add support for the cs42l43") Suggested-by: Charles Keepax Signed-off-by: Haotian Zhang Signed-off-by: Linus Walleij --- drivers/pinctrl/cirrus/pinctrl-cs42l43.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c index 68abb6d6cecd..a8f82104a384 100644 --- a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c +++ b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c @@ -532,6 +532,11 @@ static int cs42l43_gpio_add_pin_ranges(struct gpio_chip *chip) return ret; } +static void cs42l43_fwnode_put(void *data) +{ + fwnode_handle_put(data); +} + static int cs42l43_pin_probe(struct platform_device *pdev) { struct cs42l43 *cs42l43 = dev_get_drvdata(pdev->dev.parent); @@ -563,10 +568,20 @@ static int cs42l43_pin_probe(struct platform_device *pdev) priv->gpio_chip.ngpio = CS42L43_NUM_GPIOS; if (is_of_node(fwnode)) { - fwnode = fwnode_get_named_child_node(fwnode, "pinctrl"); + struct fwnode_handle *child; - if (fwnode && !fwnode->dev) - fwnode->dev = priv->dev; + child = fwnode_get_named_child_node(fwnode, "pinctrl"); + if (child) { + ret = devm_add_action_or_reset(&pdev->dev, + cs42l43_fwnode_put, child); + if (ret) { + fwnode_handle_put(child); + return ret; + } + if (!child->dev) + child->dev = priv->dev; + fwnode = child; + } } priv->gpio_chip.fwnode = fwnode; From 79280191c2fd7f24899bbd640003b5389d3c109c Mon Sep 17 00:00:00 2001 From: Henrique Carvalho Date: Fri, 7 Nov 2025 18:59:53 -0300 Subject: [PATCH 192/543] smb: client: fix cifs_pick_channel when channel needs reconnect cifs_pick_channel iterates candidate channels using cur. The reconnect-state test mistakenly used a different variable. This checked the wrong slot and would cause us to skip a healthy channel and to dispatch on one that needs reconnect, occasionally failing operations when a channel was down. Fix by replacing for the correct variable. Fixes: fc43a8ac396d ("cifs: cifs_pick_channel should try selecting active channels") Cc: stable@vger.kernel.org Reviewed-by: Shyam Prasad N Signed-off-by: Henrique Carvalho Signed-off-by: Steve French --- fs/smb/client/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 051cd9dbba13..915cedde5d66 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -830,7 +830,7 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) if (!server || server->terminate) continue; - if (CIFS_CHAN_NEEDS_RECONNECT(ses, i)) + if (CIFS_CHAN_NEEDS_RECONNECT(ses, cur)) continue; /* From e8c73eb7db0a498cd4b22d2819e6ab1a6f506bd6 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Fri, 7 Nov 2025 22:01:39 +0800 Subject: [PATCH 193/543] cifs: client: fix memory leak in smb3_fs_context_parse_param The user calls fsconfig twice, but when the program exits, free() only frees ctx->source for the second fsconfig, not the first. Regarding fc->source, there is no code in the fs context related to its memory reclamation. To fix this memory leak, release the source memory corresponding to ctx or fc before each parsing. syzbot reported: BUG: memory leak unreferenced object 0xffff888128afa360 (size 96): backtrace (crc 79c9c7ba): kstrdup+0x3c/0x80 mm/util.c:84 smb3_fs_context_parse_param+0x229b/0x36c0 fs/smb/client/fs_context.c:1444 BUG: memory leak unreferenced object 0xffff888112c7d900 (size 96): backtrace (crc 79c9c7ba): smb3_fs_context_fullpath+0x70/0x1b0 fs/smb/client/fs_context.c:629 smb3_fs_context_parse_param+0x2266/0x36c0 fs/smb/client/fs_context.c:1438 Reported-by: syzbot+72afd4c236e6bc3f4bac@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=72afd4c236e6bc3f4bac Cc: stable@vger.kernel.org Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Edward Adam Davis Signed-off-by: Steve French --- fs/smb/client/fs_context.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index e60927b2a7c8..c2d5bb23040c 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1435,12 +1435,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_errorf(fc, "Unknown error parsing devname\n"); goto cifs_parse_mount_err; } + kfree(ctx->source); ctx->source = smb3_fs_context_fullpath(ctx, '/'); if (IS_ERR(ctx->source)) { ctx->source = NULL; cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } + kfree(fc->source); fc->source = kstrdup(ctx->source, GFP_KERNEL); if (fc->source == NULL) { cifs_errorf(fc, "OOM when copying UNC string\n"); From e904d81ad1c04394e1cda4610de799a006cc141c Mon Sep 17 00:00:00 2001 From: Joshua Rogers Date: Fri, 7 Nov 2025 00:15:37 +0800 Subject: [PATCH 194/543] smb: server: rdma: avoid unmapping posted recv on accept failure smb_direct_prepare_negotiation() posts a recv and then, if smb_direct_accept_client() fails, calls put_recvmsg() on the same buffer. That unmaps and recycles a buffer that is still posted on the QP., which can lead to device DMA into unmapped or reused memory. Track whether the recv was posted and only return it if it was never posted. If accept fails after a post, leave it for teardown to drain and complete safely. Signed-off-by: Joshua Rogers Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 5d3b48e77012..3d8d8cb456c1 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1883,6 +1883,7 @@ static int smb_direct_accept_client(struct smbdirect_socket *sc) static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) { struct smbdirect_recv_io *recvmsg; + bool recv_posted = false; int ret; WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); @@ -1899,6 +1900,7 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) pr_err("Can't post recv: %d\n", ret); goto out_err; } + recv_posted = true; ret = smb_direct_accept_client(sc); if (ret) { @@ -1908,7 +1910,14 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) return 0; out_err: - put_recvmsg(sc, recvmsg); + /* + * If the recv was never posted, return it to the free list. + * If it was posted, leave it alone so disconnect teardown can + * drain the QP and complete it (flush) and the completion path + * will unmap it exactly once. + */ + if (!recv_posted) + put_recvmsg(sc, recvmsg); return ret; } From 98a5fd31cbf72d46bf18e50b3ab0ce86d5f319a9 Mon Sep 17 00:00:00 2001 From: Joshua Rogers Date: Sat, 8 Nov 2025 22:59:23 +0800 Subject: [PATCH 195/543] ksmbd: close accepted socket when per-IP limit rejects connection When the per-IP connection limit is exceeded in ksmbd_kthread_fn(), the code sets ret = -EAGAIN and continues the accept loop without closing the just-accepted socket. That leaks one socket per rejected attempt from a single IP and enables a trivial remote DoS. Release client_sk before continuing. This bug was found with ZeroPath. Cc: stable@vger.kernel.org Signed-off-by: Joshua Rogers Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_tcp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 7a1e3dcc2cde..d2e391c29464 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -290,8 +290,11 @@ static int ksmbd_kthread_fn(void *p) } } up_read(&conn_list_lock); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + /* Per-IP limit hit: release the just-accepted socket. */ + sock_release(client_sk); continue; + } skip_max_ip_conns_limit: if (server_conf.max_connections && From fe4b3a34e9a9654d98d274218dac0270779db0ae Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Sun, 9 Nov 2025 16:01:50 +0800 Subject: [PATCH 196/543] rust: Add -fno-isolate-erroneous-paths-dereference to bindgen_skip_c_flags It's used to work around an objtool issue since commit abb2a5572264 ("LoongArch: Add cflag -fno-isolate-erroneous-paths-dereference"), but it's then passed to bindgen and cause an error because Clang does not have this option. Fixes: abb2a5572264 ("LoongArch: Add cflag -fno-isolate-erroneous-paths-dereference") Acked-by: Miguel Ojeda Tested-by: Mingcong Bai Signed-off-by: Xi Ruoyao Signed-off-by: Huacai Chen --- rust/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/Makefile b/rust/Makefile index 3e545c1a0ff4..7842ad0a4ea7 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -298,7 +298,7 @@ bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \ -fno-inline-functions-called-once -fsanitize=bounds-strict \ -fstrict-flex-arrays=% -fmin-function-alignment=% \ -fzero-init-padding-bits=% -mno-fdpic \ - --param=% --param asan-% + --param=% --param asan-% -fno-isolate-erroneous-paths-dereference # Derived from `scripts/Makefile.clang`. BINDGEN_TARGET_x86 := x86_64-linux-gnu From f28abb9f96e65a28d46885afd6b70cfc4d5df5a2 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 9 Nov 2025 16:02:00 +0800 Subject: [PATCH 197/543] LoongArch: Clarify 3 MSG interrupt features LoongArch's MSG interrupt features are used across multiple subsystems. Clarify these features to avoid misuse, existing users will be adjusted if necessary. MSGINT: Infrastructure, means the CPU core supports message interupts. Indicated by CPUCFG1.MSGINT. AVECINT: AVEC interrupt controller based on MSGINT, means the CPU chip supports direct message interrupts. Indicated by IOCSR.FEATURES.DMSI. REDIRECTINT: REDIRECT interrupt controller based on MSGINT and AVECINT, means the CPU chip supports redirect message interrupts. Indicated by IOCSR.FEATURES.RMSI. For example: Loongson-3A5000/3C5000 doesn't support MSGINT/AVECINT/REDIRECTINT; Loongson-3A6000 supports MSGINT but doesn't support AVECINT/REDIRECTINT; Loongson-3C6000 supports MSGINT/AVECINT/REDIRECTINT. Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/cpu-features.h | 2 ++ arch/loongarch/include/asm/cpu.h | 6 +++++- arch/loongarch/include/asm/loongarch.h | 1 + arch/loongarch/kernel/cpu-probe.c | 4 ++++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/cpu-features.h b/arch/loongarch/include/asm/cpu-features.h index fc83bb32f9f0..bd5f0457ad21 100644 --- a/arch/loongarch/include/asm/cpu-features.h +++ b/arch/loongarch/include/asm/cpu-features.h @@ -67,6 +67,8 @@ #define cpu_has_hypervisor cpu_opt(LOONGARCH_CPU_HYPERVISOR) #define cpu_has_ptw cpu_opt(LOONGARCH_CPU_PTW) #define cpu_has_lspw cpu_opt(LOONGARCH_CPU_LSPW) +#define cpu_has_msgint cpu_opt(LOONGARCH_CPU_MSGINT) #define cpu_has_avecint cpu_opt(LOONGARCH_CPU_AVECINT) +#define cpu_has_redirectint cpu_opt(LOONGARCH_CPU_REDIRECTINT) #endif /* __ASM_CPU_FEATURES_H */ diff --git a/arch/loongarch/include/asm/cpu.h b/arch/loongarch/include/asm/cpu.h index dfb982fe8701..d4cd4041bee7 100644 --- a/arch/loongarch/include/asm/cpu.h +++ b/arch/loongarch/include/asm/cpu.h @@ -101,7 +101,9 @@ enum cpu_type_enum { #define CPU_FEATURE_HYPERVISOR 26 /* CPU has hypervisor (running in VM) */ #define CPU_FEATURE_PTW 27 /* CPU has hardware page table walker */ #define CPU_FEATURE_LSPW 28 /* CPU has LSPW (lddir/ldpte instructions) */ -#define CPU_FEATURE_AVECINT 29 /* CPU has AVEC interrupt */ +#define CPU_FEATURE_MSGINT 29 /* CPU has MSG interrupt */ +#define CPU_FEATURE_AVECINT 30 /* CPU has AVEC interrupt */ +#define CPU_FEATURE_REDIRECTINT 31 /* CPU has interrupt remapping */ #define LOONGARCH_CPU_CPUCFG BIT_ULL(CPU_FEATURE_CPUCFG) #define LOONGARCH_CPU_LAM BIT_ULL(CPU_FEATURE_LAM) @@ -132,6 +134,8 @@ enum cpu_type_enum { #define LOONGARCH_CPU_HYPERVISOR BIT_ULL(CPU_FEATURE_HYPERVISOR) #define LOONGARCH_CPU_PTW BIT_ULL(CPU_FEATURE_PTW) #define LOONGARCH_CPU_LSPW BIT_ULL(CPU_FEATURE_LSPW) +#define LOONGARCH_CPU_MSGINT BIT_ULL(CPU_FEATURE_MSGINT) #define LOONGARCH_CPU_AVECINT BIT_ULL(CPU_FEATURE_AVECINT) +#define LOONGARCH_CPU_REDIRECTINT BIT_ULL(CPU_FEATURE_REDIRECTINT) #endif /* _ASM_CPU_H */ diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 09dfd7eb406e..5b36fa57015f 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -1137,6 +1137,7 @@ #define IOCSRF_FLATMODE BIT_ULL(10) #define IOCSRF_VM BIT_ULL(11) #define IOCSRF_AVEC BIT_ULL(15) +#define IOCSRF_REDIRECT BIT_ULL(16) #define LOONGARCH_IOCSR_VENDOR 0x10 diff --git a/arch/loongarch/kernel/cpu-probe.c b/arch/loongarch/kernel/cpu-probe.c index cbfce2872d71..6f943d1391ff 100644 --- a/arch/loongarch/kernel/cpu-probe.c +++ b/arch/loongarch/kernel/cpu-probe.c @@ -157,6 +157,8 @@ static void cpu_probe_common(struct cpuinfo_loongarch *c) c->options |= LOONGARCH_CPU_TLB; if (config & CPUCFG1_IOCSR) c->options |= LOONGARCH_CPU_IOCSR; + if (config & CPUCFG1_MSGINT) + c->options |= LOONGARCH_CPU_MSGINT; if (config & CPUCFG1_UAL) { c->options |= LOONGARCH_CPU_UAL; elf_hwcap |= HWCAP_LOONGARCH_UAL; @@ -331,6 +333,8 @@ static inline void cpu_probe_loongson(struct cpuinfo_loongarch *c, unsigned int c->options |= LOONGARCH_CPU_EIODECODE; if (config & IOCSRF_AVEC) c->options |= LOONGARCH_CPU_AVECINT; + if (config & IOCSRF_REDIRECT) + c->options |= LOONGARCH_CPU_REDIRECTINT; if (config & IOCSRF_VM) c->options |= LOONGARCH_CPU_HYPERVISOR; } From 4e67526840fc55917581b90f6a4b65849a616dd8 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 9 Nov 2025 16:02:00 +0800 Subject: [PATCH 198/543] LoongArch: Use physical addresses for CSR_MERRENTRY/CSR_TLBRENTRY Now we use virtual addresses to fill CSR_MERRENTRY/CSR_TLBRENTRY, but hardware hope physical addresses. Now it works well because the high bits are ignored above PA_BITS (48 bits), but explicitly use physical addresses can avoid potential bugs. So fix it. Cc: stable@vger.kernel.org Signed-off-by: Huacai Chen --- arch/loongarch/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 3d9be6ca7ec5..da5926fead4a 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -1131,8 +1131,8 @@ static void configure_exception_vector(void) tlbrentry = (unsigned long)exception_handlers + 80*VECSIZE; csr_write64(eentry, LOONGARCH_CSR_EENTRY); - csr_write64(eentry, LOONGARCH_CSR_MERRENTRY); - csr_write64(tlbrentry, LOONGARCH_CSR_TLBRENTRY); + csr_write64(__pa(eentry), LOONGARCH_CSR_MERRENTRY); + csr_write64(__pa(tlbrentry), LOONGARCH_CSR_TLBRENTRY); } void per_cpu_trap_init(int cpu) From 43a9e6a10bdde32445ad2725f568e08a94e51dc9 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 9 Nov 2025 16:02:00 +0800 Subject: [PATCH 199/543] LoongArch: Consolidate early_ioremap()/ioremap_prot() 1. Use phys_addr_t instead of u64, which can work for both 32/64 bits. 2. Check whether the input physical address is above TO_PHYS_MASK (and return NULL if yes) for the DMW version. Note: In theory early_ioremap() also need the TO_PHYS_MASK checking, but the UEFI BIOS pass some DMW virtual addresses. Cc: stable@vger.kernel.org Signed-off-by: Jiaxun Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/io.h | 5 ++++- arch/loongarch/mm/ioremap.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index eaff72b38dc8..0130185e0349 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -14,7 +14,7 @@ #include #include -extern void __init __iomem *early_ioremap(u64 phys_addr, unsigned long size); +extern void __init __iomem *early_ioremap(phys_addr_t phys_addr, unsigned long size); extern void __init early_iounmap(void __iomem *addr, unsigned long size); #define early_memremap early_ioremap @@ -25,6 +25,9 @@ extern void __init early_iounmap(void __iomem *addr, unsigned long size); static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, pgprot_t prot) { + if (offset > TO_PHYS_MASK) + return NULL; + switch (pgprot_val(prot) & _CACHE_MASK) { case _CACHE_CC: return (void __iomem *)(unsigned long)(CACHE_BASE + offset); diff --git a/arch/loongarch/mm/ioremap.c b/arch/loongarch/mm/ioremap.c index df949a3d0f34..27c336959fe8 100644 --- a/arch/loongarch/mm/ioremap.c +++ b/arch/loongarch/mm/ioremap.c @@ -6,7 +6,7 @@ #include #include -void __init __iomem *early_ioremap(u64 phys_addr, unsigned long size) +void __init __iomem *early_ioremap(phys_addr_t phys_addr, unsigned long size) { return ((void __iomem *)TO_CACHE(phys_addr)); } From ce5ad03e459ecb3b4993a8f311fd4f2fb3e6ef81 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 9 Nov 2025 16:02:01 +0800 Subject: [PATCH 200/543] LoongArch: Consolidate max_pfn & max_low_pfn calculation Now there 5 places which calculate max_pfn & max_low_pfn: 1. in fdt_setup() for FDT systems; 2. in memblock_init() for ACPI systems; 3. in init_numa_memory() for NUMA systems; 4. in arch_mem_init() to recalculate for "mem=" cmdline; 5. in paging_init() to recalculate for NUMA systems. Since memblock_init() is called both for ACPI and FDT systems, move the calculation out of the for_each_efi_memory_desc() loop can eliminate the first case. The last case is very questionable (may be derived from the MIPS/Loongson code) and breaks the "mem=" cmdline, so should be removed. And then the NUMA version of paging_init() can be also eliminated. After consolidation there are 3 places of calculation: 1. in memblock_init() for both ACPI and FDT systems; 2. in init_numa_memory() to recalculate for NUMA systems; 3. in arch_mem_init() to recalculate for the "mem=" cmdline. For all cases the calculation is: max_pfn = PFN_DOWN(memblock_end_of_DRAM()); max_low_pfn = min(PFN_DOWN(HIGHMEM_START), max_pfn); Cc: stable@vger.kernel.org Signed-off-by: Huacai Chen --- arch/loongarch/kernel/mem.c | 7 +++---- arch/loongarch/kernel/numa.c | 23 ++--------------------- arch/loongarch/kernel/setup.c | 5 ++--- arch/loongarch/mm/init.c | 2 -- 4 files changed, 7 insertions(+), 30 deletions(-) diff --git a/arch/loongarch/kernel/mem.c b/arch/loongarch/kernel/mem.c index aed901c57fb4..8ab1ffedc52c 100644 --- a/arch/loongarch/kernel/mem.c +++ b/arch/loongarch/kernel/mem.c @@ -13,7 +13,7 @@ void __init memblock_init(void) { u32 mem_type; - u64 mem_start, mem_end, mem_size; + u64 mem_start, mem_size; efi_memory_desc_t *md; /* Parse memory information */ @@ -21,7 +21,6 @@ void __init memblock_init(void) mem_type = md->type; mem_start = md->phys_addr; mem_size = md->num_pages << EFI_PAGE_SHIFT; - mem_end = mem_start + mem_size; switch (mem_type) { case EFI_LOADER_CODE: @@ -31,8 +30,6 @@ void __init memblock_init(void) case EFI_PERSISTENT_MEMORY: case EFI_CONVENTIONAL_MEMORY: memblock_add(mem_start, mem_size); - if (max_low_pfn < (mem_end >> PAGE_SHIFT)) - max_low_pfn = mem_end >> PAGE_SHIFT; break; case EFI_PAL_CODE: case EFI_UNUSABLE_MEMORY: @@ -49,6 +46,8 @@ void __init memblock_init(void) } } + max_pfn = PFN_DOWN(memblock_end_of_DRAM()); + max_low_pfn = min(PFN_DOWN(HIGHMEM_START), max_pfn); memblock_set_current_limit(PFN_PHYS(max_low_pfn)); /* Reserve the first 2MB */ diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index d6e73e8f9c0b..ab9c660526a3 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -272,7 +272,8 @@ int __init init_numa_memory(void) node_mem_init(node); node_set_online(node); } - max_low_pfn = PHYS_PFN(memblock_end_of_DRAM()); + max_pfn = PFN_DOWN(memblock_end_of_DRAM()); + max_low_pfn = min(PFN_DOWN(HIGHMEM_START), max_pfn); setup_nr_node_ids(); loongson_sysconf.nr_nodes = nr_node_ids; @@ -283,26 +284,6 @@ int __init init_numa_memory(void) #endif -void __init paging_init(void) -{ - unsigned int node; - unsigned long zones_size[MAX_NR_ZONES] = {0, }; - - for_each_online_node(node) { - unsigned long start_pfn, end_pfn; - - get_pfn_range_for_nid(node, &start_pfn, &end_pfn); - - if (end_pfn > max_low_pfn) - max_low_pfn = end_pfn; - } -#ifdef CONFIG_ZONE_DMA32 - zones_size[ZONE_DMA32] = MAX_DMA32_PFN; -#endif - zones_size[ZONE_NORMAL] = max_low_pfn; - free_area_init(zones_size); -} - int pcibus_to_node(struct pci_bus *bus) { return dev_to_node(&bus->dev); diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 69c17d162fff..25a87378e48e 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -294,8 +294,6 @@ static void __init fdt_setup(void) early_init_dt_scan(fdt_pointer, __pa(fdt_pointer)); early_init_fdt_reserve_self(); - - max_low_pfn = PFN_PHYS(memblock_end_of_DRAM()); #endif } @@ -390,7 +388,8 @@ static void __init check_kernel_sections_mem(void) static void __init arch_mem_init(char **cmdline_p) { /* Recalculate max_low_pfn for "mem=xxx" */ - max_pfn = max_low_pfn = PHYS_PFN(memblock_end_of_DRAM()); + max_pfn = PFN_DOWN(memblock_end_of_DRAM()); + max_low_pfn = min(PFN_DOWN(HIGHMEM_START), max_pfn); if (usermem) pr_info("User-defined physical RAM map overwrite\n"); diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index c3e4586a7975..6bfd4b8dad1b 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -60,7 +60,6 @@ int __ref page_is_ram(unsigned long pfn) return memblock_is_memory(addr) && !memblock_is_reserved(addr); } -#ifndef CONFIG_NUMA void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; @@ -72,7 +71,6 @@ void __init paging_init(void) free_area_init(max_zone_pfns); } -#endif /* !CONFIG_NUMA */ void __ref free_initmem(void) { From a073d637c8cfbfbab39b7272226a3fbf3b887580 Mon Sep 17 00:00:00 2001 From: Tianyang Zhang Date: Sun, 9 Nov 2025 16:02:01 +0800 Subject: [PATCH 201/543] LoongArch: Let {pte,pmd}_modify() record the status of _PAGE_DIRTY Now if the PTE/PMD is dirty with _PAGE_DIRTY but without _PAGE_MODIFIED, after {pte,pmd}_modify() we lose _PAGE_DIRTY, then {pte,pmd}_dirty() return false and lead to data loss. This can happen in certain scenarios such as HW PTW doesn't set _PAGE_MODIFIED automatically, so here we need _PAGE_MODIFIED to record the dirty status (_PAGE_DIRTY). The new modification involves checking whether the original PTE/PMD has the _PAGE_DIRTY flag. If it exists, the _PAGE_MODIFIED bit is also set, ensuring that the {pte,pmd}_dirty() interface can always return accurate information. Cc: stable@vger.kernel.org Co-developed-by: Liupu Wang Signed-off-by: Liupu Wang Signed-off-by: Tianyang Zhang --- arch/loongarch/include/asm/pgtable.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index bd128696e96d..03fb60432fde 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -424,6 +424,9 @@ static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { + if (pte_val(pte) & _PAGE_DIRTY) + pte_val(pte) |= _PAGE_MODIFIED; + return __pte((pte_val(pte) & _PAGE_CHG_MASK) | (pgprot_val(newprot) & ~_PAGE_CHG_MASK)); } @@ -547,9 +550,11 @@ static inline struct page *pmd_page(pmd_t pmd) static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { - pmd_val(pmd) = (pmd_val(pmd) & _HPAGE_CHG_MASK) | - (pgprot_val(newprot) & ~_HPAGE_CHG_MASK); - return pmd; + if (pmd_val(pmd) & _PAGE_DIRTY) + pmd_val(pmd) |= _PAGE_MODIFIED; + + return __pmd((pmd_val(pmd) & _HPAGE_CHG_MASK) | + (pgprot_val(newprot) & ~_HPAGE_CHG_MASK)); } static inline pmd_t pmd_mkinvalid(pmd_t pmd) From 17f838512ae50203ae2e3ce9b9f2689cc67beaa3 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Sun, 9 Nov 2025 16:02:01 +0800 Subject: [PATCH 202/543] LoongArch: Remove __GFP_HIGHMEM masking in pud_alloc_one() Remove the unnecessary __GFP_HIGHMEM masking in pud_alloc_one(), which was introduced with commit 382739797f79ec2 ("loongarch: convert various functions to use ptdescs"). GFP_KERNEL doesn't contain __GFP_HIGHMEM. Signed-off-by: Vishal Moola (Oracle) Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/pgalloc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index 1c63a9d9a6d3..08dcc698ec18 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -88,7 +88,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { pud_t *pud; - struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); if (!ptdesc) return NULL; From 4c8a7c9827726f6e987b7a04af8ef58f1c7fe8d3 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Sun, 9 Nov 2025 16:02:01 +0800 Subject: [PATCH 203/543] LoongArch: Refine the init_hw_perf_events() function (1) Use the existing CPUCFG6_PMNUM_SHIFT macro definition instead of the magic value 4 to get the PMU number. (2) Detect the value of PMU bits via CPUCFG instruction according to the ISA manual instead of hard-coded as 64, because the value may be different for various micro-architectures. Link: https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_cpucfg Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/loongarch.h | 1 + arch/loongarch/kernel/perf_event.c | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 5b36fa57015f..3de03cb864b2 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -128,6 +128,7 @@ #define CPUCFG6_PMNUM GENMASK(7, 4) #define CPUCFG6_PMNUM_SHIFT 4 #define CPUCFG6_PMBITS GENMASK(13, 8) +#define CPUCFG6_PMBITS_SHIFT 8 #define CPUCFG6_UPM BIT(14) #define LOONGARCH_CPUCFG16 0x10 diff --git a/arch/loongarch/kernel/perf_event.c b/arch/loongarch/kernel/perf_event.c index 8ad098703488..9d257c8519c9 100644 --- a/arch/loongarch/kernel/perf_event.c +++ b/arch/loongarch/kernel/perf_event.c @@ -845,13 +845,14 @@ static const struct loongarch_perf_event *loongarch_pmu_map_raw_event(u64 config static int __init init_hw_perf_events(void) { - int counters; + int bits, counters; if (!cpu_has_pmp) return -ENODEV; pr_info("Performance counters: "); - counters = ((read_cpucfg(LOONGARCH_CPUCFG6) & CPUCFG6_PMNUM) >> 4) + 1; + bits = ((read_cpucfg(LOONGARCH_CPUCFG6) & CPUCFG6_PMBITS) >> CPUCFG6_PMBITS_SHIFT) + 1; + counters = ((read_cpucfg(LOONGARCH_CPUCFG6) & CPUCFG6_PMNUM) >> CPUCFG6_PMNUM_SHIFT) + 1; loongarch_pmu.num_counters = counters; loongarch_pmu.max_period = (1ULL << 63) - 1; @@ -867,7 +868,7 @@ static int __init init_hw_perf_events(void) on_each_cpu(reset_counters, NULL, 1); pr_cont("%s PMU enabled, %d %d-bit counters available to each CPU.\n", - loongarch_pmu.name, counters, 64); + loongarch_pmu.name, counters, bits); perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); From eeeeaafa62ea0cd4b86390f657dc0aea73bff4f5 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 9 Nov 2025 16:02:01 +0800 Subject: [PATCH 204/543] LoongArch: Use correct accessor to read FWPC/MWPC CSR.FWPC and CSR.MWPC are 32bit registers, so use csr_read32() rather than csr_read64() to read the values of FWPC/MWPC. Cc: stable@vger.kernel.org Fixes: edffa33c7bb5a73 ("LoongArch: Add hardware breakpoints/watchpoints support") Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/hw_breakpoint.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/include/asm/hw_breakpoint.h b/arch/loongarch/include/asm/hw_breakpoint.h index 13b2462f3d8c..5faa97a87a9e 100644 --- a/arch/loongarch/include/asm/hw_breakpoint.h +++ b/arch/loongarch/include/asm/hw_breakpoint.h @@ -134,13 +134,13 @@ static inline void hw_breakpoint_thread_switch(struct task_struct *next) /* Determine number of BRP registers available. */ static inline int get_num_brps(void) { - return csr_read64(LOONGARCH_CSR_FWPC) & CSR_FWPC_NUM; + return csr_read32(LOONGARCH_CSR_FWPC) & CSR_FWPC_NUM; } /* Determine number of WRP registers available. */ static inline int get_num_wrps(void) { - return csr_read64(LOONGARCH_CSR_MWPC) & CSR_MWPC_NUM; + return csr_read32(LOONGARCH_CSR_MWPC) & CSR_MWPC_NUM; } #endif /* __KERNEL__ */ From df16b8956cae970027f4be4a1500272201e2d5c1 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Sun, 9 Nov 2025 16:02:01 +0800 Subject: [PATCH 205/543] LoongArch: kexec: Initialize the kexec_buf structure The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data is accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly") Link: https://lore.kernel.org/r/20250827-kbuf_all-v1-2-1df9882bb01a@debian.org Signed-off-by: Youling Tang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/kexec_efi.c | 2 +- arch/loongarch/kernel/kexec_elf.c | 2 +- arch/loongarch/kernel/machine_kexec_file.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/kernel/kexec_efi.c b/arch/loongarch/kernel/kexec_efi.c index 45121b914f8f..5ee78ebb1546 100644 --- a/arch/loongarch/kernel/kexec_efi.c +++ b/arch/loongarch/kernel/kexec_efi.c @@ -42,7 +42,7 @@ static void *efi_kexec_load(struct kimage *image, { int ret; unsigned long text_offset, kernel_segment_number; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; struct kexec_segment *kernel_segment; struct loongarch_image_header *h; diff --git a/arch/loongarch/kernel/kexec_elf.c b/arch/loongarch/kernel/kexec_elf.c index 97b2f049801a..1b6b64744c7f 100644 --- a/arch/loongarch/kernel/kexec_elf.c +++ b/arch/loongarch/kernel/kexec_elf.c @@ -59,7 +59,7 @@ static void *elf_kexec_load(struct kimage *image, int ret; unsigned long text_offset, kernel_segment_number; struct elfhdr ehdr; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; struct kexec_elf_info elf_info; struct kexec_segment *kernel_segment; diff --git a/arch/loongarch/kernel/machine_kexec_file.c b/arch/loongarch/kernel/machine_kexec_file.c index dda236b51a88..fb57026f5f25 100644 --- a/arch/loongarch/kernel/machine_kexec_file.c +++ b/arch/loongarch/kernel/machine_kexec_file.c @@ -143,7 +143,7 @@ int load_other_segments(struct kimage *image, unsigned long initrd_load_addr = 0; unsigned long orig_segments = image->nr_segments; char *modified_cmdline = NULL; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; kbuf.image = image; /* Don't allocate anything below the kernel */ From 62cda5e54f7c5e773911b458dd4d10ee8c91b60b Mon Sep 17 00:00:00 2001 From: Qiang Ma Date: Sun, 9 Nov 2025 16:02:01 +0800 Subject: [PATCH 206/543] LoongArch: kexec: Print out debugging message if required When specifying '-d' for kexec_file_load interface, loaded locations of kernel/initrd/cmdline etc can be printed out to help debug. Commit eb7622d908a0 ("kexec_file, riscv: print out debugging message if required") fixes the same issue on RISC-V. So, remove kexec_image_info() because the content has been printed out in generic code. And on Loongson-3A5000, the printed messages look like below: kexec_file: kernel: 00000000d9aad283 kernel_size: 0x2e77f30 kexec_file(EFI): No LoongArch PE image header. kexec_file: Loaded initrd at 0x80000000 bufsz=0x1637cd0 memsz=0x1638000 kexec_file(ELF): Loaded kernel at 0x9c20000 bufsz=0x27f1800 memsz=0x2950000 kexec_file: nr_segments = 2 kexec_file: segment[0]: buf=0x00000000cc3e6c33 bufsz=0x27f1800 mem=0x9c20000 memsz=0x2950000 kexec_file: segment[1]: buf=0x00000000bb75a541 bufsz=0x1637cd0 mem=0x80000000 memsz=0x1638000 kexec_file: kexec_file_load: type:0, start:0xb15d000 head:0x18db60002 flags:0x8 Signed-off-by: Qiang Ma Signed-off-by: Huacai Chen --- arch/loongarch/kernel/machine_kexec.c | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/arch/loongarch/kernel/machine_kexec.c b/arch/loongarch/kernel/machine_kexec.c index e4b2bbc47e62..2d64b7c81e5e 100644 --- a/arch/loongarch/kernel/machine_kexec.c +++ b/arch/loongarch/kernel/machine_kexec.c @@ -39,34 +39,12 @@ static unsigned long systable_ptr; static unsigned long start_addr; static unsigned long first_ind_entry; -static void kexec_image_info(const struct kimage *kimage) -{ - unsigned long i; - - pr_debug("kexec kimage info:\n"); - pr_debug("\ttype: %d\n", kimage->type); - pr_debug("\tstart: %lx\n", kimage->start); - pr_debug("\thead: %lx\n", kimage->head); - pr_debug("\tnr_segments: %lu\n", kimage->nr_segments); - - for (i = 0; i < kimage->nr_segments; i++) { - pr_debug("\t segment[%lu]: %016lx - %016lx", i, - kimage->segment[i].mem, - kimage->segment[i].mem + kimage->segment[i].memsz); - pr_debug("\t\t0x%lx bytes, %lu pages\n", - (unsigned long)kimage->segment[i].memsz, - (unsigned long)kimage->segment[i].memsz / PAGE_SIZE); - } -} - int machine_kexec_prepare(struct kimage *kimage) { int i; char *bootloader = "kexec"; void *cmdline_ptr = (void *)KEXEC_CMDLINE_ADDR; - kexec_image_info(kimage); - kimage->arch.efi_boot = fw_arg0; kimage->arch.systable_ptr = fw_arg2; From 37e9d1a91382661c2d1f656b54c5d22dfe7a8606 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sun, 9 Nov 2025 16:02:09 +0800 Subject: [PATCH 207/543] LoongArch: KVM: Set page with write attribute if dirty track disabled With secondary MMU page table, if there is a read page fault, the page's write attribute will not set even if it is writable from master MMU page table. This logic only works if dirty tracking is enabled, so page table should be set with _PAGE_WRITE if dirty tracking is disabled. It reduces extra page fault on secondary MMU page table if a VM finishes migration, when the master MMU page table is ready and the secondary MMU page is fresh. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index 7c8143e79c12..a7fa458e3360 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -857,7 +857,7 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) if (writeable) { prot_bits = kvm_pte_mkwriteable(prot_bits); - if (write) + if (write || !kvm_slot_dirty_track_enabled(memslot)) prot_bits = kvm_pte_mkdirty(prot_bits); } From d3c9515e4f9d10ccb113adb4809db5cc31e7ef65 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sun, 9 Nov 2025 16:02:09 +0800 Subject: [PATCH 208/543] LoongArch: KVM: Add delay until timer interrupt injected When timer is fired in oneshot mode, CSR.TVAL will stop with value -1 rather than 0. However when the register CSR.TVAL is restored, it will continue to count down rather than stop there. Now the method is to write 0 to CSR.TVAL, wait to count down for 1 cycle at least, which is 10ns with a timer freq 100MHz, and then retore timer interrupt status. Here add 2 cycles delay to assure that timer interrupt is injected. With this patch, timer selftest case passes to run always. Cc: stable@vger.kernel.org Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/timer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c index 32dc213374be..29c2aaba63c3 100644 --- a/arch/loongarch/kvm/timer.c +++ b/arch/loongarch/kvm/timer.c @@ -4,6 +4,7 @@ */ #include +#include #include #include @@ -95,6 +96,7 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu) * and set CSR TVAL with -1 */ write_gcsr_timertick(0); + __delay(2); /* Wait cycles until timer interrupt injected */ /* * Writing CSR_TINTCLR_TI to LOONGARCH_CSR_TINTCLR will clear From 5001bcf86edf2de02f025a0f789bcac37fa040e6 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sun, 9 Nov 2025 16:02:09 +0800 Subject: [PATCH 209/543] LoongArch: KVM: Restore guest PMU if it is enabled On LoongArch system, guest PMU hardware is shared by guest and host but PMU interrupt is separated. PMU is pass-through to VM, and there is PMU context switch when exit to host and return to guest. There is optimiation to check whether PMU is enabled by guest. If not, it is not necessary to return to guest. However, if it is enabled, PMU context for guest need switch on. Now KVM_REQ_PMU notification is set on vCPU context switch, but it is missing if there is no vCPU context switch while PMU is used by guest VM, so fix it. Cc: Fixes: f4e40ea9f78f ("LoongArch: KVM: Add PMU support for guest") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vcpu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 30e3b089a596..b3995ff4b17e 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -132,6 +132,9 @@ static void kvm_lose_pmu(struct kvm_vcpu *vcpu) * Clear KVM_LARCH_PMU if the guest is not using PMU CSRs when * exiting the guest, so that the next time trap into the guest. * We don't need to deal with PMU CSRs contexts. + * + * Otherwise set the request bit KVM_REQ_PMU to restore guest PMU + * before entering guest VM */ val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0); val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1); @@ -139,6 +142,8 @@ static void kvm_lose_pmu(struct kvm_vcpu *vcpu) val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3); if (!(val & KVM_PMU_EVENT_ENABLED)) vcpu->arch.aux_inuse &= ~KVM_LARCH_PMU; + else + kvm_make_request(KVM_REQ_PMU, vcpu); kvm_restore_host_pmu(vcpu); } From 11f340ece403e71aa2b643a2562a58ed3ac12e2c Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sun, 9 Nov 2025 16:02:09 +0800 Subject: [PATCH 210/543] LoongArch: KVM: Skip PMU checking on vCPU context switch PMU hardware about VM is switched on VM exit to host rather than vCPU context sched off, PMU is checked and restored on return to VM. It is not necessary to check PMU on vCPU context sched on callback, since the request is made on the VM exit entry or VM PMU CSR access abort routine already. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vcpu.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index b3995ff4b17e..1245a6b35896 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -148,12 +148,6 @@ static void kvm_lose_pmu(struct kvm_vcpu *vcpu) kvm_restore_host_pmu(vcpu); } -static void kvm_restore_pmu(struct kvm_vcpu *vcpu) -{ - if ((vcpu->arch.aux_inuse & KVM_LARCH_PMU)) - kvm_make_request(KVM_REQ_PMU, vcpu); -} - static void kvm_check_pmu(struct kvm_vcpu *vcpu) { if (kvm_check_request(KVM_REQ_PMU, vcpu)) { @@ -304,7 +298,10 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.aux_inuse &= ~KVM_LARCH_SWCSR_LATEST; if (kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending()) { - kvm_lose_pmu(vcpu); + if (vcpu->arch.aux_inuse & KVM_LARCH_PMU) { + kvm_lose_pmu(vcpu); + kvm_make_request(KVM_REQ_PMU, vcpu); + } /* make sure the vcpu mode has been written */ smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE); local_irq_enable(); @@ -1609,9 +1606,6 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_restore_timer(vcpu); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); - /* Restore hardware PMU CSRs */ - kvm_restore_pmu(vcpu); - /* Don't bother restoring registers multiple times unless necessary */ if (vcpu->arch.aux_inuse & KVM_LARCH_HWCSR_USABLE) return 0; From 237e74bfa261fb0cf75bd08c9be0c5094018ee20 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sun, 9 Nov 2025 16:02:09 +0800 Subject: [PATCH 211/543] LoongArch: KVM: Fix max supported vCPUs set with EIOINTC VM fails to boot with 256 vCPUs, the detailed command is qemu-system-loongarch64 -smp 256 and there is an error reported as follows: KVM_LOONGARCH_EXTIOI_INIT_NUM_CPU failed: Invalid argument There is typo issue in function kvm_eiointc_ctrl_access() when set max supported vCPUs. Cc: stable@vger.kernel.org Fixes: 47256c4c8b1b ("LoongArch: KVM: Avoid copy_*_user() with lock hold in kvm_eiointc_ctrl_access()") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index c32333695381..a1cc116b4dac 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -439,7 +439,7 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, spin_lock_irqsave(&s->lock, flags); switch (type) { case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU: - if (val >= EIOINTC_ROUTE_MAX_VCPUS) + if (val > EIOINTC_ROUTE_MAX_VCPUS) ret = -EINVAL; else s->num_cpu = val; From 77008e1b2ef73249bceb078a321a3ff6bc087afb Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 16 Oct 2025 21:36:30 -0400 Subject: [PATCH 212/543] mm/huge_memory: do not change split_huge_page*() target order silently Page cache folios from a file system that support large block size (LBS) can have minimal folio order greater than 0, thus a high order folio might not be able to be split down to order-0. Commit e220917fa507 ("mm: split a folio in minimum folio order chunks") bumps the target order of split_huge_page*() to the minimum allowed order when splitting a LBS folio. This causes confusion for some split_huge_page*() callers like memory failure handling code, since they expect after-split folios all have order-0 when split succeeds but in reality get min_order_for_split() order folios and give warnings. Fix it by failing a split if the folio cannot be split to the target order. Rename try_folio_split() to try_folio_split_to_order() to reflect the added new_order parameter. Remove its unused list parameter. [The test poisons LBS folios, which cannot be split to order-0 folios, and also tries to poison all memory. The non split LBS folios take more memory than the test anticipated, leading to OOM. The patch fixed the kernel warning and the test needs some change to avoid OOM.] Link: https://lkml.kernel.org/r/20251017013630.139907-1-ziy@nvidia.com Fixes: e220917fa507 ("mm: split a folio in minimum folio order chunks") Signed-off-by: Zi Yan Reported-by: syzbot+e6367ea2fdab6ed46056@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68d2c943.a70a0220.1b52b.02b3.GAE@google.com/ Reviewed-by: Luis Chamberlain Reviewed-by: Pankaj Raghav Reviewed-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Miaohe Lin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Jane Chu Cc: Lance Yang Cc: Liam Howlett Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Christian Brauner Cc: Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 55 +++++++++++++++++------------------------ mm/huge_memory.c | 9 +------ mm/truncate.c | 6 +++-- 3 files changed, 28 insertions(+), 42 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f327d62fc985..71ac78b9f834 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -376,45 +376,30 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); /* - * try_folio_split - try to split a @folio at @page using non uniform split. + * try_folio_split_to_order - try to split a @folio at @page to @new_order using + * non uniform split. * @folio: folio to be split - * @page: split to order-0 at the given page - * @list: store the after-split folios + * @page: split to @new_order at the given page + * @new_order: the target split order * - * Try to split a @folio at @page using non uniform split to order-0, if - * non uniform split is not supported, fall back to uniform split. + * Try to split a @folio at @page using non uniform split to @new_order, if + * non uniform split is not supported, fall back to uniform split. After-split + * folios are put back to LRU list. Use min_order_for_split() to get the lower + * bound of @new_order. * * Return: 0: split is successful, otherwise split failed. */ -static inline int try_folio_split(struct folio *folio, struct page *page, - struct list_head *list) +static inline int try_folio_split_to_order(struct folio *folio, + struct page *page, unsigned int new_order) { - int ret = min_order_for_split(folio); - - if (ret < 0) - return ret; - - if (!non_uniform_split_supported(folio, 0, false)) - return split_huge_page_to_list_to_order(&folio->page, list, - ret); - return folio_split(folio, ret, page, list); + if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) + return split_huge_page_to_list_to_order(&folio->page, NULL, + new_order); + return folio_split(folio, new_order, page, NULL); } static inline int split_huge_page(struct page *page) { - struct folio *folio = page_folio(page); - int ret = min_order_for_split(folio); - - if (ret < 0) - return ret; - - /* - * split_huge_page() locks the page before splitting and - * expects the same page that has been split to be locked when - * returned. split_folio(page_folio(page)) cannot be used here - * because it converts the page to folio and passes the head - * page to be split. - */ - return split_huge_page_to_list_to_order(page, NULL, ret); + return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio, bool partially_mapped); @@ -597,14 +582,20 @@ static inline int split_huge_page(struct page *page) return -EINVAL; } +static inline int min_order_for_split(struct folio *folio) +{ + VM_WARN_ON_ONCE_FOLIO(1, folio); + return -EINVAL; +} + static inline int split_folio_to_list(struct folio *folio, struct list_head *list) { VM_WARN_ON_ONCE_FOLIO(1, folio); return -EINVAL; } -static inline int try_folio_split(struct folio *folio, struct page *page, - struct list_head *list) +static inline int try_folio_split_to_order(struct folio *folio, + struct page *page, unsigned int new_order) { VM_WARN_ON_ONCE_FOLIO(1, folio); return -EINVAL; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1d1b74950332..feac4aef7dfb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3653,8 +3653,6 @@ static int __folio_split(struct folio *folio, unsigned int new_order, min_order = mapping_min_folio_order(folio->mapping); if (new_order < min_order) { - VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u", - min_order); ret = -EINVAL; goto out; } @@ -3986,12 +3984,7 @@ int min_order_for_split(struct folio *folio) int split_folio_to_list(struct folio *folio, struct list_head *list) { - int ret = min_order_for_split(folio); - - if (ret < 0) - return ret; - - return split_huge_page_to_list_to_order(&folio->page, list, ret); + return split_huge_page_to_list_to_order(&folio->page, list, 0); } /* diff --git a/mm/truncate.c b/mm/truncate.c index 91eb92a5ce4f..9210cf808f5c 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -194,6 +194,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) size_t size = folio_size(folio); unsigned int offset, length; struct page *split_at, *split_at2; + unsigned int min_order; if (pos < start) offset = start - pos; @@ -223,8 +224,9 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) if (!folio_test_large(folio)) return true; + min_order = mapping_min_folio_order(folio->mapping); split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); - if (!try_folio_split(folio, split_at, NULL)) { + if (!try_folio_split_to_order(folio, split_at, min_order)) { /* * try to split at offset + length to make sure folios within * the range can be dropped, especially to avoid memory waste @@ -254,7 +256,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) */ if (folio_test_large(folio2) && folio2->mapping == folio->mapping) - try_folio_split(folio2, split_at2, NULL); + try_folio_split_to_order(folio2, split_at2, min_order); folio_unlock(folio2); out: From e38f65d317df1fd2dcafe614d9c537475ecf9992 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Mon, 20 Oct 2025 20:08:50 -0400 Subject: [PATCH 213/543] kho: warn and fail on metadata or preserved memory in scratch area Patch series "KHO: kfence + KHO memory corruption fix", v3. This series fixes a memory corruption bug in KHO that occurs when KFENCE is enabled. The root cause is that KHO metadata, allocated via kzalloc(), can be randomly serviced by kfence_alloc(). When a kernel boots via KHO, the early memblock allocator is restricted to a "scratch area". This forces the KFENCE pool to be allocated within this scratch area, creating a conflict. If KHO metadata is subsequently placed in this pool, it gets corrupted during the next kexec operation. Google is using KHO and have had obscure crashes due to this memory corruption, with stacks all over the place. I would prefer this fix to be properly backported to stable so we can also automatically consume it once we switch to the upstream KHO. Patch 1/3 introduces a debug-only feature (CONFIG_KEXEC_HANDOVER_DEBUG) that adds checks to detect and fail any operation that attempts to place KHO metadata or preserved memory within the scratch area. This serves as a validation and diagnostic tool to confirm the problem without affecting production builds. Patch 2/3 Increases bitmap to PAGE_SIZE, so buddy allocator can be used. Patch 3/3 Provides the fix by modifying KHO to allocate its metadata directly from the buddy allocator instead of slab. This bypasses the KFENCE interception entirely. This patch (of 3): It is invalid for KHO metadata or preserved memory regions to be located within the KHO scratch area, as this area is overwritten when the next kernel is loaded, and used early in boot by the next kernel. This can lead to memory corruption. Add checks to kho_preserve_* and KHO's internal metadata allocators (xa_load_or_alloc, new_chunk) to verify that the physical address of the memory does not overlap with any defined scratch region. If an overlap is detected, the operation will fail and a WARN_ON is triggered. To avoid performance overhead in production kernels, these checks are enabled only when CONFIG_KEXEC_HANDOVER_DEBUG is selected. [rppt@kernel.org: fix KEXEC_HANDOVER_DEBUG Kconfig dependency] Link: https://lkml.kernel.org/r/aQHUyyFtiNZhx8jo@kernel.org [pasha.tatashin@soleen.com: build fix] Link: https://lkml.kernel.org/r/CA+CK2bBnorfsTymKtv4rKvqGBHs=y=MjEMMRg_tE-RME6n-zUw@mail.gmail.com Link: https://lkml.kernel.org/r/20251021000852.2924827-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20251021000852.2924827-2-pasha.tatashin@soleen.com Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pasha Tatashin Signed-off-by: Mike Rapoport Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Christian Brauner Cc: David Matlack Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Randy Dunlap Cc: Samiullah Khawaja Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 9 +++++ kernel/Makefile | 1 + kernel/kexec_handover.c | 57 +++++++++++++++++++++----------- kernel/kexec_handover_debug.c | 25 ++++++++++++++ kernel/kexec_handover_internal.h | 20 +++++++++++ 5 files changed, 93 insertions(+), 19 deletions(-) create mode 100644 kernel/kexec_handover_debug.c create mode 100644 kernel/kexec_handover_internal.h diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 422270d64820..54e581072617 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -109,6 +109,15 @@ config KEXEC_HANDOVER to keep data or state alive across the kexec. For this to work, both source and target kernels need to have this option enabled. +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index df3dd8291bb6..9fe722305c9b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 76f0940fb485..0bc9001e532a 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "KHO: " fmt +#include #include #include #include @@ -22,6 +23,7 @@ #include +#include "kexec_handover_internal.h" /* * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. @@ -133,26 +135,26 @@ static struct kho_out kho_out = { static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) { - void *elm, *res; + void *res = xa_load(xa, index); - elm = xa_load(xa, index); - if (elm) - return elm; + if (res) + return res; + + void *elm __free(kfree) = kzalloc(sz, GFP_KERNEL); - elm = kzalloc(sz, GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); + if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), sz))) + return ERR_PTR(-EINVAL); + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); if (xa_is_err(res)) - res = ERR_PTR(xa_err(res)); - - if (res) { - kfree(elm); + return ERR_PTR(xa_err(res)); + else if (res) return res; - } - return elm; + return no_free_ptr(elm); } static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, @@ -345,15 +347,19 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, unsigned long order) { - struct khoser_mem_chunk *chunk; + struct khoser_mem_chunk *chunk __free(kfree) = NULL; chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!chunk) - return NULL; + return ERR_PTR(-ENOMEM); + + if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + chunk->hdr.order = order; if (cur_chunk) KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); - return chunk; + return no_free_ptr(chunk); } static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) @@ -374,14 +380,17 @@ static int kho_mem_serialize(struct kho_serialization *ser) struct khoser_mem_chunk *chunk = NULL; struct kho_mem_phys *physxa; unsigned long order; + int err = -ENOMEM; xa_for_each(&ser->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys; chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } if (!first_chunk) first_chunk = chunk; @@ -391,8 +400,10 @@ static int kho_mem_serialize(struct kho_serialization *ser) if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } } elm = &chunk->bitmaps[chunk->hdr.num_elms]; @@ -409,7 +420,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) err_free: kho_mem_ser_free(first_chunk); - return -ENOMEM; + return err; } static void __init deserialize_bitmap(unsigned int order, @@ -465,8 +476,8 @@ static void __init kho_mem_deserialize(const void *fdt) * area for early allocations that happen before page allocator is * initialized. */ -static struct kho_scratch *kho_scratch; -static unsigned int kho_scratch_cnt; +struct kho_scratch *kho_scratch; +unsigned int kho_scratch_cnt; /* * The scratch areas are scaled by default as percent of memory allocated from @@ -752,6 +763,9 @@ int kho_preserve_folio(struct folio *folio) const unsigned int order = folio_order(folio); struct kho_mem_track *track = &kho_out.ser.track; + if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) + return -EINVAL; + return __kho_preserve_order(track, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); @@ -775,6 +789,11 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) unsigned long failed_pfn = 0; int err = 0; + if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT))) { + return -EINVAL; + } + while (pfn < end_pfn) { const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); diff --git a/kernel/kexec_handover_debug.c b/kernel/kexec_handover_debug.c new file mode 100644 index 000000000000..6efb696f5426 --- /dev/null +++ b/kernel/kexec_handover_debug.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debug.c - kexec handover optional debug functionality + * Copyright (C) 2025 Google LLC, Pasha Tatashin + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include "kexec_handover_internal.h" + +bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + phys_addr_t scratch_start, scratch_end; + unsigned int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + scratch_start = kho_scratch[i].addr; + scratch_end = kho_scratch[i].addr + kho_scratch[i].size; + + if (phys < scratch_end && (phys + size) > scratch_start) + return true; + } + + return false; +} diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h new file mode 100644 index 000000000000..3c3c7148ceed --- /dev/null +++ b/kernel/kexec_handover_internal.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H +#define LINUX_KEXEC_HANDOVER_INTERNAL_H + +#include +#include + +extern struct kho_scratch *kho_scratch; +extern unsigned int kho_scratch_cnt; + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUG +bool kho_scratch_overlap(phys_addr_t phys, size_t size); +#else +static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ + +#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ From a2fff99f92dae9c0eaf0d75de3def70ec68dad92 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Mon, 20 Oct 2025 20:08:51 -0400 Subject: [PATCH 214/543] kho: increase metadata bitmap size to PAGE_SIZE KHO memory preservation metadata is preserved in 512 byte chunks which requires their allocation from slab allocator. Slabs are not safe to be used with KHO because of kfence, and because partial slabs may lead leaks to the next kernel. Change the size to be PAGE_SIZE. The kfence specifically may cause memory corruption, where it randomly provides slab objects that can be within the scratch area. The reason for that is that kfence allocates its objects prior to KHO scratch is marked as CMA region. While this change could potentially increase metadata overhead on systems with sparsely preserved memory, this is being mitigated by ongoing work to reduce sparseness during preservation via 1G guest pages. Furthermore, this change aligns with future work on a stateless KHO, which will also use page-sized bitmaps for its radix tree metadata. Link: https://lkml.kernel.org/r/20251021000852.2924827-3-pasha.tatashin@soleen.com Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Christian Brauner Cc: David Matlack Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Randy Dunlap Cc: Samiullah Khawaja Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 0bc9001e532a..9217d2fdd2d3 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -69,10 +69,10 @@ early_param("kho", kho_parse_enable); * Keep track of memory that is to be preserved across KHO. * * The serializing side uses two levels of xarrays to manage chunks of per-order - * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a - * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations - * each bitmap will cover 16M of address space. Thus, for 16G of memory at most - * 512K of bitmap memory will be needed for order 0. + * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order + * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 + * allocations each bitmap will cover 128M of address space. Thus, for 16G of + * memory at most 512K of bitmap memory will be needed for order 0. * * This approach is fully incremental, as the serialization progresses folios * can continue be aggregated to the tracker. The final step, immediately prior @@ -80,12 +80,14 @@ early_param("kho", kho_parse_enable); * successor kernel to parse. */ -#define PRESERVE_BITS (512 * 8) +#define PRESERVE_BITS (PAGE_SIZE * 8) struct kho_mem_phys_bits { DECLARE_BITMAP(preserve, PRESERVE_BITS); }; +static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); + struct kho_mem_phys { /* * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized @@ -133,19 +135,19 @@ static struct kho_out kho_out = { .finalized = false, }; -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) { void *res = xa_load(xa, index); if (res) return res; - void *elm __free(kfree) = kzalloc(sz, GFP_KERNEL); + void *elm __free(kfree) = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); - if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), sz))) + if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) return ERR_PTR(-EINVAL); res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); @@ -218,8 +220,7 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } } - bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, - sizeof(*bits)); + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); if (IS_ERR(bits)) return PTR_ERR(bits); From fa759cd75bce5489eed34596daa53f721849a86f Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Mon, 20 Oct 2025 20:08:52 -0400 Subject: [PATCH 215/543] kho: allocate metadata directly from the buddy allocator KHO allocates metadata for its preserved memory map using the slab allocator via kzalloc(). This metadata is temporary and is used by the next kernel during early boot to find preserved memory. A problem arises when KFENCE is enabled. kzalloc() calls can be randomly intercepted by kfence_alloc(), which services the allocation from a dedicated KFENCE memory pool. This pool is allocated early in boot via memblock. When booting via KHO, the memblock allocator is restricted to a "scratch area", forcing the KFENCE pool to be allocated within it. This creates a conflict, as the scratch area is expected to be ephemeral and overwriteable by a subsequent kexec. If KHO metadata is placed in this KFENCE pool, it leads to memory corruption when the next kernel is loaded. To fix this, modify KHO to allocate its metadata directly from the buddy allocator instead of slab. Link: https://lkml.kernel.org/r/20251021000852.2924827-4-pasha.tatashin@soleen.com Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: David Matlack Cc: Alexander Graf Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Randy Dunlap Cc: Samiullah Khawaja Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- include/linux/gfp.h | 3 +++ kernel/kexec_handover.c | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0ceb4e09306c..623bee335383 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -7,6 +7,7 @@ #include #include #include +#include #include struct vm_area_struct; @@ -463,4 +464,6 @@ static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, /* This should be paired with folio_put() rather than free_contig_range(). */ #define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) +DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) + #endif /* __LINUX_GFP_H */ diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 9217d2fdd2d3..2a8c20c238a8 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -142,7 +142,7 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) if (res) return res; - void *elm __free(kfree) = kzalloc(PAGE_SIZE, GFP_KERNEL); + void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); @@ -348,9 +348,9 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, unsigned long order) { - struct khoser_mem_chunk *chunk __free(kfree) = NULL; + struct khoser_mem_chunk *chunk __free(free_page) = NULL; - chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + chunk = (void *)get_zeroed_page(GFP_KERNEL); if (!chunk) return ERR_PTR(-ENOMEM); From fc745ff317566ec299e16346ebb9eacc8fe5b9d2 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 Oct 2025 18:57:19 +0800 Subject: [PATCH 216/543] mm/shmem: fix THP allocation and fallback loop The order check and fallback loop is updating the index value on every loop. This will cause the index to be wrongly aligned by a larger value while the loop shrinks the order. This may result in inserting and returning a folio of the wrong index and cause data corruption with some userspace workloads [1]. [kasong@tencent.com: introduce a temporary variable to improve code] Link: https://lkml.kernel.org/r/20251023065913.36925-1-ryncsn@gmail.com Link: https://lore.kernel.org/linux-mm/CAMgjq7DqgAmj25nDUwwu1U2cSGSn8n4-Hqpgottedy0S6YYeUw@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20251022105719.18321-1-ryncsn@gmail.com Link: https://lore.kernel.org/linux-mm/CAMgjq7DqgAmj25nDUwwu1U2cSGSn8n4-Hqpgottedy0S6YYeUw@mail.gmail.com/ [1] Fixes: e7a2ab7b3bb5 ("mm: shmem: add mTHP support for anonymous shmem") Closes: https://lore.kernel.org/linux-mm/CAMgjq7DqgAmj25nDUwwu1U2cSGSn8n4-Hqpgottedy0S6YYeUw@mail.gmail.com/ Signed-off-by: Kairui Song Acked-by: David Hildenbrand Acked-by: Zi Yan Reviewed-by: Baolin Wang Reviewed-by: Barry Song Reviewed-by: Lorenzo Stoakes Cc: Dev Jain Cc: Hugh Dickins Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Nico Pache Cc: Ryan Roberts Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b9081b817d28..58701d14dd96 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1882,6 +1882,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, struct shmem_inode_info *info = SHMEM_I(inode); unsigned long suitable_orders = 0; struct folio *folio = NULL; + pgoff_t aligned_index; long pages; int error, order; @@ -1895,10 +1896,12 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, order = highest_order(suitable_orders); while (suitable_orders) { pages = 1UL << order; - index = round_down(index, pages); - folio = shmem_alloc_folio(gfp, order, info, index); - if (folio) + aligned_index = round_down(index, pages); + folio = shmem_alloc_folio(gfp, order, info, aligned_index); + if (folio) { + index = aligned_index; goto allocated; + } if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); From 7e76b75e5ab3339bebab3a4738226cd9b27d8c42 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Tue, 30 Sep 2025 13:56:01 +0200 Subject: [PATCH 217/543] mm/kmsan: fix kmsan kmalloc hook when no stack depots are allocated yet If no stack depot is allocated yet, due to masking out __GFP_RECLAIM flags kmsan called from kmalloc cannot allocate stack depot. kmsan fails to record origin and report issues. This may result in KMSAN failing to report issues. Reusing flags from kmalloc without modifying them should be safe for kmsan. For example, such chain of calls is possible: test_uninit_kmalloc -> kmalloc -> __kmalloc_cache_noprof -> slab_alloc_node -> slab_post_alloc_hook -> kmsan_slab_alloc -> kmsan_internal_poison_memory. Only when it is called in a context without flags present should __GFP_RECLAIM flags be masked. With this change all kmsan tests start working reliably. Eric reported: : Yes, KMSAN seems to be at least partially broken currently. Besides the : fact that the kmsan KUnit test is currently failing (which I reported at : https://lore.kernel.org/r/20250911175145.GA1376@sol), I've confirmed that : the poly1305 KUnit test causes a KMSAN warning with Aleksei's patch : applied but does not cause a warning without it. The warning did get : reached via syzbot somehow : (https://lore.kernel.org/r/751b3d80293a6f599bb07770afcef24f623c7da0.1761026343.git.xiaopei01@kylinos.cn/), : so KMSAN must still work in some cases. But it didn't work for me. Link: https://lkml.kernel.org/r/20250930115600.709776-2-aleksei.nikiforov@linux.ibm.com Link: https://lkml.kernel.org/r/20251022030213.GA35717@sol Fixes: 97769a53f117 ("mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation") Signed-off-by: Aleksei Nikiforov Reviewed-by: Alexander Potapenko Tested-by: Eric Biggers Cc: Alexei Starovoitov Cc: Dmitriy Vyukov Cc: Ilya Leoshkevich Cc: Marco Elver Cc: Signed-off-by: Andrew Morton --- mm/kmsan/core.c | 3 --- mm/kmsan/hooks.c | 6 ++++-- mm/kmsan/shadow.c | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 8bca7fece47f..35ceaa8adb41 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -72,9 +72,6 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); - /* Don't sleep. */ - flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM); - handle = stack_depot_save(entries, nr_entries, flags); return stack_depot_set_extra_bits(handle, extra); } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 2cee59d89c80..8f22d1f22981 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -84,7 +84,8 @@ void kmsan_slab_free(struct kmem_cache *s, void *object) if (s->ctor) return; kmsan_enter_runtime(); - kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL, + kmsan_internal_poison_memory(object, s->object_size, + GFP_KERNEL & ~(__GFP_RECLAIM), KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } @@ -114,7 +115,8 @@ void kmsan_kfree_large(const void *ptr) kmsan_enter_runtime(); page = virt_to_head_page((void *)ptr); KMSAN_WARN_ON(ptr != page_address(page)); - kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL, + kmsan_internal_poison_memory((void *)ptr, page_size(page), + GFP_KERNEL & ~(__GFP_RECLAIM), KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 54f3c3c962f0..55fdea199aaf 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -208,7 +208,7 @@ void kmsan_free_page(struct page *page, unsigned int order) return; kmsan_enter_runtime(); kmsan_internal_poison_memory(page_address(page), page_size(page), - GFP_KERNEL, + GFP_KERNEL & ~(__GFP_RECLAIM), KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } From f5548c318d6520d4fa3c5ed6003eeb710763cbc5 Mon Sep 17 00:00:00 2001 From: Pedro Demarchi Gomes Date: Wed, 22 Oct 2025 12:30:59 -0300 Subject: [PATCH 218/543] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Currently, scan_get_next_rmap_item() walks every page address in a VMA to locate mergeable pages. This becomes highly inefficient when scanning large virtual memory areas that contain mostly unmapped regions, causing ksmd to use large amount of cpu without deduplicating much pages. This patch replaces the per-address lookup with a range walk using walk_page_range(). The range walker allows KSM to skip over entire unmapped holes in a VMA, avoiding unnecessary lookups. This problem was previously discussed in [1]. Consider the following test program which creates a 32 TiB mapping in the virtual address space but only populates a single page: #include #include #include /* 32 TiB */ const size_t size = 32ul * 1024 * 1024 * 1024 * 1024; int main() { char *area = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0); if (area == MAP_FAILED) { perror("mmap() failed\n"); return -1; } /* Populate a single page such that we get an anon_vma. */ *area = 0; /* Enable KSM. */ madvise(area, size, MADV_MERGEABLE); pause(); return 0; } $ ./ksm-sparse & $ echo 1 > /sys/kernel/mm/ksm/run Without this patch ksmd uses 100% of the cpu for a long time (more then 1 hour in my test machine) scanning all the 32 TiB virtual address space that contain only one mapped page. This makes ksmd essentially deadlocked not able to deduplicate anything of value. With this patch ksmd walks only the one mapped page and skips the rest of the 32 TiB virtual address space, making the scan fast using little cpu. Link: https://lkml.kernel.org/r/20251023035841.41406-1-pedrodemargomes@gmail.com Link: https://lkml.kernel.org/r/20251022153059.22763-1-pedrodemargomes@gmail.com Link: https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/ [1] Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging") Signed-off-by: Pedro Demarchi Gomes Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Reported-by: craftfever Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Chengming Zhou Cc: xu xin Cc: Signed-off-by: Andrew Morton --- mm/ksm.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 104 insertions(+), 9 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 7bc726b50b2f..c4e730409949 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2455,6 +2455,95 @@ static bool should_skip_rmap_item(struct folio *folio, return true; } +struct ksm_next_page_arg { + struct folio *folio; + struct page *page; + unsigned long addr; +}; + +static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct ksm_next_page_arg *private = walk->private; + struct vm_area_struct *vma = walk->vma; + pte_t *start_ptep = NULL, *ptep, pte; + struct mm_struct *mm = walk->mm; + struct folio *folio; + struct page *page; + spinlock_t *ptl; + pmd_t pmd; + + if (ksm_test_exit(mm)) + return 0; + + cond_resched(); + + pmd = pmdp_get_lockless(pmdp); + if (!pmd_present(pmd)) + return 0; + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) { + ptl = pmd_lock(mm, pmdp); + pmd = pmdp_get(pmdp); + + if (!pmd_present(pmd)) { + goto not_found_unlock; + } else if (pmd_leaf(pmd)) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (!page) + goto not_found_unlock; + folio = page_folio(page); + + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) + goto not_found_unlock; + + page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT); + goto found_unlock; + } + spin_unlock(ptl); + } + + start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!start_ptep) + return 0; + + for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) { + pte = ptep_get(ptep); + + if (!pte_present(pte)) + continue; + + page = vm_normal_page(vma, addr, pte); + if (!page) + continue; + folio = page_folio(page); + + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) + continue; + goto found_unlock; + } + +not_found_unlock: + spin_unlock(ptl); + if (start_ptep) + pte_unmap(start_ptep); + return 0; +found_unlock: + folio_get(folio); + spin_unlock(ptl); + if (start_ptep) + pte_unmap(start_ptep); + private->page = page; + private->folio = folio; + private->addr = addr; + return 1; +} + +static struct mm_walk_ops ksm_next_page_ops = { + .pmd_entry = ksm_next_page_pmd_entry, + .walk_lock = PGWALK_RDLOCK, +}; + static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; @@ -2542,21 +2631,27 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { + struct ksm_next_page_arg ksm_next_page_arg; struct page *tmp_page = NULL; - struct folio_walk fw; struct folio *folio; if (ksm_test_exit(mm)) break; - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); - if (folio) { - if (!folio_is_zone_device(folio) && - folio_test_anon(folio)) { - folio_get(folio); - tmp_page = fw.page; - } - folio_walk_end(&fw, vma); + int found; + + found = walk_page_range_vma(vma, ksm_scan.address, + vma->vm_end, + &ksm_next_page_ops, + &ksm_next_page_arg); + + if (found > 0) { + folio = ksm_next_page_arg.folio; + tmp_page = ksm_next_page_arg.page; + ksm_scan.address = ksm_next_page_arg.addr; + } else { + VM_WARN_ON_ONCE(found < 0); + ksm_scan.address = vma->vm_end - PAGE_SIZE; } if (tmp_page) { From fa5a061700364bc28ee1cb1095372f8033645dcb Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 22 Oct 2025 23:05:21 -0400 Subject: [PATCH 219/543] mm/huge_memory: preserve PG_has_hwpoisoned if a folio is split to >0 order folio split clears PG_has_hwpoisoned, but the flag should be preserved in after-split folios containing pages with PG_hwpoisoned flag if the folio is split to >0 order folios. Scan all pages in a to-be-split folio to determine which after-split folios need the flag. An alternatives is to change PG_has_hwpoisoned to PG_maybe_hwpoisoned to avoid the scan and set it on all after-split folios, but resulting false positive has undesirable negative impact. To remove false positive, caller of folio_test_has_hwpoisoned() and folio_contain_hwpoisoned_page() needs to do the scan. That might be causing a hassle for current and future callers and more costly than doing the scan in the split code. More details are discussed in [1]. This issue can be exposed via: 1. splitting a has_hwpoisoned folio to >0 order from debugfs interface; 2. truncating part of a has_hwpoisoned folio in truncate_inode_partial_folio(). And later accesses to a hwpoisoned page could be possible due to the missing has_hwpoisoned folio flag. This will lead to MCE errors. Link: https://lore.kernel.org/all/CAHbLzkoOZm0PXxE9qwtF4gKR=cpRXrSrJ9V9Pm2DJexs985q4g@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20251023030521.473097-1-ziy@nvidia.com Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Signed-off-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Yang Shi Reviewed-by: Lorenzo Stoakes Reviewed-by: Lance Yang Reviewed-by: Miaohe Lin Reviewed-by: Baolin Wang Reviewed-by: Wei Yang Cc: Pankaj Raghav Cc: Barry Song Cc: Dev Jain Cc: Jane Chu Cc: Liam Howlett Cc: Luis Chamberalin Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Cc: Nico Pache Cc: Ryan Roberts Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index feac4aef7dfb..b4ff49d96501 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3263,6 +3263,14 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) caller_pins; } +static bool page_range_has_hwpoisoned(struct page *page, long nr_pages) +{ + for (; nr_pages; page++, nr_pages--) + if (PageHWPoison(page)) + return true; + return false; +} + /* * It splits @folio into @new_order folios and copies the @folio metadata to * all the resulting folios. @@ -3270,17 +3278,24 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) static void __split_folio_to_order(struct folio *folio, int old_order, int new_order) { + /* Scan poisoned pages when split a poisoned folio to large folios */ + const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order; long new_nr_pages = 1 << new_order; long nr_pages = 1 << old_order; long i; + folio_clear_has_hwpoisoned(folio); + + /* Check first new_nr_pages since the loop below skips them */ + if (handle_hwpoison && + page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages)) + folio_set_has_hwpoisoned(folio); /* * Skip the first new_nr_pages, since the new folio from them have all * the flags from the original folio. */ for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) { struct page *new_head = &folio->page + i; - /* * Careful: new_folio is not a "real" folio before we cleared PageTail. * Don't pass it around before clear_compound_head(). @@ -3322,6 +3337,10 @@ static void __split_folio_to_order(struct folio *folio, int old_order, (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); + if (handle_hwpoison && + page_range_has_hwpoisoned(new_head, new_nr_pages)) + folio_set_has_hwpoisoned(new_folio); + new_folio->mapping = folio->mapping; new_folio->index = folio->index + i; @@ -3422,8 +3441,6 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, if (folio_test_anon(folio)) mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); - folio_clear_has_hwpoisoned(folio); - /* * split to new_order one order at a time. For uniform split, * folio is split to new_order directly. From 895b4c0c79b092d732544011c3cecaf7322c36a1 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 25 Oct 2025 10:42:33 +0800 Subject: [PATCH 220/543] fs/proc: fix uaf in proc_readdir_de() Pde is erased from subdir rbtree through rb_erase(), but not set the node to EMPTY, which may result in uaf access. We should use RB_CLEAR_NODE() set the erased node to EMPTY, then pde_subdir_next() will return NULL to avoid uaf access. We found an uaf issue while using stress-ng testing, need to run testcase getdent and tun in the same time. The steps of the issue is as follows: 1) use getdent to traverse dir /proc/pid/net/dev_snmp6/, and current pde is tun3; 2) in the [time windows] unregister netdevice tun3 and tun2, and erase them from rbtree. erase tun3 first, and then erase tun2. the pde(tun2) will be released to slab; 3) continue to getdent process, then pde_subdir_next() will return pde(tun2) which is released, it will case uaf access. CPU 0 | CPU 1 ------------------------------------------------------------------------- traverse dir /proc/pid/net/dev_snmp6/ | unregister_netdevice(tun->dev) //tun3 tun2 sys_getdents64() | iterate_dir() | proc_readdir() | proc_readdir_de() | snmp6_unregister_dev() pde_get(de); | proc_remove() read_unlock(&proc_subdir_lock); | remove_proc_subtree() | write_lock(&proc_subdir_lock); [time window] | rb_erase(&root->subdir_node, &parent->subdir); | write_unlock(&proc_subdir_lock); read_lock(&proc_subdir_lock); | next = pde_subdir_next(de); | pde_put(de); | de = next; //UAF | rbtree of dev_snmp6 | pde(tun3) / \ NULL pde(tun2) Link: https://lkml.kernel.org/r/20251025024233.158363-1-albin_yang@163.com Signed-off-by: Wei Yang Cc: Al Viro Cc: Christian Brauner Cc: wangzijie Cc: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton --- fs/proc/generic.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 176281112273..501889856461 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -698,6 +698,12 @@ void pde_put(struct proc_dir_entry *pde) } } +static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent) +{ + rb_erase(&pde->subdir_node, &parent->subdir); + RB_CLEAR_NODE(&pde->subdir_node); +} + /* * Remove a /proc entry and free it if it's not currently in use. */ @@ -720,7 +726,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) WARN(1, "removing permanent /proc entry '%s'", de->name); de = NULL; } else { - rb_erase(&de->subdir_node, &parent->subdir); + pde_erase(de, parent); if (S_ISDIR(de->mode)) parent->nlink--; } @@ -764,7 +770,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) root->parent->name, root->name); return -EINVAL; } - rb_erase(&root->subdir_node, &parent->subdir); + pde_erase(root, parent); de = root; while (1) { @@ -776,7 +782,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) next->parent->name, next->name); return -EINVAL; } - rb_erase(&next->subdir_node, &de->subdir); + pde_erase(next, de); de = next; continue; } From 74207de2ba10c2973334906822dc94d2e859ffc5 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Mon, 27 Oct 2025 11:56:35 +0000 Subject: [PATCH 221/543] mm/memory: do not populate page table entries beyond i_size Patch series "Fix SIGBUS semantics with large folios", v3. Accessing memory within a VMA, but beyond i_size rounded up to the next page size, is supposed to generate SIGBUS. Darrick reported[1] an xfstests regression in v6.18-rc1. generic/749 failed due to missing SIGBUS. This was caused by my recent changes that try to fault in the whole folio where possible: 19773df031bc ("mm/fault: try to map the entire file folio in finish_fault()") 357b92761d94 ("mm/filemap: map entire large folio faultaround") These changes did not consider i_size when setting up PTEs, leading to xfstest breakage. However, the problem has been present in the kernel for a long time - since huge tmpfs was introduced in 2016. The kernel happily maps PMD-sized folios as PMD without checking i_size. And huge=always tmpfs allocates PMD-size folios on any writes. I considered this corner case when I implemented a large tmpfs, and my conclusion was that no one in their right mind should rely on receiving a SIGBUS signal when accessing beyond i_size. I cannot imagine how it could be useful for the workload. But apparently filesystem folks care a lot about preserving strict SIGBUS semantics. Generic/749 was introduced last year with reference to POSIX, but no real workloads were mentioned. It also acknowledged the tmpfs deviation from the test case. POSIX indeed says[3]: References within the address range starting at pa and continuing for len bytes to whole pages following the end of an object shall result in delivery of a SIGBUS signal. The patchset fixes the regression introduced by recent changes as well as more subtle SIGBUS breakage due to split failure on truncation. This patch (of 2): Accesses within VMA, but beyond i_size rounded up to PAGE_SIZE are supposed to generate SIGBUS. Recent changes attempted to fault in full folio where possible. They did not respect i_size, which led to populating PTEs beyond i_size and breaking SIGBUS semantics. Darrick reported generic/749 breakage because of this. However, the problem existed before the recent changes. With huge=always tmpfs, any write to a file leads to PMD-size allocation. Following the fault-in of the folio will install PMD mapping regardless of i_size. Fix filemap_map_pages() and finish_fault() to not install: - PTEs beyond i_size; - PMD mappings across i_size; Make an exception for shmem/tmpfs that for long time intentionally mapped with PMDs across i_size. Link: https://lkml.kernel.org/r/20251027115636.82382-1-kirill@shutemov.name Link: https://lkml.kernel.org/r/20251027115636.82382-2-kirill@shutemov.name Signed-off-by: Kiryl Shutsemau Fixes: 6795801366da ("xfs: Support large folios") Reported-by: "Darrick J. Wong" Cc: Al Viro Cc: Baolin Wang Cc: Christian Brauner Cc: Dave Chinner Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/filemap.c | 28 ++++++++++++++++++++-------- mm/memory.c | 20 +++++++++++++++++++- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 13f0259d993c..2f1e7e283a51 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3681,7 +3681,8 @@ static struct folio *next_uptodate_folio(struct xa_state *xas, static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned long *rss, unsigned short *mmap_miss) + unsigned long *rss, unsigned short *mmap_miss, + bool can_map_large) { unsigned int ref_from_caller = 1; vm_fault_t ret = 0; @@ -3696,7 +3697,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, * The folio must not cross VMA or page table boundary. */ addr0 = addr - start * PAGE_SIZE; - if (folio_within_vma(folio, vmf->vma) && + if (can_map_large && folio_within_vma(folio, vmf->vma) && (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) { vmf->pte -= start; page -= start; @@ -3811,13 +3812,27 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, unsigned long rss = 0; unsigned int nr_pages = 0, folio_type; unsigned short mmap_miss = 0, mmap_miss_saved; + bool can_map_large; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); if (!folio) goto out; - if (filemap_map_pmd(vmf, folio, start_pgoff)) { + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; + end_pgoff = min(end_pgoff, file_end); + + /* + * Do not allow to map with PTEs beyond i_size and with PMD + * across i_size to preserve SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + can_map_large = shmem_mapping(mapping) || + file_end >= folio_next_index(folio); + + if (can_map_large && filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3830,10 +3845,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; } - file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; - if (end_pgoff > file_end) - end_pgoff = file_end; - folio_type = mm_counter_file(folio); do { unsigned long end; @@ -3850,7 +3861,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, - nr_pages, &rss, &mmap_miss); + nr_pages, &rss, &mmap_miss, + can_map_large); folio_unlock(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); diff --git a/mm/memory.c b/mm/memory.c index 74b45e258323..b59ae7ce42eb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -5501,8 +5502,25 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return ret; } + if (!needs_fallback && vma->vm_file) { + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t file_end; + + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + + /* + * Do not allow to map with PTEs beyond i_size and with PMD + * across i_size to preserve SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + needs_fallback = !shmem_mapping(mapping) && + file_end < folio_next_index(folio); + } + if (pmd_none(*vmf->pmd)) { - if (folio_test_pmd_mappable(folio)) { + if (!needs_fallback && folio_test_pmd_mappable(folio)) { ret = do_set_pmd(vmf, folio, page); if (ret != VM_FAULT_FALLBACK) return ret; From fa04f5b60fda62c98a53a60de3a1e763f11feb41 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Mon, 27 Oct 2025 11:56:36 +0000 Subject: [PATCH 222/543] mm/truncate: unmap large folio on split failure Accesses within VMA, but beyond i_size rounded up to PAGE_SIZE are supposed to generate SIGBUS. This behavior might not be respected on truncation. During truncation, the kernel splits a large folio in order to reclaim memory. As a side effect, it unmaps the folio and destroys PMD mappings of the folio. The folio will be refaulted as PTEs and SIGBUS semantics are preserved. However, if the split fails, PMD mappings are preserved and the user will not receive SIGBUS on any accesses within the PMD. Unmap the folio on split failure. It will lead to refault as PTEs and preserve SIGBUS semantics. Make an exception for shmem/tmpfs that for long time intentionally mapped with PMDs across i_size. Link: https://lkml.kernel.org/r/20251027115636.82382-3-kirill@shutemov.name Fixes: b9a8a4195c7d ("truncate,shmem: Handle truncates that split large folios") Signed-off-by: Kiryl Shutsemau Cc: Al Viro Cc: Baolin Wang Cc: Christian Brauner Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/truncate.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 9210cf808f5c..3c5a50ae3274 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -177,6 +177,32 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio) return 0; } +static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at, + unsigned long min_order) +{ + enum ttu_flags ttu_flags = + TTU_SYNC | + TTU_SPLIT_HUGE_PMD | + TTU_IGNORE_MLOCK; + int ret; + + ret = try_folio_split_to_order(folio, split_at, min_order); + + /* + * If the split fails, unmap the folio, so it will be refaulted + * with PTEs to respect SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + if (ret && !shmem_mapping(folio->mapping)) { + try_to_unmap(folio, ttu_flags); + WARN_ON(folio_mapped(folio)); + } + + return ret; +} + /* * Handle partial folios. The folio may be entirely within the * range if a split has raced with us. If not, we zero the part of the @@ -226,7 +252,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) min_order = mapping_min_folio_order(folio->mapping); split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); - if (!try_folio_split_to_order(folio, split_at, min_order)) { + if (!try_folio_split_or_unmap(folio, split_at, min_order)) { /* * try to split at offset + length to make sure folios within * the range can be dropped, especially to avoid memory waste @@ -250,13 +276,10 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) if (!folio_trylock(folio2)) goto out; - /* - * make sure folio2 is large and does not change its mapping. - * Its split result does not matter here. - */ + /* make sure folio2 is large and does not change its mapping */ if (folio_test_large(folio2) && folio2->mapping == folio->mapping) - try_folio_split_to_order(folio2, split_at2, min_order); + try_folio_split_or_unmap(folio2, split_at2, min_order); folio_unlock(folio2); out: From 0d6c356dd6547adac2b06b461528e3573f52d953 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Tue, 28 Oct 2025 12:10:12 -0700 Subject: [PATCH 223/543] mm/mm_init: fix hash table order logging in alloc_large_system_hash() When emitting the order of the allocation for a hash table, alloc_large_system_hash() unconditionally subtracts PAGE_SHIFT from log base 2 of the allocation size. This is not correct if the allocation size is smaller than a page, and yields a negative value for the order as seen below: TCP established hash table entries: 32 (order: -4, 256 bytes, linear) TCP bind hash table entries: 32 (order: -2, 1024 bytes, linear) Use get_order() to compute the order when emitting the hash table information to correctly handle cases where the allocation size is smaller than a page: TCP established hash table entries: 32 (order: 0, 256 bytes, linear) TCP bind hash table entries: 32 (order: 0, 1024 bytes, linear) Link: https://lkml.kernel.org/r/20251028191020.413002-1-isaacmanjarres@google.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Isaac J. Manjarres Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: David Hildenbrand Cc: Signed-off-by: Andrew Morton --- mm/mm_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 3db2dea7db4c..7712d887b696 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2469,7 +2469,7 @@ void *__init alloc_large_system_hash(const char *tablename, panic("Failed to allocate %s hash table\n", tablename); pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", - tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, + tablename, 1UL << log2qty, get_order(size), size, virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); if (_hash_shift) From ec4d11fc4b2dd4a2fa8c9d801ee9753b74623554 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Tue, 28 Oct 2025 12:51:25 +0100 Subject: [PATCH 224/543] gcov: add support for GCC 15 Using gcov on kernels compiled with GCC 15 results in truncated 16-byte long .gcda files with no usable data. To fix this, update GCOV_COUNTERS to match the value defined by GCC 15. Tested with GCC 14.3.0 and GCC 15.2.0. Link: https://lkml.kernel.org/r/20251028115125.1319410-1-oberpar@linux.ibm.com Signed-off-by: Peter Oberparleiter Reported-by: Matthieu Baerts Closes: https://github.com/linux-test-project/lcov/issues/445 Tested-by: Matthieu Baerts Cc: Signed-off-by: Andrew Morton --- kernel/gcov/gcc_4_7.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index a08cc076f332..ffde93d051a4 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include #include "gcov.h" -#if (__GNUC__ >= 14) +#if (__GNUC__ >= 15) +#define GCOV_COUNTERS 10 +#elif (__GNUC__ >= 14) #define GCOV_COUNTERS 9 #elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 From 04d1c9d60c6ec4c0003d433572eaa45f8b217788 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 28 Oct 2025 12:09:52 +0530 Subject: [PATCH 225/543] mm/mremap: honour writable bit in mremap pte batching Currently mremap folio pte batch ignores the writable bit during figuring out a set of similar ptes mapping the same folio. Suppose that the first pte of the batch is writable while the others are not - set_ptes will end up setting the writable bit on the other ptes, which is a violation of mremap semantics. Therefore, use FPB_RESPECT_WRITE to check the writable bit while determining the pte batch. Link: https://lkml.kernel.org/r/20251028063952.90313-1-dev.jain@arm.com Signed-off-by: Dev Jain Fixes: f822a9a81a31 ("mm: optimize mremap() by PTE batching") Reported-by: David Hildenbrand Debugged-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes Cc: Barry Song Cc: Jann Horn Cc: Liam Howlett Cc: Vlastimil Babka Cc: [6.17+] Signed-off-by: Andrew Morton --- mm/mremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index bd7314898ec5..419a0ea0a870 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -187,7 +187,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr if (!folio || !folio_test_large(folio)) return 1; - return folio_pte_batch(folio, ptep, pte, max_nr); + return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, FPB_RESPECT_WRITE); } static int move_ptes(struct pagetable_move_control *pmc, From 1abbdf3d57aa964e572940d67c9ec5dc87710738 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Wed, 29 Oct 2025 09:43:17 +0800 Subject: [PATCH 226/543] codetag: debug: handle existing CODETAG_EMPTY in mark_objexts_empty for slabobj_ext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When alloc_slab_obj_exts() fails and then later succeeds in allocating a slab extension vector, it calls handle_failed_objexts_alloc() to mark all objects in the vector as empty. As a result all objects in this slab (slabA) will have their extensions set to CODETAG_EMPTY. Later on if this slabA is used to allocate a slabobj_ext vector for another slab (slabB), we end up with the slabB->obj_exts pointing to a slabobj_ext vector that itself has a non-NULL slabobj_ext equal to CODETAG_EMPTY. When slabB gets freed, free_slab_obj_exts() is called to free slabB->obj_exts vector. free_slab_obj_exts() calls mark_objexts_empty(slabB->obj_exts) which will generate a warning because it expects slabobj_ext vectors to have a NULL obj_ext, not CODETAG_EMPTY. Modify mark_objexts_empty() to skip the warning and setting the obj_ext value if it's already set to CODETAG_EMPTY. To quickly detect this WARN, I modified the code from WARN_ON(slab_exts[offs].ref.ct) to BUG_ON(slab_exts[offs].ref.ct == 1); We then obtained this message: [21630.898561] ------------[ cut here ]------------ [21630.898596] kernel BUG at mm/slub.c:2050! [21630.898611] Internal error: Oops - BUG: 00000000f2000800 [#1] SMP [21630.900372] Modules linked in: squashfs isofs vfio_iommu_type1 vhost_vsock vfio vhost_net vmw_vsock_virtio_transport_common vhost tap vhost_iotlb iommufd vsock binfmt_misc nfsv3 nfs_acl nfs lockd grace netfs tls rds dns_resolver tun brd overlay ntfs3 exfat btrfs blake2b_generic xor xor_neon raid6_pq loop sctp ip6_udp_tunnel udp_tunnel nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nf_tables rfkill ip_set sunrpc vfat fat joydev sg sch_fq_codel nfnetlink virtio_gpu sr_mod cdrom drm_client_lib virtio_dma_buf drm_shmem_helper drm_kms_helper drm ghash_ce backlight virtio_net virtio_blk virtio_scsi net_failover virtio_console failover virtio_mmio dm_mirror dm_region_hash dm_log dm_multipath dm_mod fuse i2c_dev virtio_pci virtio_pci_legacy_dev virtio_pci_modern_dev virtio virtio_ring autofs4 aes_neon_bs aes_ce_blk [last unloaded: hwpoison_inject] [21630.909177] CPU: 3 UID: 0 PID: 3787 Comm: kylin-process-m Kdump: loaded Tainted: G        W           6.18.0-rc1+ #74 PREEMPT(voluntary) [21630.910495] Tainted: [W]=WARN [21630.910867] Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022 [21630.911625] pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [21630.912392] pc : __free_slab+0x228/0x250 [21630.912868] lr : __free_slab+0x18c/0x250[21630.913334] sp : ffff8000a02f73e0 [21630.913830] x29: ffff8000a02f73e0 x28: fffffdffc43fc800 x27: ffff0000c0011c40 [21630.914677] x26: ffff0000c000cac0 x25: ffff00010fe5e5f0 x24: ffff000102199b40 [21630.915469] x23: 0000000000000003 x22: 0000000000000003 x21: ffff0000c0011c40 [21630.916259] x20: fffffdffc4086600 x19: fffffdffc43fc800 x18: 0000000000000000 [21630.917048] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 [21630.917837] x14: 0000000000000000 x13: 0000000000000000 x12: ffff70001405ee66 [21630.918640] x11: 1ffff0001405ee65 x10: ffff70001405ee65 x9 : ffff800080a295dc [21630.919442] x8 : ffff8000a02f7330 x7 : 0000000000000000 x6 : 0000000000003000 [21630.920232] x5 : 0000000024924925 x4 : 0000000000000001 x3 : 0000000000000007 [21630.921021] x2 : 0000000000001b40 x1 : 000000000000001f x0 : 0000000000000001 [21630.921810] Call trace: [21630.922130]  __free_slab+0x228/0x250 (P) [21630.922669]  free_slab+0x38/0x118 [21630.923079]  free_to_partial_list+0x1d4/0x340 [21630.923591]  __slab_free+0x24c/0x348 [21630.924024]  ___cache_free+0xf0/0x110 [21630.924468]  qlist_free_all+0x78/0x130 [21630.924922]  kasan_quarantine_reduce+0x114/0x148 [21630.925525]  __kasan_slab_alloc+0x7c/0xb0 [21630.926006]  kmem_cache_alloc_noprof+0x164/0x5c8 [21630.926699]  __alloc_object+0x44/0x1f8 [21630.927153]  __create_object+0x34/0xc8 [21630.927604]  kmemleak_alloc+0xb8/0xd8 [21630.928052]  kmem_cache_alloc_noprof+0x368/0x5c8 [21630.928606]  getname_flags.part.0+0xa4/0x610 [21630.929112]  getname_flags+0x80/0xd8 [21630.929557]  vfs_fstatat+0xc8/0xe0 [21630.929975]  __do_sys_newfstatat+0xa0/0x100 [21630.930469]  __arm64_sys_newfstatat+0x90/0xd8 [21630.931046]  invoke_syscall+0xd4/0x258 [21630.931685]  el0_svc_common.constprop.0+0xb4/0x240 [21630.932467]  do_el0_svc+0x48/0x68 [21630.932972]  el0_svc+0x40/0xe0 [21630.933472]  el0t_64_sync_handler+0xa0/0xe8 [21630.934151]  el0t_64_sync+0x1ac/0x1b0 [21630.934923] Code: aa1803e0 97ffef2b a9446bf9 17ffff9c (d4210000) [21630.936461] SMP: stopping secondary CPUs [21630.939550] Starting crashdump kernel... [21630.940108] Bye! Link: https://lkml.kernel.org/r/20251029014317.1533488-1-hao.ge@linux.dev Fixes: 09c46563ff6d ("codetag: debug: introduce OBJEXTS_ALLOC_FAIL to mark failed slab_ext allocations") Signed-off-by: Hao Ge Reviewed-by: Suren Baghdasaryan Cc: Christoph Lameter (Ampere) Cc: David Rientjes Cc: gehao Cc: Roman Gushchin Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/slub.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index f1a5373eee7b..1bf65c421325 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2046,7 +2046,11 @@ static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) if (slab_exts) { unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, obj_exts_slab, obj_exts); - /* codetag should be NULL */ + + if (unlikely(is_codetag_empty(&slab_exts[offs].ref))) + return; + + /* codetag should be NULL here */ WARN_ON(slab_exts[offs].ref.ct); set_codetag_empty(&slab_exts[offs].ref); } From 91a54090026f84ceffaa12ac53c99b9f162946f6 Mon Sep 17 00:00:00 2001 From: Martin Kaiser Date: Thu, 30 Oct 2025 16:55:05 +0100 Subject: [PATCH 227/543] maple_tree: fix tracepoint string pointers maple_tree tracepoints contain pointers to function names. Such a pointer is saved when a tracepoint logs an event. There's no guarantee that it's still valid when the event is parsed later and the pointer is dereferenced. The kernel warns about these unsafe pointers. event 'ma_read' has unsafe pointer field 'fn' WARNING: kernel/trace/trace.c:3779 at ignore_event+0x1da/0x1e4 Mark the function names as tracepoint_string() to fix the events. One case that doesn't work without my patch would be trace-cmd record to save the binary ringbuffer and trace-cmd report to parse it in userspace. The address of __func__ can't be dereferenced from userspace but tracepoint_string will add an entry to /sys/kernel/tracing/printk_formats Link: https://lkml.kernel.org/r/20251030155537.87972-1-martin@kaiser.cx Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Martin Kaiser Acked-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 39bb779cb311..5aa4c9500018 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -64,6 +64,8 @@ #define CREATE_TRACE_POINTS #include +#define TP_FCT tracepoint_string(__func__) + /* * Kernel pointer hashing renders much of the maple tree dump useless as tagged * pointers get hashed to arbitrary values. @@ -2756,7 +2758,7 @@ static inline void mas_rebalance(struct ma_state *mas, MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); - trace_ma_op(__func__, mas); + trace_ma_op(TP_FCT, mas); /* * Rebalancing occurs if a node is insufficient. Data is rebalanced @@ -2997,7 +2999,7 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); - trace_ma_op(__func__, mas); + trace_ma_op(TP_FCT, mas); mast.l = &l_mas; mast.r = &r_mas; @@ -3172,7 +3174,7 @@ static bool mas_is_span_wr(struct ma_wr_state *wr_mas) return false; } - trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry); + trace_ma_write(TP_FCT, wr_mas->mas, wr_mas->r_max, entry); return true; } @@ -3416,7 +3418,7 @@ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) * of data may happen. */ mas = wr_mas->mas; - trace_ma_op(__func__, mas); + trace_ma_op(TP_FCT, mas); if (unlikely(!mas->index && mas->last == ULONG_MAX)) return mas_new_root(mas, wr_mas->entry); @@ -3552,7 +3554,7 @@ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } - trace_ma_write(__func__, mas, 0, wr_mas->entry); + trace_ma_write(TP_FCT, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; return; @@ -3596,7 +3598,7 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) mas->offset++; /* Keep mas accurate. */ } - trace_ma_write(__func__, mas, 0, wr_mas->entry); + trace_ma_write(TP_FCT, mas, 0, wr_mas->entry); /* * Only update gap when the new entry is empty or there is an empty * entry in the original two ranges. @@ -3717,7 +3719,7 @@ static inline void mas_wr_append(struct ma_wr_state *wr_mas, mas_update_gap(mas); mas->end = new_end; - trace_ma_write(__func__, mas, new_end, wr_mas->entry); + trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry); return; } @@ -3731,7 +3733,7 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas) { struct maple_big_node b_node; - trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); + trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); mas_commit_b_node(wr_mas, &b_node); @@ -5062,7 +5064,7 @@ void *mas_store(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); - trace_ma_write(__func__, mas, 0, entry); + trace_ma_write(TP_FCT, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (MAS_WARN_ON(mas, mas->index > mas->last)) pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, @@ -5163,7 +5165,7 @@ void mas_store_prealloc(struct ma_state *mas, void *entry) } store: - trace_ma_write(__func__, mas, 0, entry); + trace_ma_write(TP_FCT, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); mas_destroy(mas); @@ -5882,7 +5884,7 @@ void *mtree_load(struct maple_tree *mt, unsigned long index) MA_STATE(mas, mt, index, index); void *entry; - trace_ma_read(__func__, &mas); + trace_ma_read(TP_FCT, &mas); rcu_read_lock(); retry: entry = mas_start(&mas); @@ -5925,7 +5927,7 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index, MA_STATE(mas, mt, index, last); int ret = 0; - trace_ma_write(__func__, &mas, 0, entry); + trace_ma_write(TP_FCT, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; @@ -6148,7 +6150,7 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index) void *entry = NULL; MA_STATE(mas, mt, index, index); - trace_ma_op(__func__, &mas); + trace_ma_op(TP_FCT, &mas); mtree_lock(mt); entry = mas_erase(&mas); @@ -6485,7 +6487,7 @@ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max) unsigned long copy = *index; #endif - trace_ma_read(__func__, &mas); + trace_ma_read(TP_FCT, &mas); if ((*index) > max) return NULL; From 2f6ce7e714ef842e43120ecd6a7ed287b502026d Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Thu, 30 Oct 2025 10:07:45 +0800 Subject: [PATCH 228/543] mm/damon/stat: change last_refresh_jiffies to a global variable Patch series "mm/damon: fixes for the jiffies-related issues", v2. On 32-bit systems, the kernel initializes jiffies to "-5 minutes" to make jiffies wrap bugs appear earlier. However, this may cause the time_before() series of functions to return unexpected values, resulting in DAMON not functioning as intended. Meanwhile, similar issues exist in some specific user operation scenarios. This patchset addresses these issues. The first patch is about the DAMON_STAT module, and the second patch is about the core layer's sysfs. This patch (of 2): In DAMON_STAT's damon_stat_damon_call_fn(), time_before_eq() is used to avoid unnecessarily frequent stat update. On 32-bit systems, the kernel initializes jiffies to "-5 minutes" to make jiffies wrap bugs appear earlier. However, this causes time_before_eq() in DAMON_STAT to unexpectedly return true during the first 5 minutes after boot on 32-bit systems (see [1] for more explanation, which fixes another jiffies-related issue before). As a result, DAMON_STAT does not update any monitoring results during that period, which becomes more confusing when DAMON_STAT_ENABLED_DEFAULT is enabled. There is also an issue unrelated to the system's word size[2]: if the user stops DAMON_STAT just after last_refresh_jiffies is updated and restarts it after 5 seconds or a longer delay, last_refresh_jiffies will retain an older value, causing time_before_eq() to return false and the update to happen earlier than expected. Fix these issues by making last_refresh_jiffies a global variable and initializing it each time DAMON_STAT is started. Link: https://lkml.kernel.org/r/20251030020746.967174-2-yanquanmin1@huawei.com Link: https://lkml.kernel.org/r/20250822025057.1740854-1-ekffu200098@gmail.com [1] Link: https://lore.kernel.org/all/20251028143250.50144-1-sj@kernel.org/ [2] Fixes: fabdd1e911da ("mm/damon/stat: calculate and expose estimated memory bandwidth") Signed-off-by: Quanmin Yan Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Cc: Signed-off-by: Andrew Morton --- mm/damon/stat.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/damon/stat.c b/mm/damon/stat.c index d8010968bbed..bf8626859902 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -46,6 +46,8 @@ MODULE_PARM_DESC(aggr_interval_us, static struct damon_ctx *damon_stat_context; +static unsigned long damon_stat_last_refresh_jiffies; + static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c) { struct damon_target *t; @@ -130,13 +132,12 @@ static void damon_stat_set_idletime_percentiles(struct damon_ctx *c) static int damon_stat_damon_call_fn(void *data) { struct damon_ctx *c = data; - static unsigned long last_refresh_jiffies; /* avoid unnecessarily frequent stat update */ - if (time_before_eq(jiffies, last_refresh_jiffies + + if (time_before_eq(jiffies, damon_stat_last_refresh_jiffies + msecs_to_jiffies(5 * MSEC_PER_SEC))) return 0; - last_refresh_jiffies = jiffies; + damon_stat_last_refresh_jiffies = jiffies; aggr_interval_us = c->attrs.aggr_interval; damon_stat_set_estimated_memory_bandwidth(c); @@ -210,6 +211,8 @@ static int damon_stat_start(void) err = damon_start(&damon_stat_context, 1, true); if (err) return err; + + damon_stat_last_refresh_jiffies = jiffies; call_control.data = damon_stat_context; return damon_call(damon_stat_context, &call_control); } From 9fd7bb5083d1e1027b8ac1e365c29921ab88b177 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Thu, 30 Oct 2025 10:07:46 +0800 Subject: [PATCH 229/543] mm/damon/sysfs: change next_update_jiffies to a global variable In DAMON's damon_sysfs_repeat_call_fn(), time_before() is used to compare the current jiffies with next_update_jiffies to determine whether to update the sysfs files at this moment. On 32-bit systems, the kernel initializes jiffies to "-5 minutes" to make jiffies wrap bugs appear earlier. However, this causes time_before() in damon_sysfs_repeat_call_fn() to unexpectedly return true during the first 5 minutes after boot on 32-bit systems (see [1] for more explanation, which fixes another jiffies-related issue before). As a result, DAMON does not update sysfs files during that period. There is also an issue unrelated to the system's word size[2]: if the user stops DAMON just after next_update_jiffies is updated and restarts it after 'refresh_ms' or a longer delay, next_update_jiffies will retain an older value, causing time_before() to return false and the update to happen earlier than expected. Fix these issues by making next_update_jiffies a global variable and initializing it each time DAMON is started. Link: https://lkml.kernel.org/r/20251030020746.967174-3-yanquanmin1@huawei.com Link: https://lkml.kernel.org/r/20250822025057.1740854-1-ekffu200098@gmail.com [1] Link: https://lore.kernel.org/all/20251029013038.66625-1-sj@kernel.org/ [2] Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work") Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Quanmin Yan Cc: Kefeng Wang Cc: ze zuo Cc: Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index cd6815ecc04e..3c0d727788c8 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1552,16 +1552,17 @@ static struct damon_ctx *damon_sysfs_build_ctx( return ctx; } +static unsigned long damon_sysfs_next_update_jiffies; + static int damon_sysfs_repeat_call_fn(void *data) { struct damon_sysfs_kdamond *sysfs_kdamond = data; - static unsigned long next_update_jiffies; if (!sysfs_kdamond->refresh_ms) return 0; - if (time_before(jiffies, next_update_jiffies)) + if (time_before(jiffies, damon_sysfs_next_update_jiffies)) return 0; - next_update_jiffies = jiffies + + damon_sysfs_next_update_jiffies = jiffies + msecs_to_jiffies(sysfs_kdamond->refresh_ms); if (!mutex_trylock(&damon_sysfs_lock)) @@ -1607,6 +1608,9 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) } kdamond->damon_ctx = ctx; + damon_sysfs_next_update_jiffies = + jiffies + msecs_to_jiffies(kdamond->refresh_ms); + repeat_call_control->fn = damon_sysfs_repeat_call_fn; repeat_call_control->data = kdamond; repeat_call_control->repeat = true; From 7d9f7d390f6af3a29614e81e802e2b9c238eb7b2 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Thu, 30 Oct 2025 01:03:33 +0000 Subject: [PATCH 230/543] scripts/decode_stacktrace.sh: fix build ID and PC source parsing Support for parsing PC source info in stacktraces (e.g. '(P)') was added in commit 2bff77c665ed ("scripts/decode_stacktrace.sh: fix decoding of lines with an additional info"). However, this logic was placed after the build ID processing. This incorrect order fails to parse lines containing both elements, e.g.: drm_gem_mmap_obj+0x114/0x200 [drm 03d0564e0529947d67bb2008c3548be77279fd27] (P) This patch fixes the problem by extracting the PC source info first and then processing the module build ID. With this change, the line above is now properly parsed as such: drm_gem_mmap_obj (./include/linux/mmap_lock.h:212 ./include/linux/mm.h:811 drivers/gpu/drm/drm_gem.c:1177) drm (P) While here, also add a brief explanation the build ID section. Link: https://lkml.kernel.org/r/20251030010347.2731925-1-cmllamas@google.com Fixes: 2bff77c665ed ("scripts/decode_stacktrace.sh: fix decoding of lines with an additional info") Signed-off-by: Carlos Llamas Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Luca Ceresoli Cc: Breno Leitao Cc: Catalin Marinas Cc: Marc Rutland Cc: Mark Brown Cc: Matthieu Baerts Cc: Miroslav Benes Cc: Puranjay Mohan Cc: Signed-off-by: Andrew Morton --- scripts/decode_stacktrace.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index c73cb802a0a3..8d01b741de62 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -277,12 +277,6 @@ handle_line() { fi done - if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then - words[$last-1]="${words[$last-1]} ${words[$last]}" - unset words[$last] spaces[$last] - last=$(( $last - 1 )) - fi - # Extract info after the symbol if present. E.g.: # func_name+0x54/0x80 (P) # ^^^ @@ -295,6 +289,14 @@ handle_line() { last=$(( $last - 1 )) fi + # Join module name with its build id if present, as these were + # split during tokenization (e.g. "[module" and "modbuildid]"). + if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then + words[$last-1]="${words[$last-1]} ${words[$last]}" + unset words[$last] spaces[$last] + last=$(( $last - 1 )) + fi + if [[ ${words[$last]} =~ \[([^]]+)\] ]]; then module=${words[$last]} # some traces format is "(%pS)", which like "(foo+0x0/0x1 [bar])" From 9a6b60cb147d53968753a34805211d2e5e08c027 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Thu, 30 Oct 2025 07:51:52 +0900 Subject: [PATCH 231/543] nilfs2: avoid having an active sc_timer before freeing sci Because kthread_stop did not stop sc_task properly and returned -EINTR, the sc_timer was not properly closed, ultimately causing the problem [1] reported by syzbot when freeing sci due to the sc_timer not being closed. Because the thread sc_task main function nilfs_segctor_thread() returns 0 when it succeeds, when the return value of kthread_stop() is not 0 in nilfs_segctor_destroy(), we believe that it has not properly closed sc_timer. We use timer_shutdown_sync() to sync wait for sc_timer to shutdown, and set the value of sc_task to NULL under the protection of lock sc_state_lock, so as to avoid the issue caused by sc_timer not being properly shutdowned. [1] ODEBUG: free active (active state 0) object: 00000000dacb411a object type: timer_list hint: nilfs_construction_timeout Call trace: nilfs_segctor_destroy fs/nilfs2/segment.c:2811 [inline] nilfs_detach_log_writer+0x668/0x8cc fs/nilfs2/segment.c:2877 nilfs_put_super+0x4c/0x12c fs/nilfs2/super.c:509 Link: https://lkml.kernel.org/r/20251029225226.16044-1-konishi.ryusuke@gmail.com Fixes: 3f66cc261ccb ("nilfs2: use kthread_create and kthread_stop for the log writer thread") Signed-off-by: Ryusuke Konishi Reported-by: syzbot+24d8b70f039151f65590@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=24d8b70f039151f65590 Tested-by: syzbot+24d8b70f039151f65590@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Cc: [6.12+] Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index f15ca6fc400d..deee16bc9d4e 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2768,7 +2768,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) if (sci->sc_task) { wake_up(&sci->sc_wait_daemon); - kthread_stop(sci->sc_task); + if (kthread_stop(sci->sc_task)) { + spin_lock(&sci->sc_state_lock); + sci->sc_task = NULL; + timer_shutdown_sync(&sci->sc_timer); + spin_unlock(&sci->sc_state_lock); + } } spin_lock(&sci->sc_state_lock); From adfb6609c6809e107ded9a1cd46f519c882e64ea Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 31 Oct 2025 16:57:50 +0000 Subject: [PATCH 232/543] mm/huge_memory: initialise the tags of the huge zero folio On arm64 with MTE enabled, a page mapped as Normal Tagged (PROT_MTE) in user space will need to have its allocation tags initialised. This is normally done in the arm64 set_pte_at() after checking the memory attributes. Such page is also marked with the PG_mte_tagged flag to avoid subsequent clearing. Since this relies on having a struct page, pte_special() mappings are ignored. Commit d82d09e48219 ("mm/huge_memory: mark PMD mappings of the huge zero folio special") maps the huge zero folio special and the arm64 set_pmd_at() will no longer zero the tags. There is no guarantee that the tags are zero, especially if parts of this huge page have been previously tagged. It's fairly easy to detect this by regularly dropping the caches to force the reallocation of the huge zero folio. Allocate the huge zero folio with the __GFP_ZEROTAGS flag. In addition, do not warn in the arm64 __access_remote_tags() when reading tags from the huge zero page. I bundled the arm64 change in here as well since they are both related to the commit mapping the huge zero folio as special. [catalin.marinas@arm.com: handle arch mte_zero_clear_page_tags() code issuing MTE instructions] Link: https://lkml.kernel.org/r/aQi8dA_QpXM8XqrE@arm.com Link: https://lkml.kernel.org/r/20251031170133.280742-1-catalin.marinas@arm.com Fixes: d82d09e48219 ("mm/huge_memory: mark PMD mappings of the huge zero folio special") Signed-off-by: Catalin Marinas Acked-by: David Hildenbrand Reviewed-by: Lance Yang Tested-by: Beleswar Padhi Cc: Will Deacon Cc: Mark Brown Cc: Aishwarya TCV Cc: David Hildenbrand (Red Hat) Signed-off-by: Andrew Morton --- arch/arm64/kernel/mte.c | 3 ++- arch/arm64/mm/fault.c | 10 ++++++++++ mm/huge_memory.c | 3 ++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 43f7a2f39403..32148bf09c1d 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -476,7 +476,8 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, folio = page_folio(page); if (folio_test_hugetlb(folio)) - WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio)); + WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio) && + !is_huge_zero_folio(folio)); else WARN_ON_ONCE(!page_mte_tagged(page) && !is_zero_page(page)); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index d816ff44faff..125dfa6c613b 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -969,6 +969,16 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, void tag_clear_highpage(struct page *page) { + /* + * Check if MTE is supported and fall back to clear_highpage(). + * get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and + * post_alloc_hook() will invoke tag_clear_highpage(). + */ + if (!system_supports_mte()) { + clear_highpage(page); + return; + } + /* Newly allocated page, shouldn't have been tagged yet */ WARN_ON_ONCE(!try_page_mte_tagging(page)); mte_zero_clear_page_tags(page_address(page)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b4ff49d96501..323654fb4f8c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -214,7 +214,8 @@ static bool get_huge_zero_folio(void) if (likely(atomic_inc_not_zero(&huge_zero_refcount))) return true; - zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO | __GFP_ZEROTAGS) & + ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_folio) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); From 6f86d0534fddfbd08687fa0f01479d4226bc3c3d Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 31 Oct 2025 20:09:55 +0800 Subject: [PATCH 233/543] mm/secretmem: fix use-after-free race in fault handler When a page fault occurs in a secret memory file created with `memfd_secret(2)`, the kernel will allocate a new folio for it, mark the underlying page as not-present in the direct map, and add it to the file mapping. If two tasks cause a fault in the same page concurrently, both could end up allocating a folio and removing the page from the direct map, but only one would succeed in adding the folio to the file mapping. The task that failed undoes the effects of its attempt by (a) freeing the folio again and (b) putting the page back into the direct map. However, by doing these two operations in this order, the page becomes available to the allocator again before it is placed back in the direct mapping. If another task attempts to allocate the page between (a) and (b), and the kernel tries to access it via the direct map, it would result in a supervisor not-present page fault. Fix the ordering to restore the direct map before the folio is freed. Link: https://lkml.kernel.org/r/20251031120955.92116-1-lance.yang@linux.dev Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas") Signed-off-by: Lance Yang Reported-by: Google Big Sleep Closes: https://lore.kernel.org/linux-mm/CAEXGt5QeDpiHTu3K9tvjUTPqo+d-=wuCNYPa+6sWKrdQJ-ATdg@mail.gmail.com/ Acked-by: David Hildenbrand Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- mm/secretmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 60137305bc20..b59350daffe3 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -82,13 +82,13 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) __folio_mark_uptodate(folio); err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { - folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid * which guarantees that this call won't fail */ set_direct_map_default_noflush(folio_page(folio, 0)); + folio_put(folio); if (err == -EEXIST) goto retry; From bba717bbc466ab24d25964034f5e16ead1720512 Mon Sep 17 00:00:00 2001 From: Chris Li Date: Sun, 2 Nov 2025 07:11:07 -0800 Subject: [PATCH 234/543] MAINTAINERS: add Chris and Kairui as the swap maintainer We have been collaborating on a systematic effort to clean up and improve the Linux swap system, and might as well take responsibility for it. Link: https://lkml.kernel.org/r/20251102-swap-m-v1-1-582f275d5bce@kernel.org Signed-off-by: Chris Li Acked-by: Kairui Song Acked-by: Barry Song Acked-by: Baoquan He Cc: Johannes Weiner Cc: Kemeng Shi Cc: Michal Hocko Cc: Minchan Kim Cc: Nhat Pham Cc: Roman Gushchin Cc: SeongJae Park Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ddecf1ef3bed..5b93346f464f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16498,12 +16498,12 @@ F: mm/secretmem.c MEMORY MANAGEMENT - SWAP M: Andrew Morton +M: Chris Li +M: Kairui Song R: Kemeng Shi -R: Kairui Song R: Nhat Pham R: Baoquan He R: Barry Song -R: Chris Li L: linux-mm@kvack.org S: Maintained F: Documentation/mm/swap-table.rst From 0b07092d09e54e49b85379a9c60f82d54a881514 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Mon, 3 Nov 2025 12:01:57 +0100 Subject: [PATCH 235/543] kho: fix out-of-bounds access of vmalloc chunk The list of pages in a vmalloc chunk is NULL-terminated. So when looping through the pages in a vmalloc chunk, both kho_restore_vmalloc() and kho_vmalloc_unpreserve_chunk() rightly make sure to stop when encountering a NULL page. But when the chunk is full, the loops do not stop and go past the bounds of chunk->phys, resulting in out-of-bounds memory access, and possibly the restoration or unpreservation of an invalid page. Fix this by making sure the processing of chunk stops at the end of the array. Link: https://lkml.kernel.org/r/20251103110159.8399-1-pratyush@kernel.org Fixes: a667300bd53f ("kho: add support for preserving vmalloc allocations") Signed-off-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 2a8c20c238a8..36fdce2667c5 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -889,7 +889,7 @@ static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) __kho_unpreserve(track, pfn, pfn + 1); - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { pfn = PHYS_PFN(chunk->phys[i]); __kho_unpreserve(track, pfn, pfn + 1); } @@ -1012,7 +1012,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) while (chunk) { struct page *page; - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { phys_addr_t phys = chunk->phys[i]; if (idx + contig_pages > total_pages) From 7ecd2e439d1272ac02d798b0033a426e3b00dff5 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Mon, 3 Nov 2025 19:02:31 +0100 Subject: [PATCH 236/543] kho: fix unpreservation of higher-order vmalloc preservations kho_vmalloc_unpreserve_chunk() calls __kho_unpreserve() with end_pfn as pfn + 1. This happens to work for 0-order pages, but leaks higher order pages. For example, say order 2 pages back the allocation. During preservation, they get preserved in the order 2 bitmaps, but kho_vmalloc_unpreserve_chunk() would try to unpreserve them from the order 0 bitmaps, which should not have these bits set anyway, leaving the order 2 bitmaps untouched. This results in the pages being carried over to the next kernel. Nothing will free those pages in the next boot, leaking them. Fix this by taking the order into account when calculating the end PFN for __kho_unpreserve(). Link: https://lkml.kernel.org/r/20251103180235.71409-2-pratyush@kernel.org Fixes: a667300bd53f ("kho: add support for preserving vmalloc allocations") Signed-off-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 36fdce2667c5..e0bafe7c0ded 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -882,7 +882,8 @@ static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur return NULL; } -static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, + unsigned short order) { struct kho_mem_track *track = &kho_out.ser.track; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); @@ -891,7 +892,7 @@ static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { pfn = PHYS_PFN(chunk->phys[i]); - __kho_unpreserve(track, pfn, pfn + 1); + __kho_unpreserve(track, pfn, pfn + (1 << order)); } } @@ -902,7 +903,7 @@ static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) while (chunk) { struct kho_vmalloc_chunk *tmp = chunk; - kho_vmalloc_unpreserve_chunk(chunk); + kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order); chunk = KHOSER_LOAD_PTR(chunk->hdr.next); free_page((unsigned long)tmp); From b05addf6f0596edb1f82ab4059438c7ef2d2686d Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Mon, 3 Nov 2025 19:02:32 +0100 Subject: [PATCH 237/543] kho: warn and exit when unpreserved page wasn't preserved Calling __kho_unpreserve() on a pair of (pfn, end_pfn) that wasn't preserved is a bug. Currently, if that is done, the physxa or bits can be NULL. This results in a soft lockup since a NULL physxa or bits results in redoing the loop without ever making any progress. Return when physxa or bits are not found, but WARN first to loudly indicate invalid behaviour. Link: https://lkml.kernel.org/r/20251103180235.71409-3-pratyush@kernel.org Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Pasha Tatashin Cc: Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index e0bafe7c0ded..03d12e27189f 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -171,12 +171,12 @@ static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, const unsigned long pfn_high = pfn >> order; physxa = xa_load(&track->orders, order); - if (!physxa) - continue; + if (WARN_ON_ONCE(!physxa)) + return; bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (!bits) - continue; + if (WARN_ON_ONCE(!bits)) + return; clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); From aaf46c6a6df6052881c2e75cba65aeb6f1cfa88a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 23 Oct 2025 22:21:02 -0700 Subject: [PATCH 238/543] tee: : - add ending ':' to some struct members as needed for kernel-doc - change struct name in kernel-doc to match the actual struct name (2x) - add a @params: kernel-doc entry multiple times Warning: tee.h:265 struct member 'ret_origin' not described in 'tee_ioctl_open_session_arg' Warning: tee.h:265 struct member 'num_params' not described in 'tee_ioctl_open_session_arg' Warning: tee.h:265 struct member 'params' not described in 'tee_ioctl_open_session_arg' Warning: tee.h:351 struct member 'num_params' not described in 'tee_iocl_supp_recv_arg' Warning: tee.h:351 struct member 'params' not described in 'tee_iocl_supp_recv_arg' Warning: tee.h:372 struct member 'num_params' not described in 'tee_iocl_supp_send_arg' Warning: tee.h:372 struct member 'params' not described in 'tee_iocl_supp_send_arg' Warning: tee.h:298: expecting prototype for struct tee_ioctl_invoke_func_arg. Prototype was for struct tee_ioctl_invoke_arg instead Warning: tee.h:473: expecting prototype for struct tee_ioctl_invoke_func_arg. Prototype was for struct tee_ioctl_object_invoke_arg instead Signed-off-by: Randy Dunlap Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- include/uapi/linux/tee.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 386ad36f1a0a..cab5cadca8ef 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -249,8 +249,9 @@ struct tee_ioctl_param { * @cancel_id: [in] Cancellation id, a unique value to identify this request * @session: [out] Session id * @ret: [out] return value - * @ret_origin [out] origin of the return value - * @num_params [in] number of parameters following this struct + * @ret_origin: [out] origin of the return value + * @num_params: [in] number of &struct tee_ioctl_param entries in @params + * @params: array of ioctl parameters */ struct tee_ioctl_open_session_arg { __u8 uuid[TEE_IOCTL_UUID_LEN]; @@ -276,14 +277,14 @@ struct tee_ioctl_open_session_arg { struct tee_ioctl_buf_data) /** - * struct tee_ioctl_invoke_func_arg - Invokes a function in a Trusted - * Application + * struct tee_ioctl_invoke_arg - Invokes a function in a Trusted Application * @func: [in] Trusted Application function, specific to the TA * @session: [in] Session id * @cancel_id: [in] Cancellation id, a unique value to identify this request * @ret: [out] return value - * @ret_origin [out] origin of the return value - * @num_params [in] number of parameters following this struct + * @ret_origin: [out] origin of the return value + * @num_params: [in] number of parameters following this struct + * @params: array of ioctl parameters */ struct tee_ioctl_invoke_arg { __u32 func; @@ -338,7 +339,8 @@ struct tee_ioctl_close_session_arg { /** * struct tee_iocl_supp_recv_arg - Receive a request for a supplicant function * @func: [in] supplicant function - * @num_params [in/out] number of parameters following this struct + * @num_params: [in/out] number of &struct tee_ioctl_param entries in @params + * @params: array of ioctl parameters * * @num_params is the number of params that tee-supplicant has room to * receive when input, @num_params is the number of actual params @@ -363,7 +365,8 @@ struct tee_iocl_supp_recv_arg { /** * struct tee_iocl_supp_send_arg - Send a response to a received request * @ret: [out] return value - * @num_params [in] number of parameters following this struct + * @num_params: [in] number of &struct tee_ioctl_param entries in @params + * @params: array of ioctl parameters */ struct tee_iocl_supp_send_arg { __u32 ret; @@ -454,11 +457,13 @@ struct tee_ioctl_shm_register_fd_data { */ /** - * struct tee_ioctl_invoke_func_arg - Invokes an object in a Trusted Application + * struct tee_ioctl_object_invoke_arg - Invokes an object in a + * Trusted Application * @id: [in] Object id * @op: [in] Object operation, specific to the object * @ret: [out] return value * @num_params: [in] number of parameters following this struct + * @params: array of ioctl parameters */ struct tee_ioctl_object_invoke_arg { __u64 id; From 05a1fc5efdd8560f34a3af39c9cf1e1526cc3ddf Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 9 Nov 2025 10:12:07 +0100 Subject: [PATCH 239/543] ALSA: usb-audio: Fix potential overflow of PCM transfer buffer The PCM stream data in USB-audio driver is transferred over USB URB packet buffers, and each packet size is determined dynamically. The packet sizes are limited by some factors such as wMaxPacketSize USB descriptor. OTOH, in the current code, the actually used packet sizes are determined only by the rate and the PPS, which may be bigger than the size limit above. This results in a buffer overflow, as reported by syzbot. Basically when the limit is smaller than the calculated packet size, it implies that something is wrong, most likely a weird USB descriptor. So the best option would be just to return an error at the parameter setup time before doing any further operations. This patch introduces such a sanity check, and returns -EINVAL when the packet size is greater than maxpacksize. The comparison with ep->packsize[1] alone should suffice since it's always equal or greater than ep->packsize[0]. Reported-by: syzbot+bfd77469c8966de076f7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=bfd77469c8966de076f7 Link: https://lore.kernel.org/690b6b46.050a0220.3d0d33.0054.GAE@google.com Cc: Lizhi Xu Cc: Link: https://patch.msgid.link/20251109091211.12739-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/endpoint.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index 880f5afcce60..cc15624ecaff 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -1362,6 +1362,11 @@ int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, ep->sample_rem = ep->cur_rate % ep->pps; ep->packsize[0] = ep->cur_rate / ep->pps; ep->packsize[1] = (ep->cur_rate + (ep->pps - 1)) / ep->pps; + if (ep->packsize[1] > ep->maxpacksize) { + usb_audio_dbg(chip, "Too small maxpacksize %u for rate %u / pps %u\n", + ep->maxpacksize, ep->cur_rate, ep->pps); + return -EINVAL; + } /* calculate the frequency in 16.16 format */ ep->freqm = ep->freqn; From 66e9feb03e7cf8983b1d0c540e2dad90d5146d48 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 9 Nov 2025 16:53:39 +0100 Subject: [PATCH 240/543] spi: Add TODO comment about ACPI GPIO setup Add a TODO comment that ideally the ACPI/gpiolib core code should take care of setting GPIO direction and/or bias according to ACPI GPIO resources. If this TODO gets implemented then the acpi_dev_gpio_irq_get() call in acpi_register_spi_device() can be dropped. Suggested-by: Andy Shevchenko Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20251109155340.26199-1-johannes.goede@oss.qualcomm.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 8588e8562220..e25df9990f82 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2857,6 +2857,8 @@ static acpi_status acpi_register_spi_device(struct spi_controller *ctlr, * here too, because this call sets the GPIO direction and/or bias. * Setting these needs to be done even if there is no driver, in which * case spi_probe() will never get called. + * TODO: ideally the setup of the GPIO should be handled in a generic + * manner in the ACPI/gpiolib core code. */ if (spi->irq < 0) spi->irq = acpi_dev_gpio_irq_get(adev, 0); From 4aa17144d5abc3c756883e3a010246f0dba8b468 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 14 Oct 2025 13:59:59 -0400 Subject: [PATCH 241/543] NFSD: free copynotify stateid in nfs4_free_ol_stateid() Typically copynotify stateid is freed either when parent's stateid is being close/freed or in nfsd4_laundromat if the stateid hasn't been used in a lease period. However, in case when the server got an OPEN (which created a parent stateid), followed by a COPY_NOTIFY using that stateid, followed by a client reboot. New client instance while doing CREATE_SESSION would force expire previous state of this client. It leads to the open state being freed thru release_openowner-> nfs4_free_ol_stateid() and it finds that it still has copynotify stateid associated with it. We currently print a warning and is triggerred WARNING: CPU: 1 PID: 8858 at fs/nfsd/nfs4state.c:1550 nfs4_free_ol_stateid+0xb0/0x100 [nfsd] This patch, instead, frees the associated copynotify stateid here. If the parent stateid is freed (without freeing the copynotify stateids associated with it), it leads to the list corruption when laundromat ends up freeing the copynotify state later. [ 1626.839430] Internal error: Oops - BUG: 00000000f2000800 [#1] SMP [ 1626.842828] Modules linked in: nfnetlink_queue nfnetlink_log bluetooth cfg80211 rpcrdma rdma_cm iw_cm ib_cm ib_core nfsd nfs_acl lockd grace nfs_localio ext4 crc16 mbcache jbd2 overlay uinput snd_seq_dummy snd_hrtimer qrtr rfkill vfat fat uvcvideo snd_hda_codec_generic videobuf2_vmalloc videobuf2_memops snd_hda_intel uvc snd_intel_dspcfg videobuf2_v4l2 videobuf2_common snd_hda_codec snd_hda_core videodev snd_hwdep snd_seq mc snd_seq_device snd_pcm snd_timer snd soundcore sg loop auth_rpcgss vsock_loopback vmw_vsock_virtio_transport_common vmw_vsock_vmci_transport vmw_vmci vsock xfs 8021q garp stp llc mrp nvme ghash_ce e1000e nvme_core sr_mod nvme_keyring nvme_auth cdrom vmwgfx drm_ttm_helper ttm sunrpc dm_mirror dm_region_hash dm_log iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi fuse dm_multipath dm_mod nfnetlink [ 1626.855594] CPU: 2 UID: 0 PID: 199 Comm: kworker/u24:33 Kdump: loaded Tainted: G B W 6.17.0-rc7+ #22 PREEMPT(voluntary) [ 1626.857075] Tainted: [B]=BAD_PAGE, [W]=WARN [ 1626.857573] Hardware name: VMware, Inc. VMware20,1/VBSA, BIOS VMW201.00V.24006586.BA64.2406042154 06/04/2024 [ 1626.858724] Workqueue: nfsd4 laundromat_main [nfsd] [ 1626.859304] pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 1626.860010] pc : __list_del_entry_valid_or_report+0x148/0x200 [ 1626.860601] lr : __list_del_entry_valid_or_report+0x148/0x200 [ 1626.861182] sp : ffff8000881d7a40 [ 1626.861521] x29: ffff8000881d7a40 x28: 0000000000000018 x27: ffff0000c2a98200 [ 1626.862260] x26: 0000000000000600 x25: 0000000000000000 x24: ffff8000881d7b20 [ 1626.862986] x23: ffff0000c2a981e8 x22: 1fffe00012410e7d x21: ffff0000920873e8 [ 1626.863701] x20: ffff0000920873e8 x19: ffff000086f22998 x18: 0000000000000000 [ 1626.864421] x17: 20747562202c3839 x16: 3932326636383030 x15: 3030666666662065 [ 1626.865092] x14: 6220646c756f6873 x13: 0000000000000001 x12: ffff60004fd9e4a3 [ 1626.865713] x11: 1fffe0004fd9e4a2 x10: ffff60004fd9e4a2 x9 : dfff800000000000 [ 1626.866320] x8 : 00009fffb0261b5e x7 : ffff00027ecf2513 x6 : 0000000000000001 [ 1626.866938] x5 : ffff00027ecf2510 x4 : ffff60004fd9e4a3 x3 : 0000000000000000 [ 1626.867553] x2 : 0000000000000000 x1 : ffff000096069640 x0 : 000000000000006d [ 1626.868167] Call trace: [ 1626.868382] __list_del_entry_valid_or_report+0x148/0x200 (P) [ 1626.868876] _free_cpntf_state_locked+0xd0/0x268 [nfsd] [ 1626.869368] nfs4_laundromat+0x6f8/0x1058 [nfsd] [ 1626.869813] laundromat_main+0x24/0x60 [nfsd] [ 1626.870231] process_one_work+0x584/0x1050 [ 1626.870595] worker_thread+0x4c4/0xc60 [ 1626.870893] kthread+0x2f8/0x398 [ 1626.871146] ret_from_fork+0x10/0x20 [ 1626.871422] Code: aa1303e1 aa1403e3 910e8000 97bc55d7 (d4210000) [ 1626.871892] SMP: stopping secondary CPUs Reported-by: rtm@csail.mit.edu Closes: https://lore.kernel.org/linux-nfs/d8f064c1-a26f-4eed-b4f0-1f7f608f415f@oracle.com/T/#t Fixes: 624322f1adc5 ("NFSD add COPY_NOTIFY operation") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c1b54322c412..de763e6d9b58 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1542,7 +1542,8 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid) release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); - WARN_ON(!list_empty(&stid->sc_cp_list)); + if (!list_empty(&stid->sc_cp_list)) + nfs4_free_cpntf_statelist(stid->sc_client->net, stid); kmem_cache_free(stateid_slab, stid); } From ff8141e49cf70d2d093a5228f5299ce188de6142 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 16 Oct 2025 09:49:55 -0400 Subject: [PATCH 242/543] NFSD: Skip close replay processing if XDR encoding fails The replay logic added by commit 9411b1d4c7df ("nfsd4: cleanup handling of nfsv4.0 closed stateid's") cannot be done if encoding failed due to a short send buffer; there's no guarantee that the operation encoder has actually encoded the data that is being copied to the replay cache. Reported-by: rtm@csail.mit.edu Closes: https://lore.kernel.org/linux-nfs/c3628d57-94ae-48cf-8c9e-49087a28cec9@oracle.com/T/#t Fixes: 9411b1d4c7df ("nfsd4: cleanup handling of nfsv4.0 closed stateid's") Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 6040a6145dad..cf5df0f50208 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -5925,8 +5925,7 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) */ warn_on_nonidempotent_op(op); xdr_truncate_encode(xdr, op_status_offset + XDR_UNIT); - } - if (so) { + } else if (so) { int len = xdr->buf->len - (op_status_offset + XDR_UNIT); so->so_replay.rp_status = op->status; From c96573c0d75db3f8478000d0d392a9cdb95adbed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 16 Oct 2025 09:49:56 -0400 Subject: [PATCH 243/543] NFSD: Never cache a COMPOUND when the SEQUENCE operation fails RFC 8881 normatively mandates that operations where the initial SEQUENCE operation in a compound fails must not modify the slot's replay cache. nfsd4_cache_this() doesn't prevent such caching. So when SEQUENCE fails, cstate.data_offset is not set, allowing read_bytes_from_xdr_buf() to access uninitialized memory. Reported-by: rtm@csail.mit.edu Closes: https://lore.kernel.org/linux-nfs/c3628d57-94ae-48cf-8c9e-49087a28cec9@oracle.com/T/#t Fixes: 468de9e54a90 ("nfsd41: expand solo sequence check") Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index de763e6d9b58..c8c326679dca 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3487,7 +3487,20 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) struct nfsd4_slot *slot = resp->cstate.slot; unsigned int base; - dprintk("--> %s slot %p\n", __func__, slot); + /* + * RFC 5661 Section 2.10.6.1.2: + * + * Any time SEQUENCE ... returns an error ... [t]he replier MUST NOT + * modify the reply cache entry for the slot whenever an error is + * returned from SEQUENCE ... + * + * Because nfsd4_store_cache_entry is called only by + * nfsd4_sequence_done(), nfsd4_store_cache_entry() is called only + * when a SEQUENCE operation was part of the COMPOUND. + * nfs41_check_op_ordering() ensures SEQUENCE is the first op. + */ + if (resp->opcnt == 1 && resp->cstate.status != nfs_ok) + return; slot->sl_flags |= NFSD4_SLOT_INITIALIZED; slot->sl_opcnt = resp->opcnt; From 1cff14b7fc7f31363c39d0269563ce75c714f7ae Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 16 Oct 2025 09:49:57 -0400 Subject: [PATCH 244/543] nfsd: ensure SEQUENCE replay sends a valid reply. nfsd4_enc_sequence_replay() uses nfsd4_encode_operation() to encode a new SEQUENCE reply when replaying a request from the slot cache - only ops after the SEQUENCE are replayed from the cache in ->sl_data. However it does this in nfsd4_replay_cache_entry() which is called *before* nfsd4_sequence() has filled in reply fields. This means that in the replayed SEQUENCE reply: maxslots will be whatever the client sent target_maxslots will be -1 (assuming init to zero, and nfsd4_encode_sequence() subtracts 1) status_flags will be zero The incorrect maxslots value, in particular, can cause the client to think the slot table has been reduced in size so it can discard its knowledge of current sequence number of the later slots, though the server has not discarded those slots. When the client later wants to use a later slot, it can get NFS4ERR_SEQ_MISORDERED from the server. This patch moves the setup of the reply into a new helper function and call it *before* nfsd4_replay_cache_entry() is called. Only one of the updated fields was used after this point - maxslots. So the nfsd4_sequence struct has been extended to have separate maxslots for the request and the response. Reported-by: Olga Kornievskaia Closes: https://lore.kernel.org/linux-nfs/20251010194449.10281-1-okorniev@redhat.com/ Tested-by: Olga Kornievskaia Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 50 ++++++++++++++++++++++++++++++--------------- fs/nfsd/nfs4xdr.c | 2 +- fs/nfsd/xdr4.h | 3 ++- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c8c326679dca..8a6960500217 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4363,6 +4363,36 @@ static bool replay_matches_cache(struct svc_rqst *rqstp, return true; } +/* + * Note that the response is constructed here both for the case + * of a new SEQUENCE request and for a replayed SEQUENCE request. + * We do not cache SEQUENCE responses as SEQUENCE is idempotent. + */ +static void nfsd4_construct_sequence_response(struct nfsd4_session *session, + struct nfsd4_sequence *seq) +{ + struct nfs4_client *clp = session->se_client; + + seq->maxslots_response = max(session->se_target_maxslots, + seq->maxslots); + seq->target_maxslots = session->se_target_maxslots; + + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + default: + seq->status_flags = 0; + } + if (!list_empty(&clp->cl_revoked)) + seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; + if (atomic_read(&clp->cl_admin_revoked)) + seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; +} + __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -4412,6 +4442,9 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("%s: slotid %d\n", __func__, seq->slotid); trace_nfsd_slot_seqid_sequence(clp, seq, slot); + + nfsd4_construct_sequence_response(session, seq); + status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_flags); if (status == nfserr_replay_cache) { status = nfserr_seq_misordered; @@ -4509,23 +4542,6 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } out: - seq->maxslots = max(session->se_target_maxslots, seq->maxslots); - seq->target_maxslots = session->se_target_maxslots; - - switch (clp->cl_cb_state) { - case NFSD4_CB_DOWN: - seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; - break; - case NFSD4_CB_FAULT: - seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; - break; - default: - seq->status_flags = 0; - } - if (!list_empty(&clp->cl_revoked)) - seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; - if (atomic_read(&clp->cl_admin_revoked)) - seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; trace_nfsd_seq4_status(rqstp, seq); out_no_session: if (conn) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index cf5df0f50208..67bb9c0b9fcb 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -5073,7 +5073,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, return nfserr; /* Note slotid's are numbered from zero: */ /* sr_highest_slotid */ - nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1); + nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots_response - 1); if (nfserr != nfs_ok) return nfserr; /* sr_target_highest_slotid */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index ee0570cbdd9e..1ce8e12ae335 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -574,8 +574,9 @@ struct nfsd4_sequence { struct nfs4_sessionid sessionid; /* request/response */ u32 seqid; /* request/response */ u32 slotid; /* request/response */ - u32 maxslots; /* request/response */ + u32 maxslots; /* request */ u32 cachethis; /* request */ + u32 maxslots_response; /* response */ u32 target_maxslots; /* response */ u32 status_flags; /* response */ }; From 324be6dcbf09133a322db16977a84fbb45c16129 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 16 Oct 2025 11:09:13 -0400 Subject: [PATCH 245/543] Revert "SUNRPC: Make RPCSEC_GSS_KRB5 select CRYPTO instead of depending on it" Geert reports: > This is now commit d8e97cc476e33037 ("SUNRPC: Make RPCSEC_GSS_KRB5 > select CRYPTO instead of depending on it") in v6.18-rc1. > As RPCSEC_GSS_KRB5 defaults to "y", CRYPTO is now auto-enabled in > defconfigs that didn't enable it before. Revert while we work out a proper solution and then test it. Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/linux-nfs/b97cea29-4ab7-4fb6-85ba-83f9830e524f@kernel.org/T/#t Signed-off-by: Chuck Lever --- net/sunrpc/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 33aafdc8392e..2d8b67dac7b5 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -18,10 +18,9 @@ config SUNRPC_SWAP config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism" - depends on SUNRPC + depends on SUNRPC && CRYPTO default y select SUNRPC_GSS - select CRYPTO select CRYPTO_SKCIPHER select CRYPTO_HASH help From 576c930e5e7dcb937648490611a83f1bf0171048 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 7 Nov 2025 18:12:14 +0100 Subject: [PATCH 246/543] drm/panthor: Flush shmem writes before mapping buffers CPU-uncached The shmem layer zeroes out the new pages using cached mappings, and if we don't CPU-flush we might leave dirty cachelines behind, leading to potential data leaks and/or asynchronous buffer corruption when dirty cachelines are evicted. Fixes: 8a1cc07578bf ("drm/panthor: Add GEM logical block") Signed-off-by: Boris Brezillon Reviewed-by: Steven Price Reviewed-by: Liviu Dudau Signed-off-by: Steven Price Link: https://patch.msgid.link/20251107171214.1186299-1-boris.brezillon@collabora.com --- drivers/gpu/drm/panthor/panthor_gem.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c index 156c7a0b62a2..3f43686f0195 100644 --- a/drivers/gpu/drm/panthor/panthor_gem.c +++ b/drivers/gpu/drm/panthor/panthor_gem.c @@ -288,6 +288,23 @@ panthor_gem_create_with_handle(struct drm_file *file, panthor_gem_debugfs_set_usage_flags(bo, 0); + /* If this is a write-combine mapping, we query the sgt to force a CPU + * cache flush (dma_map_sgtable() is called when the sgt is created). + * This ensures the zero-ing is visible to any uncached mapping created + * by vmap/mmap. + * FIXME: Ideally this should be done when pages are allocated, not at + * BO creation time. + */ + if (shmem->map_wc) { + struct sg_table *sgt; + + sgt = drm_gem_shmem_get_pages_sgt(shmem); + if (IS_ERR(sgt)) { + ret = PTR_ERR(sgt); + goto out_put_gem; + } + } + /* * Allocate an id of idr table where the obj is registered * and handle has the id what user can see. @@ -296,6 +313,7 @@ panthor_gem_create_with_handle(struct drm_file *file, if (!ret) *size = bo->base.base.size; +out_put_gem: /* drop reference from allocate - handle holds it now. */ drm_gem_object_put(&shmem->base); From 994dec10991b53beac3e16109d876ae363e8a329 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 6 Nov 2025 22:00:00 +0200 Subject: [PATCH 247/543] drm/i915/psr: fix pipe to vblank conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First, we can't assume pipe == crtc index. If a pipe is fused off in between, it no longer holds. intel_crtc_for_pipe() is the only proper way to get from a pipe to the corresponding crtc. Second, drivers aren't supposed to access or index drm->vblank[] directly. There's drm_crtc_vblank_crtc() for this. Use both functions to fix the pipe to vblank conversion. Fixes: f02658c46cf7 ("drm/i915/psr: Add mechanism to notify PSR of pipe enable/disable") Cc: Jouni Högander Cc: stable@vger.kernel.org # v6.16+ Reviewed-by: Jouni Högander Link: https://patch.msgid.link/20251106200000.1455164-1-jani.nikula@intel.com Signed-off-by: Jani Nikula (cherry picked from commit 2750f6765d6974f7e163c5d540a96c8703f6d8dd) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_psr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 10eb93a34cf2..d5e0a1e66944 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -888,7 +888,8 @@ static bool is_dc5_dc6_blocked(struct intel_dp *intel_dp) { struct intel_display *display = to_intel_display(intel_dp); u32 current_dc_state = intel_display_power_get_current_dc_state(display); - struct drm_vblank_crtc *vblank = &display->drm->vblank[intel_dp->psr.pipe]; + struct intel_crtc *crtc = intel_crtc_for_pipe(display, intel_dp->psr.pipe); + struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(&crtc->base); return (current_dc_state != DC_STATE_EN_UPTO_DC5 && current_dc_state != DC_STATE_EN_UPTO_DC6) || From 7aca00d950e782e66c34fbd045c9605eca343a36 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 18 Oct 2025 20:10:33 -0400 Subject: [PATCH 248/543] pnfs: Fix TLS logic in _nfs4_pnfs_v3_ds_connect() Don't try to add an RDMA transport to a client that is already marked as being a TCP/TLS transport. Fixes: 04a15263662a ("pnfs/flexfiles: connect to NFSv3 DS using TLS if MDS connection uses TLS") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs_nfs.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 7b32afb29782..ff48056bf750 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -809,8 +809,11 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, unsigned int retrans) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -834,27 +837,28 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .xprtsec = clp->cl_xprtsec, }; - if (da->da_transport != clp->cl_proto && - clp->cl_proto != XPRT_TRANSPORT_TCP_TLS) - continue; - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; - if (da->da_addr.ss_family != clp->cl_addr.ss_family) + if (xprt_args.ident != clp->cl_proto) + continue; + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) continue; /* Add this address as an alias */ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); + rpc_clnt_test_and_add_xprt, NULL); continue; } - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) - da->da_transport = XPRT_TRANSPORT_TCP_TLS; - clp = get_v3_ds_connect(mds_srv, - &da->da_addr, - da->da_addrlen, da->da_transport, - timeo, retrans); + + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = get_v3_ds_connect(mds_srv, &da->da_addr, da->da_addrlen, + ds_proto, timeo, retrans); if (IS_ERR(clp)) continue; clp->cl_rpcclient->cl_softerr = 0; From 28e19737e1570c7c71890547c2e43c3e0da79df9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 18 Oct 2025 20:10:34 -0400 Subject: [PATCH 249/543] pnfs: Fix TLS logic in _nfs4_pnfs_v4_ds_connect() Don't try to add an RDMA transport to a client that is already marked as being a TCP/TLS transport. Fixes: a35518cae4b3 ("NFSv4.1/pnfs: fix NFS with TLS in pnfs") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs_nfs.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index ff48056bf750..9976cc16b689 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -884,7 +884,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, u32 minor_version) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -912,12 +915,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, .data = &xprtdata, }; - if (da->da_transport != clp->cl_proto && - clp->cl_proto != XPRT_TRANSPORT_TCP_TLS) - continue; - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == - XPRT_TRANSPORT_TCP_TLS) { + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) { struct sockaddr *addr = (struct sockaddr *)&da->da_addr; struct sockaddr_in *sin = @@ -948,7 +947,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; xprt_args.servername = servername; } - if (da->da_addr.ss_family != clp->cl_addr.ss_family) + if (xprt_args.ident != clp->cl_proto) + continue; + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) continue; /** @@ -962,15 +964,14 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, if (xprtdata.cred) put_cred(xprtdata.cred); } else { - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == - XPRT_TRANSPORT_TCP_TLS) - da->da_transport = XPRT_TRANSPORT_TCP_TLS; - clp = nfs4_set_ds_client(mds_srv, - &da->da_addr, - da->da_addrlen, - da->da_transport, timeo, - retrans, minor_version); + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = nfs4_set_ds_client(mds_srv, &da->da_addr, + da->da_addrlen, ds_proto, + timeo, retrans, minor_version); if (IS_ERR(clp)) continue; @@ -981,7 +982,6 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, clp = ERR_PTR(-EIO); continue; } - } } From 8ab523ce78d4ca13add6b4ecbacff0f84c274603 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 18 Oct 2025 20:10:35 -0400 Subject: [PATCH 250/543] pnfs: Set transport security policy to RPC_XPRTSEC_NONE unless using TLS The default setting for the transport security policy must be RPC_XPRTSEC_NONE, when using a TCP or RDMA connection without TLS. Conversely, when using TLS, the security policy needs to be set. Fixes: 6c0a8c5fcf71 ("NFS: Have struct nfs_client carry a TLS policy field") Signed-off-by: Trond Myklebust Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/nfs3client.c | 14 ++++++++++++-- fs/nfs/nfs4client.c | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 0d7310c1ee0c..5d97c1d38bb6 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "internal.h" #include "nfs3_fs.h" #include "netns.h" @@ -98,7 +99,11 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, - .xprtsec = mds_clp->cl_xprtsec, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, .connect_timeout = connect_timeout, .reconnect_timeout = connect_timeout, }; @@ -111,9 +116,14 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { + case XPRT_TRANSPORT_TCP_TLS: + if (mds_clp->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_clp->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: - case XPRT_TRANSPORT_TCP_TLS: if (mds_clp->cl_nconnect > 1) cl_init.nconnect = mds_clp->cl_nconnect; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 5998d6bd8a4f..3a4baed993c9 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" #include "callback.h" #include "delegation.h" @@ -983,7 +984,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, - .xprtsec = mds_srv->nfs_client->cl_xprtsec, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, }; char buf[INET6_ADDRSTRLEN + 1]; @@ -992,9 +997,14 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { + case XPRT_TRANSPORT_TCP_TLS: + if (mds_srv->nfs_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_srv->nfs_client->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: - case XPRT_TRANSPORT_TCP_TLS: if (mds_clp->cl_nconnect > 1) { cl_init.nconnect = mds_clp->cl_nconnect; cl_init.max_connect = NFS_MAX_TRANSPORTS; From fb2cba0854a7f315c8100a807a6959b99d72479e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 18 Oct 2025 20:10:36 -0400 Subject: [PATCH 251/543] NFS: Check the TLS certificate fields in nfs_match_client() If the TLS security policy is of type RPC_XPRTSEC_TLS_X509, then the cert_serial and privkey_serial fields need to match as well since they define the client's identity, as presented to the server. Fixes: 90c9550a8d65 ("NFS: support the kernel keyring for TLS") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4e3dcc157a83..54699299d5b1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -338,6 +338,14 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat /* Match the xprt security policy */ if (clp->cl_xprtsec.policy != data->xprtsec.policy) continue; + if (clp->cl_xprtsec.policy == RPC_XPRTSEC_TLS_X509) { + if (clp->cl_xprtsec.cert_serial != + data->xprtsec.cert_serial) + continue; + if (clp->cl_xprtsec.privkey_serial != + data->xprtsec.privkey_serial) + continue; + } refcount_inc(&clp->cl_count); return clp; From 51a491f2708de79da76791523d40926921823b7e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 27 Oct 2025 09:08:31 -0400 Subject: [PATCH 252/543] nfs/localio: remove unecessary ENOTBLK handling in DIO WRITE support Each filesystem is meant to fallback to retrying DIO in terms buffered IO when it might encounter -ENOTBLK when issuing DIO (which can happen if the VFS cannot invalidate the page cache). So NFS doesn't need special handling for -ENOTBLK. Also, explicitly initialize a couple DIO related iocb members rather than simply rely on data structure zeroing. Fixes: c817248fc831 ("nfs/localio: add proper O_DIRECT support for READ and WRITE") Reported-by: Christoph Hellwig Signed-off-by: Mike Snitzer Signed-off-by: Anna Schumaker --- fs/nfs/localio.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 2c0455e91571..0383d6eb2f46 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -315,6 +315,7 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, iocb->hdr = hdr; iocb->kiocb.ki_flags &= ~IOCB_APPEND; + iocb->kiocb.ki_complete = NULL; iocb->aio_complete_work = NULL; iocb->end_iter_index = -1; @@ -484,6 +485,7 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) /* Use buffered IO */ iocb->offset[0] = hdr->args.offset; iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); + iocb->iter_is_dio_aligned[0] = false; iocb->n_iters = 1; } @@ -803,7 +805,7 @@ static void nfs_local_call_write(struct work_struct *work) iocb->kiocb.ki_complete = nfs_local_write_aio_complete; iocb->aio_complete_work = nfs_local_write_aio_complete_work; } -retry: + iocb->kiocb.ki_pos = iocb->offset[i]; status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); if (status != -EIOCBQUEUED) { @@ -823,15 +825,6 @@ static void nfs_local_call_write(struct work_struct *work) nfs_local_pgio_done(iocb->hdr, status); break; } - } else if (unlikely(status == -ENOTBLK && - (iocb->kiocb.ki_flags & IOCB_DIRECT))) { - /* VFS will return -ENOTBLK if DIO WRITE fails to - * invalidate the page cache. Retry using buffered IO. - */ - iocb->kiocb.ki_flags &= ~IOCB_DIRECT; - iocb->kiocb.ki_complete = NULL; - iocb->aio_complete_work = NULL; - goto retry; } nfs_local_pgio_done(iocb->hdr, status); if (iocb->hdr->task.tk_status) From f2060bdc21d70f3d8a4753a9fd3b0b02cb48c0bc Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 27 Oct 2025 09:08:32 -0400 Subject: [PATCH 253/543] nfs/localio: add refcounting for each iocb IO associated with NFS pgio header Improve completion handling of as many as 3 IOs associated with each misaligned DIO by using a atomic_t to track completion of each IO. Update nfs_local_pgio_done() to use precise atomic_t accounting for remaining iov_iter (up to 3) associated with each iocb, so that each NFS LOCALIO pgio header is only released after all IOs have completed. But also allow early return if/when a short read or write occurs. Fixes reported BUG: KASAN: slab-use-after-free in nfs_local_call_read: https://lore.kernel.org/linux-nfs/aPSvi5Yr2lGOh5Jh@dell-per750-06-vm-07.rhts.eng.pek2.redhat.com/ Reported-by: Yongcheng Yang Fixes: c817248fc831 ("nfs/localio: add proper O_DIRECT support for READ and WRITE") Signed-off-by: Mike Snitzer Signed-off-by: Anna Schumaker --- fs/nfs/localio.c | 110 +++++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 43 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 0383d6eb2f46..647fa19b0479 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -42,7 +42,7 @@ struct nfs_local_kiocb { /* Begin mostly DIO-specific members */ size_t end_len; short int end_iter_index; - short int n_iters; + atomic_t n_iters; bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS]; loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned; struct iov_iter iters[NFSLOCAL_MAX_IOS]; @@ -407,6 +407,7 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, iters[n_iters].count = local_dio->start_len; iocb->offset[n_iters] = iocb->hdr->args.offset; iocb->iter_is_dio_aligned[n_iters] = false; + atomic_inc(&iocb->n_iters); ++n_iters; } @@ -425,6 +426,7 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, /* Save index and length of end */ iocb->end_iter_index = n_iters; iocb->end_len = local_dio->end_len; + atomic_inc(&iocb->n_iters); ++n_iters; } @@ -448,7 +450,6 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, } ++n_iters; - iocb->n_iters = n_iters; return n_iters; } @@ -474,6 +475,12 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) } len = hdr->args.count - total; + /* + * For each iocb, iocb->n_iter is always at least 1 and we always + * end io after first nfs_local_pgio_done call unless misaligned DIO. + */ + atomic_set(&iocb->n_iters, 1); + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { struct nfs_local_dio local_dio; @@ -486,7 +493,6 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) iocb->offset[0] = hdr->args.offset; iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); iocb->iter_is_dio_aligned[0] = false; - iocb->n_iters = 1; } static void @@ -506,9 +512,11 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr, hdr->task.tk_start = ktime_get(); } -static void -nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) +static bool +nfs_local_pgio_done(struct nfs_local_kiocb *iocb, long status, bool force) { + struct nfs_pgio_header *hdr = iocb->hdr; + /* Must handle partial completions */ if (status >= 0) { hdr->res.count += status; @@ -519,6 +527,12 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status); hdr->task.tk_status = status; } + + if (force) + return true; + + BUG_ON(atomic_read(&iocb->n_iters) <= 0); + return atomic_dec_and_test(&iocb->n_iters); } static void @@ -549,11 +563,11 @@ static inline void nfs_local_pgio_aio_complete(struct nfs_local_kiocb *iocb) queue_work(nfsiod_workqueue, &iocb->work); } -static void -nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) +static void nfs_local_read_done(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; struct file *filp = iocb->kiocb.ki_filp; + long status = hdr->task.tk_status; if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ @@ -574,12 +588,18 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) status > 0 ? status : 0, hdr->res.eof); } +static inline void nfs_local_read_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_read_done(iocb); + nfs_local_pgio_release(iocb); +} + static void nfs_local_read_aio_complete_work(struct work_struct *work) { struct nfs_local_kiocb *iocb = container_of(work, struct nfs_local_kiocb, work); - nfs_local_pgio_release(iocb); + nfs_local_read_iocb_done(iocb); } static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) @@ -587,8 +607,10 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); - nfs_local_pgio_done(iocb->hdr, ret); - nfs_local_read_done(iocb, ret); + /* AIO completion of DIO read should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ } @@ -599,10 +621,13 @@ static void nfs_local_call_read(struct work_struct *work) struct file *filp = iocb->kiocb.ki_filp; const struct cred *save_cred; ssize_t status; + int n_iters; save_cred = override_creds(filp->f_cred); - for (int i = 0; i < iocb->n_iters ; i++) { + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { + /* DIO-aligned middle is always issued last with AIO completion */ if (iocb->iter_is_dio_aligned[i]) { iocb->kiocb.ki_flags |= IOCB_DIRECT; iocb->kiocb.ki_complete = nfs_local_read_aio_complete; @@ -612,18 +637,14 @@ static void nfs_local_call_read(struct work_struct *work) iocb->kiocb.ki_pos = iocb->offset[i]; status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); if (status != -EIOCBQUEUED) { - nfs_local_pgio_done(iocb->hdr, status); - if (iocb->hdr->task.tk_status) + if (nfs_local_pgio_done(iocb, status, false)) { + nfs_local_read_iocb_done(iocb); break; + } } } revert_creds(save_cred); - - if (status != -EIOCBQUEUED) { - nfs_local_read_done(iocb, status); - nfs_local_pgio_release(iocb); - } } static int @@ -738,11 +759,10 @@ static void nfs_local_vfs_getattr(struct nfs_local_kiocb *iocb) fattr->du.nfs3.used = stat.blocks << 9; } -static void -nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) +static void nfs_local_write_done(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; - struct inode *inode = hdr->inode; + long status = hdr->task.tk_status; dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0); @@ -761,10 +781,17 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset); status = -ENOSPC; /* record -ENOSPC in terms of nfs_local_pgio_done */ - nfs_local_pgio_done(hdr, status); + (void) nfs_local_pgio_done(iocb, status, true); } if (hdr->task.tk_status < 0) - nfs_reset_boot_verifier(inode); + nfs_reset_boot_verifier(hdr->inode); +} + +static inline void nfs_local_write_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_write_done(iocb); + nfs_local_vfs_getattr(iocb); + nfs_local_pgio_release(iocb); } static void nfs_local_write_aio_complete_work(struct work_struct *work) @@ -772,8 +799,7 @@ static void nfs_local_write_aio_complete_work(struct work_struct *work) struct nfs_local_kiocb *iocb = container_of(work, struct nfs_local_kiocb, work); - nfs_local_vfs_getattr(iocb); - nfs_local_pgio_release(iocb); + nfs_local_write_iocb_done(iocb); } static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) @@ -781,8 +807,10 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); - nfs_local_pgio_done(iocb->hdr, ret); - nfs_local_write_done(iocb, ret); + /* AIO completion of DIO write should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ } @@ -793,13 +821,17 @@ static void nfs_local_call_write(struct work_struct *work) struct file *filp = iocb->kiocb.ki_filp; unsigned long old_flags = current->flags; const struct cred *save_cred; + bool force_done = false; ssize_t status; + int n_iters; current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; save_cred = override_creds(filp->f_cred); file_start_write(filp); - for (int i = 0; i < iocb->n_iters ; i++) { + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { + /* DIO-aligned middle is always issued last with AIO completion */ if (iocb->iter_is_dio_aligned[i]) { iocb->kiocb.ki_flags |= IOCB_DIRECT; iocb->kiocb.ki_complete = nfs_local_write_aio_complete; @@ -812,35 +844,27 @@ static void nfs_local_call_write(struct work_struct *work) if (unlikely(status >= 0 && status < iocb->iters[i].count)) { /* partial write */ if (i == iocb->end_iter_index) { - /* Must not account partial end, otherwise, due - * to end being issued before middle: the partial + /* Must not account DIO partial end, otherwise (due + * to end being issued before middle): the partial * write accounting in nfs_local_write_done() * would incorrectly advance hdr->args.offset */ status = 0; } else { - /* Partial write at start or buffered middle, - * exit early. - */ - nfs_local_pgio_done(iocb->hdr, status); - break; + /* Partial write at start or middle, force done */ + force_done = true; } } - nfs_local_pgio_done(iocb->hdr, status); - if (iocb->hdr->task.tk_status) + if (nfs_local_pgio_done(iocb, status, force_done)) { + nfs_local_write_iocb_done(iocb); break; + } } } file_end_write(filp); revert_creds(save_cred); current->flags = old_flags; - - if (status != -EIOCBQUEUED) { - nfs_local_write_done(iocb, status); - nfs_local_vfs_getattr(iocb); - nfs_local_pgio_release(iocb); - } } static int From d0497dd27452c79a48414df813a16cd12d274b3b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 27 Oct 2025 09:08:33 -0400 Subject: [PATCH 254/543] nfs/localio: backfill missing partial read support for misaligned DIO Misaligned DIO read can be split into 3 IOs, must handle potential for short read from each component IO (follows same pattern used for handling partial writes, except upper layer read code handles advancing offset before retry). Fixes: c817248fc831 ("nfs/localio: add proper O_DIRECT support for READ and WRITE") Signed-off-by: Mike Snitzer Signed-off-by: Anna Schumaker --- fs/nfs/localio.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 647fa19b0479..9c205f8b5e59 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -414,7 +414,7 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, /* Setup misaligned end? * If so, the end is purposely setup to be issued using buffered IO * before the middle (which will use DIO, if DIO-aligned, with AIO). - * This creates problems if/when the end results in a partial write. + * This creates problems if/when the end results in short read or write. * So must save index and length of end to handle this corner case. */ if (local_dio->end_len) { @@ -580,8 +580,9 @@ static void nfs_local_read_done(struct nfs_local_kiocb *iocb) */ hdr->res.replen = 0; - if (hdr->res.count != hdr->args.count || - hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) + /* nfs_readpage_result() handles short read */ + + if (hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) hdr->res.eof = true; dprintk("%s: read %ld bytes eof %d.\n", __func__, @@ -620,6 +621,7 @@ static void nfs_local_call_read(struct work_struct *work) container_of(work, struct nfs_local_kiocb, work); struct file *filp = iocb->kiocb.ki_filp; const struct cred *save_cred; + bool force_done = false; ssize_t status; int n_iters; @@ -637,7 +639,21 @@ static void nfs_local_call_read(struct work_struct *work) iocb->kiocb.ki_pos = iocb->offset[i]; status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); if (status != -EIOCBQUEUED) { - if (nfs_local_pgio_done(iocb, status, false)) { + if (unlikely(status >= 0 && status < iocb->iters[i].count)) { + /* partial read */ + if (i == iocb->end_iter_index) { + /* Must not account DIO partial end, otherwise (due + * to end being issued before middle): the partial + * read accounting in nfs_local_read_done() + * would incorrectly advance hdr->args.offset + */ + status = 0; + } else { + /* Partial read at start or middle, force done */ + force_done = true; + } + } + if (nfs_local_pgio_done(iocb, status, force_done)) { nfs_local_read_iocb_done(iocb); break; } From d32ddfeb559342e89a4d06b1df4e7e5e96df3762 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 27 Oct 2025 13:52:28 -0400 Subject: [PATCH 255/543] nfs/localio: Ensure DIO WRITE's IO on stable storage upon completion LOCALIO's misaligned DIO WRITE support requires synchronous IO for any misaligned head and/or tail that are issued using buffered IO. In addition, it is important that the O_DIRECT middle be on stable storage upon its completion via AIO. Otherwise, a misaligned DIO WRITE could mix buffered IO for the head/tail and direct IO for the DIO-aligned middle -- which could lead to problems associated with deferred writes to stable storage (such as out of order partial completions causing incorrect advancement of the file's offset, etc). Fixes: c817248fc831 ("nfs/localio: add proper O_DIRECT support for READ and WRITE") Signed-off-by: Mike Snitzer Signed-off-by: Anna Schumaker --- fs/nfs/localio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 9c205f8b5e59..839dbda0b370 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -485,8 +485,12 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) struct nfs_local_dio local_dio; if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) && - nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) + nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) { + /* Ensure DIO WRITE's IO on stable storage upon completion */ + if (rw == ITER_SOURCE) + iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC; return; /* is DIO-aligned */ + } } /* Use buffered IO */ From eb2d6774cc0d9d6ab8f924825695a85c14b2e0c2 Mon Sep 17 00:00:00 2001 From: Niranjan H Y Date: Mon, 10 Nov 2025 20:56:46 +0530 Subject: [PATCH 256/543] ASoC: SDCA: bug fix while parsing mipi-sdca-control-cn-list "struct sdca_control" declares "values" field as integer array. But the memory allocated to it is of char array. This causes crash for sdca_parse_function API. This patch addresses the issue by allocating correct data size. Signed-off-by: Niranjan H Y Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20251110152646.192-1-niranjan.hy@ti.com Signed-off-by: Mark Brown --- sound/soc/sdca/sdca_functions.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c index 13f68f7b6dd6..0ccb6775f4de 100644 --- a/sound/soc/sdca/sdca_functions.c +++ b/sound/soc/sdca/sdca_functions.c @@ -894,7 +894,8 @@ static int find_sdca_entity_control(struct device *dev, struct sdca_entity *enti return ret; } - control->values = devm_kzalloc(dev, hweight64(control->cn_list), GFP_KERNEL); + control->values = devm_kcalloc(dev, hweight64(control->cn_list), + sizeof(int), GFP_KERNEL); if (!control->values) return -ENOMEM; From c93433fd4e2bbbe7caa67b53d808b4a084852ff3 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 10 Nov 2025 12:12:52 +0100 Subject: [PATCH 257/543] platform/x86: msi-wmi-platform: Only load on MSI devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out that the GUID used by the msi-wmi-platform driver (ABBC0F60-8EA1-11D1-00A0-C90629100000) is not unique, but was instead copied from the WIndows Driver Samples. This means that this driver could load on devices from other manufacturers that also copied this GUID, potentially causing hardware errors. Prevent this by only loading on devices whitelisted via DMI. The DMI matches where taken from the msi-ec driver. Reported-by: Antheas Kapenekakis Fixes: 9c0beb6b29e7 ("platform/x86: wmi: Add MSI WMI Platform driver") Tested-by: Antheas Kapenekakis Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20251110111253.16204-2-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/Kconfig | 1 + drivers/platform/x86/msi-wmi-platform.c | 41 ++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index c122016d82f1..c883a28e0916 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -545,6 +545,7 @@ config MSI_WMI config MSI_WMI_PLATFORM tristate "MSI WMI Platform features" depends on ACPI_WMI + depends on DMI depends on HWMON help Say Y here if you want to have support for WMI-based platform features diff --git a/drivers/platform/x86/msi-wmi-platform.c b/drivers/platform/x86/msi-wmi-platform.c index dc5e9878cb68..bd2687828a2e 100644 --- a/drivers/platform/x86/msi-wmi-platform.c +++ b/drivers/platform/x86/msi-wmi-platform.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -448,7 +449,45 @@ static struct wmi_driver msi_wmi_platform_driver = { .probe = msi_wmi_platform_probe, .no_singleton = true, }; -module_wmi_driver(msi_wmi_platform_driver); + +/* + * MSI reused the WMI GUID from the WMI-ACPI sample code provided by Microsoft, + * so other manufacturers might use it as well for their WMI-ACPI implementations. + */ +static const struct dmi_system_id msi_wmi_platform_whitelist[] __initconst = { + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "MICRO-STAR INT"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Micro-Star International"), + }, + }, + { } +}; + +static int __init msi_wmi_platform_module_init(void) +{ + if (!dmi_check_system(msi_wmi_platform_whitelist)) { + if (!force) + return -ENODEV; + + pr_warn("Ignoring DMI whitelist\n"); + } + + return wmi_driver_register(&msi_wmi_platform_driver); +} + +static void __exit msi_wmi_platform_module_exit(void) +{ + wmi_driver_unregister(&msi_wmi_platform_driver); +} + +module_init(msi_wmi_platform_module_init); +module_exit(msi_wmi_platform_module_exit); + MODULE_AUTHOR("Armin Wolf "); MODULE_DESCRIPTION("MSI WMI platform features"); From 97b726eb1dc2b4a2532544eb3da72bb6acbd39a3 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 10 Nov 2025 12:12:53 +0100 Subject: [PATCH 258/543] platform/x86: msi-wmi-platform: Fix typo in WMI GUID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The WMI driver core only supports GUID strings containing only uppercase characters, however the GUID string used by the msi-wmi-platform driver contains a single lowercase character. This prevents the WMI driver core from matching said driver to its WMI device. Fix this by turning the lowercase character into a uppercase character. Also update the WMI driver development guide to warn about this. Reported-by: Antheas Kapenekakis Fixes: 9c0beb6b29e7 ("platform/x86: wmi: Add MSI WMI Platform driver") Tested-by: Antheas Kapenekakis Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20251110111253.16204-3-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- Documentation/wmi/driver-development-guide.rst | 1 + drivers/platform/x86/msi-wmi-platform.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/wmi/driver-development-guide.rst b/Documentation/wmi/driver-development-guide.rst index 99ef21fc1c1e..5680303ae314 100644 --- a/Documentation/wmi/driver-development-guide.rst +++ b/Documentation/wmi/driver-development-guide.rst @@ -54,6 +54,7 @@ to matching WMI devices using a struct wmi_device_id table: :: static const struct wmi_device_id foo_id_table[] = { + /* Only use uppercase letters! */ { "936DA01F-9ABD-4D9D-80C7-02AF85C822A8", NULL }, { } }; diff --git a/drivers/platform/x86/msi-wmi-platform.c b/drivers/platform/x86/msi-wmi-platform.c index bd2687828a2e..e912fcc12d12 100644 --- a/drivers/platform/x86/msi-wmi-platform.c +++ b/drivers/platform/x86/msi-wmi-platform.c @@ -29,7 +29,7 @@ #define DRIVER_NAME "msi-wmi-platform" -#define MSI_PLATFORM_GUID "ABBC0F6E-8EA1-11d1-00A0-C90629100000" +#define MSI_PLATFORM_GUID "ABBC0F6E-8EA1-11D1-00A0-C90629100000" #define MSI_WMI_PLATFORM_INTERFACE_VERSION 2 From 0b2f7be548006b0651e1e8320790f49723265cbc Mon Sep 17 00:00:00 2001 From: Nitin Gote Date: Mon, 27 Oct 2025 14:56:43 +0530 Subject: [PATCH 259/543] drm/xe/xe3: Add WA_14024681466 for Xe3_LPG Apply WA_14024681466 to Xe3_LPG graphics IP versions from 30.00 to 30.05. v2: (Matthew Roper) - Remove stepping filter as workaround applies to all steppings. - Add an engine class filter so it only applies to the RENDER engine. Signed-off-by: Nitin Gote Link: https://patch.msgid.link/20251027092643.335904-1-nitin.r.gote@intel.com Reviewed-by: Matt Roper Signed-off-by: Matt Roper (cherry picked from commit 071089a69e199bd810ff31c4c933bd528e502743) Cc: stable@vger.kernel.org # v6.16+ Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/regs/xe_gt_regs.h | 1 + drivers/gpu/drm/xe/xe_wa.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h index 51f2a03847f9..f680c8b8f258 100644 --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h @@ -168,6 +168,7 @@ #define XEHP_SLICE_COMMON_ECO_CHICKEN1 XE_REG_MCR(0x731c, XE_REG_OPTION_MASKED) #define MSC_MSAA_REODER_BUF_BYPASS_DISABLE REG_BIT(14) +#define FAST_CLEAR_VALIGN_FIX REG_BIT(13) #define XE2LPM_CCCHKNREG1 XE_REG(0x82a8) diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c index cd03891654a1..c33719e2e0df 100644 --- a/drivers/gpu/drm/xe/xe_wa.c +++ b/drivers/gpu/drm/xe/xe_wa.c @@ -916,6 +916,10 @@ static const struct xe_rtp_entry_sr lrc_was[] = { XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3003), ENGINE_CLASS(RENDER)), XE_RTP_ACTIONS(SET(COMMON_SLICE_CHICKEN4, SBE_PUSH_CONSTANT_BEHIND_FIX_ENABLE)) }, + { XE_RTP_NAME("14024681466"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3005), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(XEHP_SLICE_COMMON_ECO_CHICKEN1, FAST_CLEAR_VALIGN_FIX)) + }, }; static __maybe_unused const struct xe_rtp_entry oob_was[] = { From fa3376319b83ba8b7fd55f2c1a268dcbf9d6eedc Mon Sep 17 00:00:00 2001 From: Tangudu Tilak Tirumalesh Date: Thu, 30 Oct 2025 21:16:26 +0530 Subject: [PATCH 260/543] drm/xe/xe3: Extend wa_14023061436 Extend wa_14023061436 to Graphics Versions 30.03, 30.04 and 30.05. Signed-off-by: Tangudu Tilak Tirumalesh Reviewed-by: Matt Roper Link: https://patch.msgid.link/20251030154626.3124565-1-tilak.tirumalesh.tangudu@intel.com Signed-off-by: Matt Roper (cherry picked from commit 0dd656d06f50ae4cedf160634cf13fd9e0944cf7) Cc: stable@vger.kernel.org # v6.17+ Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_wa.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c index c33719e2e0df..917b97317d11 100644 --- a/drivers/gpu/drm/xe/xe_wa.c +++ b/drivers/gpu/drm/xe/xe_wa.c @@ -679,6 +679,8 @@ static const struct xe_rtp_entry_sr engine_was[] = { }, { XE_RTP_NAME("14023061436"), XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3001), + FUNC(xe_rtp_match_first_render_or_compute), OR, + GRAPHICS_VERSION_RANGE(3003, 3005), FUNC(xe_rtp_match_first_render_or_compute)), XE_RTP_ACTIONS(SET(TDL_CHICKEN, QID_WAIT_FOR_THREAD_NOT_RUN_DISABLE)) }, From 240372edaf854c9136f5ead45f2d8cd9496a9cb3 Mon Sep 17 00:00:00 2001 From: Nitin Gote Date: Thu, 6 Nov 2025 15:35:17 +0530 Subject: [PATCH 261/543] drm/xe/xe3lpg: Extend Wa_15016589081 for xe3lpg Wa_15016589081 applies to Xe3_LPG renderCS Signed-off-by: Nitin Gote Link: https://patch.msgid.link/20251106100516.318863-2-nitin.r.gote@intel.com Signed-off-by: Matt Roper (cherry picked from commit 715974499a2199bd199fb4630501f55545342ea4) Cc: stable@vger.kernel.org # v6.16+ Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_wa.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c index 917b97317d11..3cf30718b200 100644 --- a/drivers/gpu/drm/xe/xe_wa.c +++ b/drivers/gpu/drm/xe/xe_wa.c @@ -922,6 +922,11 @@ static const struct xe_rtp_entry_sr lrc_was[] = { XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3005), ENGINE_CLASS(RENDER)), XE_RTP_ACTIONS(SET(XEHP_SLICE_COMMON_ECO_CHICKEN1, FAST_CLEAR_VALIGN_FIX)) }, + { XE_RTP_NAME("15016589081"), + XE_RTP_RULES(GRAPHICS_VERSION(3000), GRAPHICS_STEP(A0, B0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(CHICKEN_RASTER_1, DIS_CLIP_NEGATIVE_BOUNDING_BOX)) + }, }; static __maybe_unused const struct xe_rtp_entry oob_was[] = { From 6a218b9c3183ed19d5703130025282cf20463d87 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 5 Nov 2025 22:03:04 -0500 Subject: [PATCH 262/543] nfs/localio: do not issue misaligned DIO out-of-order From https://lore.kernel.org/linux-nfs/aQHASIumLJyOoZGH@infradead.org/ On Wed, Oct 29, 2025 at 12:20:40AM -0700, Christoph Hellwig wrote: > On Mon, Oct 27, 2025 at 12:18:30PM -0400, Mike Snitzer wrote: > > LOCALIO's misaligned DIO will issue head/tail followed by O_DIRECT > > middle (via AIO completion of that aligned middle). So out of order > > relative to file offset. > > That's in general a really bad idea. It will obviously work, but > both on SSDs and out of place write file systems it is a sure way > to increase your garbage collection overhead a lot down the line. Fix this by never issuing misaligned DIO out of order. This fix means the DIO-aligned middle will only use AIO completion if there is no misaligned end segment. Otherwise, all 3 segments of a misaligned DIO will be issued without AIO completion to ensure file offset increases properly for all partial READ or WRITE situations. Factoring out nfs_local_iter_setup() helps standardize repetitive nfs_local_iters_setup_dio() code and is inspired by cleanup work that Chuck Lever did on the NFSD Direct code. Fixes: c817248fc831 ("nfs/localio: add proper O_DIRECT support for READ and WRITE") Reported-by: Christoph Hellwig Signed-off-by: Mike Snitzer Signed-off-by: Anna Schumaker --- fs/nfs/localio.c | 128 +++++++++++++++++++---------------------------- 1 file changed, 52 insertions(+), 76 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 839dbda0b370..656976b4f42c 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -44,8 +44,7 @@ struct nfs_local_kiocb { short int end_iter_index; atomic_t n_iters; bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS]; - loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned; - struct iov_iter iters[NFSLOCAL_MAX_IOS]; + struct iov_iter iters[NFSLOCAL_MAX_IOS] ____cacheline_aligned; /* End mostly DIO-specific members */ }; @@ -314,6 +313,7 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, init_sync_kiocb(&iocb->kiocb, file); iocb->hdr = hdr; + iocb->kiocb.ki_pos = hdr->args.offset; iocb->kiocb.ki_flags &= ~IOCB_APPEND; iocb->kiocb.ki_complete = NULL; iocb->aio_complete_work = NULL; @@ -389,13 +389,24 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, return true; } +static void +nfs_local_iter_setup(struct iov_iter *iter, int rw, struct bio_vec *bvec, + unsigned int nvecs, unsigned long total, + size_t start, size_t len) +{ + iov_iter_bvec(iter, rw, bvec, nvecs, total); + if (start) + iov_iter_advance(iter, start); + iov_iter_truncate(iter, len); +} + /* * Setup as many as 3 iov_iter based on extents described by @local_dio. * Returns the number of iov_iter that were setup. */ static int nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, - unsigned int nvecs, size_t len, + unsigned int nvecs, unsigned long total, struct nfs_local_dio *local_dio) { int n_iters = 0; @@ -403,41 +414,17 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, /* Setup misaligned start? */ if (local_dio->start_len) { - iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); - iters[n_iters].count = local_dio->start_len; - iocb->offset[n_iters] = iocb->hdr->args.offset; - iocb->iter_is_dio_aligned[n_iters] = false; - atomic_inc(&iocb->n_iters); + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, 0, local_dio->start_len); ++n_iters; } - /* Setup misaligned end? - * If so, the end is purposely setup to be issued using buffered IO - * before the middle (which will use DIO, if DIO-aligned, with AIO). - * This creates problems if/when the end results in short read or write. - * So must save index and length of end to handle this corner case. + /* + * Setup DIO-aligned middle, if there is no misaligned end (below) + * then AIO completion is used, see nfs_local_call_{read,write} */ - if (local_dio->end_len) { - iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); - iocb->offset[n_iters] = local_dio->end_offset; - iov_iter_advance(&iters[n_iters], - local_dio->start_len + local_dio->middle_len); - iocb->iter_is_dio_aligned[n_iters] = false; - /* Save index and length of end */ - iocb->end_iter_index = n_iters; - iocb->end_len = local_dio->end_len; - atomic_inc(&iocb->n_iters); - ++n_iters; - } - - /* Setup DIO-aligned middle to be issued last, to allow for - * DIO with AIO completion (see nfs_local_call_{read,write}). - */ - iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); - if (local_dio->start_len) - iov_iter_advance(&iters[n_iters], local_dio->start_len); - iters[n_iters].count -= local_dio->end_len; - iocb->offset[n_iters] = local_dio->middle_offset; + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, nvecs, + total, local_dio->start_len, local_dio->middle_len); iocb->iter_is_dio_aligned[n_iters] = nfs_iov_iter_aligned_bvec(&iters[n_iters], @@ -445,11 +432,22 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) { trace_nfs_local_dio_misaligned(iocb->hdr->inode, - iocb->hdr->args.offset, len, local_dio); + local_dio->start_len, local_dio->middle_len, local_dio); return 0; /* no DIO-aligned IO possible */ } + iocb->end_iter_index = n_iters; ++n_iters; + /* Setup misaligned end? */ + if (local_dio->end_len) { + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, local_dio->start_len + + local_dio->middle_len, local_dio->end_len); + iocb->end_iter_index = n_iters; + ++n_iters; + } + + atomic_set(&iocb->n_iters, n_iters); return n_iters; } @@ -476,7 +474,7 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) len = hdr->args.count - total; /* - * For each iocb, iocb->n_iter is always at least 1 and we always + * For each iocb, iocb->n_iters is always at least 1 and we always * end io after first nfs_local_pgio_done call unless misaligned DIO. */ atomic_set(&iocb->n_iters, 1); @@ -494,9 +492,7 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) } /* Use buffered IO */ - iocb->offset[0] = hdr->args.offset; iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); - iocb->iter_is_dio_aligned[0] = false; } static void @@ -633,30 +629,20 @@ static void nfs_local_call_read(struct work_struct *work) n_iters = atomic_read(&iocb->n_iters); for (int i = 0; i < n_iters ; i++) { - /* DIO-aligned middle is always issued last with AIO completion */ if (iocb->iter_is_dio_aligned[i]) { iocb->kiocb.ki_flags |= IOCB_DIRECT; - iocb->kiocb.ki_complete = nfs_local_read_aio_complete; - iocb->aio_complete_work = nfs_local_read_aio_complete_work; - } + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_read_aio_complete; + iocb->aio_complete_work = nfs_local_read_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; - iocb->kiocb.ki_pos = iocb->offset[i]; status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); if (status != -EIOCBQUEUED) { - if (unlikely(status >= 0 && status < iocb->iters[i].count)) { - /* partial read */ - if (i == iocb->end_iter_index) { - /* Must not account DIO partial end, otherwise (due - * to end being issued before middle): the partial - * read accounting in nfs_local_read_done() - * would incorrectly advance hdr->args.offset - */ - status = 0; - } else { - /* Partial read at start or middle, force done */ - force_done = true; - } - } + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial read */ if (nfs_local_pgio_done(iocb, status, force_done)) { nfs_local_read_iocb_done(iocb); break; @@ -851,30 +837,20 @@ static void nfs_local_call_write(struct work_struct *work) file_start_write(filp); n_iters = atomic_read(&iocb->n_iters); for (int i = 0; i < n_iters ; i++) { - /* DIO-aligned middle is always issued last with AIO completion */ if (iocb->iter_is_dio_aligned[i]) { iocb->kiocb.ki_flags |= IOCB_DIRECT; - iocb->kiocb.ki_complete = nfs_local_write_aio_complete; - iocb->aio_complete_work = nfs_local_write_aio_complete_work; - } + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_write_aio_complete; + iocb->aio_complete_work = nfs_local_write_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; - iocb->kiocb.ki_pos = iocb->offset[i]; status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); if (status != -EIOCBQUEUED) { - if (unlikely(status >= 0 && status < iocb->iters[i].count)) { - /* partial write */ - if (i == iocb->end_iter_index) { - /* Must not account DIO partial end, otherwise (due - * to end being issued before middle): the partial - * write accounting in nfs_local_write_done() - * would incorrectly advance hdr->args.offset - */ - status = 0; - } else { - /* Partial write at start or middle, force done */ - force_done = true; - } - } + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial write */ if (nfs_local_pgio_done(iocb, status, force_done)) { nfs_local_write_iocb_done(iocb); break; From 85d2c2392ac6348e1171d627497034a341a250c1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 28 Oct 2025 17:27:43 -0400 Subject: [PATCH 263/543] NFSv2/v3: Fix error handling in nfs_atomic_open_v23() When nfs_do_create() returns an EEXIST error, it means that a regular file could not be created. That could mean that a symlink needs to be resolved. If that's the case, a lookup needs to be kicked off. Reported-by: Stephen Abbene Link: https://bugzilla.kernel.org/show_bug.cgi?id=220710 Fixes: 7c6c5249f061 ("NFS: add atomic_open for NFSv3 to handle O_TRUNC correctly.") Signed-off-by: Trond Myklebust Reviewed-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 46d9c65d50f8..ea9f6ca8f30f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2268,11 +2268,12 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, return -ENAMETOOLONG; if (open_flags & O_CREAT) { - file->f_mode |= FMODE_CREATED; error = nfs_do_create(dir, dentry, mode, open_flags); - if (error) + if (!error) { + file->f_mode |= FMODE_CREATED; + return finish_open(file, dentry, NULL); + } else if (error != -EEXIST || open_flags & O_EXCL) return error; - return finish_open(file, dentry, NULL); } if (d_in_lookup(dentry)) { /* The only flags nfs_lookup considers are From 7a7a3456520b309a0bffa1d9d62bd6c9dcab89b3 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Thu, 30 Oct 2025 11:03:25 +0800 Subject: [PATCH 264/543] NFS: sysfs: fix leak when nfs_client kobject add fails If adding the second kobject fails, drop both references to avoid sysfs residue and memory leak. Fixes: e96f9268eea6 ("NFS: Make all of /sys/fs/nfs network-namespace unique") Signed-off-by: Yang Xiuwei Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 545148d42dcc..ea6e6168092b 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -189,6 +189,7 @@ static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent, return p; kobject_put(&p->kobject); + kobject_put(&p->nfs_net_kobj); } return NULL; } From 1f214e9c3aef2d0936be971072e991d78a174d71 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 31 Oct 2025 10:51:42 -0400 Subject: [PATCH 265/543] NFSv4: Fix an incorrect parameter when calling nfs4_call_sync() The Smatch static checker noted that in _nfs4_proc_lookupp(), the flag RPC_TASK_TIMEOUT is being passed as an argument to nfs4_init_sequence(), which is clearly incorrect. Since LOOKUPP is an idempotent operation, nfs4_init_sequence() should not ask the server to cache the result. The RPC_TASK_TIMEOUT flag needs to be passed down to the RPC layer. Reported-by: Dan Carpenter Reported-by: Harshit Mogalapalli Fixes: 76998ebb9158 ("NFSv4: Observe the NFS_MOUNT_SOFTREVAL flag in _nfs4_proc_lookupp") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 411776718494..93c6ce04332b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4715,16 +4715,19 @@ static int _nfs4_proc_lookupp(struct inode *inode, }; unsigned short task_flags = 0; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) + if (server->flags & NFS_MOUNT_SOFTREVAL) task_flags |= RPC_TASK_TIMEOUT; + if (server->caps & NFS_CAP_MOVEABLE) + task_flags |= RPC_TASK_MOVEABLE; args.bitmask = nfs4_bitmask(server, fattr->label); nfs_fattr_init(fattr); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, task_flags); + status = nfs4_do_call_sync(clnt, server, &msg, &args.seq_args, + &res.seq_res, task_flags); dprintk("NFS reply lookupp: %d\n", status); return status; } From 55fb52ffdd62850d667ebed842815e072d3c9961 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sun, 2 Nov 2025 20:16:12 +0200 Subject: [PATCH 266/543] Bluetooth: MGMT: cancel mesh send timer when hdev removed mesh_send_done timer is not canceled when hdev is removed, which causes crash if the timer triggers after hdev is gone. Cancel the timer when MGMT removes the hdev, like other MGMT timers. Should fix the BUG: sporadically seen by BlueZ test bot (in "Mesh - Send cancel - 1" test). Log: ------ BUG: KASAN: slab-use-after-free in run_timer_softirq+0x76b/0x7d0 ... Freed by task 36: kasan_save_stack+0x24/0x50 kasan_save_track+0x14/0x30 __kasan_save_free_info+0x3a/0x60 __kasan_slab_free+0x43/0x70 kfree+0x103/0x500 device_release+0x9a/0x210 kobject_put+0x100/0x1e0 vhci_release+0x18b/0x240 ------ Fixes: b338d91703fa ("Bluetooth: Implement support for Mesh") Link: https://lore.kernel.org/linux-bluetooth/67364c09.0c0a0220.113cba.39ff@mx.google.com/ Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 79762bfaea5f..262bf984d2aa 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -9497,6 +9497,7 @@ void mgmt_index_removed(struct hci_dev *hdev) cancel_delayed_work_sync(&hdev->discov_off); cancel_delayed_work_sync(&hdev->service_cache); cancel_delayed_work_sync(&hdev->rpa_expired); + cancel_delayed_work_sync(&hdev->mesh_send_done); } void mgmt_power_on(struct hci_dev *hdev, int err) From 23d22f2f71768034d6ef86168213843fc49bf550 Mon Sep 17 00:00:00 2001 From: Raphael Pinsonneault-Thibeault Date: Wed, 5 Nov 2025 14:28:41 -0500 Subject: [PATCH 267/543] Bluetooth: btusb: reorder cleanup in btusb_disconnect to avoid UAF There is a KASAN: slab-use-after-free read in btusb_disconnect(). Calling "usb_driver_release_interface(&btusb_driver, data->intf)" will free the btusb data associated with the interface. The same data is then used later in the function, hence the UAF. Fix by moving the accesses to btusb data to before the data is free'd. Reported-by: syzbot+2fc81b50a4f8263a159b@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2fc81b50a4f8263a159b Tested-by: syzbot+2fc81b50a4f8263a159b@syzkaller.appspotmail.com Fixes: fd913ef7ce619 ("Bluetooth: btusb: Add out-of-band wakeup support") Signed-off-by: Raphael Pinsonneault-Thibeault Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 5e9ebf0c5312..a722446ec73d 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -4361,6 +4361,11 @@ static void btusb_disconnect(struct usb_interface *intf) hci_unregister_dev(hdev); + if (data->oob_wake_irq) + device_init_wakeup(&data->udev->dev, false); + if (data->reset_gpio) + gpiod_put(data->reset_gpio); + if (intf == data->intf) { if (data->isoc) usb_driver_release_interface(&btusb_driver, data->isoc); @@ -4371,17 +4376,11 @@ static void btusb_disconnect(struct usb_interface *intf) usb_driver_release_interface(&btusb_driver, data->diag); usb_driver_release_interface(&btusb_driver, data->intf); } else if (intf == data->diag) { - usb_driver_release_interface(&btusb_driver, data->intf); if (data->isoc) usb_driver_release_interface(&btusb_driver, data->isoc); + usb_driver_release_interface(&btusb_driver, data->intf); } - if (data->oob_wake_irq) - device_init_wakeup(&data->udev->dev, false); - - if (data->reset_gpio) - gpiod_put(data->reset_gpio); - hci_free_dev(hdev); } From 3b78f50918276ab28fb22eac9aa49401ac436a3b Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 3 Nov 2025 20:29:46 +0200 Subject: [PATCH 268/543] Bluetooth: 6lowpan: reset link-local header on ipv6 recv path Bluetooth 6lowpan.c netdev has header_ops, so it must set link-local header for RX skb, otherwise things crash, eg. with AF_PACKET SOCK_RAW Add missing skb_reset_mac_header() for uncompressed ipv6 RX path. For the compressed one, it is done in lowpan_header_decompress(). Log: (BlueZ 6lowpan-tester Client Recv Raw - Success) ------ kernel BUG at net/core/skbuff.c:212! Call Trace: ... packet_rcv (net/packet/af_packet.c:2152) ... __local_bh_enable_ip (kernel/softirq.c:407) netif_rx (net/core/dev.c:5648) chan_recv_cb (net/bluetooth/6lowpan.c:294 net/bluetooth/6lowpan.c:359) ------ Fixes: 18722c247023 ("Bluetooth: Enable 6LoWPAN support for BT LE devices") Reviewed-by: Paul Menzel Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/6lowpan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index f0c862091bff..f1d29fa4b411 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -289,6 +289,7 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, local_skb->pkt_type = PACKET_HOST; local_skb->dev = dev; + skb_reset_mac_header(local_skb); skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { From b454505bf57a2e4f5d49951d4deb03730a9348d9 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 3 Nov 2025 20:29:47 +0200 Subject: [PATCH 269/543] Bluetooth: 6lowpan: fix BDADDR_LE vs ADDR_LE_DEV address type confusion Bluetooth 6lowpan.c confuses BDADDR_LE and ADDR_LE_DEV address types, e.g. debugfs "connect" command takes the former, and "disconnect" and "connect" to already connected device take the latter. This is due to using same value both for l2cap_chan_connect and hci_conn_hash_lookup_le which take different dst_type values. Fix address type passed to hci_conn_hash_lookup_le(). Retain the debugfs API difference between "connect" and "disconnect" commands since it's been like this since 2015 and nobody apparently complained. Fixes: f5ad4ffceba0 ("Bluetooth: 6lowpan: Use hci_conn_hash_lookup_le() when possible") Reviewed-by: Paul Menzel Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/6lowpan.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index f1d29fa4b411..0d8c2e2e9a6c 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -957,10 +957,11 @@ static struct l2cap_chan *bt_6lowpan_listen(void) } static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, - struct l2cap_conn **conn) + struct l2cap_conn **conn, bool disconnect) { struct hci_conn *hcon; struct hci_dev *hdev; + int le_addr_type; int n; n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu", @@ -971,13 +972,32 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, if (n < 7) return -EINVAL; + if (disconnect) { + /* The "disconnect" debugfs command has used different address + * type constants than "connect" since 2015. Let's retain that + * for now even though it's obviously buggy... + */ + *addr_type += 1; + } + + switch (*addr_type) { + case BDADDR_LE_PUBLIC: + le_addr_type = ADDR_LE_DEV_PUBLIC; + break; + case BDADDR_LE_RANDOM: + le_addr_type = ADDR_LE_DEV_RANDOM; + break; + default: + return -EINVAL; + } + /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */ hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC); if (!hdev) return -ENOENT; hci_dev_lock(hdev); - hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type); + hcon = hci_conn_hash_lookup_le(hdev, addr, le_addr_type); hci_dev_unlock(hdev); hci_dev_put(hdev); @@ -1104,7 +1124,7 @@ static ssize_t lowpan_control_write(struct file *fp, buf[buf_size] = '\0'; if (memcmp(buf, "connect ", 8) == 0) { - ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn); + ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn, false); if (ret == -EINVAL) return ret; @@ -1141,7 +1161,7 @@ static ssize_t lowpan_control_write(struct file *fp, } if (memcmp(buf, "disconnect ", 11) == 0) { - ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn); + ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn, true); if (ret < 0) return ret; From e060088db0bdf7932e0e3c2d24b7371c4c5b867c Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 3 Nov 2025 20:29:48 +0200 Subject: [PATCH 270/543] Bluetooth: L2CAP: export l2cap_chan_hold for modules l2cap_chan_put() is exported, so export also l2cap_chan_hold() for modules. l2cap_chan_hold() has use case in net/bluetooth/6lowpan.c Signed-off-by: Pauli Virtanen Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index d08320380ad6..35c57657bcf4 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -497,6 +497,7 @@ void l2cap_chan_hold(struct l2cap_chan *c) kref_get(&c->kref); } +EXPORT_SYMBOL_GPL(l2cap_chan_hold); struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c) { From 98454bc812f3611551e4b1f81732da4aa7b9597e Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 3 Nov 2025 20:29:49 +0200 Subject: [PATCH 271/543] Bluetooth: 6lowpan: Don't hold spin lock over sleeping functions disconnect_all_peers() calls sleeping function (l2cap_chan_close) under spinlock. Holding the lock doesn't actually do any good -- we work on a local copy of the list, and the lock doesn't protect against peer->chan having already been freed. Fix by taking refcounts of peer->chan instead. Clean up the code and old comments a bit. Take devices_lock instead of RCU, because the kfree_rcu(); l2cap_chan_put(); construct in chan_close_cb() does not guarantee peer->chan is necessarily valid in RCU. Also take l2cap_chan_lock() which is required for l2cap_chan_close(). Log: (bluez 6lowpan-tester Client Connect - Disable) ------ BUG: sleeping function called from invalid context at kernel/locking/mutex.c:575 ... ... l2cap_send_disconn_req (net/bluetooth/l2cap_core.c:938 net/bluetooth/l2cap_core.c:1495) ... ? __pfx_l2cap_chan_close (net/bluetooth/l2cap_core.c:809) do_enable_set (net/bluetooth/6lowpan.c:1048 net/bluetooth/6lowpan.c:1068) ------ Fixes: 90305829635d ("Bluetooth: 6lowpan: Converting rwlocks to use RCU") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/6lowpan.c | 74 +++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 0d8c2e2e9a6c..588d7e94e606 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -53,6 +53,11 @@ static bool enable_6lowpan; static struct l2cap_chan *listen_chan; static DEFINE_MUTEX(set_lock); +enum { + LOWPAN_PEER_CLOSING, + LOWPAN_PEER_MAXBITS +}; + struct lowpan_peer { struct list_head list; struct rcu_head rcu; @@ -61,6 +66,8 @@ struct lowpan_peer { /* peer addresses in various formats */ unsigned char lladdr[ETH_ALEN]; struct in6_addr peer_addr; + + DECLARE_BITMAP(flags, LOWPAN_PEER_MAXBITS); }; struct lowpan_btle_dev { @@ -1014,41 +1021,52 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, static void disconnect_all_peers(void) { struct lowpan_btle_dev *entry; - struct lowpan_peer *peer, *tmp_peer, *new_peer; - struct list_head peers; + struct lowpan_peer *peer; + int nchans; - INIT_LIST_HEAD(&peers); - - /* We make a separate list of peers as the close_cb() will - * modify the device peers list so it is better not to mess - * with the same list at the same time. + /* l2cap_chan_close() cannot be called from RCU, and lock ordering + * chan->lock > devices_lock prevents taking write side lock, so copy + * then close. */ rcu_read_lock(); - - list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { - list_for_each_entry_rcu(peer, &entry->peers, list) { - new_peer = kmalloc(sizeof(*new_peer), GFP_ATOMIC); - if (!new_peer) - break; - - new_peer->chan = peer->chan; - INIT_LIST_HEAD(&new_peer->list); - - list_add(&new_peer->list, &peers); - } - } - + list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) + list_for_each_entry_rcu(peer, &entry->peers, list) + clear_bit(LOWPAN_PEER_CLOSING, peer->flags); rcu_read_unlock(); - spin_lock(&devices_lock); - list_for_each_entry_safe(peer, tmp_peer, &peers, list) { - l2cap_chan_close(peer->chan, ENOENT); + do { + struct l2cap_chan *chans[32]; + int i; - list_del_rcu(&peer->list); - kfree_rcu(peer, rcu); - } - spin_unlock(&devices_lock); + nchans = 0; + + spin_lock(&devices_lock); + + list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { + list_for_each_entry_rcu(peer, &entry->peers, list) { + if (test_and_set_bit(LOWPAN_PEER_CLOSING, + peer->flags)) + continue; + + l2cap_chan_hold(peer->chan); + chans[nchans++] = peer->chan; + + if (nchans >= ARRAY_SIZE(chans)) + goto done; + } + } + +done: + spin_unlock(&devices_lock); + + for (i = 0; i < nchans; ++i) { + l2cap_chan_lock(chans[i]); + l2cap_chan_close(chans[i], ENOENT); + l2cap_chan_unlock(chans[i]); + l2cap_chan_put(chans[i]); + } + } while (nchans); } struct set_enable { From 15f32cabf426143ec1d2c8faf2bbca40c0fbd61b Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 3 Nov 2025 20:29:50 +0200 Subject: [PATCH 272/543] Bluetooth: 6lowpan: add missing l2cap_chan_lock() l2cap_chan_close() needs to be called in l2cap_chan_lock(), otherwise l2cap_le_sig_cmd() etc. may run concurrently. Add missing locks around l2cap_chan_close(). Fixes: 6b8d4a6a0314 ("Bluetooth: 6LoWPAN: Use connected oriented channel instead of fixed one") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/6lowpan.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 588d7e94e606..2c21ae8abadc 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -927,7 +927,9 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) BT_DBG("peer %p chan %p", peer, peer->chan); + l2cap_chan_lock(peer->chan); l2cap_chan_close(peer->chan, ENOENT); + l2cap_chan_unlock(peer->chan); return 0; } @@ -1089,7 +1091,9 @@ static void do_enable_set(struct work_struct *work) mutex_lock(&set_lock); if (listen_chan) { + l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); + l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); } @@ -1148,7 +1152,9 @@ static ssize_t lowpan_control_write(struct file *fp, mutex_lock(&set_lock); if (listen_chan) { + l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); + l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); listen_chan = NULL; } @@ -1310,7 +1316,9 @@ static void __exit bt_6lowpan_exit(void) debugfs_remove(lowpan_control_debugfs); if (listen_chan) { + l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); + l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); } From 41bf23338a501e745c398e0faee948dd05d0be98 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 4 Nov 2025 17:02:04 -0500 Subject: [PATCH 273/543] Bluetooth: hci_conn: Fix not cleaning up PA_LINK connections Contrary to what was stated on d36349ea73d8 ("Bluetooth: hci_conn: Fix running bis_cleanup for hci_conn->type PA_LINK") the PA_LINK does in fact needs to run bis_cleanup in order to terminate the PA Sync, since that is bond to the listening socket which is the entity that controls the lifetime of PA Sync, so if it is closed/released the PA Sync shall be terminated, terminating the PA Sync shall not result in the BIG Sync being terminated since once the later is established it doesn't depend on the former anymore. If the use user wants to reconnect/rebind a number of BIS(s) it shall keep the socket open until it no longer needs the PA Sync, which means it retains full control of the lifetime of both PA and BIG Syncs. Fixes: d36349ea73d8 ("Bluetooth: hci_conn: Fix running bis_cleanup for hci_conn->type PA_LINK") Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 33 +++++++++++++++++++-------------- net/bluetooth/hci_event.c | 7 +------ net/bluetooth/hci_sync.c | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index c5dedf39a129..6fc0692abf05 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -769,21 +769,23 @@ static void find_bis(struct hci_conn *conn, void *data) d->count++; } -static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *conn) +static int hci_le_big_terminate(struct hci_dev *hdev, struct hci_conn *conn) { struct iso_list_data *d; int ret; - bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, conn->sync_handle); + bt_dev_dbg(hdev, "hcon %p big 0x%2.2x sync_handle 0x%4.4x", conn, + conn->iso_qos.bcast.big, conn->sync_handle); d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; - d->big = big; + d->big = conn->iso_qos.bcast.big; d->sync_handle = conn->sync_handle; - if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { + if (conn->type == PA_LINK && + test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { hci_conn_hash_list_flag(hdev, find_bis, PA_LINK, HCI_CONN_PA_SYNC, d); @@ -801,6 +803,9 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c d->big_sync_term = true; } + if (!d->pa_sync_term && !d->big_sync_term) + return 0; + ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d, terminate_big_destroy); if (ret) @@ -852,8 +857,7 @@ static void bis_cleanup(struct hci_conn *conn) hci_le_terminate_big(hdev, conn); } else { - hci_le_big_terminate(hdev, conn->iso_qos.bcast.big, - conn); + hci_le_big_terminate(hdev, conn); } } @@ -994,19 +998,20 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; break; case CIS_LINK: - case BIS_LINK: - case PA_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); - /* set proper cleanup function */ - if (!bacmp(dst, BDADDR_ANY)) - conn->cleanup = bis_cleanup; - else if (conn->role == HCI_ROLE_MASTER) + if (conn->role == HCI_ROLE_MASTER) conn->cleanup = cis_cleanup; - conn->mtu = hdev->iso_mtu ? hdev->iso_mtu : - hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; + conn->mtu = hdev->iso_mtu; + break; + case PA_LINK: + case BIS_LINK: + /* conn->src should reflect the local identity address */ + hci_copy_identity_address(hdev, &conn->src, &conn->src_type); + conn->cleanup = bis_cleanup; + conn->mtu = hdev->iso_mtu; break; case SCO_LINK: if (lmp_esco_capable(hdev)) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index f20c826509b6..03328c1dd090 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -7001,14 +7001,9 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, continue; } - if (ev->status != 0x42) { + if (ev->status != 0x42) /* Mark PA sync as established */ set_bit(HCI_CONN_PA_SYNC, &bis->flags); - /* Reset cleanup callback of PA Sync so it doesn't - * terminate the sync when deleting the connection. - */ - conn->cleanup = NULL; - } bis->sync_handle = conn->sync_handle; bis->iso_qos.bcast.big = ev->handle; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 73fc41b68b68..6e76798ec786 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -6999,7 +6999,7 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err) hci_dev_lock(hdev); - if (!hci_conn_valid(hdev, conn)) + if (hci_conn_valid(hdev, conn)) clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); if (!err) From b623390045a81fc559decb9bfeb79319721d3dfb Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Sun, 9 Nov 2025 09:05:08 -0800 Subject: [PATCH 274/543] NFS: Fix LTP test failures when timestamps are delegated The utimes01 and utime06 tests fail when delegated timestamps are enabled, specifically in subtests that modify the atime and mtime fields using the 'nobody' user ID. The problem can be reproduced as follow: # echo "/media *(rw,no_root_squash,sync)" >> /etc/exports # export -ra # mount -o rw,nfsvers=4.2 127.0.0.1:/media /tmpdir # cd /opt/ltp # ./runltp -d /tmpdir -s utimes01 # ./runltp -d /tmpdir -s utime06 This issue occurs because nfs_setattr does not verify the inode's UID against the caller's fsuid when delegated timestamps are permitted for the inode. This patch adds the UID check and if it does not match then the request is sent to the server for permission checking. Fixes: e12912d94137 ("NFSv4: Add support for delegated atime and mtime attributes") Signed-off-by: Dai Ngo Signed-off-by: Anna Schumaker --- fs/nfs/inode.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 18b57c7c2f97..13ad70fc00d8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -718,6 +718,8 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct nfs_fattr *fattr; loff_t oldsize = i_size_read(inode); int error = 0; + kuid_t task_uid = current_fsuid(); + kuid_t owner_uid = inode->i_uid; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -739,9 +741,11 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { spin_lock(&inode->i_lock); if (attr->ia_valid & ATTR_MTIME_SET) { - nfs_set_timestamps_to_ts(inode, attr); - attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| + if (uid_eq(task_uid, owner_uid)) { + nfs_set_timestamps_to_ts(inode, attr); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| ATTR_ATIME|ATTR_ATIME_SET); + } } else { nfs_update_timestamps(inode, attr->ia_valid); attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME); @@ -751,10 +755,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, attr->ia_valid & ATTR_ATIME && !(attr->ia_valid & ATTR_MTIME)) { if (attr->ia_valid & ATTR_ATIME_SET) { - spin_lock(&inode->i_lock); - nfs_set_timestamps_to_ts(inode, attr); - spin_unlock(&inode->i_lock); - attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + if (uid_eq(task_uid, owner_uid)) { + spin_lock(&inode->i_lock); + nfs_set_timestamps_to_ts(inode, attr); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + } } else { nfs_update_delegated_atime(inode); attr->ia_valid &= ~ATTR_ATIME; From d3c9c213c0b86ac5dd8fe2c53c24db20f1f510bc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Nov 2025 14:30:41 -0700 Subject: [PATCH 275/543] io_uring/rw: ensure allocated iovec gets cleared for early failure A previous commit reused the recyling infrastructure for early cleanup, but this is not enough for the case where our internal caches have overflowed. If this happens, then the allocated iovec can get leaked if the request is also aborted early. Reinstate the previous forced free of the iovec for that situation. Cc: stable@vger.kernel.org Reported-by: syzbot+3c93637d7648c24e1fd0@syzkaller.appspotmail.com Tested-by: syzbot+3c93637d7648c24e1fd0@syzkaller.appspotmail.com Fixes: 9ac273ae3dc2 ("io_uring/rw: use io_rw_recycle() from cleanup path") Link: https://lore.kernel.org/io-uring/69122a59.a70a0220.22f260.00fd.GAE@google.com/ Signed-off-by: Jens Axboe --- io_uring/rw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/io_uring/rw.c b/io_uring/rw.c index 5b2241a5813c..abe68ba9c9dc 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -463,7 +463,10 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) void io_readv_writev_cleanup(struct io_kiocb *req) { + struct io_async_rw *rw = req->async_data; + lockdep_assert_held(&req->ctx->uring_lock); + io_vec_free(&rw->vec); io_rw_recycle(req, 0); } From 6a77267d97b5b6cd0e35099ab4eb054e5f965ee6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 10 Nov 2025 13:03:53 +0000 Subject: [PATCH 276/543] io_uring/query: return number of available queries It's useful to know which query opcodes are available. Extend the structure and return that. It's a trivial change, and even though it can be painlessly extended later, it'd still require adding a v2 of the structure. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/query.h | 3 +++ io_uring/query.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h index 5d754322a27c..3539ccbfd064 100644 --- a/include/uapi/linux/io_uring/query.h +++ b/include/uapi/linux/io_uring/query.h @@ -36,6 +36,9 @@ struct io_uring_query_opcode { __u64 enter_flags; /* Bitmask of all supported IOSQE_* flags */ __u64 sqe_flags; + /* The number of available query opcodes */ + __u32 nr_query_opcodes; + __u32 __pad; }; #endif diff --git a/io_uring/query.c b/io_uring/query.c index 645301bd2c82..cf02893ba911 100644 --- a/io_uring/query.c +++ b/io_uring/query.c @@ -20,6 +20,8 @@ static ssize_t io_query_ops(void *data) e->ring_setup_flags = IORING_SETUP_FLAGS; e->enter_flags = IORING_ENTER_FLAGS; e->sqe_flags = SQE_VALID_FLAGS; + e->nr_query_opcodes = __IO_URING_QUERY_MAX; + e->__pad = 0; return sizeof(*e); } From 1534ff77757e44bcc4b98d0196bc5c0052fce5fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Nov 2025 11:10:54 +0000 Subject: [PATCH 277/543] sctp: prevent possible shift-out-of-bounds in sctp_transport_update_rto syzbot reported a possible shift-out-of-bounds [1] Blamed commit added rto_alpha_max and rto_beta_max set to 1000. It is unclear if some sctp users are setting very large rto_alpha and/or rto_beta. In order to prevent user regression, perform the test at run time. Also add READ_ONCE() annotations as sysctl values can change under us. [1] UBSAN: shift-out-of-bounds in net/sctp/transport.c:509:41 shift exponent 64 is too large for 32-bit type 'unsigned int' CPU: 0 UID: 0 PID: 16704 Comm: syz.2.2320 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/02/2025 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x16c/0x1f0 lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:233 [inline] __ubsan_handle_shift_out_of_bounds+0x27f/0x420 lib/ubsan.c:494 sctp_transport_update_rto.cold+0x1c/0x34b net/sctp/transport.c:509 sctp_check_transmitted+0x11c4/0x1c30 net/sctp/outqueue.c:1502 sctp_outq_sack+0x4ef/0x1b20 net/sctp/outqueue.c:1338 sctp_cmd_process_sack net/sctp/sm_sideeffect.c:840 [inline] sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1372 [inline] Fixes: b58537a1f562 ("net: sctp: fix permissions for rto_alpha and rto_beta knobs") Reported-by: syzbot+f8c46c8b2b7f6e076e99@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/690c81ae.050a0220.3d0d33.014e.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Daniel Borkmann Acked-by: Xin Long Link: https://patch.msgid.link/20251106111054.3288127-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sctp/transport.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 0d48c61fe6ad..0c56d9673cc1 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -486,6 +486,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) if (tp->rttvar || tp->srtt) { struct net *net = tp->asoc->base.net; + unsigned int rto_beta, rto_alpha; /* 6.3.1 C3) When a new RTT measurement R' is made, set * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' @@ -497,10 +498,14 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) * For example, assuming the default value of RTO.Alpha of * 1/8, rto_alpha would be expressed as 3. */ - tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta) - + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta); - tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha) - + (rtt >> net->sctp.rto_alpha); + rto_beta = READ_ONCE(net->sctp.rto_beta); + if (rto_beta < 32) + tp->rttvar = tp->rttvar - (tp->rttvar >> rto_beta) + + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> rto_beta); + rto_alpha = READ_ONCE(net->sctp.rto_alpha); + if (rto_alpha < 32) + tp->srtt = tp->srtt - (tp->srtt >> rto_alpha) + + (rtt >> rto_alpha); } else { /* 6.3.1 C2) When the first RTT measurement R is made, set * SRTT <- R, RTTVAR <- R/2. From e781122d76f018ad17752ab1018b3ffbf7fad84e Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 6 Nov 2025 17:56:20 -0300 Subject: [PATCH 278/543] net/sched: Abort __tc_modify_qdisc if parent is a clsact/ingress qdisc Wang reported an illegal configuration [1] where the user attempts to add a child qdisc to the ingress qdisc as follows: tc qdisc add dev eth0 handle ffff:0 ingress tc qdisc add dev eth0 handle ffe0:0 parent ffff:a fq To solve this, we reject any configuration attempt to add a child qdisc to ingress or clsact. [1] https://lore.kernel.org/netdev/20251105022213.1981982-1-wangliang74@huawei.com/ Fixes: 5e50da01d0ce ("[NET_SCHED]: Fix endless loops (part 2): "simple" qdiscs") Reported-by: Wang Liang Closes: https://lore.kernel.org/netdev/20251105022213.1981982-1-wangliang74@huawei.com/ Reviewed-by: Pedro Tammela Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Reviewed-by: Cong Wang Link: https://patch.msgid.link/20251106205621.3307639-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1e058b46d3e1..f56b18c8aebf 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1599,6 +1599,11 @@ static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; } + if (p->flags & TCQ_F_INGRESS) { + NL_SET_ERR_MSG(extack, + "Cannot add children to ingress/clsact qdisc"); + return -EOPNOTSUPP; + } q = qdisc_leaf(p, clid, extack); if (IS_ERR(q)) return PTR_ERR(q); From 60260ad935861f6b8db7c65c23faa41c98d8fb15 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 6 Nov 2025 17:56:21 -0300 Subject: [PATCH 279/543] selftests/tc-testing: Create tests trying to add children to clsact/ingress qdiscs In response to Wang's bug report [1], add the following test cases: - Try and fail to add an fq child to an ingress qdisc - Try and fail to add an fq child to a clsact qdisc [1] https://lore.kernel.org/netdev/20251105022213.1981982-1-wangliang74@huawei.com/ Reviewed-by: Pedro Tammela Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Reviewed-by: Cong Wang Link: https://patch.msgid.link/20251106205621.3307639-2-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 998e5a2f4579..0091bcd91c2c 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -961,5 +961,49 @@ "teardown": [ "$TC qdisc del dev $DUMMY root" ] + }, + { + "id": "4989", + "name": "Try to add an fq child to an ingress qdisc", + "category": [ + "qdisc", + "ingress" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DUMMY handle ffff:0 ingress" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq", + "expExitCode": "2", + "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:", + "matchJSON": [], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY ingress" + ] + }, + { + "id": "c2b0", + "name": "Try to add an fq child to a clsact qdisc", + "category": [ + "qdisc", + "ingress" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DUMMY handle ffff:0 clsact" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq", + "expExitCode": "2", + "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:", + "matchJSON": [], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY clsact" + ] } ] From dd4adb986a86727ed8f56c48b6d0695f1e211e65 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 12:27:24 -0400 Subject: [PATCH 280/543] selftests/tracing: Run sample events to clear page cache events The tracing selftest "event-filter-function.tc" was failing because it first runs the "sample_events" function that triggers the kmem_cache_free event and it looks at what function was used during a call to "ls". But the first time it calls this, it could trigger events that are used to pull pages into the page cache. The rest of the test uses the function it finds during that call to see if it will be called in subsequent "sample_events" calls. But if there's no need to pull pages into the page cache, it will not trigger that function and the test will fail. Call the "sample_events" twice to trigger all the page cache work before it calls it to find a function to use in subsequent checks. Cc: stable@vger.kernel.org Fixes: eb50d0f250e96 ("selftests/ftrace: Choose target function for filter test from samples") Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Shuah Khan --- .../selftests/ftrace/test.d/filter/event-filter-function.tc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc index c62165fabd0c..cfa16aa1f39a 100644 --- a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc +++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc @@ -20,6 +20,10 @@ sample_events() { echo 0 > tracing_on echo 0 > events/enable +# Clear functions caused by page cache; run sample_events twice +sample_events +sample_events + echo "Get the most frequently calling function" echo > trace sample_events From 762e7e174da91cf4babfe77e45bc6b67334b1503 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sun, 9 Nov 2025 14:46:35 +0100 Subject: [PATCH 281/543] net: dsa: tag_brcm: do not mark link local traffic as offloaded Broadcom switches locally terminate link local traffic and do not forward it, so we should not mark it as offloaded. In some situations we still want/need to flood this traffic, e.g. if STP is disabled, or it is explicitly enabled via the group_fwd_mask. But if the skb is marked as offloaded, the kernel will assume this was already done in hardware, and the packets never reach other bridge ports. So ensure that link local traffic is never marked as offloaded, so that the kernel can forward/flood these packets in software if needed. Since the local termination in not configurable, check the destination MAC, and never mark packets as offloaded if it is a link local ether address. While modern switches set the tag reason code to BRCM_EG_RC_PROT_TERM for trapped link local traffic, they also set it for link local traffic that is flooded (01:80:c2:00:00:10 to 01:80:c2:00:00:2f), so we cannot use it and need to look at the destination address for them as well. Fixes: 964dbf186eaa ("net: dsa: tag_brcm: add support for legacy tags") Fixes: 0e62f543bed0 ("net: dsa: Fix duplicate frames flooded by learning") Signed-off-by: Jonas Gorski Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20251109134635.243951-1-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski --- net/dsa/tag_brcm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index d9c77fa553b5..eadb358179ce 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -176,7 +176,8 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); - dsa_default_offload_fwd_mark(skb); + if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest))) + dsa_default_offload_fwd_mark(skb); return skb; } @@ -250,7 +251,8 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, len); - dsa_default_offload_fwd_mark(skb); + if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest))) + dsa_default_offload_fwd_mark(skb); dsa_strip_etype_header(skb, len); From 41d0c31be29fdee2535028ce70a6661e3a67bb25 Mon Sep 17 00:00:00 2001 From: Zahari Doychev Date: Thu, 6 Nov 2025 16:15:28 +0100 Subject: [PATCH 282/543] tools: ynl: call nested attribute free function for indexed arrays When freeing indexed arrays, the corresponding free function should be called for each entry of the indexed array. For example, for for 'struct tc_act_attrs' 'tc_act_attrs_free(...)' needs to be called for each entry. Previously, memory leaks were reported when enabling the ASAN analyzer. ================================================================= ==874==ERROR: LeakSanitizer: detected memory leaks Direct leak of 24 byte(s) in 1 object(s) allocated from: #0 0x7f221fd20cb5 in malloc ./debug/gcc/gcc/libsanitizer/asan/asan_malloc_linux.cpp:67 #1 0x55c98db048af in tc_act_attrs_set_options_vlan_parms ../generated/tc-user.h:2813 #2 0x55c98db048af in main ./linux/tools/net/ynl/samples/tc-filter-add.c:71 Direct leak of 24 byte(s) in 1 object(s) allocated from: #0 0x7f221fd20cb5 in malloc ./debug/gcc/gcc/libsanitizer/asan/asan_malloc_linux.cpp:67 #1 0x55c98db04a93 in tc_act_attrs_set_options_vlan_parms ../generated/tc-user.h:2813 #2 0x55c98db04a93 in main ./linux/tools/net/ynl/samples/tc-filter-add.c:74 Direct leak of 10 byte(s) in 2 object(s) allocated from: #0 0x7f221fd20cb5 in malloc ./debug/gcc/gcc/libsanitizer/asan/asan_malloc_linux.cpp:67 #1 0x55c98db0527d in tc_act_attrs_set_kind ../generated/tc-user.h:1622 SUMMARY: AddressSanitizer: 58 byte(s) leaked in 4 allocation(s). The following diff illustrates the changes introduced compared to the previous version of the code. void tc_flower_attrs_free(struct tc_flower_attrs *obj) { + unsigned int i; + free(obj->indev); + for (i = 0; i < obj->_count.act; i++) + tc_act_attrs_free(&obj->act[i]); free(obj->act); free(obj->key_eth_dst); free(obj->key_eth_dst_mask); Signed-off-by: Zahari Doychev Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20251106151529.453026-3-zahari.doychev@linux.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_c.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 58086b101057..aadeb3abcad8 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -861,6 +861,18 @@ class TypeIndexedArray(Type): return [f"{member} = {self.c_name};", f"{presence} = n_{self.c_name};"] + def free_needs_iter(self): + return self.sub_type == 'nest' + + def _free_lines(self, ri, var, ref): + lines = [] + if self.sub_type == 'nest': + lines += [ + f"for (i = 0; i < {var}->{ref}_count.{self.c_name}; i++)", + f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);', + ] + lines += f"free({var}->{ref}{self.c_name});", + return lines class TypeNestTypeValue(Type): def _complex_member_type(self, ri): From 2554559aba883803475e4ca4fae22eaad6d33d86 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 6 Nov 2025 19:02:52 +0100 Subject: [PATCH 283/543] bonding: fix mii_status when slave is down netif_carrier_ok() doesn't check if the slave is up. Before the below commit, netif_running() was also checked. Fixes: 23a6037ce76c ("bonding: Remove support for use_carrier") Signed-off-by: Nicolas Dichtel Acked-by: Jay Vosburgh Link: https://patch.msgid.link/20251106180252.3974772-1-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e95e593cd12d..5abef8a3b775 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2120,7 +2120,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, /* check for initial state */ new_slave->link = BOND_LINK_NOCHANGE; if (bond->params.miimon) { - if (netif_carrier_ok(slave_dev)) { + if (netif_running(slave_dev) && netif_carrier_ok(slave_dev)) { if (bond->params.updelay) { bond_set_slave_link_state(new_slave, BOND_LINK_BACK, @@ -2665,7 +2665,8 @@ static int bond_miimon_inspect(struct bonding *bond) bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); - link_state = netif_carrier_ok(slave->dev); + link_state = netif_running(slave->dev) && + netif_carrier_ok(slave->dev); switch (slave->link) { case BOND_LINK_UP: From ec33f2e5a2d0dbbfd71435209aee812fdc9369b8 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 7 Nov 2025 10:40:29 +0800 Subject: [PATCH 284/543] net/smc: fix mismatch between CLC header and proposal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current CLC proposal message construction uses a mix of `ini->smc_type_v1/v2` and `pclc_base->hdr.typev1/v2` to decide whether to include optional extensions (IPv6 prefix extension for v1, and v2 extension). This leads to a critical inconsistency: when `smc_clc_prfx_set()` fails - for example, in IPv6-only environments with only link-local addresses, or when the local IP address and the outgoing interface’s network address are not in the same subnet. As a result, the proposal message is assembled using the stale `ini->smc_type_v1` value—causing the IPv6 prefix extension to be included even though the header indicates v1 is not supported. The peer then receives a malformed CLC proposal where the header type does not match the payload, and immediately resets the connection. The fix ensures consistency between the CLC header flags and the actual payload by synchronizing `ini->smc_type_v1` with `pclc_base->hdr.typev1` when prefix setup fails. Fixes: 8c3dca341aea ("net/smc: build and send V2 CLC proposal") Signed-off-by: D. Wythe Reviewed-by: Alexandra Winter Link: https://patch.msgid.link/20251107024029.88753-1-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/smc_clc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 157aace169d4..87c87edadde7 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -890,6 +890,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) return SMC_CLC_DECL_CNFERR; } pclc_base->hdr.typev1 = SMC_TYPE_N; + ini->smc_type_v1 = SMC_TYPE_N; } else { pclc_base->iparea_offset = htons(sizeof(*pclc_smcd)); plen += sizeof(*pclc_prfx) + From 3072f00bba764082fa41b3c3a2a7b013335353d2 Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Thu, 6 Nov 2025 14:45:11 +0000 Subject: [PATCH 285/543] net/handshake: Fix memory leak in tls_handshake_accept() In tls_handshake_accept(), a netlink message is allocated using genlmsg_new(). In the error handling path, genlmsg_cancel() is called to cancel the message construction, but the message itself is not freed. This leads to a memory leak. Fix this by calling nlmsg_free() in the error path after genlmsg_cancel() to release the allocated memory. Fixes: 2fd5532044a89 ("net/handshake: Add a kernel API for requesting a TLSv1.3 handshake") Signed-off-by: Zilin Guan Reviewed-by: Chuck Lever Link: https://patch.msgid.link/20251106144511.3859535-1-zilin@seu.edu.cn Signed-off-by: Jakub Kicinski --- net/handshake/tlshd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index 081093dfd553..8f9532a15f43 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -259,6 +259,7 @@ static int tls_handshake_accept(struct handshake_req *req, out_cancel: genlmsg_cancel(msg, hdr); + nlmsg_free(msg); out: return ret; } From 49b3916465176a5abcb29a0e464825f553d55d58 Mon Sep 17 00:00:00 2001 From: Aksh Garg Date: Thu, 6 Nov 2025 14:53:04 +0530 Subject: [PATCH 286/543] net: ethernet: ti: am65-cpsw-qos: fix IET verify/response timeout The CPSW module uses the MAC_VERIFY_CNT bit field in the CPSW_PN_IET_VERIFY_REG_k register to set the verify/response timeout count. This register specifies the number of clock cycles to wait before resending a verify packet if the verification fails. The verify/response timeout count, as being set by the function am65_cpsw_iet_set_verify_timeout_count() is hardcoded for 125MHz clock frequency, which varies based on PHY mode and link speed. The respective clock frequencies are as follows: - RGMII mode: * 1000 Mbps: 125 MHz * 100 Mbps: 25 MHz * 10 Mbps: 2.5 MHz - QSGMII/SGMII mode: 125 MHz (all speeds) Fix this by adding logic to calculate the correct timeout counts based on the actual PHY interface mode and link speed. Fixes: 49a2eb9068246 ("net: ethernet: ti: am65-cpsw-qos: Add Frame Preemption MAC Merge support") Signed-off-by: Aksh Garg Link: https://patch.msgid.link/20251106092305.1437347-2-a-garg7@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-qos.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-qos.c b/drivers/net/ethernet/ti/am65-cpsw-qos.c index fa96db7c1a13..ad06942ce461 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-qos.c +++ b/drivers/net/ethernet/ti/am65-cpsw-qos.c @@ -276,9 +276,31 @@ static int am65_cpsw_iet_set_verify_timeout_count(struct am65_cpsw_port *port) /* The number of wireside clocks contained in the verify * timeout counter. The default is 0x1312d0 * (10ms at 125Mhz in 1G mode). + * The frequency of the clock depends on the link speed + * and the PHY interface. */ - val = 125 * HZ_PER_MHZ; /* assuming 125MHz wireside clock */ + switch (port->slave.phy_if) { + case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: + case PHY_INTERFACE_MODE_RGMII_TXID: + if (port->qos.link_speed == SPEED_1000) + val = 125 * HZ_PER_MHZ; /* 125 MHz at 1000Mbps*/ + else if (port->qos.link_speed == SPEED_100) + val = 25 * HZ_PER_MHZ; /* 25 MHz at 100Mbps*/ + else + val = (25 * HZ_PER_MHZ) / 10; /* 2.5 MHz at 10Mbps*/ + break; + case PHY_INTERFACE_MODE_QSGMII: + case PHY_INTERFACE_MODE_SGMII: + val = 125 * HZ_PER_MHZ; /* 125 MHz */ + break; + + default: + netdev_err(port->ndev, "selected mode does not supported IET\n"); + return -EOPNOTSUPP; + } val /= MILLIHZ_PER_HZ; /* count per ms timeout */ val *= verify_time_ms; /* count for timeout ms */ From d4b00d132d7cb70a74bc039c91c1d6120943c71b Mon Sep 17 00:00:00 2001 From: Aksh Garg Date: Thu, 6 Nov 2025 14:53:05 +0530 Subject: [PATCH 287/543] net: ethernet: ti: am65-cpsw-qos: fix IET verify retry mechanism The am65_cpsw_iet_verify_wait() function attempts verification 20 times, toggling the AM65_CPSW_PN_IET_MAC_LINKFAIL bit in each iteration. When the LINKFAIL bit transitions from 1 to 0, the MAC merge layer initiates the verification process and waits for the timeout configured in MAC_VERIFY_CNT before automatically retransmitting. The MAC_VERIFY_CNT register is configured according to the user-defined verify/response timeout in am65_cpsw_iet_set_verify_timeout_count(). As per IEEE 802.3 Clause 99, the hardware performs this automatic retry up to 3 times. Current implementation toggles LINKFAIL after the user-configured verify/response timeout in each iteration, forcing the hardware to restart verification instead of respecting the MAC_VERIFY_CNT timeout. This bypasses the hardware's automatic retry mechanism. Fix this by moving the LINKFAIL bit toggle outside the retry loop and reducing the retry count from 20 to 3. The software now only monitors the status register while the hardware autonomously handles the 3 verification attempts at proper MAC_VERIFY_CNT intervals. Fixes: 49a2eb9068246 ("net: ethernet: ti: am65-cpsw-qos: Add Frame Preemption MAC Merge support") Signed-off-by: Aksh Garg Link: https://patch.msgid.link/20251106092305.1437347-3-a-garg7@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-qos.c | 29 +++++++++++++------------ 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-qos.c b/drivers/net/ethernet/ti/am65-cpsw-qos.c index ad06942ce461..66e8b224827b 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-qos.c +++ b/drivers/net/ethernet/ti/am65-cpsw-qos.c @@ -317,20 +317,21 @@ static int am65_cpsw_iet_verify_wait(struct am65_cpsw_port *port) u32 ctrl, status; int try; - try = 20; + try = 3; + + /* Reset the verify state machine by writing 1 + * to LINKFAIL + */ + ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL); + ctrl |= AM65_CPSW_PN_IET_MAC_LINKFAIL; + writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL); + + /* Clear MAC_LINKFAIL bit to start Verify. */ + ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL); + ctrl &= ~AM65_CPSW_PN_IET_MAC_LINKFAIL; + writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL); + do { - /* Reset the verify state machine by writing 1 - * to LINKFAIL - */ - ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL); - ctrl |= AM65_CPSW_PN_IET_MAC_LINKFAIL; - writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL); - - /* Clear MAC_LINKFAIL bit to start Verify. */ - ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL); - ctrl &= ~AM65_CPSW_PN_IET_MAC_LINKFAIL; - writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL); - msleep(port->qos.iet.verify_time_ms); status = readl(port->port_base + AM65_CPSW_PN_REG_IET_STATUS); @@ -352,7 +353,7 @@ static int am65_cpsw_iet_verify_wait(struct am65_cpsw_port *port) netdev_dbg(port->ndev, "MAC Merge verify error\n"); return -ENODEV; } - } while (try-- > 0); + } while (--try > 0); netdev_dbg(port->ndev, "MAC Merge verify timeout\n"); return -ETIMEDOUT; From 0725e6afb55128be21a2ca36e9674f573ccec173 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 7 Nov 2025 06:40:25 +0000 Subject: [PATCH 288/543] tipc: Fix use-after-free in tipc_mon_reinit_self(). syzbot reported use-after-free of tipc_net(net)->monitors[] in tipc_mon_reinit_self(). [0] The array is protected by RTNL, but tipc_mon_reinit_self() iterates over it without RTNL. tipc_mon_reinit_self() is called from tipc_net_finalize(), which is always under RTNL except for tipc_net_finalize_work(). Let's hold RTNL in tipc_net_finalize_work(). [0]: BUG: KASAN: slab-use-after-free in __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] BUG: KASAN: slab-use-after-free in _raw_spin_lock_irqsave+0xa7/0xf0 kernel/locking/spinlock.c:162 Read of size 1 at addr ffff88805eae1030 by task kworker/0:7/5989 CPU: 0 UID: 0 PID: 5989 Comm: kworker/0:7 Not tainted syzkaller #0 PREEMPT_{RT,(full)} Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/18/2025 Workqueue: events tipc_net_finalize_work Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x240 mm/kasan/report.c:482 kasan_report+0x118/0x150 mm/kasan/report.c:595 __kasan_check_byte+0x2a/0x40 mm/kasan/common.c:568 kasan_check_byte include/linux/kasan.h:399 [inline] lock_acquire+0x8d/0x360 kernel/locking/lockdep.c:5842 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0xa7/0xf0 kernel/locking/spinlock.c:162 rtlock_slowlock kernel/locking/rtmutex.c:1894 [inline] rwbase_rtmutex_lock_state kernel/locking/spinlock_rt.c:160 [inline] rwbase_write_lock+0xd3/0x7e0 kernel/locking/rwbase_rt.c:244 rt_write_lock+0x76/0x110 kernel/locking/spinlock_rt.c:243 write_lock_bh include/linux/rwlock_rt.h:99 [inline] tipc_mon_reinit_self+0x79/0x430 net/tipc/monitor.c:718 tipc_net_finalize+0x115/0x190 net/tipc/net.c:140 process_one_work kernel/workqueue.c:3236 [inline] process_scheduled_works+0xade/0x17b0 kernel/workqueue.c:3319 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3400 kthread+0x70e/0x8a0 kernel/kthread.c:463 ret_from_fork+0x439/0x7d0 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Allocated by task 6089: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:388 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:405 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x1a8/0x320 mm/slub.c:4407 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] tipc_mon_create+0xc3/0x4d0 net/tipc/monitor.c:657 tipc_enable_bearer net/tipc/bearer.c:357 [inline] __tipc_nl_bearer_enable+0xe16/0x13f0 net/tipc/bearer.c:1047 __tipc_nl_compat_doit net/tipc/netlink_compat.c:371 [inline] tipc_nl_compat_doit+0x3bc/0x5f0 net/tipc/netlink_compat.c:393 tipc_nl_compat_handle net/tipc/netlink_compat.c:-1 [inline] tipc_nl_compat_recv+0x83c/0xbe0 net/tipc/netlink_compat.c:1321 genl_family_rcv_msg_doit+0x215/0x300 net/netlink/genetlink.c:1115 genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x60e/0x790 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x208/0x470 net/netlink/af_netlink.c:2552 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1320 [inline] netlink_unicast+0x846/0xa10 net/netlink/af_netlink.c:1346 netlink_sendmsg+0x805/0xb30 net/netlink/af_netlink.c:1896 sock_sendmsg_nosec net/socket.c:714 [inline] __sock_sendmsg+0x21c/0x270 net/socket.c:729 ____sys_sendmsg+0x508/0x820 net/socket.c:2614 ___sys_sendmsg+0x21f/0x2a0 net/socket.c:2668 __sys_sendmsg net/socket.c:2700 [inline] __do_sys_sendmsg net/socket.c:2705 [inline] __se_sys_sendmsg net/socket.c:2703 [inline] __x64_sys_sendmsg+0x1a1/0x260 net/socket.c:2703 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 6088: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:243 [inline] __kasan_slab_free+0x5b/0x80 mm/kasan/common.c:275 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2422 [inline] slab_free mm/slub.c:4695 [inline] kfree+0x195/0x550 mm/slub.c:4894 tipc_l2_device_event+0x380/0x650 net/tipc/bearer.c:-1 notifier_call_chain+0x1b3/0x3e0 kernel/notifier.c:85 call_netdevice_notifiers_extack net/core/dev.c:2267 [inline] call_netdevice_notifiers net/core/dev.c:2281 [inline] unregister_netdevice_many_notify+0x14d7/0x1fe0 net/core/dev.c:12166 unregister_netdevice_many net/core/dev.c:12229 [inline] unregister_netdevice_queue+0x33c/0x380 net/core/dev.c:12073 unregister_netdevice include/linux/netdevice.h:3385 [inline] __tun_detach+0xe4d/0x1620 drivers/net/tun.c:621 tun_detach drivers/net/tun.c:637 [inline] tun_chr_close+0x10d/0x1c0 drivers/net/tun.c:3433 __fput+0x458/0xa80 fs/file_table.c:468 task_work_run+0x1d4/0x260 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop+0xec/0x110 kernel/entry/common.c:43 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline] syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline] syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline] do_syscall_64+0x2bd/0x3b0 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 46cb01eeeb86 ("tipc: update mon's self addr when node addr generated") Reported-by: syzbot+d7dad7fd4b3921104957@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/690c323a.050a0220.baf87.007f.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251107064038.2361188-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/tipc/net.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tipc/net.c b/net/tipc/net.c index 0e95572e56b4..7e65d0b0c4a8 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -145,7 +145,9 @@ void tipc_net_finalize_work(struct work_struct *work) { struct tipc_net *tn = container_of(work, struct tipc_net, work); + rtnl_lock(); tipc_net_finalize(tipc_link_net(tn->bcl), tn->trial_addr); + rtnl_unlock(); } void tipc_net_stop(struct net *net) From e6ca8f533ed41129fcf052297718f417f021cc7d Mon Sep 17 00:00:00 2001 From: Buday Csaba Date: Sat, 8 Nov 2025 07:49:22 +0100 Subject: [PATCH 289/543] net: mdio: fix resource leak in mdiobus_register_device() Fix a possible leak in mdiobus_register_device() when both a reset-gpio and a reset-controller are present. Clean up the already claimed reset-gpio, when the registration of the reset-controller fails, so when an error code is returned, the device retains its state before the registration attempt. Link: https://lore.kernel.org/all/20251106144603.39053c81@kernel.org/ Fixes: 71dd6c0dff51 ("net: phy: add support for reset-controller") Signed-off-by: Buday Csaba Link: https://patch.msgid.link/4b419377f8dd7d2f63f919d0f74a336c734f8fff.1762584481.git.buday.csaba@prolan.hu Signed-off-by: Jakub Kicinski --- drivers/net/phy/mdio_bus.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index cad6ed3aa10b..4354241137d5 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -73,8 +73,11 @@ int mdiobus_register_device(struct mdio_device *mdiodev) return err; err = mdiobus_register_reset(mdiodev); - if (err) + if (err) { + gpiod_put(mdiodev->reset_gpio); + mdiodev->reset_gpio = NULL; return err; + } /* Assert the reset signal */ mdio_device_reset(mdiodev, 1); From 49c8d2c1f94cc2f4d1a108530d7ba52614b874c2 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 7 Nov 2025 06:03:37 -0800 Subject: [PATCH 290/543] net: netpoll: fix incorrect refcount handling causing incorrect cleanup commit efa95b01da18 ("netpoll: fix use after free") incorrectly ignored the refcount and prematurely set dev->npinfo to NULL during netpoll cleanup, leading to improper behavior and memory leaks. Scenario causing lack of proper cleanup: 1) A netpoll is associated with a NIC (e.g., eth0) and netdev->npinfo is allocated, and refcnt = 1 - Keep in mind that npinfo is shared among all netpoll instances. In this case, there is just one. 2) Another netpoll is also associated with the same NIC and npinfo->refcnt += 1. - Now dev->npinfo->refcnt = 2; - There is just one npinfo associated to the netdev. 3) When the first netpolls goes to clean up: - The first cleanup succeeds and clears np->dev->npinfo, ignoring refcnt. - It basically calls `RCU_INIT_POINTER(np->dev->npinfo, NULL);` - Set dev->npinfo = NULL, without proper cleanup - No ->ndo_netpoll_cleanup() is either called 4) Now the second target tries to clean up - The second cleanup fails because np->dev->npinfo is already NULL. * In this case, ops->ndo_netpoll_cleanup() was never called, and the skb pool is not cleaned as well (for the second netpoll instance) - This leaks npinfo and skbpool skbs, which is clearly reported by kmemleak. Revert commit efa95b01da18 ("netpoll: fix use after free") and adds clarifying comments emphasizing that npinfo cleanup should only happen once the refcount reaches zero, ensuring stable and correct netpoll behavior. Cc: # 3.17.x Cc: Jay Vosburgh Fixes: efa95b01da18 ("netpoll: fix use after free") Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251107-netconsole_torture-v10-1-749227b55f63@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c85f740065fc..331764845e8f 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -811,6 +811,10 @@ static void __netpoll_cleanup(struct netpoll *np) if (!npinfo) return; + /* At this point, there is a single npinfo instance per netdevice, and + * its refcnt tracks how many netpoll structures are linked to it. We + * only perform npinfo cleanup when the refcnt decrements to zero. + */ if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; @@ -820,8 +824,7 @@ static void __netpoll_cleanup(struct netpoll *np) RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); - } else - RCU_INIT_POINTER(np->dev->npinfo, NULL); + } skb_pool_flush(np); } From 39acc6a95eefcf814efa226d8813f89e7e03496e Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 7 Nov 2025 06:03:38 -0800 Subject: [PATCH 291/543] selftest: netcons: refactor target creation Extract the netconsole target creation from create_dynamic_target(), by moving it from create_dynamic_target() into a new helper function. This enables other tests to use the creation of netconsole targets with arbitrary parameters and no sleep. The new helper will be utilized by forthcoming torture-type selftests that require dynamic target management. Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251107-netconsole_torture-v10-2-749227b55f63@debian.org Signed-off-by: Jakub Kicinski --- .../drivers/net/lib/sh/lib_netcons.sh | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index 8e1085e89647..9b5ef8074440 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -113,31 +113,39 @@ function set_network() { configure_ip } -function create_dynamic_target() { - local FORMAT=${1:-"extended"} +function _create_dynamic_target() { + local FORMAT="${1:?FORMAT parameter required}" + local NCPATH="${2:?NCPATH parameter required}" DSTMAC=$(ip netns exec "${NAMESPACE}" \ ip link show "${DSTIF}" | awk '/ether/ {print $2}') # Create a dynamic target - mkdir "${NETCONS_PATH}" + mkdir "${NCPATH}" - echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip - echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip - echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac - echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name + echo "${DSTIP}" > "${NCPATH}"/remote_ip + echo "${SRCIP}" > "${NCPATH}"/local_ip + echo "${DSTMAC}" > "${NCPATH}"/remote_mac + echo "${SRCIF}" > "${NCPATH}"/dev_name if [ "${FORMAT}" == "basic" ] then # Basic target does not support release - echo 0 > "${NETCONS_PATH}"/release - echo 0 > "${NETCONS_PATH}"/extended + echo 0 > "${NCPATH}"/release + echo 0 > "${NCPATH}"/extended elif [ "${FORMAT}" == "extended" ] then - echo 1 > "${NETCONS_PATH}"/extended + echo 1 > "${NCPATH}"/extended fi - echo 1 > "${NETCONS_PATH}"/enabled + echo 1 > "${NCPATH}"/enabled + +} + +function create_dynamic_target() { + local FORMAT=${1:-"extended"} + local NCPATH=${2:-"$NETCONS_PATH"} + _create_dynamic_target "${FORMAT}" "${NCPATH}" # This will make sure that the kernel was able to # load the netconsole driver configuration. The console message From 6701896eb90998ff16338f199144bd9deefb79ba Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 7 Nov 2025 06:03:39 -0800 Subject: [PATCH 292/543] selftest: netcons: create a torture test Create a netconsole test that puts a lot of pressure on the netconsole list manipulation. Do it by creating dynamic targets and deleting targets while messages are being sent. Also put interface down while the messages are being sent, as creating parallel targets. The code launches three background jobs on distinct schedules: * Toggle netcons target every 30 iterations * create and delete random_target every 50 iterations * toggle iface every 70 iterations This creates multiple concurrency sources that interact with netconsole states. This is good practice to simulate stress, and exercise netpoll and netconsole locks. This test already found an issue as reported in [1] Link: https://lore.kernel.org/all/20250901-netpoll_memleak-v1-1-34a181977dfc@debian.org/ [1] Signed-off-by: Breno Leitao Reviewed-by: Andre Carvalho Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251107-netconsole_torture-v10-3-749227b55f63@debian.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/Makefile | 1 + .../selftests/drivers/net/netcons_torture.sh | 130 ++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/netcons_torture.sh diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 6e41635bd55a..71ee69e524d7 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -18,6 +18,7 @@ TEST_PROGS := \ netcons_fragmented_msg.sh \ netcons_overflow.sh \ netcons_sysdata.sh \ + netcons_torture.sh \ netpoll_basic.py \ ping.py \ psp.py \ diff --git a/tools/testing/selftests/drivers/net/netcons_torture.sh b/tools/testing/selftests/drivers/net/netcons_torture.sh new file mode 100755 index 000000000000..2ce9ee3719d1 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_torture.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Repeatedly send kernel messages, toggles netconsole targets on and off, +# creates and deletes targets in parallel, and toggles the source interface to +# simulate stress conditions. +# +# This test aims to verify the robustness of netconsole under dynamic +# configurations and concurrent operations. +# +# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make +# sure no issues is reported. +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh + +# Number of times the main loop run +ITERATIONS=${1:-150} + +# Only test extended format +FORMAT="extended" +# And ipv6 only +IP_VERSION="ipv6" + +# Create, enable and delete some targets. +create_and_delete_random_target() { + COUNT=2 + RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_) + + if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}" ] || \ + [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then + echo "Function didn't finish yet, skipping it." >&2 + return + fi + + # enable COUNT targets + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + + # Basic population so the target can come up + _create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}" + done + + echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg + # disable them all + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]] + then + echo 0 > "${RND_TARGET_PATH}"/enabled + fi + rmdir "${RND_TARGET_PATH}" + done +} + +# Disable and enable the target mid-air, while messages +# are being transmitted. +toggle_netcons_target() { + for i in $(seq 2) + do + if [ ! -d "${NETCONS_PATH}" ] + then + break + fi + echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + # Try to enable a bit harder, given it might fail to enable + # Write to `enabled` might fail depending on the lock, which is + # highly contentious here + for _ in $(seq 5) + do + echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + done + done +} + +toggle_iface(){ + ip link set "${SRCIF}" down + ip link set "${SRCIF}" up +} + +# Start here + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network "${IP_VERSION}" +# Create a dynamic target for netconsole +create_dynamic_target "${FORMAT}" + +for i in $(seq "$ITERATIONS") +do + for _ in $(seq 10) + do + echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg + done + wait + + if (( i % 30 == 0 )); then + toggle_netcons_target & + fi + + if (( i % 50 == 0 )); then + # create some targets, enable them, send msg and disable + # all in a parallel thread + create_and_delete_random_target & + fi + + if (( i % 70 == 0 )); then + toggle_iface & + fi +done +wait + +exit "${EXIT_STATUS}" From 236682db3b6fe71cad76ac5e920ea4c14a33178e Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 7 Nov 2025 06:03:40 -0800 Subject: [PATCH 293/543] selftest: netcons: add test for netconsole over bonded interfaces This patch adds a selftest that verifies netconsole functionality over bonded network interfaces using netdevsim. It sets up two bonded interfaces acting as transmit (TX) and receive (RX) ends, placed in separate network namespaces. The test sends kernel log messages and verifies that they are properly received on the bonded RX interfaces with both IPv4 and IPv6, and using basic and extended netconsole formats. This patchset aims to test a long-standing netpoll subsystem where netpoll has multiple users. (in this case netconsole and bonding). A similar selftest has been discussed in [1] and [2]. This test also tries to enable bonding and netpoll in different order, just to guarantee that all the possibilities are exercised. Link: https://lore.kernel.org/all/20250905-netconsole_torture-v3-0-875c7febd316@debian.org/ [1] Link: https://lore.kernel.org/lkml/96b940137a50e5c387687bb4f57de8b0435a653f.1404857349.git.decot@googlers.com/ [2] Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251107-netconsole_torture-v10-4-749227b55f63@debian.org Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/bonding/Makefile | 2 + .../selftests/drivers/net/bonding/config | 4 + .../net/bonding/netcons_over_bonding.sh | 361 ++++++++++++++++++ .../drivers/net/lib/sh/lib_netcons.sh | 54 ++- 4 files changed, 414 insertions(+), 7 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 402d4ee84f2e..6c5c60adb5e8 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -14,6 +14,7 @@ TEST_PROGS := \ dev_addr_lists.sh \ mode-1-recovery-updelay.sh \ mode-2-recovery-updelay.sh \ + netcons_over_bonding.sh \ # end of TEST_PROGS TEST_FILES := \ @@ -24,6 +25,7 @@ TEST_FILES := \ TEST_INCLUDES := \ ../../../net/lib.sh \ + ../lib/sh/lib_netcons.sh \ ../../../net/forwarding/lib.sh \ # end of TEST_INCLUDES diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 6bb290abd48b..991494376223 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -1,5 +1,6 @@ CONFIG_BONDING=y CONFIG_BRIDGE=y +CONFIG_CONFIGFS_FS=y CONFIG_DUMMY=y CONFIG_INET_ESP=y CONFIG_INET_ESP_OFFLOAD=y @@ -9,6 +10,9 @@ CONFIG_MACVLAN=y CONFIG_NET_ACT_GACT=y CONFIG_NET_CLS_FLOWER=y CONFIG_NET_CLS_MATCHALL=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETCONSOLE_EXTENDED_LOG=y CONFIG_NETDEVSIM=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y diff --git a/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh new file mode 100755 index 000000000000..477cc9379500 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh @@ -0,0 +1,361 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 +# +# This selftest exercises trying to have multiple netpoll users at the same +# time. +# +# This selftest has multiple smalls test inside, and the goal is to +# get interfaces with bonding and netconsole in different orders in order +# to catch any possible issue. +# +# The main test composes of four interfaces being created using netdevsim; two +# of them are bonded to serve as the netconsole's transmit interface. The +# remaining two interfaces are similarly bonded and assigned to a separate +# network namespace, which acts as the receive interface, where socat monitors +# for incoming messages. +# +# A netconsole message is then sent to ensure it is properly received across +# this configuration. +# +# Later, run a few other tests, to make sure that bonding and netconsole +# cannot coexist. +# +# The test's objective is to exercise netpoll usage when managed simultaneously +# by multiple subsystems (netconsole and bonding). +# +# Author: Breno Leitao + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true +modprobe bonding 2> /dev/null || true +modprobe veth 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup_bond EXIT + +FORMAT="extended" +IP_VERSION="ipv4" +VETH0="veth"$(( RANDOM % 256)) +VETH1="veth"$((256 + RANDOM % 256)) +TXNS="" +RXNS="" + +# Create "bond_tx_XX" and "bond_rx_XX" interfaces, and set DSTIF and SRCIF with +# the bonding interfaces +function setup_bonding_ifaces() { + local RAND=$(( RANDOM % 100 )) + BOND_TX_MAIN_IF="bond_tx_$RAND" + BOND_RX_MAIN_IF="bond_rx_$RAND" + + # Setup TX + if ! ip -n "${TXNS}" link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr + then + echo "Failed to create bond TX interface. Is CONFIG_BONDING set?" >&2 + # only clean nsim ifaces and namespace. Nothing else has been + # initialized + cleanup_bond_nsim + trap - EXIT + exit "${ksft_skip}" + fi + + # create_netdevsim() got the interface up, but it needs to be down + # before being enslaved. + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX_MAIN_IF}" up + + # Setup RX + ip -n "${RXNS}" \ + link add "${BOND_RX_MAIN_IF}" type bond mode balance-rr + ip -n "${RXNS}" \ + link set "${BOND_RX1_SLAVE_IF}" down + ip -n "${RXNS}" \ + link set "${BOND_RX2_SLAVE_IF}" down + ip -n "${RXNS}" \ + link set "${BOND_RX1_SLAVE_IF}" master "${BOND_RX_MAIN_IF}" + ip -n "${RXNS}" \ + link set "${BOND_RX2_SLAVE_IF}" master "${BOND_RX_MAIN_IF}" + ip -n "${RXNS}" \ + link set "${BOND_RX_MAIN_IF}" up + + export DSTIF="${BOND_RX_MAIN_IF}" + export SRCIF="${BOND_TX_MAIN_IF}" +} + +# Create 4 netdevsim interfaces. Two of them will be bound to TX bonding iface +# and the other two will be bond to the RX interface (on the other namespace) +function create_ifaces_bond() { + BOND_TX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_1}" "${TXNS}") + BOND_TX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_2}" "${TXNS}") + BOND_RX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_1}" "${RXNS}") + BOND_RX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_2}" "${RXNS}") +} + +# netdevsim link BOND_TX to BOND_RX interfaces +function link_ifaces_bond() { + local BOND_TX1_SLAVE_IFIDX + local BOND_TX2_SLAVE_IFIDX + local BOND_RX1_SLAVE_IFIDX + local BOND_RX2_SLAVE_IFIDX + local TXNS_FD + local RXNS_FD + + BOND_TX1_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \ + cat /sys/class/net/"$BOND_TX1_SLAVE_IF"/ifindex) + BOND_TX2_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \ + cat /sys/class/net/"$BOND_TX2_SLAVE_IF"/ifindex) + BOND_RX1_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \ + cat /sys/class/net/"$BOND_RX1_SLAVE_IF"/ifindex) + BOND_RX2_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \ + cat /sys/class/net/"$BOND_RX2_SLAVE_IF"/ifindex) + + exec {TXNS_FD} "$NSIM_DEV_SYS_LINK" + echo "${TXNS_FD}:$BOND_TX2_SLAVE_IFIDX $RXNS_FD:$BOND_RX2_SLAVE_IFIDX" \ + > "$NSIM_DEV_SYS_LINK" + + exec {TXNS_FD}<&- + exec {RXNS_FD}<&- +} + +function create_all_ifaces() { + # setup_ns function is coming from lib.sh + setup_ns TXNS RXNS + export NAMESPACE="${RXNS}" + + # Create two interfaces for RX and two for TX + create_ifaces_bond + # Link netlink ifaces + link_ifaces_bond +} + +# configure DSTIF and SRCIF IPs +function configure_ifaces_ips() { + local IP_VERSION=${1:-"ipv4"} + select_ipv4_or_ipv6 "${IP_VERSION}" + + ip -n "${RXNS}" addr add "${DSTIP}"/24 dev "${DSTIF}" + ip -n "${RXNS}" link set "${DSTIF}" up + + ip -n "${TXNS}" addr add "${SRCIP}"/24 dev "${SRCIF}" + ip -n "${TXNS}" link set "${SRCIF}" up +} + +function test_enable_netpoll_on_enslaved_iface() { + echo 0 > "${NETCONS_PATH}"/enabled + + # At this stage, BOND_TX1_SLAVE_IF is enslaved to BOND_TX_MAIN_IF, and + # linked to BOND_RX1_SLAVE_IF inside the namespace. + echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name + + # This should fail with the following message in dmesg: + # netpoll: netconsole: ethX is a slave device, aborting + set +e + enable_netcons_ns 2> /dev/null + set -e + + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]] + then + echo "test failed: Bonding and netpoll cannot co-exists." >&2 + exit "${ksft_fail}" + fi +} + +function test_delete_bond_and_reenable_target() { + ip -n "${TXNS}" \ + link delete "${BOND_TX_MAIN_IF}" type bond + + # BOND_TX1_SLAVE_IF is not attached to a bond interface anymore + # netpoll can be plugged in there + echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name + + # this should work, since the interface is not enslaved + enable_netcons_ns + + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: Unable to start netpoll on an unbond iface." >&2 + exit "${ksft_fail}" + fi +} + +# Send a netconsole message to the netconsole target +function test_send_netcons_msg_through_bond_iface() { + # Listen for netconsole port inside the namespace and + # destination interface + listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" & + # Wait for socat to start and listen to the port. + wait_for_port "${RXNS}" "${PORT}" "${IP_VERSION}" + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_result "${OUTPUT_FILE}" "${FORMAT}" + # kill socat in case it is still running + pkill_socat +} + +# BOND_TX1_SLAVE_IF has netconsole enabled on it, bind it to BOND_TX_MAIN_IF. +# Given BOND_TX_MAIN_IF was deleted, recreate it first +function test_enslave_netcons_enabled_iface { + # netconsole got disabled while the interface was down + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: netconsole expected to be enabled against BOND_TX1_SLAVE_IF" >&2 + exit "${ksft_fail}" + fi + + # recreate the bonding iface. it got deleted by previous + # test (test_delete_bond_and_reenable_target) + ip -n "${TXNS}" \ + link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr + + # sub-interface need to be down before attaching to bonding + # This will also disable netconsole. + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX_MAIN_IF}" up + + # netconsole got disabled while the interface was down + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]] + then + echo "test failed: Device is part of a bond iface, cannot have netcons enabled" >&2 + exit "${ksft_fail}" + fi +} + +# Get netconsole enabled on a bonding interface and attach a second +# sub-interface. +function test_enslave_iface_to_bond { + # BOND_TX_MAIN_IF has only BOND_TX1_SLAVE_IF right now + echo "${BOND_TX_MAIN_IF}" > "${NETCONS_PATH}"/dev_name + enable_netcons_ns + + # netcons is attached to bond0 and BOND_TX1_SLAVE_IF is + # part of BOND_TX_MAIN_IF. Attach BOND_TX2_SLAVE_IF to BOND_TX_MAIN_IF. + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: Netconsole should be enabled on bonding interface. Failed" >&2 + exit "${ksft_fail}" + fi +} + +function test_enslave_iff_disabled_netpoll_iface { + local ret + + # Create two interfaces. veth interfaces it known to have + # IFF_DISABLE_NETPOLL set + if ! ip link add "${VETH0}" type veth peer name "${VETH1}" + then + echo "Failed to create veth TX interface. Is CONFIG_VETH set?" >&2 + exit "${ksft_skip}" + fi + set +e + # This will print RTNETLINK answers: Device or resource busy + ip link set "${VETH0}" master "${BOND_TX_MAIN_IF}" 2> /dev/null + ret=$? + set -e + if [[ $ret -eq 0 ]] + then + echo "test failed: veth interface could not be enslaved" + exit "${ksft_fail}" + fi +} + +# Given that netconsole picks the current net namespace, we need to enable it +# from inside the TXNS namespace +function enable_netcons_ns() { + ip netns exec "${TXNS}" sh -c \ + "mount -t configfs configfs /sys/kernel/config && echo 1 > $NETCONS_PATH/enabled" +} + +#################### +# Tests start here # +#################### + +# Create regular interfaces using netdevsim and link them +create_all_ifaces + +# Setup the bonding interfaces +# BOND_RX_MAIN_IF has BOND_RX{1,2}_SLAVE_IF +# BOND_TX_MAIN_IF has BOND_TX{1,2}_SLAVE_IF +setup_bonding_ifaces + +# Configure the ips as BOND_RX1_SLAVE_IF and BOND_TX1_SLAVE_IF +configure_ifaces_ips "${IP_VERSION}" + +_create_dynamic_target "${FORMAT}" "${NETCONS_PATH}" +enable_netcons_ns +set_user_data + +# Test #1 : Create an bonding interface and attach netpoll into +# the bonding interface. Netconsole/netpoll should work on +# the bonding interface. +test_send_netcons_msg_through_bond_iface +echo "test #1: netpoll on bonding interface worked. Test passed" >&2 + +# Test #2: Attach netpoll to an enslaved interface +# Try to attach netpoll to an enslaved sub-interface (while still being part of +# a bonding interface), which shouldn't be allowed +test_enable_netpoll_on_enslaved_iface +echo "test #2: netpoll correctly rejected enslaved interface (expected behavior). Test passed." >&2 + +# Test #3: Unplug the sub-interface from bond and enable netconsole +# Detach the interface from a bonding interface and attach netpoll again +test_delete_bond_and_reenable_target +echo "test #3: Able to attach to an unbound interface. Test passed." >&2 + +# Test #4: Enslave a sub-interface that had netconsole enabled +# Try to enslave an interface that has netconsole/netpoll enabled. +# Previous test has netconsole enabled in BOND_TX1_SLAVE_IF, try to enslave it +test_enslave_netcons_enabled_iface +echo "test #4: Enslaving an interface with netpoll attached. Test passed." >&2 + +# Test #5: Enslave a sub-interface to a bonding interface +# Enslave an interface to a bond interface that has netpoll attached +# At this stage, BOND_TX_MAIN_IF is created and BOND_TX1_SLAVE_IF is part of +# it. Netconsole is currently disabled +test_enslave_iface_to_bond +echo "test #5: Enslaving an interface to bond+netpoll. Test passed." >&2 + +# Test #6: Enslave a IFF_DISABLE_NETPOLL sub-interface to a bonding interface +# At this stage, BOND_TX_MAIN_IF has both sub interface and netconsole is +# enabled. This test will try to enslave an a veth (IFF_DISABLE_NETPOLL) interface +# and it should fail, with netpoll: veth0 doesn't support polling +test_enslave_iff_disabled_netpoll_iface +echo "test #6: Enslaving IFF_DISABLE_NETPOLL ifaces to bond iface is not supported. Test passed." >&2 + +cleanup_bond +trap - EXIT +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index 9b5ef8074440..87f89fd92f8c 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -11,9 +11,11 @@ set -euo pipefail LIBDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") SRCIF="" # to be populated later +SRCIP="" # to be populated later SRCIP4="192.0.2.1" SRCIP6="fc00::1" DSTIF="" # to be populated later +DSTIP="" # to be populated later DSTIP4="192.0.2.2" DSTIP6="fc00::2" @@ -28,17 +30,23 @@ NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" # NAMESPACE will be populated by setup_ns with a random value NAMESPACE="" -# IDs for netdevsim +# IDs for netdevsim. We either use NSIM_DEV_{1,2}_ID for standard test +# or NSIM_BOND_{T,R}X_{1,2} for the bonding tests. Not both at the +# same time. NSIM_DEV_1_ID=$((256 + RANDOM % 256)) NSIM_DEV_2_ID=$((512 + RANDOM % 256)) +NSIM_BOND_TX_1=$((768 + RANDOM % 256)) +NSIM_BOND_TX_2=$((1024 + RANDOM % 256)) +NSIM_BOND_RX_1=$((1280 + RANDOM % 256)) +NSIM_BOND_RX_2=$((1536 + RANDOM % 256)) NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device" +NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device" # Used to create and delete namespaces source "${LIBDIR}"/../../../../net/lib.sh # Create netdevsim interfaces create_ifaces() { - echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW" echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW" udevadm settle 2> /dev/null || true @@ -137,9 +145,6 @@ function _create_dynamic_target() { then echo 1 > "${NCPATH}"/extended fi - - echo 1 > "${NCPATH}"/enabled - } function create_dynamic_target() { @@ -147,6 +152,8 @@ function create_dynamic_target() { local NCPATH=${2:-"$NETCONS_PATH"} _create_dynamic_target "${FORMAT}" "${NCPATH}" + echo 1 > "${NCPATH}"/enabled + # This will make sure that the kernel was able to # load the netconsole driver configuration. The console message # gets more organized/sequential as well. @@ -193,14 +200,26 @@ function do_cleanup() { echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk } -function cleanup() { +function cleanup_netcons() { # delete netconsole dynamic reconfiguration - echo 0 > "${NETCONS_PATH}"/enabled + # do not fail if the target is already disabled + if [[ ! -d "${NETCONS_PATH}" ]] + then + # in some cases this is called before netcons path is created + return + fi + if [[ $(cat "${NETCONS_PATH}"/enabled) != 0 ]] + then + echo 0 > "${NETCONS_PATH}"/enabled || true + fi # Remove all the keys that got created during the selftest find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete # Remove the configfs entry rmdir "${NETCONS_PATH}" +} +function cleanup() { + cleanup_netcons do_cleanup } @@ -377,3 +396,24 @@ function wait_for_port() { # more frequently on IPv6 sleep 1 } + +# Clean up netdevsim ifaces created for bonding test +function cleanup_bond_nsim() { + ip -n "${TXNS}" \ + link delete "${BOND_TX_MAIN_IF}" type bond || true + ip -n "${RXNS}" \ + link delete "${BOND_RX_MAIN_IF}" type bond || true + + cleanup_netdevsim "$NSIM_BOND_TX_1" + cleanup_netdevsim "$NSIM_BOND_TX_2" + cleanup_netdevsim "$NSIM_BOND_RX_1" + cleanup_netdevsim "$NSIM_BOND_RX_2" +} + +# cleanup tests that use bonding interfaces +function cleanup_bond() { + cleanup_netcons + cleanup_bond_nsim + cleanup_all_ns + ip link delete "${VETH0}" || true +} From 7fe0d21f5633af8c3fab9f0ef0706c6156623484 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 10 Nov 2025 14:26:18 +0200 Subject: [PATCH 294/543] wifi: mac80211: skip rate verification for not captured PSDUs If for example the sniffer did not follow any AIDs in an MU frame, then some of the information may not be filled in or is even expected to be invalid. As an example, in that case it is expected that Nss is zero. Fixes: 2ff5e52e7836 ("radiotap: add 0-length PSDU "not captured" type") Signed-off-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251110142554.83a2858ee15b.I9f78ce7984872f474722f9278691ae16378f0a3e@changeid Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6af43dfefdd6..5b4c3fe9970a 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -5360,10 +5360,14 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, if (WARN_ON(!local->started)) goto drop; - if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) { + if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC) && + !(status->flag & RX_FLAG_NO_PSDU && + status->zero_length_psdu_type == + IEEE80211_RADIOTAP_ZERO_LEN_PSDU_NOT_CAPTURED))) { /* - * Validate the rate, unless a PLCP error means that - * we probably can't have a valid rate here anyway. + * Validate the rate, unless there was a PLCP error which may + * have an invalid rate or the PSDU was not capture and may be + * missing rate information. */ switch (status->encoding) { From eaa7ce66c3e2ccda035022b5e8af09caabecd635 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 10 Nov 2025 14:01:54 +0200 Subject: [PATCH 295/543] wifi: mac80211_hwsim: Fix possible NULL dereference The 'vif' pointer in the Tx information might be NULL, e.g., in case of injected frames etc. and is not checked in all paths. Fix it. While at it, also directly use the local 'vif' pointer. Fixes: a37a6f54439b ("wifi: mac80211_hwsim: Add simulation support for NAN device") Signed-off-by: Ilan Peer Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-wireless/aNJUlyIiSTW9zZdr@stanley.mountain Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251110140128.ec00ae795a32.I9c65659b52434189d8b2ba06710d482669a3887a@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/virtual/mac80211_hwsim.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index d28bf18d57ec..5903d82e1ab1 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -2003,8 +2003,14 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, struct ieee80211_sta *sta = control->sta; struct ieee80211_bss_conf *bss_conf; + /* This can happen in case of monitor injection */ + if (!vif) { + ieee80211_free_txskb(hw, skb); + return; + } + if (link != IEEE80211_LINK_UNSPECIFIED) { - bss_conf = rcu_dereference(txi->control.vif->link_conf[link]); + bss_conf = rcu_dereference(vif->link_conf[link]); if (sta) link_sta = rcu_dereference(sta->link[link]); } else { @@ -2065,13 +2071,13 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, return; } - if (txi->control.vif) - hwsim_check_magic(txi->control.vif); + if (vif) + hwsim_check_magic(vif); if (control->sta) hwsim_check_sta_magic(control->sta); if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) - ieee80211_get_tx_rates(txi->control.vif, control->sta, skb, + ieee80211_get_tx_rates(vif, control->sta, skb, txi->control.rates, ARRAY_SIZE(txi->control.rates)); From c4e1ac09ee1c750890e36cb1f841f25518f23589 Mon Sep 17 00:00:00 2001 From: Pawel Dembicki Date: Tue, 11 Nov 2025 11:07:29 +0100 Subject: [PATCH 296/543] wifi: mwl8k: inject DSSS Parameter Set element into beacons if missing Some Marvell AP firmware used with mwl8k misbehaves when beacon frames do not contain a WLAN_EID_DS_PARAMS element with the current channel. It was reported on OpenWrt Github issues [0]. When hostapd/mac80211 omits DSSS Parameter Set from the beacon (which is valid on some bands), the firmware stops transmitting sane frames and RX status starts reporting bogus channel information. This makes AP mode unusable. Newer Marvell drivers (mwlwifi [1]) hard-code DSSS Parameter Set into AP beacons for all chips, which suggests this is a firmware requirement rather than a mwl8k-specific quirk. Mirror that behaviour in mwl8k: when setting the beacon, check if WLAN_EID_DS_PARAMS is present, and if not, extend the beacon and inject a DSSS Parameter Set element, using the current channel from hw->conf.chandef.chan. Tested on Linksys EA4500 (88W8366). [0] https://github.com/openwrt/openwrt/issues/19088 [1] https://github.com/kaloz/mwlwifi/blob/db97edf20fadea2617805006f5230665fadc6a8c/hif/fwcmd.c#L675 Fixes: b64fe619e371 ("mwl8k: basic AP interface support") Tested-by: Antony Kolitsos Signed-off-by: Pawel Dembicki Link: https://patch.msgid.link/20251111100733.2825970-3-paweldembicki@gmail.com Signed-off-by: Johannes Berg --- drivers/net/wireless/marvell/mwl8k.c | 71 ++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c index 891e125ad30b..54d6d00ecdf1 100644 --- a/drivers/net/wireless/marvell/mwl8k.c +++ b/drivers/net/wireless/marvell/mwl8k.c @@ -2966,6 +2966,51 @@ mwl8k_cmd_rf_antenna(struct ieee80211_hw *hw, int antenna, int mask) /* * CMD_SET_BEACON. */ + +static bool mwl8k_beacon_has_ds_params(const u8 *buf, int len) +{ + const struct ieee80211_mgmt *mgmt = (const void *)buf; + int ies_len; + + if (len <= offsetof(struct ieee80211_mgmt, u.beacon.variable)) + return false; + + ies_len = len - offsetof(struct ieee80211_mgmt, u.beacon.variable); + + return cfg80211_find_ie(WLAN_EID_DS_PARAMS, mgmt->u.beacon.variable, + ies_len) != NULL; +} + +static void mwl8k_beacon_copy_inject_ds_params(struct ieee80211_hw *hw, + u8 *buf_dst, const u8 *buf_src, + int src_len) +{ + const struct ieee80211_mgmt *mgmt = (const void *)buf_src; + static const u8 before_ds_params[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + }; + const u8 *ies; + int hdr_len, left, offs, pos; + + ies = mgmt->u.beacon.variable; + hdr_len = offsetof(struct ieee80211_mgmt, u.beacon.variable); + + offs = ieee80211_ie_split(ies, src_len - hdr_len, before_ds_params, + ARRAY_SIZE(before_ds_params), 0); + + pos = hdr_len + offs; + left = src_len - pos; + + memcpy(buf_dst, buf_src, pos); + + /* Inject a DSSS Parameter Set after SSID + Supp Rates */ + buf_dst[pos + 0] = WLAN_EID_DS_PARAMS; + buf_dst[pos + 1] = 1; + buf_dst[pos + 2] = hw->conf.chandef.chan->hw_value; + + memcpy(buf_dst + pos + 3, buf_src + pos, left); +} struct mwl8k_cmd_set_beacon { struct mwl8k_cmd_pkt_hdr header; __le16 beacon_len; @@ -2975,17 +3020,33 @@ struct mwl8k_cmd_set_beacon { static int mwl8k_cmd_set_beacon(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u8 *beacon, int len) { + bool ds_params_present = mwl8k_beacon_has_ds_params(beacon, len); struct mwl8k_cmd_set_beacon *cmd; - int rc; + int rc, final_len = len; - cmd = kzalloc(sizeof(*cmd) + len, GFP_KERNEL); + if (!ds_params_present) { + /* + * mwl8k firmware requires a DS Params IE with the current + * channel in AP beacons. If mac80211/hostapd does not + * include it, inject one here. IE ID + length + channel + * number = 3 bytes. + */ + final_len += 3; + } + + cmd = kzalloc(sizeof(*cmd) + final_len, GFP_KERNEL); if (cmd == NULL) return -ENOMEM; cmd->header.code = cpu_to_le16(MWL8K_CMD_SET_BEACON); - cmd->header.length = cpu_to_le16(sizeof(*cmd) + len); - cmd->beacon_len = cpu_to_le16(len); - memcpy(cmd->beacon, beacon, len); + cmd->header.length = cpu_to_le16(sizeof(*cmd) + final_len); + cmd->beacon_len = cpu_to_le16(final_len); + + if (ds_params_present) + memcpy(cmd->beacon, beacon, len); + else + mwl8k_beacon_copy_inject_ds_params(hw, cmd->beacon, beacon, + len); rc = mwl8k_post_pervif_cmd(hw, vif, &cmd->header); kfree(cmd); From ebd729fef31620e0bf74cbf8a4c7fda73a2a4e7e Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 20 Oct 2025 02:11:49 +0100 Subject: [PATCH 297/543] MIPS: Malta: Fix !EVA SOC-it PCI MMIO Fix a regression that has caused accesses to the PCI MMIO window to complete unclaimed in non-EVA configurations with the SOC-it family of system controllers, preventing PCI devices from working that use MMIO. In the non-EVA case PHYS_OFFSET is set to 0, meaning that PCI_BAR0 is set with an empty mask (and PCI_HEAD4 matches addresses starting from 0 accordingly). Consequently all addresses are matched for incoming DMA accesses from PCI. This seems to confuse the system controller's logic and outgoing bus cycles targeting the PCI MMIO window seem not to make it to the intended devices. This happens as well when a wider mask is used with PCI_BAR0, such as 0x80000000 or 0xe0000000, that makes addresses match that overlap with the PCI MMIO window, which starts at 0x10000000 in our configuration. Set the mask in PCI_BAR0 to 0xf0000000 for non-EVA then, covering the non-EVA maximum 256 MiB of RAM, which is what YAMON does and which used to work correctly up to the offending commit. Set PCI_P2SCMSKL to match PCI_BAR0 as required by the system controller's specification, and match PCI_P2SCMAPL to PCI_HEAD4 for identity mapping. Verified with: Core board type/revision = 0x0d (Core74K) / 0x01 System controller/revision = MIPS SOC-it 101 OCP / 1.3 SDR-FW-4:1 Processor Company ID/options = 0x01 (MIPS Technologies, Inc.) / 0x1c Processor ID/revision = 0x97 (MIPS 74Kf) / 0x4c for non-EVA and with: Core board type/revision = 0x0c (CoreFPGA-5) / 0x00 System controller/revision = MIPS ROC-it2 / 0.0 FW-1:1 (CLK_unknown) GIC Processor Company ID/options = 0x01 (MIPS Technologies, Inc.) / 0x00 Processor ID/revision = 0xa0 (MIPS interAptiv UP) / 0x20 for EVA/non-EVA, fixing: defxx 0000:00:12.0: assign IRQ: got 10 defxx: v1.12 2021/03/10 Lawrence V. Stefani and others 0000:00:12.0: Could not read adapter factory MAC address! vs: defxx 0000:00:12.0: assign IRQ: got 10 defxx: v1.12 2021/03/10 Lawrence V. Stefani and others 0000:00:12.0: DEFPA at MMIO addr = 0x10142000, IRQ = 10, Hardware addr = 00-00-f8-xx-xx-xx 0000:00:12.0: registered as fddi0 for non-EVA and causing no change for EVA. Signed-off-by: Maciej W. Rozycki Fixes: 422dd256642b ("MIPS: Malta: Allow PCI devices DMA to lower 2GB physical") Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Thomas Bogendoerfer --- arch/mips/mti-malta/malta-init.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/mips/mti-malta/malta-init.c b/arch/mips/mti-malta/malta-init.c index 000d6d50520a..82b0fd8576a2 100644 --- a/arch/mips/mti-malta/malta-init.c +++ b/arch/mips/mti-malta/malta-init.c @@ -241,16 +241,22 @@ void __init prom_init(void) #endif /* - * Setup the Malta max (2GB) memory for PCI DMA in host bridge - * in transparent addressing mode. + * Set up memory mapping in host bridge for PCI DMA masters, + * in transparent addressing mode. For EVA use the Malta + * maximum of 2 GiB memory in the alias space at 0x80000000 + * as per PHYS_OFFSET. Otherwise use 256 MiB of memory in + * the regular space, avoiding mapping the PCI MMIO window + * for DMA as it seems to confuse the system controller's + * logic, causing PCI MMIO to stop working. */ - mask = PHYS_OFFSET | PCI_BASE_ADDRESS_MEM_PREFETCH; - MSC_WRITE(MSC01_PCI_BAR0, mask); - MSC_WRITE(MSC01_PCI_HEAD4, mask); + mask = PHYS_OFFSET ? PHYS_OFFSET : 0xf0000000; + MSC_WRITE(MSC01_PCI_BAR0, + mask | PCI_BASE_ADDRESS_MEM_PREFETCH); + MSC_WRITE(MSC01_PCI_HEAD4, + PHYS_OFFSET | PCI_BASE_ADDRESS_MEM_PREFETCH); - mask &= MSC01_PCI_BAR0_SIZE_MSK; MSC_WRITE(MSC01_PCI_P2SCMSKL, mask); - MSC_WRITE(MSC01_PCI_P2SCMAPL, mask); + MSC_WRITE(MSC01_PCI_P2SCMAPL, PHYS_OFFSET); /* Don't handle target retries indefinitely. */ if ((data & MSC01_PCI_CFG_MAXRTRY_MSK) == From 09782e72eec451fa14d327595f86cdc338ebe53c Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Fri, 17 Oct 2025 20:01:19 +0200 Subject: [PATCH 298/543] mips: dts: econet: fix EN751221 core type In fact, it is a multi-threaded MIPS34Kc, not a single-threaded MIPS24Kc. Fixes: 0ec488700972 ("mips: dts: Add EcoNet DTS with EN751221 and SmartFiber XP8421-B board") Signed-off-by: Aleksander Jan Bajkowski Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/econet/en751221.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/boot/dts/econet/en751221.dtsi b/arch/mips/boot/dts/econet/en751221.dtsi index 66197e73d4f0..2abeef5b744a 100644 --- a/arch/mips/boot/dts/econet/en751221.dtsi +++ b/arch/mips/boot/dts/econet/en751221.dtsi @@ -18,7 +18,7 @@ cpus: cpus { cpu@0 { device_type = "cpu"; - compatible = "mips,mips24KEc"; + compatible = "mips,mips34Kc"; reg = <0>; }; }; From 0f559cd91e37b7978e4198ca2fbf7eb95df11361 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 10 Nov 2025 17:30:10 +0000 Subject: [PATCH 299/543] KVM: arm64: Finalize ID registers only once per VM Owing to the ID registers being global to the VM, there is no point in computing them more than once. However, recent changes making use of kvm_set_vm_id_reg() outlined that we repeatedly hammer the ID registers when we shouldn't. Gate the ID reg update on the VM having never run. Fixes: 50e7cce81b9b2 ("KVM: arm64: Limit clearing of ID_{AA64PFR0,PFR1}_EL1.GIC to userspace irqchip") Fixes: 5cb57a1aff755 ("KVM: arm64: Zero ID_AA64PFR0_EL1.GIC when no GICv3 is presented to the guest") Closes: https://lore.kernel.org/r/aRHf6x5umkTYhYJ3@finisterre.sirena.org.uk Reported-by: Mark Brown Tested-by: Mark Brown Link: https://patch.msgid.link/20251110173010.1918424-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 8ae2bca81614..ec3fbe0b8d52 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -5609,7 +5609,11 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) guard(mutex)(&kvm->arch.config_lock); - if (!irqchip_in_kernel(kvm)) { + /* + * This hacks into the ID registers, so only perform it when the + * first vcpu runs, or the kvm_set_vm_id_reg() helper will scream. + */ + if (!irqchip_in_kernel(kvm) && !kvm_vm_has_ran_once(kvm)) { u64 val; val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; From 60e6489f8e3b086bd1130ad4450a2c112e863791 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sun, 9 Nov 2025 02:52:22 +0000 Subject: [PATCH 300/543] af_unix: Initialise scc_index in unix_add_edge(). Quang Le reported that the AF_UNIX GC could garbage-collect a receive queue of an alive in-flight socket, with a nice repro. The repro consists of three stages. 1) 1-a. Create a single cyclic reference with many sockets 1-b. close() all sockets 1-c. Trigger GC 2) 2-a. Pass sk-A to an embryo sk-B 2-b. Pass sk-X to sk-X 2-c. Trigger GC 3) 3-a. accept() the embryo sk-B 3-b. Pass sk-B to sk-C 3-c. close() the in-flight sk-A 3-d. Trigger GC As of 2-c, sk-A and sk-X are linked to unix_unvisited_vertices, and unix_walk_scc() groups them into two different SCCs: unix_sk(sk-A)->vertex->scc_index = 2 (UNIX_VERTEX_INDEX_START) unix_sk(sk-X)->vertex->scc_index = 3 Once GC completes, unix_graph_grouped is set to true. Also, unix_graph_maybe_cyclic is set to true due to sk-X's cyclic self-reference, which makes close() trigger GC. At 3-b, unix_add_edge() allocates unix_sk(sk-B)->vertex and links it to unix_unvisited_vertices. unix_update_graph() is called at 3-a. and 3-b., but neither unix_graph_grouped nor unix_graph_maybe_cyclic is changed because both sk-B's listener and sk-C are not in-flight. 3-c decrements sk-A's file refcnt to 1. Since unix_graph_grouped is true at 3-d, unix_walk_scc_fast() is finally called and iterates 3 sockets sk-A, sk-B, and sk-X: sk-A -> sk-B (-> sk-C) sk-X -> sk-X This is totally fine. All of them are not yet close()d and should be grouped into different SCCs. However, unix_vertex_dead() misjudges that sk-A and sk-B are in the same SCC and sk-A is dead. unix_sk(sk-A)->scc_index == unix_sk(sk-B)->scc_index <-- Wrong! && sk-A's file refcnt == unix_sk(sk-A)->vertex->out_degree ^-- 1 in-flight count for sk-B -> sk-A is dead !? The problem is that unix_add_edge() does not initialise scc_index. Stage 1) is used for heap spraying, making a newly allocated vertex have vertex->scc_index == 2 (UNIX_VERTEX_INDEX_START) set by unix_walk_scc() at 1-c. Let's track the max SCC index from the previous unix_walk_scc() call and assign the max + 1 to a new vertex's scc_index. This way, we can continue to avoid Tarjan's algorithm while preventing misjudgments. Fixes: ad081928a8b0 ("af_unix: Avoid Tarjan's algorithm if unnecessary.") Reported-by: Quang Le Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251109025233.3659187-1-kuniyu@google.com Signed-off-by: Paolo Abeni --- net/unix/garbage.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 684ab03137b6..65396a4e1b07 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -145,6 +145,7 @@ enum unix_vertex_index { }; static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; +static unsigned long unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { @@ -153,6 +154,7 @@ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) if (!vertex) { vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); vertex->index = unix_vertex_unvisited_index; + vertex->scc_index = ++unix_vertex_max_scc_index; vertex->out_degree = 0; INIT_LIST_HEAD(&vertex->edges); INIT_LIST_HEAD(&vertex->scc_entry); @@ -489,10 +491,15 @@ static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_inde scc_dead = unix_vertex_dead(v); } - if (scc_dead) + if (scc_dead) { unix_collect_skb(&scc, hitlist); - else if (!unix_graph_maybe_cyclic) - unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + } else { + if (unix_vertex_max_scc_index < vertex->scc_index) + unix_vertex_max_scc_index = vertex->scc_index; + + if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + } list_del(&scc); } @@ -507,6 +514,7 @@ static void unix_walk_scc(struct sk_buff_head *hitlist) unsigned long last_index = UNIX_VERTEX_INDEX_START; unix_graph_maybe_cyclic = false; + unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; /* Visit every vertex exactly once. * __unix_walk_scc() moves visited vertices to unix_visited_vertices. From fd3ecda38fe0cb713d167b5477d25f6b350f0514 Mon Sep 17 00:00:00 2001 From: Niravkumar L Rabara Date: Tue, 11 Nov 2025 16:08:01 +0800 Subject: [PATCH 301/543] EDAC/altera: Handle OCRAM ECC enable after warm reset The OCRAM ECC is always enabled either by the BootROM or by the Secure Device Manager (SDM) during a power-on reset on SoCFPGA. However, during a warm reset, the OCRAM content is retained to preserve data, while the control and status registers are reset to their default values. As a result, ECC must be explicitly re-enabled after a warm reset. Fixes: 17e47dc6db4f ("EDAC/altera: Add Stratix10 OCRAM ECC support") Signed-off-by: Niravkumar L Rabara Signed-off-by: Borislav Petkov (AMD) Acked-by: Dinh Nguyen Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20251111080801.1279401-1-niravkumarlaxmidas.rabara@altera.com --- drivers/edac/altera_edac.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 103b2c2eba2a..a776d61027f2 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -1184,10 +1184,22 @@ altr_check_ocram_deps_init(struct altr_edac_device_dev *device) if (ret) return ret; - /* Verify OCRAM has been initialized */ + /* + * Verify that OCRAM has been initialized. + * During a warm reset, OCRAM contents are retained, but the control + * and status registers are reset to their default values. Therefore, + * ECC must be explicitly re-enabled in the control register. + * Error condition: if INITCOMPLETEA is clear and ECC_EN is already set. + */ if (!ecc_test_bits(ALTR_A10_ECC_INITCOMPLETEA, - (base + ALTR_A10_ECC_INITSTAT_OFST))) - return -ENODEV; + (base + ALTR_A10_ECC_INITSTAT_OFST))) { + if (!ecc_test_bits(ALTR_A10_ECC_EN, + (base + ALTR_A10_ECC_CTRL_OFST))) + ecc_set_bits(ALTR_A10_ECC_EN, + (base + ALTR_A10_ECC_CTRL_OFST)); + else + return -ENODEV; + } /* Enable IRQ on Single Bit Error */ writel(ALTR_A10_ECC_SERRINTEN, (base + ALTR_A10_ECC_ERRINTENS_OFST)); From 485e0626e58768f3c53ba61ab9e09d6b60a455f4 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 6 Nov 2025 13:05:35 -0500 Subject: [PATCH 302/543] Bluetooth: hci_event: Fix not handling PA Sync Lost event This handles PA Sync Lost event which previously was assumed to be handled with BIG Sync Lost but their lifetime are not the same thus why there are 2 different events to inform when each sync is lost. Fixes: b2a5f2e1c127 ("Bluetooth: hci_event: Add support for handling LE BIG Sync Lost event") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 5 ++++ net/bluetooth/hci_event.c | 49 ++++++++++++++++++++++++++----------- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8d0e703bc929..cb4c02d00759 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2783,6 +2783,11 @@ struct hci_ev_le_per_adv_report { __u8 data[]; } __packed; +#define HCI_EV_LE_PA_SYNC_LOST 0x10 +struct hci_ev_le_pa_sync_lost { + __le16 handle; +} __packed; + #define LE_PA_DATA_COMPLETE 0x00 #define LE_PA_DATA_MORE_TO_COME 0x01 #define LE_PA_DATA_TRUNCATED 0x02 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 03328c1dd090..3838b90343d9 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5843,6 +5843,29 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, void *data, le16_to_cpu(ev->supervision_timeout)); } +static void hci_le_pa_sync_lost_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_ev_le_pa_sync_lost *ev = data; + u16 handle = le16_to_cpu(ev->handle); + struct hci_conn *conn; + + bt_dev_dbg(hdev, "sync handle 0x%4.4x", handle); + + hci_dev_lock(hdev); + + /* Delete the pa sync connection */ + conn = hci_conn_hash_lookup_pa_sync_handle(hdev, handle); + if (conn) { + clear_bit(HCI_CONN_BIG_SYNC, &conn->flags); + clear_bit(HCI_CONN_PA_SYNC, &conn->flags); + hci_disconn_cfm(conn, HCI_ERROR_REMOTE_USER_TERM); + hci_conn_del(conn); + } + + hci_dev_unlock(hdev); +} + static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -7046,29 +7069,24 @@ static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_evt_le_big_sync_lost *ev = data; - struct hci_conn *bis, *conn; - bool mgmt_conn; + struct hci_conn *bis; + bool mgmt_conn = false; bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle); hci_dev_lock(hdev); - /* Delete the pa sync connection */ - bis = hci_conn_hash_lookup_pa_sync_big_handle(hdev, ev->handle); - if (bis) { - conn = hci_conn_hash_lookup_pa_sync_handle(hdev, - bis->sync_handle); - if (conn) - hci_conn_del(conn); - } - /* Delete each bis connection */ while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle, BT_CONNECTED, HCI_ROLE_SLAVE))) { - mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &bis->flags); - mgmt_device_disconnected(hdev, &bis->dst, bis->type, bis->dst_type, - ev->reason, mgmt_conn); + if (!mgmt_conn) { + mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, + &bis->flags); + mgmt_device_disconnected(hdev, &bis->dst, bis->type, + bis->dst_type, ev->reason, + mgmt_conn); + } clear_bit(HCI_CONN_BIG_SYNC, &bis->flags); hci_disconn_cfm(bis, ev->reason); @@ -7182,6 +7200,9 @@ static const struct hci_le_ev { hci_le_per_adv_report_evt, sizeof(struct hci_ev_le_per_adv_report), HCI_MAX_EVENT_SIZE), + /* [0x10 = HCI_EV_LE_PA_SYNC_LOST] */ + HCI_LE_EV(HCI_EV_LE_PA_SYNC_LOST, hci_le_pa_sync_lost_evt, + sizeof(struct hci_ev_le_pa_sync_lost)), /* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */ HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt, sizeof(struct hci_evt_le_ext_adv_set_term)), From 281326be67252ac5794d1383f67526606b1d6b13 Mon Sep 17 00:00:00 2001 From: Niravkumar L Rabara Date: Tue, 11 Nov 2025 16:13:33 +0800 Subject: [PATCH 303/543] EDAC/altera: Use INTTEST register for Ethernet and USB SBE injection The current single-bit error injection mechanism flips bits directly in ECC RAM by performing write and read operations. When the ECC RAM is actively used by the Ethernet or USB controller, this approach sometimes trigger a false double-bit error. Switch both Ethernet and USB EDAC devices to use the INTTEST register (altr_edac_a10_device_inject_fops) for single-bit error injection, similar to the existing double-bit error injection method. Fixes: 064acbd4f4ab ("EDAC, altera: Add Stratix10 peripheral support") Signed-off-by: Niravkumar L Rabara Signed-off-by: Borislav Petkov (AMD) Acked-by: Dinh Nguyen Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20251111081333.1279635-1-niravkumarlaxmidas.rabara@altera.com --- drivers/edac/altera_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index a776d61027f2..0c5b94e64ea1 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -1369,7 +1369,7 @@ static const struct edac_device_prv_data a10_enetecc_data = { .ue_set_mask = ALTR_A10_ECC_TDERRA, .set_err_ofst = ALTR_A10_ECC_INTTEST_OFST, .ecc_irq_handler = altr_edac_a10_ecc_irq, - .inject_fops = &altr_edac_a10_device_inject2_fops, + .inject_fops = &altr_edac_a10_device_inject_fops, }; #endif /* CONFIG_EDAC_ALTERA_ETHERNET */ @@ -1459,7 +1459,7 @@ static const struct edac_device_prv_data a10_usbecc_data = { .ue_set_mask = ALTR_A10_ECC_TDERRA, .set_err_ofst = ALTR_A10_ECC_INTTEST_OFST, .ecc_irq_handler = altr_edac_a10_ecc_irq, - .inject_fops = &altr_edac_a10_device_inject2_fops, + .inject_fops = &altr_edac_a10_device_inject_fops, }; #endif /* CONFIG_EDAC_ALTERA_USB */ From 62b656e43eaeae445a39cd8021a4f47065af4389 Mon Sep 17 00:00:00 2001 From: Ranganath V N Date: Sun, 9 Nov 2025 14:43:35 +0530 Subject: [PATCH 304/543] net: sched: act_connmark: initialize struct tc_ife to fix kernel leak In tcf_connmark_dump(), the variable 'opt' was partially initialized using a designatied initializer. While the padding bytes are reamined uninitialized. nla_put() copies the entire structure into a netlink message, these uninitialized bytes leaked to userspace. Initialize the structure with memset before assigning its fields to ensure all members and padding are cleared prior to beign copied. Reported-by: syzbot+0c85cae3350b7d486aee@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=0c85cae3350b7d486aee Tested-by: syzbot+0c85cae3350b7d486aee@syzkaller.appspotmail.com Fixes: 22a5dc0e5e3e ("net: sched: Introduce connmark action") Signed-off-by: Ranganath V N Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251109091336.9277-2-vnranganath.20@gmail.com Acked-by: Cong Wang Signed-off-by: Paolo Abeni --- net/sched/act_connmark.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 3e89927d7116..26ba8c2d20ab 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -195,13 +195,15 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, const struct tcf_connmark_info *ci = to_connmark(a); unsigned char *b = skb_tail_pointer(skb); const struct tcf_connmark_parms *parms; - struct tc_connmark opt = { - .index = ci->tcf_index, - .refcnt = refcount_read(&ci->tcf_refcnt) - ref, - .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, - }; + struct tc_connmark opt; struct tcf_t t; + memset(&opt, 0, sizeof(opt)); + + opt.index = ci->tcf_index; + opt.refcnt = refcount_read(&ci->tcf_refcnt) - ref; + opt.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind; + rcu_read_lock(); parms = rcu_dereference(ci->parms); From ce50039be49eea9b4cd8873ca6eccded1b4a130a Mon Sep 17 00:00:00 2001 From: Ranganath V N Date: Sun, 9 Nov 2025 14:43:36 +0530 Subject: [PATCH 305/543] net: sched: act_ife: initialize struct tc_ife to fix KMSAN kernel-infoleak Fix a KMSAN kernel-infoleak detected by the syzbot . [net?] KMSAN: kernel-infoleak in __skb_datagram_iter In tcf_ife_dump(), the variable 'opt' was partially initialized using a designatied initializer. While the padding bytes are reamined uninitialized. nla_put() copies the entire structure into a netlink message, these uninitialized bytes leaked to userspace. Initialize the structure with memset before assigning its fields to ensure all members and padding are cleared prior to beign copied. This change silences the KMSAN report and prevents potential information leaks from the kernel memory. This fix has been tested and validated by syzbot. This patch closes the bug reported at the following syzkaller link and ensures no infoleak. Reported-by: syzbot+0c85cae3350b7d486aee@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=0c85cae3350b7d486aee Tested-by: syzbot+0c85cae3350b7d486aee@syzkaller.appspotmail.com Fixes: ef6980b6becb ("introduce IFE action") Signed-off-by: Ranganath V N Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251109091336.9277-3-vnranganath.20@gmail.com Acked-by: Cong Wang Signed-off-by: Paolo Abeni --- net/sched/act_ife.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 107c6d83dc5c..7c6975632fc2 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -644,13 +644,15 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; - struct tc_ife opt = { - .index = ife->tcf_index, - .refcnt = refcount_read(&ife->tcf_refcnt) - ref, - .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, - }; + struct tc_ife opt; struct tcf_t t; + memset(&opt, 0, sizeof(opt)); + + opt.index = ife->tcf_index, + opt.refcnt = refcount_read(&ife->tcf_refcnt) - ref, + opt.bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, + spin_lock_bh(&ife->tcf_lock); opt.action = ife->tcf_action; p = rcu_dereference_protected(ife->params, From 0bcd5b3b50cc1fcbf775479322cc37c15d35a489 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Sun, 9 Nov 2025 11:37:49 +0200 Subject: [PATCH 306/543] net/mlx5e: Fix missing error assignment in mlx5e_xfrm_add_state() Assign the return value of mlx5_eswitch_block_mode() to 'err' before checking it to avoid returning an uninitialized error code. Fixes: 22239eb258bc ("net/mlx5e: Prevent tunnel reformat when tunnel mode not allowed") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202510271649.uwsIxD6O-lkp@intel.com/ Closes: http://lore.kernel.org/linux-rdma/aPIEK4rLB586FdDt@stanley.mountain/ Signed-off-by: Carolina Jubran Reviewed-by: Jianbo Liu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1762681073-1084058-2-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 0a4fb8c92268..35d9530037a6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -804,7 +804,8 @@ static int mlx5e_xfrm_add_state(struct net_device *dev, goto err_xfrm; } - if (mlx5_eswitch_block_mode(priv->mdev)) + err = mlx5_eswitch_block_mode(priv->mdev); + if (err) goto unblock_ipsec; if (x->props.mode == XFRM_MODE_TUNNEL && From 2dc768c05217e667f987907a3404926e7ba89ff3 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Sun, 9 Nov 2025 11:37:50 +0200 Subject: [PATCH 307/543] net/mlx5e: Trim the length of the num_doorbell error When trying to set num_doorbells to a value greater than the max number of channels, the error message was going over the netlink limit of 80 chars, truncating the most important part of the message, the number of channels. Fix that by trimming the length a bit. Fixes: 11bbcfb7668c ("net/mlx5e: Use the 'num_doorbells' devlink param") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1762681073-1084058-3-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index fceea83abbd7..887adf4807d1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -541,7 +541,7 @@ static int mlx5_devlink_num_doorbells_validate(struct devlink *devlink, u32 id, max_num_channels = mlx5e_get_max_num_channels(mdev); if (val32 > max_num_channels) { NL_SET_ERR_MSG_FMT_MOD(extack, - "Requested num_doorbells (%u) exceeds maximum number of channels (%u)", + "Requested num_doorbells (%u) exceeds max number of channels (%u)", val32, max_num_channels); return -EINVAL; } From a7bf4d5063c7837096aab2853224eb23628514d9 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 9 Nov 2025 11:37:51 +0200 Subject: [PATCH 308/543] net/mlx5e: Fix maxrate wraparound in threshold between units The previous calculation used roundup() which caused an overflow for rates between 25.5Gbps and 26Gbps. For example, a rate of 25.6Gbps would result in using 100Mbps units with value of 256, which would overflow the 8 bits field. Simplify the upper_limit_mbps calculation by removing the unnecessary roundup, and adjust the comparison to use <= to correctly handle the boundary condition. Fixes: d8880795dabf ("net/mlx5e: Implement DCBNL IEEE max rate") Signed-off-by: Gal Pressman Reviewed-by: Nimrod Oren Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1762681073-1084058-4-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index d166c0d5189e..345614471052 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -595,18 +595,19 @@ static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev, struct mlx5_core_dev *mdev = priv->mdev; u8 max_bw_value[IEEE_8021QAZ_MAX_TCS]; u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS]; - __u64 upper_limit_mbps = roundup(255 * MLX5E_100MB, MLX5E_1GB); + __u64 upper_limit_mbps; int i; memset(max_bw_value, 0, sizeof(max_bw_value)); memset(max_bw_unit, 0, sizeof(max_bw_unit)); + upper_limit_mbps = 255 * MLX5E_100MB; for (i = 0; i <= mlx5_max_tc(mdev); i++) { if (!maxrate->tc_maxrate[i]) { max_bw_unit[i] = MLX5_BW_NO_LIMIT; continue; } - if (maxrate->tc_maxrate[i] < upper_limit_mbps) { + if (maxrate->tc_maxrate[i] <= upper_limit_mbps) { max_bw_value[i] = div_u64(maxrate->tc_maxrate[i], MLX5E_100MB); max_bw_value[i] = max_bw_value[i] ? max_bw_value[i] : 1; From 43b27d1bd88a4bce34ec2437d103acfae9655f9e Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 9 Nov 2025 11:37:52 +0200 Subject: [PATCH 309/543] net/mlx5e: Fix wraparound in rate limiting for values above 255 Gbps Add validation to reject rates exceeding 255 Gbps that would overflow the 8 bits max bandwidth field. Fixes: d8880795dabf ("net/mlx5e: Implement DCBNL IEEE max rate") Signed-off-by: Gal Pressman Reviewed-by: Nimrod Oren Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1762681073-1084058-5-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index 345614471052..d88a48210fdc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -596,11 +596,13 @@ static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev, u8 max_bw_value[IEEE_8021QAZ_MAX_TCS]; u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS]; __u64 upper_limit_mbps; + __u64 upper_limit_gbps; int i; memset(max_bw_value, 0, sizeof(max_bw_value)); memset(max_bw_unit, 0, sizeof(max_bw_unit)); upper_limit_mbps = 255 * MLX5E_100MB; + upper_limit_gbps = 255 * MLX5E_1GB; for (i = 0; i <= mlx5_max_tc(mdev); i++) { if (!maxrate->tc_maxrate[i]) { @@ -612,10 +614,16 @@ static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev, MLX5E_100MB); max_bw_value[i] = max_bw_value[i] ? max_bw_value[i] : 1; max_bw_unit[i] = MLX5_100_MBPS_UNIT; - } else { + } else if (max_bw_value[i] <= upper_limit_gbps) { max_bw_value[i] = div_u64(maxrate->tc_maxrate[i], MLX5E_1GB); max_bw_unit[i] = MLX5_GBPS_UNIT; + } else { + netdev_err(netdev, + "tc_%d maxrate %llu Kbps exceeds limit %llu\n", + i, maxrate->tc_maxrate[i], + upper_limit_gbps); + return -EINVAL; } } From 9fcc2b6c10523f7e75db6387946c86fcf19dc97e Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 9 Nov 2025 11:37:53 +0200 Subject: [PATCH 310/543] net/mlx5e: Fix potentially misleading debug message Change the debug message to print the correct units instead of always assuming Gbps, as the value can be in either 100 Mbps or 1 Gbps units. Fixes: 5da8bc3effb6 ("net/mlx5e: DCBNL, Add debug messages log") Signed-off-by: Gal Pressman Reviewed-by: Nimrod Oren Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1762681073-1084058-6-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- .../net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index d88a48210fdc..9b93da4d52f6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -598,6 +598,19 @@ static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev, __u64 upper_limit_mbps; __u64 upper_limit_gbps; int i; + struct { + int scale; + const char *units_str; + } units[] = { + [MLX5_100_MBPS_UNIT] = { + .scale = 100, + .units_str = "Mbps", + }, + [MLX5_GBPS_UNIT] = { + .scale = 1, + .units_str = "Gbps", + }, + }; memset(max_bw_value, 0, sizeof(max_bw_value)); memset(max_bw_unit, 0, sizeof(max_bw_unit)); @@ -628,8 +641,9 @@ static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev, } for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { - netdev_dbg(netdev, "%s: tc_%d <=> max_bw %d Gbps\n", - __func__, i, max_bw_value[i]); + netdev_dbg(netdev, "%s: tc_%d <=> max_bw %u %s\n", __func__, i, + max_bw_value[i] * units[max_bw_unit[i]].scale, + units[max_bw_unit[i]].units_str); } return mlx5_modify_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit); From cd8dbd9ef600435439bb0e70af0a1d9e2193aecb Mon Sep 17 00:00:00 2001 From: Max Chou Date: Wed, 5 Nov 2025 20:02:04 +0800 Subject: [PATCH 311/543] Bluetooth: btrtl: Avoid loading the config file on security chips For chips with security enabled, it's only possible to load firmware with a valid signature pattern. If key_id is not zero, it indicates a security chip, and the driver will not load the config file. - Example log for a security chip. Bluetooth: hci0: RTL: examining hci_ver=0c hci_rev=000a lmp_ver=0c lmp_subver=8922 Bluetooth: hci0: RTL: rom_version status=0 version=1 Bluetooth: hci0: RTL: btrtl_initialize: key id 1 Bluetooth: hci0: RTL: loading rtl_bt/rtl8922au_fw.bin Bluetooth: hci0: RTL: cfg_sz 0, total sz 71301 Bluetooth: hci0: RTL: fw version 0x41c0c905 - Example log for a normal chip. Bluetooth: hci0: RTL: examining hci_ver=0c hci_rev=000a lmp_ver=0c lmp_subver=8922 Bluetooth: hci0: RTL: rom_version status=0 version=1 Bluetooth: hci0: RTL: btrtl_initialize: key id 0 Bluetooth: hci0: RTL: loading rtl_bt/rtl8922au_fw.bin Bluetooth: hci0: RTL: loading rtl_bt/rtl8922au_config.bin Bluetooth: hci0: RTL: cfg_sz 6, total sz 71307 Bluetooth: hci0: RTL: fw version 0x41c0c905 Tested-by: Hilda Wu Signed-off-by: Nial Ni Signed-off-by: Max Chou Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btrtl.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c index 1d4a7887abcc..52794db2739b 100644 --- a/drivers/bluetooth/btrtl.c +++ b/drivers/bluetooth/btrtl.c @@ -50,7 +50,7 @@ #define RTL_CHIP_SUBVER (&(struct rtl_vendor_cmd) {{0x10, 0x38, 0x04, 0x28, 0x80}}) #define RTL_CHIP_REV (&(struct rtl_vendor_cmd) {{0x10, 0x3A, 0x04, 0x28, 0x80}}) -#define RTL_SEC_PROJ (&(struct rtl_vendor_cmd) {{0x10, 0xA4, 0x0D, 0x00, 0xb0}}) +#define RTL_SEC_PROJ (&(struct rtl_vendor_cmd) {{0x10, 0xA4, 0xAD, 0x00, 0xb0}}) #define RTL_PATCH_SNIPPETS 0x01 #define RTL_PATCH_DUMMY_HEADER 0x02 @@ -534,7 +534,6 @@ static int rtlbt_parse_firmware_v2(struct hci_dev *hdev, { struct rtl_epatch_header_v2 *hdr; int rc; - u8 reg_val[2]; u8 key_id; u32 num_sections; struct rtl_section *section; @@ -549,14 +548,7 @@ static int rtlbt_parse_firmware_v2(struct hci_dev *hdev, .len = btrtl_dev->fw_len - 7, /* Cut the tail */ }; - rc = btrtl_vendor_read_reg16(hdev, RTL_SEC_PROJ, reg_val); - if (rc < 0) - return -EIO; - key_id = reg_val[0]; - - rtl_dev_dbg(hdev, "%s: key id %u", __func__, key_id); - - btrtl_dev->key_id = key_id; + key_id = btrtl_dev->key_id; hdr = rtl_iov_pull_data(&iov, sizeof(*hdr)); if (!hdr) @@ -1070,6 +1062,8 @@ struct btrtl_device_info *btrtl_initialize(struct hci_dev *hdev, u16 hci_rev, lmp_subver; u8 hci_ver, lmp_ver, chip_type = 0; int ret; + int rc; + u8 key_id; u8 reg_val[2]; btrtl_dev = kzalloc(sizeof(*btrtl_dev), GFP_KERNEL); @@ -1180,6 +1174,14 @@ struct btrtl_device_info *btrtl_initialize(struct hci_dev *hdev, goto err_free; } + rc = btrtl_vendor_read_reg16(hdev, RTL_SEC_PROJ, reg_val); + if (rc < 0) + goto err_free; + + key_id = reg_val[0]; + btrtl_dev->key_id = key_id; + rtl_dev_info(hdev, "%s: key id %u", __func__, key_id); + btrtl_dev->fw_len = -EIO; if (lmp_subver == RTL_ROM_LMP_8852A && hci_rev == 0x000c) { snprintf(fw_name, sizeof(fw_name), "%s_v2.bin", @@ -1202,7 +1204,7 @@ struct btrtl_device_info *btrtl_initialize(struct hci_dev *hdev, goto err_free; } - if (btrtl_dev->ic_info->cfg_name) { + if (btrtl_dev->ic_info->cfg_name && !btrtl_dev->key_id) { if (postfix) { snprintf(cfg_name, sizeof(cfg_name), "%s-%s.bin", btrtl_dev->ic_info->cfg_name, postfix); From e5eba42f01340f73888dfe560be2806057c25913 Mon Sep 17 00:00:00 2001 From: Akiva Goldberger Date: Sun, 9 Nov 2025 11:49:03 +0200 Subject: [PATCH 312/543] mlx5: Fix default values in create CQ Currently, CQs without a completion function are assigned the mlx5_add_cq_to_tasklet function by default. This is problematic since only user CQs created through the mlx5_ib driver are intended to use this function. Additionally, all CQs that will use doorbells instead of polling for completions must call mlx5_cq_arm. However, the default CQ creation flow leaves a valid value in the CQ's arm_db field, allowing FW to send interrupts to polling-only CQs in certain corner cases. These two factors would allow a polling-only kernel CQ to be triggered by an EQ interrupt and call a completion function intended only for user CQs, causing a null pointer exception. Some areas in the driver have prevented this issue with one-off fixes but did not address the root cause. This patch fixes the described issue by adding defaults to the create CQ flow. It adds a default dummy completion function to protect against null pointer exceptions, and it sets an invalid command sequence number by default in kernel CQs to prevent the FW from sending an interrupt to the CQ until it is armed. User CQs are responsible for their own initialization values. Callers of mlx5_core_create_cq are responsible for changing the completion function and arming the CQ per their needs. Fixes: cdd04f4d4d71 ("net/mlx5: Add support to create SQ and CQ for ASO") Signed-off-by: Akiva Goldberger Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Acked-by: Leon Romanovsky Link: https://patch.msgid.link/1762681743-1084694-1-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- drivers/infiniband/hw/mlx5/cq.c | 11 +++++--- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 23 +++++++++++++-- .../net/ethernet/mellanox/mlx5/core/en_main.c | 1 - .../ethernet/mellanox/mlx5/core/fpga/conn.c | 15 +++++----- .../mellanox/mlx5/core/steering/hws/send.c | 7 ----- .../mellanox/mlx5/core/steering/sws/dr_send.c | 28 +++++-------------- drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 ++-- include/linux/mlx5/cq.h | 1 + 8 files changed, 44 insertions(+), 48 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index a23b364e24ff..651d76bca114 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -1020,15 +1020,18 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN) MLX5_SET(cqc, cqc, oi, 1); + if (udata) { + cq->mcq.comp = mlx5_add_cq_to_tasklet; + cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; + } else { + cq->mcq.comp = mlx5_ib_cq_comp; + } + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out)); if (err) goto err_cqb; mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); - if (udata) - cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; - else - cq->mcq.comp = mlx5_ib_cq_comp; cq->mcq.event = mlx5_ib_cq_event; INIT_LIST_HEAD(&cq->wc_list); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index e9f319a9bdd6..60f7ab1d72e7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -66,8 +66,8 @@ void mlx5_cq_tasklet_cb(struct tasklet_struct *t) tasklet_schedule(&ctx->task); } -static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, - struct mlx5_eqe *eqe) +void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, + struct mlx5_eqe *eqe) { unsigned long flags; struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv; @@ -95,7 +95,15 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, if (schedule_tasklet) tasklet_schedule(&tasklet_ctx->task); } +EXPORT_SYMBOL(mlx5_add_cq_to_tasklet); +static void mlx5_core_cq_dummy_cb(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe) +{ + mlx5_core_err(cq->eq->core.dev, + "CQ default completion callback, CQ #%u\n", cq->cqn); +} + +#define MLX5_CQ_INIT_CMD_SN cpu_to_be32(2 << 28) /* Callers must verify outbox status in case of err */ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen) @@ -121,10 +129,19 @@ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, cq->arm_sn = 0; cq->eq = eq; cq->uid = MLX5_GET(create_cq_in, in, uid); + + /* Kernel CQs must set the arm_db address prior to calling + * this function, allowing for the proper value to be + * initialized. User CQs are responsible for their own + * initialization since they do not use the arm_db field. + */ + if (cq->arm_db) + *cq->arm_db = MLX5_CQ_INIT_CMD_SN; + refcount_set(&cq->refcount, 1); init_completion(&cq->free); if (!cq->comp) - cq->comp = mlx5_add_cq_to_tasklet; + cq->comp = mlx5_core_cq_dummy_cb; /* assuming CQ will be deleted before the EQ */ cq->tasklet_ctx.priv = &eq->tasklet_ctx; INIT_LIST_HEAD(&cq->tasklet_ctx.list); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 6023bbbf3f39..5e17eae81f4b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2219,7 +2219,6 @@ static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, mcq->set_ci_db = cq->wq_ctrl.db.db; mcq->arm_db = cq->wq_ctrl.db.db + 1; *mcq->set_ci_db = 0; - *mcq->arm_db = 0; mcq->vector = param->eq_ix; mcq->comp = mlx5e_completion_event; mcq->event = mlx5e_cq_error_event; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index cb1319974f83..ccef64fb40b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -421,6 +421,13 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) __be64 *pas; u32 i; + conn->cq.mcq.cqe_sz = 64; + conn->cq.mcq.set_ci_db = conn->cq.wq_ctrl.db.db; + conn->cq.mcq.arm_db = conn->cq.wq_ctrl.db.db + 1; + *conn->cq.mcq.set_ci_db = 0; + conn->cq.mcq.vector = 0; + conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; + cq_size = roundup_pow_of_two(cq_size); MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(cq_size)); @@ -468,15 +475,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) if (err) goto err_cqwq; - conn->cq.mcq.cqe_sz = 64; - conn->cq.mcq.set_ci_db = conn->cq.wq_ctrl.db.db; - conn->cq.mcq.arm_db = conn->cq.wq_ctrl.db.db + 1; - *conn->cq.mcq.set_ci_db = 0; - *conn->cq.mcq.arm_db = 0; - conn->cq.mcq.vector = 0; - conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; tasklet_setup(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet); - mlx5_fpga_dbg(fdev, "Created CQ #0x%x\n", conn->cq.mcq.cqn); goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index 24ef7d66fa8a..7510c46e58a5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -873,12 +873,6 @@ static int hws_send_ring_open_sq(struct mlx5hws_context *ctx, return err; } -static void hws_cq_complete(struct mlx5_core_cq *mcq, - struct mlx5_eqe *eqe) -{ - pr_err("CQ completion CQ: #%u\n", mcq->cqn); -} - static int hws_send_ring_alloc_cq(struct mlx5_core_dev *mdev, int numa_node, struct mlx5hws_send_engine *queue, @@ -901,7 +895,6 @@ static int hws_send_ring_alloc_cq(struct mlx5_core_dev *mdev, mcq->cqe_sz = 64; mcq->set_ci_db = cq->wq_ctrl.db.db; mcq->arm_db = cq->wq_ctrl.db.db + 1; - mcq->comp = hws_cq_complete; for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { cqe = mlx5_cqwq_get_wqe(&cq->wq, i); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index 077a77fde670..d034372fa047 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1049,12 +1049,6 @@ static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) return 0; } -static void dr_cq_complete(struct mlx5_core_cq *mcq, - struct mlx5_eqe *eqe) -{ - pr_err("CQ completion CQ: #%u\n", mcq->cqn); -} - static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, struct mlx5_uars_page *uar, size_t ncqe) @@ -1089,6 +1083,13 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK; } + cq->mcq.cqe_sz = 64; + cq->mcq.set_ci_db = cq->wq_ctrl.db.db; + cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; + *cq->mcq.set_ci_db = 0; + cq->mcq.vector = 0; + cq->mdev = mdev; + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + sizeof(u64) * cq->wq_ctrl.buf.npages; in = kvzalloc(inlen, GFP_KERNEL); @@ -1112,27 +1113,12 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas); - cq->mcq.comp = dr_cq_complete; - err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); kvfree(in); if (err) goto err_cqwq; - cq->mcq.cqe_sz = 64; - cq->mcq.set_ci_db = cq->wq_ctrl.db.db; - cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; - *cq->mcq.set_ci_db = 0; - - /* set no-zero value, in order to avoid the HW to run db-recovery on - * CQ that used in polling mode. - */ - *cq->mcq.arm_db = cpu_to_be32(2 << 28); - - cq->mcq.vector = 0; - cq->mdev = mdev; - return cq; err_cqwq: diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 82034efb74fc..a7936bd1aabe 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -573,6 +573,8 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) vcq->mcq.set_ci_db = vcq->db.db; vcq->mcq.arm_db = vcq->db.db + 1; vcq->mcq.cqe_sz = 64; + vcq->mcq.comp = mlx5_vdpa_cq_comp; + vcq->cqe = num_ent; err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent); if (err) @@ -612,10 +614,6 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) if (err) goto err_vec; - vcq->mcq.comp = mlx5_vdpa_cq_comp; - vcq->cqe = num_ent; - vcq->mcq.set_ci_db = vcq->db.db; - vcq->mcq.arm_db = vcq->db.db + 1; mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index); kfree(in); return 0; diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 7ef2c7c7d803..9d47cdc727ad 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -183,6 +183,7 @@ static inline void mlx5_cq_put(struct mlx5_core_cq *cq) complete(&cq->free); } +void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe); int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen); int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, From 55286b1e1bf4ce55f61ad2816d4ff8a7861a8cbb Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 10 Nov 2025 16:24:20 +0100 Subject: [PATCH 313/543] smb: server: let smb_direct_disconnect_rdma_connection() turn CREATED into DISCONNECTED When smb_direct_disconnect_rdma_connection() turns SMBDIRECT_SOCKET_CREATED into SMBDIRECT_SOCKET_ERROR, we'll have the situation that smb_direct_disconnect_rdma_work() will set SMBDIRECT_SOCKET_DISCONNECTING and call rdma_disconnect(), which likely fails as we never reached the RDMA_CM_EVENT_ESTABLISHED. it means that wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED) in free_transport() will hang forever in SMBDIRECT_SOCKET_DISCONNECTING never reaching SMBDIRECT_SOCKET_DISCONNECTED. So we directly go from SMBDIRECT_SOCKET_CREATED to SMBDIRECT_SOCKET_DISCONNECTED. Fixes: b3fd52a0d85c ("smb: server: let smb_direct_disconnect_rdma_connection() set SMBDIRECT_SOCKET_ERROR...") Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 3d8d8cb456c1..e2be9a496154 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -334,6 +334,9 @@ smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) break; case SMBDIRECT_SOCKET_CREATED: + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + case SMBDIRECT_SOCKET_CONNECTED: sc->status = SMBDIRECT_SOCKET_ERROR; break; From ed6612165b74f09db00ef0abaf9831895ab28b7f Mon Sep 17 00:00:00 2001 From: Yiqi Sun Date: Tue, 11 Nov 2025 15:05:39 +0800 Subject: [PATCH 314/543] smb: fix invalid username check in smb3_fs_context_parse_param() Since the maximum return value of strnlen(..., CIFS_MAX_USERNAME_LEN) is CIFS_MAX_USERNAME_LEN, length check in smb3_fs_context_parse_param() is always FALSE and invalid. Fix the comparison in if statement. Signed-off-by: Yiqi Sun Signed-off-by: Steve French --- fs/smb/client/fs_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index c2d5bb23040c..0f894d09157b 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1470,7 +1470,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; } - if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) > + if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) == CIFS_MAX_USERNAME_LEN) { pr_warn("username too long\n"); goto cifs_parse_mount_err; From c42af83c59b65d01c0f7a074e450bbbb43b22f0d Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 11 Nov 2025 10:00:10 +0900 Subject: [PATCH 315/543] memblock: fix memblock_estimated_nr_free_pages() for soft-reserved memory memblock_estimated_nr_free_pages() returns the difference between the total size of the "memory" memblock type and the "reserved" memblock type. The "soft-reserved" memory regions are added to the "reserved" memblock type, but not to the "memory" memblock type. Therefore, memblock_estimated_nr_free_pages() may return a smaller value than expected, or if it underflows, an extremely large value. /proc/sys/kernel/threads-max is determined by the value of memblock_estimated_nr_free_pages(). This issue was discovered on machines with CXL memory because kernel.threads-max was either smaller than expected or extremely large for the installed DRAM size. This fixes the issue by replacing memblock_reserved_size() with memblock_reserved_kern_size() that tells how much memory was reserved from the actual RAM. Suggested-by: Mike Rapoport Signed-off-by: Akinobu Mita Link: https://patch.msgid.link/20251111010010.7800-1-akinobu.mita@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- mm/memblock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index e23e16618e9b..f0f2dc66e9a2 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1826,7 +1826,8 @@ phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int n */ unsigned long __init memblock_estimated_nr_free_pages(void) { - return PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size()); + return PHYS_PFN(memblock_phys_mem_size() - + memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, NUMA_NO_NODE)); } /* lowest address */ From 9e805625218b70d865fcee2105dbf835d473c074 Mon Sep 17 00:00:00 2001 From: Rakuram Eswaran Date: Thu, 23 Oct 2025 20:24:32 +0530 Subject: [PATCH 316/543] mmc: pxamci: Simplify pxamci_probe() error handling using devm APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch refactors pxamci_probe() to use devm-managed resource allocation (e.g. devm_dma_request_chan) and dev_err_probe() for improved readability and automatic cleanup on probe failure. It also removes redundant NULL assignments and manual resource release logic from pxamci_probe(), and eliminates the corresponding release calls from pxamci_remove(). Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202510041841.pRlunIfl-lkp@intel.com/ Fixes: 58c40f3faf742c ("mmc: pxamci: Use devm_mmc_alloc_host() helper") Suggested-by: Uwe Kleine-König Signed-off-by: Rakuram Eswaran Reviewed-by: Khalid Aziz Acked-by: Uwe Kleine-König Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/pxamci.c | 56 +++++++++++++-------------------------- 1 file changed, 18 insertions(+), 38 deletions(-) diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c index 26d03352af63..b5ea058ed467 100644 --- a/drivers/mmc/host/pxamci.c +++ b/drivers/mmc/host/pxamci.c @@ -652,10 +652,9 @@ static int pxamci_probe(struct platform_device *pdev) host->clkrt = CLKRT_OFF; host->clk = devm_clk_get(dev, NULL); - if (IS_ERR(host->clk)) { - host->clk = NULL; - return PTR_ERR(host->clk); - } + if (IS_ERR(host->clk)) + return dev_err_probe(dev, PTR_ERR(host->clk), + "Failed to acquire clock\n"); host->clkrate = clk_get_rate(host->clk); @@ -703,46 +702,37 @@ static int pxamci_probe(struct platform_device *pdev) platform_set_drvdata(pdev, mmc); - host->dma_chan_rx = dma_request_chan(dev, "rx"); - if (IS_ERR(host->dma_chan_rx)) { - host->dma_chan_rx = NULL; + host->dma_chan_rx = devm_dma_request_chan(dev, "rx"); + if (IS_ERR(host->dma_chan_rx)) return dev_err_probe(dev, PTR_ERR(host->dma_chan_rx), "unable to request rx dma channel\n"); - } - host->dma_chan_tx = dma_request_chan(dev, "tx"); - if (IS_ERR(host->dma_chan_tx)) { - dev_err(dev, "unable to request tx dma channel\n"); - ret = PTR_ERR(host->dma_chan_tx); - host->dma_chan_tx = NULL; - goto out; - } + + host->dma_chan_tx = devm_dma_request_chan(dev, "tx"); + if (IS_ERR(host->dma_chan_tx)) + return dev_err_probe(dev, PTR_ERR(host->dma_chan_tx), + "unable to request tx dma channel\n"); if (host->pdata) { host->detect_delay_ms = host->pdata->detect_delay_ms; host->power = devm_gpiod_get_optional(dev, "power", GPIOD_OUT_LOW); - if (IS_ERR(host->power)) { - ret = PTR_ERR(host->power); - dev_err(dev, "Failed requesting gpio_power\n"); - goto out; - } + if (IS_ERR(host->power)) + return dev_err_probe(dev, PTR_ERR(host->power), + "Failed requesting gpio_power\n"); /* FIXME: should we pass detection delay to debounce? */ ret = mmc_gpiod_request_cd(mmc, "cd", 0, false, 0); - if (ret && ret != -ENOENT) { - dev_err(dev, "Failed requesting gpio_cd\n"); - goto out; - } + if (ret && ret != -ENOENT) + return dev_err_probe(dev, ret, "Failed requesting gpio_cd\n"); if (!host->pdata->gpio_card_ro_invert) mmc->caps2 |= MMC_CAP2_RO_ACTIVE_HIGH; ret = mmc_gpiod_request_ro(mmc, "wp", 0, 0); - if (ret && ret != -ENOENT) { - dev_err(dev, "Failed requesting gpio_ro\n"); - goto out; - } + if (ret && ret != -ENOENT) + return dev_err_probe(dev, ret, "Failed requesting gpio_ro\n"); + if (!ret) host->use_ro_gpio = true; @@ -759,16 +749,8 @@ static int pxamci_probe(struct platform_device *pdev) if (ret) { if (host->pdata && host->pdata->exit) host->pdata->exit(dev, mmc); - goto out; } - return 0; - -out: - if (host->dma_chan_rx) - dma_release_channel(host->dma_chan_rx); - if (host->dma_chan_tx) - dma_release_channel(host->dma_chan_tx); return ret; } @@ -791,8 +773,6 @@ static void pxamci_remove(struct platform_device *pdev) dmaengine_terminate_all(host->dma_chan_rx); dmaengine_terminate_all(host->dma_chan_tx); - dma_release_channel(host->dma_chan_rx); - dma_release_channel(host->dma_chan_tx); } } From 739f04f4a46237536aff07ff223c231da53ed8ce Mon Sep 17 00:00:00 2001 From: Shawn Lin Date: Tue, 4 Nov 2025 11:51:23 +0800 Subject: [PATCH 317/543] mmc: dw_mmc-rockchip: Fix wrong internal phase calculate ciu clock is 2 times of io clock, but the sample clk used is derived from io clock provided to the card. So we should use io clock to calculate the phase. Fixes: 59903441f5e4 ("mmc: dw_mmc-rockchip: Add internal phase support") Signed-off-by: Shawn Lin Acked-by: Heiko Stuebner Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc-rockchip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/dw_mmc-rockchip.c b/drivers/mmc/host/dw_mmc-rockchip.c index 82dd906bb002..681354942e97 100644 --- a/drivers/mmc/host/dw_mmc-rockchip.c +++ b/drivers/mmc/host/dw_mmc-rockchip.c @@ -42,7 +42,7 @@ struct dw_mci_rockchip_priv_data { */ static int rockchip_mmc_get_internal_phase(struct dw_mci *host, bool sample) { - unsigned long rate = clk_get_rate(host->ciu_clk); + unsigned long rate = clk_get_rate(host->ciu_clk) / RK3288_CLKGEN_DIV; u32 raw_value; u16 degrees; u32 delay_num = 0; @@ -85,7 +85,7 @@ static int rockchip_mmc_get_phase(struct dw_mci *host, bool sample) static int rockchip_mmc_set_internal_phase(struct dw_mci *host, bool sample, int degrees) { - unsigned long rate = clk_get_rate(host->ciu_clk); + unsigned long rate = clk_get_rate(host->ciu_clk) / RK3288_CLKGEN_DIV; u8 nineties, remainder; u8 delay_num; u32 raw_value; From 632108ec072ad64c8c83db6e16a7efee29ebfb74 Mon Sep 17 00:00:00 2001 From: Haein Lee Date: Wed, 12 Nov 2025 00:37:54 +0900 Subject: [PATCH 318/543] ALSA: usb-audio: Fix NULL pointer dereference in snd_usb_mixer_controls_badd In snd_usb_create_streams(), for UAC version 3 devices, the Interface Association Descriptor (IAD) is retrieved via usb_ifnum_to_if(). If this call fails, a fallback routine attempts to obtain the IAD from the next interface and sets a BADD profile. However, snd_usb_mixer_controls_badd() assumes that the IAD retrieved from usb_ifnum_to_if() is always valid, without performing a NULL check. This can lead to a NULL pointer dereference when usb_ifnum_to_if() fails to find the interface descriptor. This patch adds a NULL pointer check after calling usb_ifnum_to_if() in snd_usb_mixer_controls_badd() to prevent the dereference. This issue was discovered by syzkaller, which triggered the bug by sending a crafted USB device descriptor. Fixes: 17156f23e93c ("ALSA: usb: add UAC3 BADD profiles support") Signed-off-by: Haein Lee Link: https://patch.msgid.link/vwhzmoba9j2f.vwhzmob9u9e2.g6@dooray.com Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 6f00e0d52382..72b900505d2c 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -3086,6 +3086,8 @@ static int snd_usb_mixer_controls_badd(struct usb_mixer_interface *mixer, int i; assoc = usb_ifnum_to_if(dev, ctrlif)->intf_assoc; + if (!assoc) + return -EINVAL; /* Detect BADD capture/playback channels from AS EP descriptors */ for (i = 0; i < assoc->bInterfaceCount; i++) { From d93a89684dce949c2ea817b6f07feee9a45241a7 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 10 Nov 2025 16:23:52 +0100 Subject: [PATCH 319/543] smb: client: let smbd_disconnect_rdma_connection() turn CREATED into DISCONNECTED When smbd_disconnect_rdma_connection() turns SMBDIRECT_SOCKET_CREATED into SMBDIRECT_SOCKET_ERROR, we'll have the situation that smbd_disconnect_rdma_work() will set SMBDIRECT_SOCKET_DISCONNECTING and call rdma_disconnect(), which likely fails as we never reached the RDMA_CM_EVENT_ESTABLISHED. it means that wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED) in smbd_destroy() will hang forever in SMBDIRECT_SOCKET_DISCONNECTING never reaching SMBDIRECT_SOCKET_DISCONNECTED. So we directly go from SMBDIRECT_SOCKET_CREATED to SMBDIRECT_SOCKET_DISCONNECTED. Fixes: ffbfc73e84eb ("smb: client: let smbd_disconnect_rdma_connection() set SMBDIRECT_SOCKET_ERROR...") Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 85a4c55b61b8..c6c428c2e08d 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -290,6 +290,9 @@ static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) break; case SMBDIRECT_SOCKET_CREATED: + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + case SMBDIRECT_SOCKET_CONNECTED: sc->status = SMBDIRECT_SOCKET_ERROR; break; From fdf302e6bea1822a9144a0cc2e8e17527e746162 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 10 Nov 2025 14:19:13 +0100 Subject: [PATCH 320/543] gendwarfksyms: Skip files with no exports Starting with Rust 1.91.0 (released 2025-10-30), in upstream commit ab91a63d403b ("Ignore intrinsic calls in cross-crate-inlining cost model") [1][2], `bindings.o` stops containing DWARF debug information because the `Default` implementations contained `write_bytes()` calls which are now ignored in that cost model (note that `CLIPPY=1` does not reproduce it). This means `gendwarfksyms` complains: RUSTC L rust/bindings.o error: gendwarfksyms: process_module: dwarf_get_units failed: no debugging information? There are several alternatives that would work here: conditionally skipping in the cases needed (but that is subtle and brittle), forcing DWARF generation with e.g. a dummy `static` (ugly and we may need to do it in several crates), skipping the call to the tool in the Kbuild command when there are no exports (fine) or teaching the tool to do so itself (simple and clean). Thus do the last one: don't attempt to process files if we have no symbol versions to calculate. [ I used the commit log of my patch linked below since it explained the root issue and expanded it a bit more to summarize the alternatives. - Miguel ] Cc: stable@vger.kernel.org # Needed in 6.17.y. Reported-by: Haiyue Wang Closes: https://lore.kernel.org/rust-for-linux/b8c1c73d-bf8b-4bf2-beb1-84ffdcd60547@163.com/ Suggested-by: Miguel Ojeda Link: https://lore.kernel.org/rust-for-linux/CANiq72nKC5r24VHAp9oUPR1HVPqT+=0ab9N0w6GqTF-kJOeiSw@mail.gmail.com/ Link: https://github.com/rust-lang/rust/commit/ab91a63d403b0105cacd72809cd292a72984ed99 [1] Link: https://github.com/rust-lang/rust/pull/145910 [2] Signed-off-by: Sami Tolvanen Tested-by: Haiyue Wang Reviewed-by: Alice Ryhl Link: https://patch.msgid.link/20251110131913.1789896-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- scripts/gendwarfksyms/gendwarfksyms.c | 3 ++- scripts/gendwarfksyms/gendwarfksyms.h | 2 +- scripts/gendwarfksyms/symbols.c | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/gendwarfksyms/gendwarfksyms.c b/scripts/gendwarfksyms/gendwarfksyms.c index 08ae61eb327e..f5203d1640ee 100644 --- a/scripts/gendwarfksyms/gendwarfksyms.c +++ b/scripts/gendwarfksyms/gendwarfksyms.c @@ -138,7 +138,8 @@ int main(int argc, char **argv) error("no input files?"); } - symbol_read_exports(stdin); + if (!symbol_read_exports(stdin)) + return 0; if (symtypes_file) { symfile = fopen(symtypes_file, "w"); diff --git a/scripts/gendwarfksyms/gendwarfksyms.h b/scripts/gendwarfksyms/gendwarfksyms.h index d9c06d2cb1df..32cec8f7695a 100644 --- a/scripts/gendwarfksyms/gendwarfksyms.h +++ b/scripts/gendwarfksyms/gendwarfksyms.h @@ -123,7 +123,7 @@ struct symbol { typedef void (*symbol_callback_t)(struct symbol *, void *arg); bool is_symbol_ptr(const char *name); -void symbol_read_exports(FILE *file); +int symbol_read_exports(FILE *file); void symbol_read_symtab(int fd); struct symbol *symbol_get(const char *name); void symbol_set_ptr(struct symbol *sym, Dwarf_Die *ptr); diff --git a/scripts/gendwarfksyms/symbols.c b/scripts/gendwarfksyms/symbols.c index 35ed594f0749..ecddcb5ffcdf 100644 --- a/scripts/gendwarfksyms/symbols.c +++ b/scripts/gendwarfksyms/symbols.c @@ -128,7 +128,7 @@ static bool is_exported(const char *name) return for_each(name, NULL, NULL) > 0; } -void symbol_read_exports(FILE *file) +int symbol_read_exports(FILE *file) { struct symbol *sym; char *line = NULL; @@ -159,6 +159,8 @@ void symbol_read_exports(FILE *file) free(line); debug("%d exported symbols", nsym); + + return nsym; } static void get_symbol(struct symbol *sym, void *arg) From bb8336a5163a5839476f27ed1ad69df4a19e13ca Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Mon, 10 Nov 2025 18:25:45 +0000 Subject: [PATCH 321/543] ethtool: fix incorrect kernel-doc style comment in ethtool.h Building documentation produced the following warning: WARNING: ./include/linux/ethtool.h:495 This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for This comment was not intended to be parsed as kernel-doc, so replace the '/**' with '/*' to silence the warning and align with normal comment style in header files. No functional changes. Signed-off-by: Kriish Sharma Link: https://patch.msgid.link/20251110182545.2112596-1-kriish.sharma2006@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c2d8b4ec62eb..5c9162193d26 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -492,7 +492,7 @@ struct ethtool_pause_stats { }; #define ETHTOOL_MAX_LANES 8 -/** +/* * IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for * the end-of-list marker, total 17 items */ From 63c643aa7b7287fdbb0167063785f89ece3f000f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 10 Nov 2025 19:23:40 +0100 Subject: [PATCH 322/543] selftests: mptcp: connect: fix fallback note due to OoO The "fallback due to TCP OoO" was never printed because the stat_ooo_now variable was checked twice: once in the parent if-statement, and one in the child one. The second condition was then always true then, and the 'else' branch was never taken. The idea is that when there are more ACK + MP_CAPABLE than expected, the test either fails if there was no out of order packets, or a notice is printed. Fixes: 69ca3d29a755 ("mptcp: update selftest for fallback due to OoO") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251110-net-mptcp-sft-join-unstable-v1-1-a4332c714e10@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 47ecb5b3836e..9b7b93f8eb0c 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -492,7 +492,7 @@ do_transfer() "than expected (${expect_synrx})" retc=1 fi - if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ] && [ ${stat_ooo_now} -eq 0 ]; then + if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ]; then if [ ${stat_ooo_now} -eq 0 ]; then mptcp_lib_pr_fail "lower MPC ACK rx (${stat_ackrx_now_l})" \ "than expected (${expect_ackrx})" From aea73bae662a0e184393d6d7d0feb18d2577b9b9 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 10 Nov 2025 19:23:41 +0100 Subject: [PATCH 323/543] selftests: mptcp: join: rm: set backup flag Some of these 'remove' tests rarely fail because a subflow has been reset instead of cleanly removed. This can happen when one extra subflow which has never carried data is being closed (FIN) on one side, while the other is sending data for the first time. To avoid such subflows to be used right at the end, the backup flag has been added. With that, data will be only carried on the initial subflow. Fixes: d2c4333a801c ("selftests: mptcp: add testcases for removing addrs") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251110-net-mptcp-sft-join-unstable-v1-2-a4332c714e10@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_join.sh | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 78a1aa4ecff2..9ed9ec7202d6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2532,7 +2532,7 @@ remove_tests() if reset "remove single subflow"; then pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns2=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 @@ -2545,8 +2545,8 @@ remove_tests() if reset "remove multiple subflows"; then pm_nl_set_limits $ns1 0 2 pm_nl_set_limits $ns2 0 2 - pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns2=-2 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 @@ -2557,7 +2557,7 @@ remove_tests() # single address, remove if reset "remove single address"; then pm_nl_set_limits $ns1 0 1 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 1 addr_nr_ns1=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -2570,9 +2570,9 @@ remove_tests() # subflow and signal, remove if reset "remove subflow and signal"; then pm_nl_set_limits $ns1 0 2 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 2 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns1=-1 addr_nr_ns2=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 @@ -2584,10 +2584,10 @@ remove_tests() # subflows and signal, remove if reset "remove subflows and signal"; then pm_nl_set_limits $ns1 0 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 3 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-1 addr_nr_ns2=-2 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2599,9 +2599,9 @@ remove_tests() # addresses remove if reset "remove addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250 - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.4.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250 + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 @@ -2614,10 +2614,10 @@ remove_tests() # invalid addresses remove if reset "remove invalid addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.12.1 flags signal + pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup # broadcast IP: no packet for this address will be received on ns1 - pm_nl_add_endpoint $ns1 224.0.0.1 flags signal - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal + pm_nl_add_endpoint $ns1 224.0.0.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup pm_nl_set_limits $ns2 2 2 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 @@ -2631,10 +2631,10 @@ remove_tests() # subflows and signal, flush if reset "flush subflows and signal"; then pm_nl_set_limits $ns1 0 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 3 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2647,9 +2647,9 @@ remove_tests() if reset "flush subflows"; then pm_nl_set_limits $ns1 3 3 pm_nl_set_limits $ns2 3 3 - pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow id 150 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup id 150 + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2666,9 +2666,9 @@ remove_tests() # addresses flush if reset "flush addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250 - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.4.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250 + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -2681,9 +2681,9 @@ remove_tests() # invalid addresses flush if reset "flush invalid addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.12.1 flags signal - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.14.1 flags signal + pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.14.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 From 6457595db9870298ee30b6d75287b8548e33fe19 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 10 Nov 2025 19:23:42 +0100 Subject: [PATCH 324/543] selftests: mptcp: join: endpoints: longer transfer In rare cases, when the test environment is very slow, some userspace tests can fail because some expected events have not been seen. Because the tests are expecting a long on-going connection, and they are not waiting for the end of the transfer, it is fine to make the connection longer. This connection will be killed at the end, after the verifications, so making it longer doesn't change anything, apart from avoid it to end before the end of the verifications To play it safe, all endpoints tests not waiting for the end of the transfer are now sharing a longer file (128KB) at slow speed. Fixes: 69c6ce7b6eca ("selftests: mptcp: add implicit endpoint test case") Cc: stable@vger.kernel.org Fixes: e274f7154008 ("selftests: mptcp: add subflow limits test-cases") Fixes: b5e2fb832f48 ("selftests: mptcp: add explicit test case for remove/readd") Fixes: e06959e9eebd ("selftests: mptcp: join: test for flush/re-add endpoints") Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251110-net-mptcp-sft-join-unstable-v1-3-a4332c714e10@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 9ed9ec7202d6..97af8d89ac5c 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3943,7 +3943,7 @@ endpoint_tests() pm_nl_set_limits $ns1 2 2 pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal - { speed=slow \ + { test_linkfail=128 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -3970,7 +3970,7 @@ endpoint_tests() pm_nl_set_limits $ns2 0 3 pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - { test_linkfail=4 speed=5 \ + { test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4048,7 +4048,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal - { test_linkfail=4 speed=5 \ + { test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4121,7 +4121,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow - { test_linkfail=4 speed=20 \ + { test_linkfail=128 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! From 290493078b96ce2ce3e60f55c23654acb678042a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 10 Nov 2025 19:23:43 +0100 Subject: [PATCH 325/543] selftests: mptcp: join: userspace: longer transfer In rare cases, when the test environment is very slow, some userspace tests can fail because some expected events have not been seen. Because the tests are expecting a long on-going connection, and they are not waiting for the end of the transfer, it is fine to make the connection longer. This connection will be killed at the end, after the verifications, so making it longer doesn't change anything, apart from avoid it to end before the end of the verifications To play it safe, all userspace tests not waiting for the end of the transfer are now sharing a longer file (128KB) at slow speed. Fixes: 4369c198e599 ("selftests: mptcp: test userspace pm out of transfer") Cc: stable@vger.kernel.org Fixes: b2e2248f365a ("selftests: mptcp: userspace pm create id 0 subflow") Fixes: e3b47e460b4b ("selftests: mptcp: userspace pm remove initial subflow") Fixes: b9fb176081fb ("selftests: mptcp: userspace pm send RM_ADDR for ID 0") Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251110-net-mptcp-sft-join-unstable-v1-4-a4332c714e10@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 97af8d89ac5c..01273abfdc89 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3806,7 +3806,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 2 2 - { speed=5 \ + { test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 @@ -3839,7 +3839,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3867,7 +3867,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3888,7 +3888,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3912,7 +3912,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 1 1 - { speed=5 \ + { test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 From ee79980f7a428ec299f6261bea4c1084dcbc9631 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 10 Nov 2025 19:23:44 +0100 Subject: [PATCH 326/543] selftests: mptcp: connect: trunc: read all recv data MPTCP Join "fastclose server" selftest is sometimes failing because the client output file doesn't have the expected size, e.g. 296B instead of 1024B. When looking at a packet trace when this happens, the server sent the expected 1024B in two parts -- 100B, then 924B -- then the MP_FASTCLOSE. It is then strange to see the client only receiving 296B, which would mean it only got a part of the second packet. The problem is then not on the networking side, but rather on the data reception side. When mptcp_connect is launched with '-f -1', it means the connection might stop before having sent everything, because a reset has been received. When this happens, the program was directly stopped. But it is also possible there are still some data to read, simply because the previous 'read' step was done with a buffer smaller than the pending data, see do_rnd_read(). In this case, it is important to read what's left in the kernel buffers before stopping without error like before. SIGPIPE is now ignored, not to quit the app before having read everything. Fixes: 6bf41020b72b ("selftests: mptcp: update and extend fastclose test-cases") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251110-net-mptcp-sft-join-unstable-v1-5-a4332c714e10@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/net/mptcp/mptcp_connect.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index b148cadb96d0..fc7e22b503d3 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -710,8 +710,14 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bw = do_rnd_write(peerfd, winfo->buf + winfo->off, winfo->len); if (bw < 0) { - if (cfg_rcv_trunc) - return 0; + /* expected reset, continue to read */ + if (cfg_rcv_trunc && + (errno == ECONNRESET || + errno == EPIPE)) { + fds.events &= ~POLLOUT; + continue; + } + perror("write"); return 111; } @@ -737,8 +743,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, } if (fds.revents & (POLLERR | POLLNVAL)) { - if (cfg_rcv_trunc) - return 0; + if (cfg_rcv_trunc) { + fds.events &= ~(POLLERR | POLLNVAL); + continue; + } fprintf(stderr, "Unexpected revents: " "POLLERR/POLLNVAL(%x)\n", fds.revents); return 5; @@ -1433,7 +1441,7 @@ static void parse_opts(int argc, char **argv) */ if (cfg_truncate < 0) { cfg_rcv_trunc = true; - signal(SIGPIPE, handle_signal); + signal(SIGPIPE, SIG_IGN); } break; case 'j': From 852b644acbce1529307a4bb283752c4e77b5cda7 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 10 Nov 2025 19:23:45 +0100 Subject: [PATCH 327/543] selftests: mptcp: join: properly kill background tasks The 'run_tests' function is executed in the background, but killing its associated PID would not kill the children tasks running in the background. To properly kill all background tasks, 'kill -- -PID' could be used, but this requires kill from procps-ng. Instead, all children tasks are listed using 'ps', and 'kill' is called with all PIDs of this group. Fixes: 31ee4ad86afd ("selftests: mptcp: join: stop transfer when check is done (part 1)") Cc: stable@vger.kernel.org Fixes: 04b57c9e096a ("selftests: mptcp: join: stop transfer when check is done (part 2)") Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251110-net-mptcp-sft-join-unstable-v1-6-a4332c714e10@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_join.sh | 18 ++++++++-------- .../testing/selftests/net/mptcp/mptcp_lib.sh | 21 +++++++++++++++++++ 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 01273abfdc89..41503c241989 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3831,7 +3831,7 @@ userspace_tests() chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm create destroy subflow @@ -3859,7 +3859,7 @@ userspace_tests() chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm create id 0 subflow @@ -3880,7 +3880,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 2 2 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm remove initial subflow @@ -3904,7 +3904,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm send RM_ADDR for ID 0 @@ -3930,7 +3930,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi } @@ -3960,7 +3960,7 @@ endpoint_tests() pm_nl_add_endpoint $ns2 10.0.2.2 flags signal pm_nl_check_endpoint "modif is allowed" \ $ns2 10.0.2.2 id 1 flags signal - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && @@ -4015,7 +4015,7 @@ endpoint_tests() chk_mptcp_info subflows 3 subflows 3 done - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid kill_events_pids chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 @@ -4089,7 +4089,7 @@ endpoint_tests() wait_mpj $ns2 chk_subflow_nr "after re-re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid kill_events_pids chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 @@ -4137,7 +4137,7 @@ endpoint_tests() wait_mpj $ns2 pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal wait_mpj $ns2 - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid join_syn_tx=3 join_connect_err=1 \ chk_join_nr 2 2 2 diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index d62e653d48b0..f4388900016a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -350,6 +350,27 @@ mptcp_lib_kill_wait() { wait "${1}" 2>/dev/null } +# $1: PID +mptcp_lib_pid_list_children() { + local curr="${1}" + # evoke 'ps' only once + local pids="${2:-"$(ps o pid,ppid)"}" + + echo "${curr}" + + local pid + for pid in $(echo "${pids}" | awk "\$2 == ${curr} { print \$1 }"); do + mptcp_lib_pid_list_children "${pid}" "${pids}" + done +} + +# $1: PID +mptcp_lib_kill_group_wait() { + # Some users might not have procps-ng: cannot use "kill -- -PID" + mptcp_lib_pid_list_children "${1}" | xargs -r kill &>/dev/null + wait "${1}" 2>/dev/null +} + # $1: IP address mptcp_lib_is_v6() { [ -z "${1##*:*}" ] From d58041d2c63e09a1c9083e0e9f4151e487c4e16a Mon Sep 17 00:00:00 2001 From: Magnus Lindholm Date: Tue, 4 Nov 2025 11:33:43 +0100 Subject: [PATCH 328/543] MAINTAINERS: Add Magnus Lindholm as maintainer for alpha port Acked-by: John Paul Adrian Glaubitz Acked-by: Matt Turner Signed-off-by: Magnus Lindholm Signed-off-by: Matt Turner --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f0c8b85baa6b..69e27908f74c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -915,6 +915,7 @@ F: drivers/staging/media/sunxi/cedrus/ ALPHA PORT M: Richard Henderson M: Matt Turner +M: Magnus Lindholm L: linux-alpha@vger.kernel.org S: Odd Fixes F: arch/alpha/ From 0345552a653ce5542affeb69ac5aa52177a5199b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 9 Nov 2025 16:12:15 +0000 Subject: [PATCH 329/543] net_sched: limit try_bulk_dequeue_skb() batches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 100dfa74cad9 ("inet: dev_queue_xmit() llist adoption") I started seeing many qdisc requeues on IDPF under high TX workload. $ tc -s qd sh dev eth1 handle 1: ; sleep 1; tc -s qd sh dev eth1 handle 1: qdisc mq 1: root Sent 43534617319319 bytes 268186451819 pkt (dropped 0, overlimits 0 requeues 3532840114) backlog 1056Kb 6675p requeues 3532840114 qdisc mq 1: root Sent 43554665866695 bytes 268309964788 pkt (dropped 0, overlimits 0 requeues 3537737653) backlog 781164b 4822p requeues 3537737653 This is caused by try_bulk_dequeue_skb() being only limited by BQL budget. perf record -C120-239 -e qdisc:qdisc_dequeue sleep 1 ; perf script ... netperf 75332 [146] 2711.138269: qdisc:qdisc_dequeue: dequeue ifindex=5 qdisc handle=0x80150000 parent=0x10013 txq_state=0x0 packets=1292 skbaddr=0xff378005a1e9f200 netperf 75332 [146] 2711.138953: qdisc:qdisc_dequeue: dequeue ifindex=5 qdisc handle=0x80150000 parent=0x10013 txq_state=0x0 packets=1213 skbaddr=0xff378004d607a500 netperf 75330 [144] 2711.139631: qdisc:qdisc_dequeue: dequeue ifindex=5 qdisc handle=0x80150000 parent=0x10013 txq_state=0x0 packets=1233 skbaddr=0xff3780046be20100 netperf 75333 [147] 2711.140356: qdisc:qdisc_dequeue: dequeue ifindex=5 qdisc handle=0x80150000 parent=0x10013 txq_state=0x0 packets=1093 skbaddr=0xff37800514845b00 netperf 75337 [151] 2711.141037: qdisc:qdisc_dequeue: dequeue ifindex=5 qdisc handle=0x80150000 parent=0x10013 txq_state=0x0 packets=1353 skbaddr=0xff37800460753300 netperf 75337 [151] 2711.141877: qdisc:qdisc_dequeue: dequeue ifindex=5 qdisc handle=0x80150000 parent=0x10013 txq_state=0x0 packets=1367 skbaddr=0xff378004e72c7b00 netperf 75330 [144] 2711.142643: qdisc:qdisc_dequeue: dequeue ifindex=5 qdisc handle=0x80150000 parent=0x10013 txq_state=0x0 packets=1202 skbaddr=0xff3780045bd60000 ... This is bad because : 1) Large batches hold one victim cpu for a very long time. 2) Driver often hit their own TX ring limit (all slots are used). 3) We call dev_requeue_skb() 4) Requeues are using a FIFO (q->gso_skb), breaking qdisc ability to implement FQ or priority scheduling. 5) dequeue_skb() gets packets from q->gso_skb one skb at a time with no xmit_more support. This is causing many spinlock games between the qdisc and the device driver. Requeues were supposed to be very rare, lets keep them this way. Limit batch sizes to /proc/sys/net/core/dev_weight (default 64) as __qdisc_run() was designed to use. Fixes: 5772e9a3463b ("qdisc: bulk dequeue support for qdiscs with TCQ_F_ONETXQUEUE") Signed-off-by: Eric Dumazet Reviewed-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/20251109161215.2574081-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_generic.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 1e008a228ebd..7dee9748a56b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -180,9 +180,10 @@ static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, - int *packets) + int *packets, int budget) { int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; + int cnt = 0; while (bytelimit > 0) { struct sk_buff *nskb = q->dequeue(q); @@ -193,8 +194,10 @@ static void try_bulk_dequeue_skb(struct Qdisc *q, bytelimit -= nskb->len; /* covers GSO len */ skb->next = nskb; skb = nskb; - (*packets)++; /* GSO counts as one pkt */ + if (++cnt >= budget) + break; } + (*packets) += cnt; skb_mark_not_on_list(skb); } @@ -228,7 +231,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, * A requeued skb (via q->gso_skb) can also be a SKB list. */ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, - int *packets) + int *packets, int budget) { const struct netdev_queue *txq = q->dev_queue; struct sk_buff *skb = NULL; @@ -295,7 +298,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, if (skb) { bulk: if (qdisc_may_bulk(q)) - try_bulk_dequeue_skb(q, skb, txq, packets); + try_bulk_dequeue_skb(q, skb, txq, packets, budget); else try_bulk_dequeue_skb_slow(q, skb, packets); } @@ -387,7 +390,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, * >0 - queue is not empty. * */ -static inline bool qdisc_restart(struct Qdisc *q, int *packets) +static inline bool qdisc_restart(struct Qdisc *q, int *packets, int budget) { spinlock_t *root_lock = NULL; struct netdev_queue *txq; @@ -396,7 +399,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) bool validate; /* Dequeue packet */ - skb = dequeue_skb(q, &validate, packets); + skb = dequeue_skb(q, &validate, packets, budget); if (unlikely(!skb)) return false; @@ -414,7 +417,7 @@ void __qdisc_run(struct Qdisc *q) int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; - while (qdisc_restart(q, &packets)) { + while (qdisc_restart(q, &packets, quota)) { quota -= packets; if (quota <= 0) { if (q->flags & TCQ_F_NOLOCK) From 22a36e660d014925114feb09a2680bb3c2d1e279 Mon Sep 17 00:00:00 2001 From: Vitaly Prosyak Date: Thu, 6 Nov 2025 12:35:53 -0500 Subject: [PATCH 330/543] drm/amdgpu: disable peer-to-peer access for DCC-enabled GC12 VRAM surfaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Certain multi-GPU configurations (especially GFX12) may hit data corruption when a DCC-compressed VRAM surface is shared across GPUs using peer-to-peer (P2P) DMA transfers. Such surfaces rely on device-local metadata and cannot be safely accessed through a remote GPU’s page tables. Attempting to import a DCC-enabled surface through P2P leads to incorrect rendering or GPU faults. This change disables P2P for DCC-enabled VRAM buffers that are contiguous and allocated on GFX12+ hardware. In these cases, the importer falls back to the standard system-memory path, avoiding invalid access to compressed surfaces. Future work could consider optional migration (VRAM→System→VRAM) if a performance regression is observed when `attach->peer2peer = false`. Tested on: - Dual RX 9700 XT (Navi4x) setup - GNOME and Wayland compositor scenarios - Confirmed no corruption after disabling P2P under these conditions v2: Remove check TTM_PL_VRAM & TTM_PL_FLAG_CONTIGUOUS. v3: simplify for upsteam and fix ip version check (Alex) Suggested-by: Christian König Signed-off-by: Vitaly Prosyak Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 9dff2bb709e6fbd97e263fd12bf12802d2b5a0cf) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 8561ad7f6180..ed3bef1edfe4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -82,6 +82,18 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf, struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); + /* + * Disable peer-to-peer access for DCC-enabled VRAM surfaces on GFX12+. + * Such buffers cannot be safely accessed over P2P due to device-local + * compression metadata. Fallback to system-memory path instead. + * Device supports GFX12 (GC 12.x or newer) + * BO was created with the AMDGPU_GEM_CREATE_GFX12_DCC flag + * + */ + if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0) && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) + attach->peer2peer = false; + if (!amdgpu_dmabuf_is_xgmi_accessible(attach_adev, bo) && pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0) attach->peer2peer = false; From 9f8fd538e244b87e4556833da51ddd986f50cc81 Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Tue, 4 Nov 2025 10:42:45 +0100 Subject: [PATCH 331/543] drm/amdgpu: jump to the correct label on failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_sched_entity_init wasn't called yet, so the only thing to do is to release allocated memory. This doesn't fix any bug since entity is zero allocated and drm_sched_entity_fini does nothing in this case. Signed-off-by: Pierre-Eric Pelloux-Prayer Reviewed-by: Tvrtko Ursulin Acked-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit ec49374ccb8da86b465beaf09c367f3dfd648d8f) --- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index f5d5c45ddc0d..afedea02188d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -236,7 +236,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip, r = amdgpu_xcp_select_scheds(adev, hw_ip, hw_prio, fpriv, &num_scheds, &scheds); if (r) - goto cleanup_entity; + goto error_free_entity; } /* disable load balance if the hw engine retains context among dependent jobs */ From 6623c5f9fd877868fba133b4ae4dab0052e82dad Mon Sep 17 00:00:00 2001 From: "Jesse.Zhang" Date: Fri, 24 Oct 2025 16:09:25 +0800 Subject: [PATCH 332/543] drm/amdgpu: fix lock warning in amdgpu_userq_fence_driver_process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a potential deadlock caused by inconsistent spinlock usage between interrupt and process contexts in the userq fence driver. The issue occurs when amdgpu_userq_fence_driver_process() is called from both: - Interrupt context: gfx_v11_0_eop_irq() -> amdgpu_userq_fence_driver_process() - Process context: amdgpu_eviction_fence_suspend_worker() -> amdgpu_userq_fence_driver_force_completion() -> amdgpu_userq_fence_driver_process() In interrupt context, the spinlock was acquired without disabling interrupts, leaving it in {IN-HARDIRQ-W} state. When the same lock is acquired in process context, the kernel detects inconsistent locking since the process context acquisition would enable interrupts while holding a lock previously acquired in interrupt context. Kernel log shows: [ 4039.310790] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. [ 4039.310804] kworker/7:2/409 [HC0[0]:SC0[0]:HE1:SE1] takes: [ 4039.310818] ffff9284e1bed000 (&fence_drv->fence_list_lock){?...}-{3:3}, [ 4039.310993] {IN-HARDIRQ-W} state was registered at: [ 4039.311004] lock_acquire+0xc6/0x300 [ 4039.311018] _raw_spin_lock+0x39/0x80 [ 4039.311031] amdgpu_userq_fence_driver_process.part.0+0x30/0x180 [amdgpu] [ 4039.311146] amdgpu_userq_fence_driver_process+0x17/0x30 [amdgpu] [ 4039.311257] gfx_v11_0_eop_irq+0x132/0x170 [amdgpu] Fix by using spin_lock_irqsave()/spin_unlock_irqrestore() to properly manage interrupt state regardless of calling context. Reviewed-by: Christian König Signed-off-by: Jesse Zhang Signed-off-by: Alex Deucher (cherry picked from commit ded3ad780cf97a04927773c4600823b84f7f3cc2) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 761bad98da3e..4d0096d0baa9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -151,15 +151,16 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d { struct amdgpu_userq_fence *userq_fence, *tmp; struct dma_fence *fence; + unsigned long flags; u64 rptr; int i; if (!fence_drv) return; + spin_lock_irqsave(&fence_drv->fence_list_lock, flags); rptr = amdgpu_userq_fence_read(fence_drv); - spin_lock(&fence_drv->fence_list_lock); list_for_each_entry_safe(userq_fence, tmp, &fence_drv->fences, link) { fence = &userq_fence->base; @@ -174,7 +175,7 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d list_del(&userq_fence->link); dma_fence_put(fence); } - spin_unlock(&fence_drv->fence_list_lock); + spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags); } void amdgpu_userq_fence_driver_destroy(struct kref *ref) From 33c995709121a3a29d4567a08c943bf7a5b24b78 Mon Sep 17 00:00:00 2001 From: Ivan Lipski Date: Thu, 23 Oct 2025 10:03:59 -0400 Subject: [PATCH 333/543] drm/amd/display: Allow VRR params change if unsynced with the stream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] When changing resolution (e.g., 4K → FHD) in mirror/clone mode with certain monitors, the monitor blanks and loses connection due to an early exit in vrr_settings_require_update(). The function only checks if VRR state, fixed refresh target, or min/max refresh rate range has changed. During mode changes, if the calculated min/max refresh values remain the same even though the stream's v_total changed, the function returns early without updating vrr_params.adjust.v_total_min/max, leaving the monitor's VRR timing parameters unsynced with the new mode, causing it to blank out. [How] Explicitly adjust VRR parameters to the stream's nominal v_total when VRR is supported, but inactive. Fixes: 6d31602a9f57 ("drm/amd/display: more liberal vmin/vmax update for freesync") Reviewed-by: Aurabindo Pillai Signed-off-by: Ivan Lipski Signed-off-by: Fangzhi Zuo Tested-by: Dan Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 607df8248a011524211ee34850345305a1913f9e) --- .../gpu/drm/amd/display/modules/freesync/freesync.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c index ce421bcddcb0..1aae46d703ba 100644 --- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c +++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c @@ -1260,6 +1260,17 @@ void mod_freesync_handle_v_update(struct mod_freesync *mod_freesync, update_v_total_for_static_ramp( core_freesync, stream, in_out_vrr); } + + /* + * If VRR is inactive, set vtotal min and max to nominal vtotal + */ + if (in_out_vrr->state == VRR_STATE_INACTIVE) { + in_out_vrr->adjust.v_total_min = + mod_freesync_calc_v_total_from_refresh(stream, + in_out_vrr->max_refresh_in_uhz); + in_out_vrr->adjust.v_total_max = in_out_vrr->adjust.v_total_min; + return; + } } unsigned long long mod_freesync_calc_nominal_field_rate( From 7132f7e025f9382157543dd86a62d161335b48b9 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Fri, 7 Nov 2025 13:07:13 -0500 Subject: [PATCH 334/543] drm/amd/amdgpu: Ensure isp_kernel_buffer_alloc() creates a new BO When the BO pointer provided to amdgpu_bo_create_kernel() points to non-NULL, amdgpu_bo_create_kernel() takes it as a hint to pin that address rather than allocate a new BO. This functionality is never desired for allocating ISP buffers. A new BO should always be created when isp_kernel_buffer_alloc() is called, per the description for isp_kernel_buffer_alloc(). Ensure this by zeroing *bo right before the amdgpu_bo_create_kernel() call. Fixes: 55d42f616976 ("drm/amd/amdgpu: Add helper functions for isp buffers") Reviewed-by: Mario Limonciello (AMD) Reviewed-by: Pratap Nirujogi Signed-off-by: Sultan Alsawaf Signed-off-by: Alex Deucher (cherry picked from commit 73c8c29baac7f0c7e703d92eba009008cbb5228e) --- drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c index 9cddbf50442a..37270c4dab8d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c @@ -280,6 +280,8 @@ int isp_kernel_buffer_alloc(struct device *dev, u64 size, if (ret) return ret; + /* Ensure *bo is NULL so a new BO will be created */ + *bo = NULL; ret = amdgpu_bo_create_kernel(adev, size, ISP_MC_ADDR_ALIGN, From bbe3c115030da431c9ec843c18d5583e59482dd2 Mon Sep 17 00:00:00 2001 From: Sathishkumar S Date: Tue, 7 Oct 2025 13:17:51 +0530 Subject: [PATCH 335/543] drm/amdgpu/jpeg: Add parse_cs for JPEG5_0_1 enable parse_cs callback for JPEG5_0_1. Signed-off-by: Sathishkumar S Reviewed-by: Leo Liu Signed-off-by: Alex Deucher (cherry picked from commit 547985579932c1de13f57f8bcf62cd9361b9d3d3) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c index baf097d2e1ac..ab0bf880d3d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c @@ -878,6 +878,7 @@ static const struct amdgpu_ring_funcs jpeg_v5_0_1_dec_ring_vm_funcs = { .get_rptr = jpeg_v5_0_1_dec_ring_get_rptr, .get_wptr = jpeg_v5_0_1_dec_ring_get_wptr, .set_wptr = jpeg_v5_0_1_dec_ring_set_wptr, + .parse_cs = amdgpu_jpeg_dec_parse_cs, .emit_frame_size = SOC15_FLUSH_GPU_TLB_NUM_WREG * 6 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 8 + From d15deafab5d722afb9e2f83c5edcdef9d9d98bd1 Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Thu, 6 Nov 2025 10:17:06 -0500 Subject: [PATCH 336/543] drm/amdkfd: relax checks for over allocation of save area Over allocation of save area is not fatal, only under allocation is. ROCm has various components that independently claim authority over save area size. Unless KFD decides to claim single authority, relax size checks. Signed-off-by: Jonathan Kim Reviewed-by: Philip Yang Signed-off-by: Alex Deucher (cherry picked from commit 15bd4958fe38e763bc17b607ba55155254a01f55) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index a65c67cf56ff..f1e7583650c4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -297,16 +297,16 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope goto out_err_unreserve; } - if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) { - pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n", + if (properties->ctx_save_restore_area_size < topo_dev->node_props.cwsr_size) { + pr_debug("queue cwsr size 0x%x not sufficient for node cwsr size 0x%x\n", properties->ctx_save_restore_area_size, topo_dev->node_props.cwsr_size); err = -EINVAL; goto out_err_unreserve; } - total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) - * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = (properties->ctx_save_restore_area_size + + topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask); total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, @@ -352,8 +352,8 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope topo_dev = kfd_topology_device_by_id(pdd->dev->id); if (!topo_dev) return -EINVAL; - total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) - * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = (properties->ctx_save_restore_area_size + + topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask); total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size); From eac32ff42393efa6657efc821231b8d802c1d485 Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Tue, 28 Oct 2025 14:37:07 -0400 Subject: [PATCH 337/543] drm/amdkfd: Fix GPU mappings for APU after prefetch Fix the following corner case:- Consider a 2M huge page SVM allocation, followed by prefetch call for the first 4K page. The whole range is initially mapped with single PTE. After the prefetch, this range gets split to first page + rest of the pages. Currently, the first page mapping is not updated on MI300A (APU) since page hasn't migrated. However, after range split PTE mapping it not valid. Fix this by forcing page table update for the whole range when prefetch is called. Calling prefetch on APU doesn't improve performance. If all it deteriotes. However, functionality has to be supported. v2: Use apu_prefer_gtt as this issue doesn't apply to APUs with carveout VRAM v3: Simplify by setting the flag for all ASICs as it doesn't affect dGPU v4: Remove v2 and v3 changes. Force update_mapping when range is split at a size that is not aligned to prange granularity Suggested-by: Philip Yang Signed-off-by: Harish Kasiviswanathan Reviewed-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher (cherry picked from commit 076470b9f6f8d9c7c8ca73a9f054942a686f9ba7) --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 9d72411c3379..74a1d3e1d52b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -3687,6 +3687,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping); /* TODO: unmap ranges from GPU that lost access */ } + update_mapping |= !p->xnack_enabled && !list_empty(&remap_list); + list_for_each_entry_safe(prange, next, &remove_list, update_list) { pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, prange->start, From f4c737d44969c38dac2478039d353edddffd120d Mon Sep 17 00:00:00 2001 From: Junjie Cao Date: Thu, 16 Oct 2025 09:49:19 +0800 Subject: [PATCH 338/543] wifi: iwlwifi: fix aux ROC time event iterator usage The list_for_each_entry() iterator must not be used outside the loop. Even though we break and check for NULL, doing so still violates kernel iteration rules and triggers Coccinelle's use_after_iter.cocci warning. Cache the matched entry in aux_roc_te and use it consistently after the loop. This follows iterator best practices, resolves the warning, and makes the code more maintainable. Signed-off-by: Junjie Cao Link: https://patch.msgid.link/20251016014919.383565-1-junjie.cao@intel.com Signed-off-by: Miri Korenblit --- .../net/wireless/intel/iwlwifi/mvm/time-event.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c index 0c9c2492d8a7..0b12ee8ad618 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c @@ -463,7 +463,7 @@ static int iwl_mvm_aux_roc_te_handle_notif(struct iwl_mvm *mvm, if (!aux_roc_te) /* Not a Aux ROC time event */ return -EINVAL; - iwl_mvm_te_check_trigger(mvm, notif, te_data); + iwl_mvm_te_check_trigger(mvm, notif, aux_roc_te); IWL_DEBUG_TE(mvm, "Aux ROC time event notification - UID = 0x%x action %d (error = %d)\n", @@ -475,14 +475,14 @@ static int iwl_mvm_aux_roc_te_handle_notif(struct iwl_mvm *mvm, /* End TE, notify mac80211 */ ieee80211_remain_on_channel_expired(mvm->hw); iwl_mvm_roc_finished(mvm); /* flush aux queue */ - list_del(&te_data->list); /* remove from list */ - te_data->running = false; - te_data->vif = NULL; - te_data->uid = 0; - te_data->id = TE_MAX; + list_del(&aux_roc_te->list); /* remove from list */ + aux_roc_te->running = false; + aux_roc_te->vif = NULL; + aux_roc_te->uid = 0; + aux_roc_te->id = TE_MAX; } else if (le32_to_cpu(notif->action) == TE_V2_NOTIF_HOST_EVENT_START) { set_bit(IWL_MVM_STATUS_ROC_AUX_RUNNING, &mvm->status); - te_data->running = true; + aux_roc_te->running = true; ieee80211_ready_on_channel(mvm->hw); /* Start TE */ } else { IWL_DEBUG_TE(mvm, From 3592c0083fb29cca13cd9978b8844d58b4eff548 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 8 Oct 2025 11:20:44 +0200 Subject: [PATCH 339/543] wifi: iwlwifi: mvm: fix beacon template/fixed rate During the development of the rate changes, I evidently made some changes that shouldn't have been there; beacon templates with rate_n_flags are only in old versions, so no changes to them should have been necessary, and evidently broke on some devices. This also would have broken fixed (injection) rates, it would seem. Restore the old handling of this. Fixes: dabc88cb3b78 ("wifi: iwlwifi: handle v3 rates") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220558 Reviewed-by: Benjamin Berg Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20251008112044.3bb8ea849d8d.I90f4d2b2c1f62eaedaf304a61d2ab9e50c491c2d@changeid Signed-off-by: Miri Korenblit --- drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c | 13 +++---------- drivers/net/wireless/intel/iwlwifi/mvm/utils.c | 12 +++++++++--- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c index 9c9e0e1c6e1d..867807abde66 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c @@ -938,19 +938,12 @@ u8 iwl_mvm_mac_ctxt_get_lowest_rate(struct iwl_mvm *mvm, u16 iwl_mvm_mac_ctxt_get_beacon_flags(const struct iwl_fw *fw, u8 rate_idx) { + u16 flags = iwl_mvm_mac80211_idx_to_hwrate(fw, rate_idx); bool is_new_rate = iwl_fw_lookup_cmd_ver(fw, BEACON_TEMPLATE_CMD, 0) > 10; - u16 flags, cck_flag; - - if (is_new_rate) { - flags = iwl_mvm_mac80211_idx_to_hwrate(fw, rate_idx); - cck_flag = IWL_MAC_BEACON_CCK; - } else { - cck_flag = IWL_MAC_BEACON_CCK_V1; - flags = iwl_fw_rate_idx_to_plcp(rate_idx); - } if (rate_idx <= IWL_LAST_CCK_RATE) - flags |= cck_flag; + flags |= is_new_rate ? IWL_MAC_BEACON_CCK + : IWL_MAC_BEACON_CCK_V1; return flags; } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c index 22602c32faa5..fa995e235d9b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c @@ -159,9 +159,15 @@ int iwl_mvm_legacy_rate_to_mac80211_idx(u32 rate_n_flags, u8 iwl_mvm_mac80211_idx_to_hwrate(const struct iwl_fw *fw, int rate_idx) { - return (rate_idx >= IWL_FIRST_OFDM_RATE ? - rate_idx - IWL_FIRST_OFDM_RATE : - rate_idx); + if (iwl_fw_lookup_cmd_ver(fw, TX_CMD, 0) > 8) + /* In the new rate legacy rates are indexed: + * 0 - 3 for CCK and 0 - 7 for OFDM. + */ + return (rate_idx >= IWL_FIRST_OFDM_RATE ? + rate_idx - IWL_FIRST_OFDM_RATE : + rate_idx); + + return iwl_fw_rate_idx_to_plcp(rate_idx); } u8 iwl_mvm_mac80211_ac_to_ucode_ac(enum ieee80211_ac_numbers ac) From 1a222625b468effd13d1ebb662c36a41c28a835a Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 10 Nov 2025 14:57:00 +0200 Subject: [PATCH 340/543] wifi: iwlwifi: mld: always take beacon ies in link grading One of the factors of a link's grade is the channel load, which is calculated from the AP's bss load element. The current code takes this element from the beacon for an active link, and from bss->ies for an inactive link. bss->ies is set to either the beacon's ies or to the probe response ones, with preference to the probe response (meaning that if there was even one probe response, the ies of it will be stored in bss->ies and won't be overiden by the beacon ies). The probe response can be very old, i.e. from the connection time, where a beacon is updated before each link selection (which is triggered only after a passive scan). In such case, the bss load element in the probe response will not include the channel load caused by the STA, where the beacon will. This will cause the inactive link to always have a lower channel load, and therefore an higher grade than the active link's one. This causes repeated link switches, causing the throughput to drop. Fix this by always taking the ies from the beacon, as those are for sure new. Fixes: d1e879ec600f ("wifi: iwlwifi: add iwlmld sub-driver") Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251110145652.b493dbb1853a.I058ba7309c84159f640cc9682d1bda56dd56a536@changeid Signed-off-by: Miri Korenblit --- drivers/net/wireless/intel/iwlwifi/mld/link.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mld/link.c b/drivers/net/wireless/intel/iwlwifi/mld/link.c index 60d814bf5779..f6f52d297a72 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/link.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/link.c @@ -708,18 +708,13 @@ static int iwl_mld_get_chan_load_from_element(struct iwl_mld *mld, struct ieee80211_bss_conf *link_conf) { - struct ieee80211_vif *vif = link_conf->vif; const struct cfg80211_bss_ies *ies; const struct element *bss_load_elem = NULL; const struct ieee80211_bss_load_elem *bss_load; guard(rcu)(); - if (ieee80211_vif_link_active(vif, link_conf->link_id)) - ies = rcu_dereference(link_conf->bss->beacon_ies); - else - ies = rcu_dereference(link_conf->bss->ies); - + ies = rcu_dereference(link_conf->bss->beacon_ies); if (ies) bss_load_elem = cfg80211_find_elem(WLAN_EID_QBSS_LOAD, ies->data, ies->len); From a3f8f8662771285511ae26c4c8d3ba1cd22159b9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Nov 2025 14:39:45 +0100 Subject: [PATCH 341/543] power: always freeze efivarfs The efivarfs filesystems must always be frozen and thawed to resync variable state. Make it so. Link: https://patch.msgid.link/20251105-vorbild-zutreffen-fe00d1dd98db@brauner Signed-off-by: Christian Brauner --- fs/efivarfs/super.c | 1 + fs/super.c | 13 ++++++++++--- include/linux/fs.h | 3 ++- kernel/power/hibernate.c | 9 +++------ kernel/power/suspend.c | 3 +-- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 1f4d8ce56667..6de97565d5f7 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -533,6 +533,7 @@ static struct file_system_type efivarfs_type = { .init_fs_context = efivarfs_init_fs_context, .kill_sb = efivarfs_kill_sb, .parameters = efivarfs_parameters, + .fs_flags = FS_POWER_FREEZE, }; static __init int efivarfs_init(void) diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e03..277b84e5c279 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1183,11 +1183,14 @@ static inline bool get_active_super(struct super_block *sb) static const char *filesystems_freeze_ptr = "filesystems_freeze"; -static void filesystems_freeze_callback(struct super_block *sb, void *unused) +static void filesystems_freeze_callback(struct super_block *sb, void *freeze_all_ptr) { if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) return; + if (freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE)) + return; + if (!get_active_super(sb)) return; @@ -1201,9 +1204,13 @@ static void filesystems_freeze_callback(struct super_block *sb, void *unused) deactivate_super(sb); } -void filesystems_freeze(void) +void filesystems_freeze(bool freeze_all) { - __iterate_supers(filesystems_freeze_callback, NULL, + void *freeze_all_ptr = NULL; + + if (freeze_all) + freeze_all_ptr = &freeze_all; + __iterate_supers(filesystems_freeze_callback, freeze_all_ptr, SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 3ea98c6cce81..249a1da8440e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2689,6 +2689,7 @@ struct file_system_type { #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ +#define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -3606,7 +3607,7 @@ extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); -void filesystems_freeze(void); +void filesystems_freeze(bool freeze_all); void filesystems_thaw(void); extern int dcache_dir_open(struct inode *, struct file *); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 14e85ff23551..1f250ce036a0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -825,8 +825,7 @@ int hibernate(void) goto Restore; ksys_sync_helper(); - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -932,8 +931,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -1083,8 +1081,7 @@ static int software_resume(void) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4bb4686c1c08..c933a63a9718 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -375,8 +375,7 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); From f6fdd77b3e0d519a2535a1e923558cd07d9acda9 Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Wed, 12 Nov 2025 17:26:09 +0800 Subject: [PATCH 342/543] ALSA: hda/tas2781: Correct the wrong project ID The project hardware ID should be ALC287_FIXUP_TXNW2781_I2C, not ALC287_FIXUP_TAS2781_I2C for HP Lampass projects. Fixes: 7a39c723b747 ("ALSA: hda/tas2781: Add new quirk for HP new projects") Signed-off-by: Baojun Xu Link: https://patch.msgid.link/20251112092609.15865-1-baojun.xu@ti.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index a9698bf26887..269b6c1e3b6d 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -6700,9 +6700,9 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8ed8, "HP Merino16", ALC245_FIXUP_TAS2781_SPI_2), SND_PCI_QUIRK(0x103c, 0x8ed9, "HP Merino14W", ALC245_FIXUP_TAS2781_SPI_2), SND_PCI_QUIRK(0x103c, 0x8eda, "HP Merino16W", ALC245_FIXUP_TAS2781_SPI_2), - SND_PCI_QUIRK(0x103c, 0x8f40, "HP Lampas14", ALC287_FIXUP_TAS2781_I2C), - SND_PCI_QUIRK(0x103c, 0x8f41, "HP Lampas16", ALC287_FIXUP_TAS2781_I2C), - SND_PCI_QUIRK(0x103c, 0x8f42, "HP LampasW14", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x103c, 0x8f40, "HP Lampas14", ALC287_FIXUP_TXNW2781_I2C), + SND_PCI_QUIRK(0x103c, 0x8f41, "HP Lampas16", ALC287_FIXUP_TXNW2781_I2C), + SND_PCI_QUIRK(0x103c, 0x8f42, "HP LampasW14", ALC287_FIXUP_TXNW2781_I2C), SND_PCI_QUIRK(0x1043, 0x1032, "ASUS VivoBook X513EA", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1034, "ASUS GU605C", ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), From 78f0e33cd6c939a555aa80dbed2fec6b333a7660 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 11 Nov 2025 06:28:15 +0000 Subject: [PATCH 343/543] fs/namespace: correctly handle errors returned by grab_requested_mnt_ns grab_requested_mnt_ns was changed to return error codes on failure, but its callers were not updated to check for error pointers, still checking only for a NULL return value. This commit updates the callers to use IS_ERR() or IS_ERR_OR_NULL() and PTR_ERR() to correctly check for and propagate errors. This also makes sure that the logic actually works and mount namespace file descriptors can be used to refere to mounts. Christian Brauner says: Rework the patch to be more ergonomic and in line with our overall error handling patterns. Fixes: 7b9d14af8777 ("fs: allow mount namespace fd") Cc: Christian Brauner Signed-off-by: Andrei Vagin Link: https://patch.msgid.link/20251111062815.2546189-1-avagin@google.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 32 ++++++++++++++++---------------- include/uapi/linux/mount.h | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index cc6e00e72437..2bad25709b2c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -141,7 +141,8 @@ static void mnt_ns_release(struct mnt_namespace *ns) kfree(ns); } } -DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, + if (!IS_ERR(_T)) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { @@ -5726,7 +5727,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); if (ret) return ret; - if (kreq->spare != 0) + if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id) return -EINVAL; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) @@ -5743,16 +5744,12 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq { struct mnt_namespace *mnt_ns; - if (kreq->mnt_ns_id && kreq->spare) - return ERR_PTR(-EINVAL); - - if (kreq->mnt_ns_id) - return lookup_mnt_ns(kreq->mnt_ns_id); - - if (kreq->spare) { + if (kreq->mnt_ns_id) { + mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id); + } else if (kreq->mnt_ns_fd) { struct ns_common *ns; - CLASS(fd, f)(kreq->spare); + CLASS(fd, f)(kreq->mnt_ns_fd); if (fd_empty(f)) return ERR_PTR(-EBADF); @@ -5767,6 +5764,8 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq } else { mnt_ns = current->nsproxy->mnt_ns; } + if (!mnt_ns) + return ERR_PTR(-ENOENT); refcount_inc(&mnt_ns->passive); return mnt_ns; @@ -5791,8 +5790,8 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, return ret; ns = grab_requested_mnt_ns(&kreq); - if (!ns) - return -ENOENT; + if (IS_ERR(ns)) + return PTR_ERR(ns); if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) @@ -5902,8 +5901,8 @@ static void __free_klistmount_free(const struct klistmount *kls) static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, size_t nr_mnt_ids) { - u64 last_mnt_id = kreq->param; + struct mnt_namespace *ns; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) @@ -5917,9 +5916,10 @@ static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req * if (!kls->kmnt_ids) return -ENOMEM; - kls->ns = grab_requested_mnt_ns(kreq); - if (!kls->ns) - return -ENOENT; + ns = grab_requested_mnt_ns(kreq); + if (IS_ERR(ns)) + return PTR_ERR(ns); + kls->ns = ns; kls->mnt_parent_id = kreq->mnt_id; return 0; diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 7fa67c2031a5..5d3f8c9e3a62 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -197,7 +197,7 @@ struct statmount { */ struct mnt_id_req { __u32 size; - __u32 spare; + __u32 mnt_ns_fd; __u64 mnt_id; __u64 param; __u64 mnt_ns_id; From 3cd1548a278c7d6a9bdef1f1866e7cf66bfd3518 Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Sat, 8 Nov 2025 19:09:47 +0000 Subject: [PATCH 344/543] shmem: fix tmpfs reconfiguration (remount) when noswap is set In systemd we're trying to switch the internal credentials setup logic to new mount API [1], and I noticed fsconfig(FSCONFIG_CMD_RECONFIGURE) consistently fails on tmpfs with noswap option. This can be trivially reproduced with the following: ``` int fs_fd = fsopen("tmpfs", 0); fsconfig(fs_fd, FSCONFIG_SET_FLAG, "noswap", NULL, 0); fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); fsmount(fs_fd, 0, 0); fsconfig(fs_fd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0); <------ EINVAL ``` After some digging the culprit is shmem_reconfigure() rejecting !(ctx->seen & SHMEM_SEEN_NOSWAP) && sbinfo->noswap, which is bogus as ctx->seen serves as a mask for whether certain options are touched at all. On top of that, noswap option doesn't use fsparam_flag_no, hence it's not really possible to "reenable" swap to begin with. Drop the check and redundant SHMEM_SEEN_NOSWAP flag. [1] https://github.com/systemd/systemd/pull/39637 Fixes: 2c6efe9cf2d7 ("shmem: add support to ignore swap") Signed-off-by: Mike Yuan Link: https://patch.msgid.link/20251108190930.440685-1-me@yhndnzj.com Cc: Luis Chamberlain Cc: Christian Brauner Cc: Hugh Dickins Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- mm/shmem.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b9081b817d28..1b976414d6fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -131,8 +131,7 @@ struct shmem_options { #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 -#define SHMEM_SEEN_NOSWAP 16 -#define SHMEM_SEEN_QUOTA 32 +#define SHMEM_SEEN_QUOTA 16 }; #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -4677,7 +4676,6 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) "Turning off swap in unprivileged tmpfs mounts unsupported"); } ctx->noswap = true; - ctx->seen |= SHMEM_SEEN_NOSWAP; break; case Opt_quota: if (fc->user_ns != &init_user_ns) @@ -4827,14 +4825,15 @@ static int shmem_reconfigure(struct fs_context *fc) err = "Current inum too high to switch to 32-bit inums"; goto out; } - if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) { + + /* + * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap" + * counterpart for (re-)enabling swap. + */ + if (ctx->noswap && !sbinfo->noswap) { err = "Cannot disable swap on remount"; goto out; } - if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) { - err = "Cannot enable swap on remount if it was disabled on first mount"; - goto out; - } if (ctx->seen & SHMEM_SEEN_QUOTA && !sb_any_quota_loaded(fc->root->d_sb)) { From 12741624645e098b2234a5ae341045a97473caf1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Nov 2025 22:20:24 +0100 Subject: [PATCH 345/543] fs: add iput_not_last() Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251105212025.807549-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 12 ++++++++++++ include/linux/fs.h | 1 + 2 files changed, 13 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index ec9339024ac3..cff1d3af0d57 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1967,6 +1967,18 @@ void iput(struct inode *inode) } EXPORT_SYMBOL(iput); +/** + * iput_not_last - put an inode assuming this is not the last reference + * @inode: inode to put + */ +void iput_not_last(struct inode *inode) +{ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode); + + WARN_ON(atomic_sub_return(1, &inode->i_count) == 0); +} +EXPORT_SYMBOL(iput_not_last); + #ifdef CONFIG_BLOCK /** * bmap - find a block number in a file diff --git a/include/linux/fs.h b/include/linux/fs.h index 249a1da8440e..dd3b57cfadee 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2824,6 +2824,7 @@ extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); +void iput_not_last(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); int generic_update_time(struct inode *, int); From 56325e8c68c0724d626f665773a5005dcf44e329 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Nov 2025 22:20:25 +0100 Subject: [PATCH 346/543] landlock: fix splats from iput() after it started calling might_sleep() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At this point it is guaranteed this is not the last reference. However, a recent addition of might_sleep() at top of iput() started generating false-positives as it was executing for all values. Remedy the problem by using the newly introduced iput_not_last(). Reported-by: syzbot+12479ae15958fc3f54ec@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68d32659.a70a0220.4f78.0012.GAE@google.com/ Fixes: 2ef435a872ab ("fs: add might_sleep() annotation to iput() and more") Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251105212025.807549-2-mjguzik@gmail.com Reviewed-by: Mickaël Salaün Signed-off-by: Christian Brauner --- security/landlock/fs.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/security/landlock/fs.c b/security/landlock/fs.c index 0bade2c5aa1d..d9c12b993fa7 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1335,11 +1335,10 @@ static void hook_sb_delete(struct super_block *const sb) * At this point, we own the ihold() reference that was * originally set up by get_inode_object() and the * __iget() reference that we just set in this loop - * walk. Therefore the following call to iput() will - * not sleep nor drop the inode because there is now at - * least two references to it. + * walk. Therefore there are at least two references + * on the inode. */ - iput(inode); + iput_not_last(inode); } else { spin_unlock(&object->lock); rcu_read_unlock(); From 85592114ffda568b507bc2b04f5e9afbe7c13b62 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Wed, 12 Nov 2025 10:28:53 +0000 Subject: [PATCH 347/543] KVM: arm64: VHE: Compute fgt traps before activating them On VHE, the Fine Grain Traps registers are written to hardware in kvm_arch_vcpu_load()->..->__activate_traps_hfgxtr(), but the fgt array is computed later, in kvm_vcpu_load_fgt(). This can lead to zero being written to the FGT registers the first time a VCPU is loaded. Also, any changes to the fgt array will be visible only after the VCPU is scheduled out, and then back in, which is not the intended behaviour. Fix it by computing the fgt array just before the fgt traps are written to hardware. Fixes: fb10ddf35c1c ("KVM: arm64: Compute per-vCPU FGTs at vcpu_load()") Signed-off-by: Alexandru Elisei Reviewed-by: Oliver Upton Link: https://patch.msgid.link/20251112102853.47759-1-alexandru.elisei@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..052bf0d4d0b0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -624,6 +624,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_timer_vcpu_load(vcpu); kvm_vgic_load(vcpu); kvm_vcpu_load_debug(vcpu); + kvm_vcpu_load_fgt(vcpu); if (has_vhe()) kvm_vcpu_load_vhe(vcpu); kvm_arch_vcpu_load_fp(vcpu); @@ -642,7 +643,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->arch.hcr_el2 |= HCR_TWI; vcpu_set_pauth_traps(vcpu); - kvm_vcpu_load_fgt(vcpu); if (is_protected_kvm_enabled()) { kvm_call_hyp_nvhe(__pkvm_vcpu_load, From fa0498f8047536b877819ce4ab154d332b243d43 Mon Sep 17 00:00:00 2001 From: Marcos Vega Date: Sat, 8 Nov 2025 12:47:41 +0100 Subject: [PATCH 348/543] platform/x86: hp-wmi: Add Omen MAX 16-ah0xx fan support and thermal profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New HP Omen laptops follow the same WMI thermal profile as Victus 16-r1000 and 16-s1000. Add DMI board 8D41 to victus_s_thermal_profile_boards. Signed-off-by: Marcos Vega Link: https://patch.msgid.link/20251108114739.9255-3-marcosmola2@gmail.com [ij: changelog taken partially from v1] Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-wmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index e10c75d91f24..ad9d9f97960f 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -96,6 +96,7 @@ static const char * const victus_thermal_profile_boards[] = { static const char * const victus_s_thermal_profile_boards[] = { "8BBE", "8BD4", "8BD5", "8C78", "8C99", "8C9C", + "8D41", }; enum hp_wmi_radio { From 1c2e70397b4125022dba80f6111271a37fb36bae Mon Sep 17 00:00:00 2001 From: Praveen Talari Date: Mon, 10 Nov 2025 15:40:41 +0530 Subject: [PATCH 349/543] pinctrl: qcom: msm: Fix deadlock in pinmux configuration Replace disable_irq() with disable_irq_nosync() in msm_pinmux_set_mux() to prevent deadlock when wakeup IRQ is triggered on the same GPIO being reconfigured. The issue occurs when a wakeup IRQ is triggered on a GPIO and the IRQ handler attempts to reconfigure the same GPIO's pinmux. In this scenario, msm_pinmux_set_mux() calls disable_irq() which waits for the currently running IRQ handler to complete, creating a circular dependency that results in deadlock. Using disable_irq_nosync() avoids waiting for the IRQ handler to complete, preventing the deadlock condition while still properly disabling the interrupt during pinmux reconfiguration. Suggested-by: Prasad Sodagudi Signed-off-by: Praveen Talari Reviewed-by: Bjorn Andersson Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-msm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index 67525d542c5b..e99871b90ab9 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -189,7 +189,7 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev, */ if (d && i != gpio_func && !test_and_set_bit(d->hwirq, pctrl->disabled_for_mux)) - disable_irq(irq); + disable_irq_nosync(irq); raw_spin_lock_irqsave(&pctrl->lock, flags); From f2687d3cc9f905505d7b510c50970176115066a2 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 7 Nov 2025 14:41:41 +0200 Subject: [PATCH 350/543] drm/i915/dp_mst: Disable Panel Replay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Disable Panel Replay on MST links until it's properly implemented. For instance the required VSC SDP is not programmed on MST and FEC is not enabled if Panel Replay is enabled. Fixes: 3257e55d3ea7 ("drm/i915/panelreplay: enable/disable panel replay") Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/15174 Cc: Jouni Högander Cc: Animesh Manna Cc: stable@vger.kernel.org # v6.8+ Reviewed-by: Jouni Högander Signed-off-by: Imre Deak Link: https://patch.msgid.link/20251107124141.911895-1-imre.deak@intel.com (cherry picked from commit e109f644b871df8440c886a69cdce971ed533088) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_psr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index d5e0a1e66944..4619237f1346 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -585,6 +585,10 @@ static void _panel_replay_init_dpcd(struct intel_dp *intel_dp) struct intel_display *display = to_intel_display(intel_dp); int ret; + /* TODO: Enable Panel Replay on MST once it's properly implemented. */ + if (intel_dp->mst_detect == DRM_DP_MST) + return; + ret = drm_dp_dpcd_read_data(&intel_dp->aux, DP_PANEL_REPLAY_CAP_SUPPORT, &intel_dp->pr_dpcd, sizeof(intel_dp->pr_dpcd)); if (ret < 0) From ac1499fcd40fe06479e9b933347b837ccabc2a40 Mon Sep 17 00:00:00 2001 From: Chuang Wang Date: Tue, 11 Nov 2025 14:43:24 +0800 Subject: [PATCH 351/543] ipv4: route: Prevent rt_bind_exception() from rebinding stale fnhe The sit driver's packet transmission path calls: sit_tunnel_xmit() -> update_or_create_fnhe(), which lead to fnhe_remove_oldest() being called to delete entries exceeding FNHE_RECLAIM_DEPTH+random. The race window is between fnhe_remove_oldest() selecting fnheX for deletion and the subsequent kfree_rcu(). During this time, the concurrent path's __mkroute_output() -> find_exception() can fetch the soon-to-be-deleted fnheX, and rt_bind_exception() then binds it with a new dst using a dst_hold(). When the original fnheX is freed via RCU, the dst reference remains permanently leaked. CPU 0 CPU 1 __mkroute_output() find_exception() [fnheX] update_or_create_fnhe() fnhe_remove_oldest() [fnheX] rt_bind_exception() [bind dst] RCU callback [fnheX freed, dst leak] This issue manifests as a device reference count leak and a warning in dmesg when unregistering the net device: unregister_netdevice: waiting for sitX to become free. Usage count = N Ido Schimmel provided the simple test validation method [1]. The fix clears 'oldest->fnhe_daddr' before calling fnhe_flush_routes(). Since rt_bind_exception() checks this field, setting it to zero prevents the stale fnhe from being reused and bound to a new dst just before it is freed. [1] ip netns add ns1 ip -n ns1 link set dev lo up ip -n ns1 address add 192.0.2.1/32 dev lo ip -n ns1 link add name dummy1 up type dummy ip -n ns1 route add 192.0.2.2/32 dev dummy1 ip -n ns1 link add name gretap1 up arp off type gretap \ local 192.0.2.1 remote 192.0.2.2 ip -n ns1 route add 198.51.0.0/16 dev gretap1 taskset -c 0 ip netns exec ns1 mausezahn gretap1 \ -A 198.51.100.1 -B 198.51.0.0/16 -t udp -p 1000 -c 0 -q & taskset -c 2 ip netns exec ns1 mausezahn gretap1 \ -A 198.51.100.1 -B 198.51.0.0/16 -t udp -p 1000 -c 0 -q & sleep 10 ip netns pids ns1 | xargs kill ip netns del ns1 Cc: stable@vger.kernel.org Fixes: 67d6d681e15b ("ipv4: make exception cache less predictible") Signed-off-by: Chuang Wang Reviewed-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251111064328.24440-1-nashuiliang@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6d27d3610c1c..b549d6a57307 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -607,6 +607,11 @@ static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) oldest_p = fnhe_p; } } + + /* Clear oldest->fnhe_daddr to prevent this fnhe from being + * rebound with new dsts in rt_bind_exception(). + */ + oldest->fnhe_daddr = 0; fnhe_flush_routes(oldest); *oldest_p = oldest->fnhe_next; kfree_rcu(oldest, rcu); From 7c44656ab3ea6f8429027ed14c23b314502e2541 Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Tue, 11 Nov 2025 10:48:24 -0800 Subject: [PATCH 352/543] vfio: selftests: add iova range query helpers VFIO selftests need to map IOVAs from legally accessible ranges, which could vary between hardware. Tests in vfio_dma_mapping_test.c are making excessively strong assumptions about which IOVAs can be mapped. Add vfio_iommu_iova_ranges(), which queries IOVA ranges from the IOMMUFD or VFIO container associated with the device. The queried ranges are normalized to IOMMUFD's iommu_iova_range representation so that handling of IOVA ranges up the stack can be implementation-agnostic. iommu_iova_range and vfio_iova_range are equivalent, so bias to using the new interface's struct. Query IOMMUFD's ranges with IOMMU_IOAS_IOVA_RANGES. Query VFIO container's ranges with VFIO_IOMMU_GET_INFO and VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE. The underlying vfio_iommu_type1_info buffer-related functionality has been kept generic so the same helpers can be used to query other capability chain information, if needed. Reviewed-by: David Matlack Tested-by: David Matlack Signed-off-by: Alex Mastro Link: https://lore.kernel.org/r/20251111-iova-ranges-v3-1-7960244642c5@fb.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 8 +- .../selftests/vfio/lib/vfio_pci_device.c | 172 ++++++++++++++++++ 2 files changed, 179 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 240409bf5f8a..ef8f06ef0c13 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -4,9 +4,12 @@ #include #include -#include + +#include +#include #include #include +#include #include "../../../kselftest.h" @@ -206,6 +209,9 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_ void vfio_pci_device_cleanup(struct vfio_pci_device *device); void vfio_pci_device_reset(struct vfio_pci_device *device); +struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, + u32 *nranges); + int __vfio_pci_dma_map(struct vfio_pci_device *device, struct vfio_dma_region *region); int __vfio_pci_dma_unmap(struct vfio_pci_device *device, diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index a381fd253aa7..11749348f53f 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -29,6 +29,178 @@ VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ } while (0) +static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz, + u32 *cap_offset) +{ + struct vfio_info_cap_header *hdr; + + if (!*cap_offset) + return NULL; + + VFIO_ASSERT_LT(*cap_offset, bufsz); + VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr)); + + hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset); + *cap_offset = hdr->next; + + return hdr; +} + +static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info, + u16 cap_id) +{ + struct vfio_info_cap_header *hdr; + u32 cap_offset = info->cap_offset; + u32 max_depth; + u32 depth = 0; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) + return NULL; + + if (cap_offset) + VFIO_ASSERT_GE(cap_offset, sizeof(*info)); + + max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr); + + while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) { + depth++; + VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n"); + + if (hdr->id == cap_id) + return hdr; + } + + return NULL; +} + +/* Return buffer including capability chain, if present. Free with free() */ +static struct vfio_iommu_type1_info *vfio_iommu_get_info(struct vfio_pci_device *device) +{ + struct vfio_iommu_type1_info *info; + + info = malloc(sizeof(*info)); + VFIO_ASSERT_NOT_NULL(info); + + *info = (struct vfio_iommu_type1_info) { + .argsz = sizeof(*info), + }; + + ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + info = realloc(info, info->argsz); + VFIO_ASSERT_NOT_NULL(info); + + ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + return info; +} + +/* + * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to + * report iommufd's iommu_iova_range. Free with free(). + */ +static struct iommu_iova_range *vfio_iommu_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct vfio_iommu_type1_info_cap_iova_range *cap_range; + struct vfio_iommu_type1_info *info; + struct vfio_info_cap_header *hdr; + struct iommu_iova_range *ranges = NULL; + + info = vfio_iommu_get_info(device); + hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + VFIO_ASSERT_NOT_NULL(hdr); + + cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header); + VFIO_ASSERT_GT(cap_range->nr_iovas, 0); + + ranges = calloc(cap_range->nr_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + for (u32 i = 0; i < cap_range->nr_iovas; i++) { + ranges[i] = (struct iommu_iova_range){ + .start = cap_range->iova_ranges[i].start, + .last = cap_range->iova_ranges[i].end, + }; + } + + *nranges = cap_range->nr_iovas; + + free(info); + return ranges; +} + +/* Return iova ranges of the device's IOAS. Free with free() */ +static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct iommu_iova_range *ranges; + int ret; + + struct iommu_ioas_iova_ranges query = { + .size = sizeof(query), + .ioas_id = device->ioas_id, + }; + + ret = ioctl(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + VFIO_ASSERT_EQ(ret, -1); + VFIO_ASSERT_EQ(errno, EMSGSIZE); + VFIO_ASSERT_GT(query.num_iovas, 0); + + ranges = calloc(query.num_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + query.allowed_iovas = (uintptr_t)ranges; + + ioctl_assert(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + *nranges = query.num_iovas; + + return ranges; +} + +static int iova_range_comp(const void *a, const void *b) +{ + const struct iommu_iova_range *ra = a, *rb = b; + + if (ra->start < rb->start) + return -1; + + if (ra->start > rb->start) + return 1; + + return 0; +} + +/* Return sorted IOVA ranges of the device. Free with free(). */ +struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct iommu_iova_range *ranges; + + if (device->iommufd) + ranges = iommufd_iova_ranges(device, nranges); + else + ranges = vfio_iommu_iova_ranges(device, nranges); + + if (!ranges) + return NULL; + + VFIO_ASSERT_GT(*nranges, 0); + + /* Sort and check that ranges are sane and non-overlapping */ + qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp); + VFIO_ASSERT_LT(ranges[0].start, ranges[0].last); + + for (u32 i = 1; i < *nranges; i++) { + VFIO_ASSERT_LT(ranges[i].start, ranges[i].last); + VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start); + } + + return ranges; +} + iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) { struct vfio_dma_region *region; From a77fa0b9222d2f23a764061a3be18e6bc738672e Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Tue, 11 Nov 2025 10:48:25 -0800 Subject: [PATCH 353/543] vfio: selftests: fix map limit tests to use last available iova Use the newly available vfio_pci_iova_ranges() to determine the last legal IOVA, and use this as the basis for vfio_dma_map_limit_test tests. Fixes: de8d1f2fd5a5 ("vfio: selftests: add end of address space DMA map/unmap tests") Reviewed-by: David Matlack Tested-by: David Matlack Signed-off-by: Alex Mastro Link: https://lore.kernel.org/r/20251111-iova-ranges-v3-2-7960244642c5@fb.com Signed-off-by: Alex Williamson --- .../selftests/vfio/vfio_dma_mapping_test.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 4f1ea79a200c..e1374aab96bd 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -3,6 +3,8 @@ #include #include +#include +#include #include #include #include @@ -219,7 +221,10 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); FIXTURE_SETUP(vfio_dma_map_limit_test) { struct vfio_dma_region *region = &self->region; + struct iommu_iova_range *ranges; u64 region_size = getpagesize(); + iova_t last_iova; + u32 nranges; /* * Over-allocate mmap by double the size to provide enough backing vaddr @@ -232,8 +237,13 @@ FIXTURE_SETUP(vfio_dma_map_limit_test) MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); ASSERT_NE(region->vaddr, MAP_FAILED); - /* One page prior to the end of address space */ - region->iova = ~(iova_t)0 & ~(region_size - 1); + ranges = vfio_pci_iova_ranges(self->device, &nranges); + VFIO_ASSERT_NOT_NULL(ranges); + last_iova = ranges[nranges - 1].last; + free(ranges); + + /* One page prior to the last iova */ + region->iova = last_iova & ~(region_size - 1); region->size = region_size; } @@ -276,6 +286,7 @@ TEST_F(vfio_dma_map_limit_test, overflow) struct vfio_dma_region *region = &self->region; int rc; + region->iova = ~(iova_t)0 & ~(region->size - 1); region->size = self->mmap_size; rc = __vfio_pci_dma_map(self->device, region); From ce0e3c403e00e9e03e80aca6570bf936a44279e2 Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Tue, 11 Nov 2025 10:48:26 -0800 Subject: [PATCH 354/543] vfio: selftests: add iova allocator Add struct iova_allocator, which gives tests a convenient way to generate legally-accessible IOVAs to map. This allocator traverses the sorted available IOVA ranges linearly, requires power-of-two size allocations, and does not support freeing iova allocations. The assumption is that tests are not IOVA space-bounded, and will not need to recycle IOVAs. This is based on Alex Williamson's patch series for adding an IOVA allocator [1]. [1] https://lore.kernel.org/all/20251108212954.26477-1-alex@shazbot.org/ Reviewed-by: David Matlack Tested-by: David Matlack Signed-off-by: Alex Mastro Link: https://lore.kernel.org/r/20251111-iova-ranges-v3-3-7960244642c5@fb.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 11 +++ .../selftests/vfio/lib/vfio_pci_device.c | 74 ++++++++++++++++++- 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index ef8f06ef0c13..69ec0c856481 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -188,6 +188,13 @@ struct vfio_pci_device { struct vfio_pci_driver driver; }; +struct iova_allocator { + struct iommu_iova_range *ranges; + u32 nranges; + u32 range_idx; + u64 range_offset; +}; + /* * Return the BDF string of the device that the test should use. * @@ -212,6 +219,10 @@ void vfio_pci_device_reset(struct vfio_pci_device *device); struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, u32 *nranges); +struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device); +void iova_allocator_cleanup(struct iova_allocator *allocator); +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); + int __vfio_pci_dma_map(struct vfio_pci_device *device, struct vfio_dma_region *region); int __vfio_pci_dma_unmap(struct vfio_pci_device *device, diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 11749348f53f..b479a359da12 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -12,11 +12,12 @@ #include #include +#include #include #include +#include #include #include -#include #include "../../../kselftest.h" #include @@ -201,6 +202,77 @@ struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, return ranges; } +struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device) +{ + struct iova_allocator *allocator; + struct iommu_iova_range *ranges; + u32 nranges; + + ranges = vfio_pci_iova_ranges(device, &nranges); + VFIO_ASSERT_NOT_NULL(ranges); + + allocator = malloc(sizeof(*allocator)); + VFIO_ASSERT_NOT_NULL(allocator); + + *allocator = (struct iova_allocator){ + .ranges = ranges, + .nranges = nranges, + .range_idx = 0, + .range_offset = 0, + }; + + return allocator; +} + +void iova_allocator_cleanup(struct iova_allocator *allocator) +{ + free(allocator->ranges); + free(allocator); +} + +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) +{ + VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n"); + VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n"); + + for (;;) { + struct iommu_iova_range *range; + iova_t iova, last; + + VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges, + "IOVA allocator out of space\n"); + + range = &allocator->ranges[allocator->range_idx]; + iova = range->start + allocator->range_offset; + + /* Check for sufficient space at the current offset */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + /* Align iova to size */ + iova = last & ~(size - 1); + + /* Check for sufficient space at the aligned iova */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + if (last == range->last) { + allocator->range_idx++; + allocator->range_offset = 0; + } else { + allocator->range_offset = last - range->start + 1; + } + + return iova; + +next_range: + allocator->range_idx++; + allocator->range_offset = 0; + } +} + iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) { struct vfio_dma_region *region; From d323ad739666761646048fca587734f4ae64f2c8 Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Tue, 11 Nov 2025 10:48:27 -0800 Subject: [PATCH 355/543] vfio: selftests: replace iova=vaddr with allocated iovas vfio_dma_mapping_test and vfio_pci_driver_test currently use iova=vaddr as part of DMA mapping operations. However, not all IOMMUs support the same virtual address width as the processor. For instance, older Intel consumer platforms only support 39-bits of IOMMU address space. On such platforms, using the virtual address as the IOVA fails. Make the tests more robust by using iova_allocator to vend IOVAs, which queries legally accessible IOVAs from the underlying IOMMUFD or VFIO container. Reviewed-by: David Matlack Tested-by: David Matlack Signed-off-by: Alex Mastro Link: https://lore.kernel.org/r/20251111-iova-ranges-v3-4-7960244642c5@fb.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/vfio_dma_mapping_test.c | 5 ++++- tools/testing/selftests/vfio/vfio_pci_driver_test.c | 12 ++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index e1374aab96bd..102603d4407d 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -95,6 +95,7 @@ static int iommu_mapping_get(const char *bdf, u64 iova, FIXTURE(vfio_dma_mapping_test) { struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; }; FIXTURE_VARIANT(vfio_dma_mapping_test) { @@ -119,10 +120,12 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_1gb, SZ_1G, MAP_HUGETLB | FIXTURE_SETUP(vfio_dma_mapping_test) { self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iova_allocator = iova_allocator_init(self->device); } FIXTURE_TEARDOWN(vfio_dma_mapping_test) { + iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); } @@ -144,7 +147,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) else ASSERT_NE(region.vaddr, MAP_FAILED); - region.iova = (u64)region.vaddr; + region.iova = iova_allocator_alloc(self->iova_allocator, size); region.size = size; vfio_pci_dma_map(self->device, ®ion); diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index 2dbd70b7db62..f69eec8b928d 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -19,6 +19,7 @@ static const char *device_bdf; } while (0) static void region_setup(struct vfio_pci_device *device, + struct iova_allocator *iova_allocator, struct vfio_dma_region *region, u64 size) { const int flags = MAP_SHARED | MAP_ANONYMOUS; @@ -29,7 +30,7 @@ static void region_setup(struct vfio_pci_device *device, VFIO_ASSERT_NE(vaddr, MAP_FAILED); region->vaddr = vaddr; - region->iova = (u64)vaddr; + region->iova = iova_allocator_alloc(iova_allocator, size); region->size = size; vfio_pci_dma_map(device, region); @@ -44,6 +45,7 @@ static void region_teardown(struct vfio_pci_device *device, FIXTURE(vfio_pci_driver_test) { struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; struct vfio_dma_region memcpy_region; void *vaddr; int msi_fd; @@ -72,14 +74,15 @@ FIXTURE_SETUP(vfio_pci_driver_test) struct vfio_pci_driver *driver; self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iova_allocator = iova_allocator_init(self->device); driver = &self->device->driver; - region_setup(self->device, &self->memcpy_region, SZ_1G); - region_setup(self->device, &driver->region, SZ_2M); + region_setup(self->device, self->iova_allocator, &self->memcpy_region, SZ_1G); + region_setup(self->device, self->iova_allocator, &driver->region, SZ_2M); /* Any IOVA that doesn't overlap memcpy_region and driver->region. */ - self->unmapped_iova = 8UL * SZ_1G; + self->unmapped_iova = iova_allocator_alloc(self->iova_allocator, SZ_1G); vfio_pci_driver_init(self->device); self->msi_fd = self->device->msi_eventfds[driver->msi]; @@ -108,6 +111,7 @@ FIXTURE_TEARDOWN(vfio_pci_driver_test) region_teardown(self->device, &self->memcpy_region); region_teardown(self->device, &driver->region); + iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); } From 2d0e88f3fd1dcb37072d499c36162baf5b009d41 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 11 Nov 2025 12:15:29 -0700 Subject: [PATCH 356/543] io_uring/rsrc: don't use blk_rq_nr_phys_segments() as number of bvecs io_buffer_register_bvec() currently uses blk_rq_nr_phys_segments() as the number of bvecs in the request. However, bvecs may be split into multiple segments depending on the queue limits. Thus, the number of segments may overestimate the number of bvecs. For ublk devices, the only current users of io_buffer_register_bvec(), virt_boundary_mask, seg_boundary_mask, max_segments, and max_segment_size can all be set arbitrarily by the ublk server process. Set imu->nr_bvecs based on the number of bvecs the rq_for_each_bvec() loop actually yields. However, continue using blk_rq_nr_phys_segments() as an upper bound on the number of bvecs when allocating imu to avoid needing to iterate the bvecs a second time. Link: https://lore.kernel.org/io-uring/20251111191530.1268875-1-csander@purestorage.com/ Signed-off-by: Caleb Sander Mateos Fixes: 27cb27b6d5ea ("io_uring: add support for kernel registered bvecs") Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 2602d76d5ff0..0010c4992490 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -943,8 +943,8 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, struct req_iterator rq_iter; struct io_mapped_ubuf *imu; struct io_rsrc_node *node; - struct bio_vec bv, *bvec; - u16 nr_bvecs; + struct bio_vec bv; + unsigned int nr_bvecs = 0; int ret = 0; io_ring_submit_lock(ctx, issue_flags); @@ -965,8 +965,11 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, goto unlock; } - nr_bvecs = blk_rq_nr_phys_segments(rq); - imu = io_alloc_imu(ctx, nr_bvecs); + /* + * blk_rq_nr_phys_segments() may overestimate the number of bvecs + * but avoids needing to iterate over the bvecs + */ + imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); if (!imu) { kfree(node); ret = -ENOMEM; @@ -977,16 +980,15 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, imu->len = blk_rq_bytes(rq); imu->acct_pages = 0; imu->folio_shift = PAGE_SHIFT; - imu->nr_bvecs = nr_bvecs; refcount_set(&imu->refs, 1); imu->release = release; imu->priv = rq; imu->is_kbuf = true; imu->dir = 1 << rq_data_dir(rq); - bvec = imu->bvec; rq_for_each_bvec(bv, rq, rq_iter) - *bvec++ = bv; + imu->bvec[nr_bvecs++] = bv; + imu->nr_bvecs = nr_bvecs; node->buf = imu; data->nodes[index] = node; From 5f02151c411dda46efcc5dc57b0845efcdcfc26d Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 12 Nov 2025 15:33:28 +0800 Subject: [PATCH 357/543] sched_ext: Fix unsafe locking in the scx_dump_state() For built with CONFIG_PREEMPT_RT=y kernels, the dump_lock will be converted sleepable spinlock and not disable-irq, so the following scenarios occur: inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. irq_work/0/27 [HC0[0]:SC0[0]:HE1:SE1] takes: (&rq->__lock){?...}-{2:2}, at: raw_spin_rq_lock_nested+0x2b/0x40 {IN-HARDIRQ-W} state was registered at: lock_acquire+0x1e1/0x510 _raw_spin_lock_nested+0x42/0x80 raw_spin_rq_lock_nested+0x2b/0x40 sched_tick+0xae/0x7b0 update_process_times+0x14c/0x1b0 tick_periodic+0x62/0x1f0 tick_handle_periodic+0x48/0xf0 timer_interrupt+0x55/0x80 __handle_irq_event_percpu+0x20a/0x5c0 handle_irq_event_percpu+0x18/0xc0 handle_irq_event+0xb5/0x150 handle_level_irq+0x220/0x460 __common_interrupt+0xa2/0x1e0 common_interrupt+0xb0/0xd0 asm_common_interrupt+0x2b/0x40 _raw_spin_unlock_irqrestore+0x45/0x80 __setup_irq+0xc34/0x1a30 request_threaded_irq+0x214/0x2f0 hpet_time_init+0x3e/0x60 x86_late_time_init+0x5b/0xb0 start_kernel+0x308/0x410 x86_64_start_reservations+0x1c/0x30 x86_64_start_kernel+0x96/0xa0 common_startup_64+0x13e/0x148 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&rq->__lock); lock(&rq->__lock); *** DEADLOCK *** stack backtrace: CPU: 0 UID: 0 PID: 27 Comm: irq_work/0 Call Trace: dump_stack_lvl+0x8c/0xd0 dump_stack+0x14/0x20 print_usage_bug+0x42e/0x690 mark_lock.part.44+0x867/0xa70 ? __pfx_mark_lock.part.44+0x10/0x10 ? string_nocheck+0x19c/0x310 ? number+0x739/0x9f0 ? __pfx_string_nocheck+0x10/0x10 ? __pfx_check_pointer+0x10/0x10 ? kvm_sched_clock_read+0x15/0x30 ? sched_clock_noinstr+0xd/0x20 ? local_clock_noinstr+0x1c/0xe0 __lock_acquire+0xc4b/0x62b0 ? __pfx_format_decode+0x10/0x10 ? __pfx_string+0x10/0x10 ? __pfx___lock_acquire+0x10/0x10 ? __pfx_vsnprintf+0x10/0x10 lock_acquire+0x1e1/0x510 ? raw_spin_rq_lock_nested+0x2b/0x40 ? __pfx_lock_acquire+0x10/0x10 ? dump_line+0x12e/0x270 ? raw_spin_rq_lock_nested+0x20/0x40 _raw_spin_lock_nested+0x42/0x80 ? raw_spin_rq_lock_nested+0x2b/0x40 raw_spin_rq_lock_nested+0x2b/0x40 scx_dump_state+0x3b3/0x1270 ? finish_task_switch+0x27e/0x840 scx_ops_error_irq_workfn+0x67/0x80 irq_work_single+0x113/0x260 irq_work_run_list.part.3+0x44/0x70 run_irq_workd+0x6b/0x90 ? __pfx_run_irq_workd+0x10/0x10 smpboot_thread_fn+0x529/0x870 ? __pfx_smpboot_thread_fn+0x10/0x10 kthread+0x305/0x3f0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x40/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 This commit therefore use rq_lock_irqsave/irqrestore() to replace rq_lock/unlock() in the scx_dump_state(). Fixes: 07814a9439a3 ("sched_ext: Print debug dump after an error exit") Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1a019a7728fb..184d562bda5e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4276,7 +4276,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) size_t avail, used; bool idle; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); idle = list_empty(&rq->scx.runnable_list) && rq->curr->sched_class == &idle_sched_class; @@ -4345,7 +4345,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) scx_dump_task(&s, &dctx, p, ' '); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); } dump_newline(&s); From 4b747cc628d8f500d56cf1338280eacc66362ff3 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 10 Nov 2025 17:08:40 -0800 Subject: [PATCH 358/543] cpufreq: intel_pstate: Check IDA only before MSR_IA32_PERF_CTL writes Commit ac4e04d9e378 ("cpufreq: intel_pstate: Unchecked MSR aceess in legacy mode") introduced a check for feature X86_FEATURE_IDA to verify turbo mode support. Although this is the correct way to check for turbo mode support, it causes issues on some platforms that disable turbo during OS boot, but enable it later [1]. Before adding this feature check, users were able to get turbo mode frequencies by writing 0 to /sys/devices/system/cpu/intel_pstate/no_turbo post-boot. To restore the old behavior on the affected systems while still addressing the unchecked MSR issue on some Skylake-X systems, check X86_FEATURE_IDA only immediately before updates of MSR_IA32_PERF_CTL that may involve setting the Turbo Engage Bit (bit 32). Fixes: ac4e04d9e378 ("cpufreq: intel_pstate: Unchecked MSR aceess in legacy mode") Reported-by: Aaron Rainbolt Closes: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/2122531 [1] Tested-by: Aaron Rainbolt Signed-off-by: Srinivas Pandruvada [ rjw: Subject adjustment, changelog edits ] Link: https://patch.msgid.link/20251111010840.141490-1-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 38897bb14a2c..492a10f1bdbf 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -603,9 +603,6 @@ static bool turbo_is_disabled(void) { u64 misc_en; - if (!cpu_feature_enabled(X86_FEATURE_IDA)) - return true; - rdmsrq(MSR_IA32_MISC_ENABLE, misc_en); return !!(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); @@ -2106,7 +2103,8 @@ static u64 atom_get_val(struct cpudata *cpudata, int pstate) u32 vid; val = (u64)pstate << 8; - if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled)) + if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) && + cpu_feature_enabled(X86_FEATURE_IDA)) val |= (u64)1 << 32; vid_fp = cpudata->vid.min + mul_fp( @@ -2271,7 +2269,8 @@ static u64 core_get_val(struct cpudata *cpudata, int pstate) u64 val; val = (u64)pstate << 8; - if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled)) + if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) && + cpu_feature_enabled(X86_FEATURE_IDA)) val |= (u64)1 << 32; return val; From c87488a12393a23f8a1b9850b989b386c58cac3f Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Wed, 12 Nov 2025 08:42:02 -1000 Subject: [PATCH 359/543] sched/ext: convert scx_tasks_lock to raw spinlock Update scx_task_locks so that it's safe to lock/unlock in a non-sleepable context in PREEMPT_RT kernels. scx_task_locks is (non-raw) spinlock used to protect the list of tasks under SCX. This list is updated during from finish_task_switch(), which cannot sleep. Regular spinlocks can be locked in such a context in non-RT kernels, but are sleepable under when CONFIG_PREEMPT_RT=y. Convert scx_task_locks into a raw spinlock, which is not sleepable even on RT kernels. Sample backtrace: dump_stack_lvl+0x83/0xa0 __might_resched+0x14a/0x200 rt_spin_lock+0x61/0x1c0 ? sched_ext_dead+0x2d/0xf0 ? lock_release+0xc6/0x280 sched_ext_dead+0x2d/0xf0 ? srso_alias_return_thunk+0x5/0xfbef5 finish_task_switch.isra.0+0x254/0x360 __schedule+0x584/0x11d0 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? tick_nohz_idle_exit+0x7e/0x120 schedule_idle+0x23/0x40 cpu_startup_entry+0x29/0x30 start_secondary+0xf8/0x100 common_startup_64+0x13e/0x148 Signed-off-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 184d562bda5e..03d05ea20006 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -25,7 +25,7 @@ static struct scx_sched __rcu *scx_root; * guarantee system safety. Maintain a dedicated task list which contains every * task between its fork and eventual free. */ -static DEFINE_SPINLOCK(scx_tasks_lock); +static DEFINE_RAW_SPINLOCK(scx_tasks_lock); static LIST_HEAD(scx_tasks); /* ops enable/disable */ @@ -476,7 +476,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter) BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); @@ -507,14 +507,14 @@ static void scx_task_iter_unlock(struct scx_task_iter *iter) __scx_task_iter_rq_unlock(iter); if (iter->list_locked) { iter->list_locked = false; - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); } } static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { if (!iter->list_locked) { - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->list_locked = true; } } @@ -2940,9 +2940,9 @@ void scx_post_fork(struct task_struct *p) } } - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); list_add_tail(&p->scx.tasks_node, &scx_tasks); - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); percpu_up_read(&scx_fork_rwsem); } @@ -2966,9 +2966,9 @@ void sched_ext_free(struct task_struct *p) { unsigned long flags; - spin_lock_irqsave(&scx_tasks_lock, flags); + raw_spin_lock_irqsave(&scx_tasks_lock, flags); list_del_init(&p->scx.tasks_node); - spin_unlock_irqrestore(&scx_tasks_lock, flags); + raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); /* * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED From 9efb297c520f392ab04bc45544a03770c98c3798 Mon Sep 17 00:00:00 2001 From: Gopi Krishna Menon Date: Sat, 25 Oct 2025 01:50:40 +0530 Subject: [PATCH 360/543] hwmon: (gpd-fan) Fix compilation error in non-ACPI builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building gpd-fan driver without CONFIG_ACPI results in the following build errors: drivers/hwmon/gpd-fan.c: In function ‘gpd_ecram_read’: drivers/hwmon/gpd-fan.c:228:9: error: implicit declaration of function ‘outb’ [-Werror=implicit-function-declaration] 228 | outb(0x2E, addr_port); | ^~~~ drivers/hwmon/gpd-fan.c:241:16: error: implicit declaration of function ‘inb’ [-Werror=implicit-function-declaration] 241 | *val = inb(data_port); The definitions for inb() and outb() come from (specifically through ), which is implicitly included via . When CONFIG_ACPI is not set, is not included resulting in to be omitted as well. Since the driver does not depend on ACPI, remove and add directly to fix the compilation errors. Signed-off-by: Gopi Krishna Menon Link: https://lore.kernel.org/r/20251024202042.752160-1-krishnagopi487@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/gpd-fan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/gpd-fan.c b/drivers/hwmon/gpd-fan.c index 321794807e8d..48c84e3e9939 100644 --- a/drivers/hwmon/gpd-fan.c +++ b/drivers/hwmon/gpd-fan.c @@ -12,9 +12,9 @@ * Copyright (c) 2024 Cryolitia PukNgae */ -#include #include #include +#include #include #include #include From c55a8e24cd129b6d8fed20e3d63c10c2263e2fc8 Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Thu, 30 Oct 2025 22:30:06 +0800 Subject: [PATCH 361/543] hwmon: (gpd-fan) initialize EC on driver load for Win 4 The original implement will re-init the EC when it reports a zero value, and it's a workaround for the black box buggy firmware. Now a contributer test and report that, the bug is that, the firmware won't initialize the EC on boot, so the EC ramains in unusable status. And it won't need to re-init it during runtime. The original implement is not perfect, any write command will be ignored until we first read it. Just re-init it unconditionally when the driver load could work. Fixes: 0ab88e239439 ("hwmon: add GPD devices sensor driver") Co-developed-by: kylon <3252255+kylon@users.noreply.github.com> Signed-off-by: kylon <3252255+kylon@users.noreply.github.com> Link: https://github.com/Cryolitia/gpd-fan-driver/pull/20 Signed-off-by: Cryolitia PukNgae Link: https://lore.kernel.org/r/20251030-win4-v1-1-c374dcb86985@uniontech.com Signed-off-by: Guenter Roeck --- drivers/hwmon/gpd-fan.c | 52 ++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/drivers/hwmon/gpd-fan.c b/drivers/hwmon/gpd-fan.c index 48c84e3e9939..f81c3bc422f4 100644 --- a/drivers/hwmon/gpd-fan.c +++ b/drivers/hwmon/gpd-fan.c @@ -276,31 +276,6 @@ static int gpd_generic_read_rpm(void) return (u16)high << 8 | low; } -static void gpd_win4_init_ec(void) -{ - u8 chip_id, chip_ver; - - gpd_ecram_read(0x2000, &chip_id); - - if (chip_id == 0x55) { - gpd_ecram_read(0x1060, &chip_ver); - gpd_ecram_write(0x1060, chip_ver | 0x80); - } -} - -static int gpd_win4_read_rpm(void) -{ - int ret; - - ret = gpd_generic_read_rpm(); - - if (ret == 0) - // Re-init EC when speed is 0 - gpd_win4_init_ec(); - - return ret; -} - static int gpd_wm2_read_rpm(void) { for (u16 pwm_ctr_offset = GPD_PWM_CTR_OFFSET; @@ -320,11 +295,10 @@ static int gpd_wm2_read_rpm(void) static int gpd_read_rpm(void) { switch (gpd_driver_priv.drvdata->board) { + case win4_6800u: case win_mini: case duo: return gpd_generic_read_rpm(); - case win4_6800u: - return gpd_win4_read_rpm(); case win_max_2: return gpd_wm2_read_rpm(); } @@ -607,6 +581,28 @@ static struct hwmon_chip_info gpd_fan_chip_info = { .info = gpd_fan_hwmon_channel_info }; +static void gpd_win4_init_ec(void) +{ + u8 chip_id, chip_ver; + + gpd_ecram_read(0x2000, &chip_id); + + if (chip_id == 0x55) { + gpd_ecram_read(0x1060, &chip_ver); + gpd_ecram_write(0x1060, chip_ver | 0x80); + } +} + +static void gpd_init_ec(void) +{ + // The buggy firmware won't initialize EC properly on boot. + // Before its initialization, reading RPM will always return 0, + // and writing PWM will have no effect. + // Initialize it manually on driver load. + if (gpd_driver_priv.drvdata->board == win4_6800u) + gpd_win4_init_ec(); +} + static int gpd_fan_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -634,6 +630,8 @@ static int gpd_fan_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(hwdev), "Failed to register hwmon device\n"); + gpd_init_ec(); + return 0; } From 214291cbaaceeb28debd773336642b1fca393ae0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 5 Nov 2025 16:51:15 -0700 Subject: [PATCH 362/543] acpi/hmat: Fix lockdep warning for hmem_register_resource() The following lockdep splat was observed while kernel auto-online a CXL memory region: ====================================================== WARNING: possible circular locking dependency detected 6.17.0djtest+ #53 Tainted: G W ------------------------------------------------------ systemd-udevd/3334 is trying to acquire lock: ffffffff90346188 (hmem_resource_lock){+.+.}-{4:4}, at: hmem_register_resource+0x31/0x50 but task is already holding lock: ffffffff90338890 ((node_chain).rwsem){++++}-{4:4}, at: blocking_notifier_call_chain+0x2e/0x70 which lock already depends on the new lock. [..] Chain exists of: hmem_resource_lock --> mem_hotplug_lock --> (node_chain).rwsem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- rlock((node_chain).rwsem); lock(mem_hotplug_lock); lock((node_chain).rwsem); lock(hmem_resource_lock); The lock ordering can cause potential deadlock. There are instances where hmem_resource_lock is taken after (node_chain).rwsem, and vice versa. Split out the target update section of hmat_register_target() so that hmat_callback() only envokes that section instead of attempt to register hmem devices that it does not need to. [ dj: Fix up comment to be closer to 80cols. (Jonathan) ] Fixes: cf8741ac57ed ("ACPI: NUMA: HMAT: Register "soft reserved" memory as an "hmem" device") Reviewed-by: Jonathan Cameron Tested-by: Smita Koralahalli Reviewed-by: Smita Koralahalli Reviewed-by: Dan Williams Link: https://patch.msgid.link/20251105235115.85062-3-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/acpi/numa/hmat.c | 46 ++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 5a36d57289b4..11e4483685c9 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -874,10 +874,32 @@ static void hmat_register_target_devices(struct memory_target *target) } } -static void hmat_register_target(struct memory_target *target) +static void hmat_hotplug_target(struct memory_target *target) { int nid = pxm_to_node(target->memory_pxm); + /* + * Skip offline nodes. This can happen when memory marked EFI_MEMORY_SP, + * "specific purpose", is applied to all the memory in a proximity + * domain leading to * the node being marked offline / unplugged, or if + * memory-only "hotplug" node is offline. + */ + if (nid == NUMA_NO_NODE || !node_online(nid)) + return; + + guard(mutex)(&target_lock); + if (target->registered) + return; + + hmat_register_target_initiators(target); + hmat_register_target_cache(target); + hmat_register_target_perf(target, ACCESS_COORDINATE_LOCAL); + hmat_register_target_perf(target, ACCESS_COORDINATE_CPU); + target->registered = true; +} + +static void hmat_register_target(struct memory_target *target) +{ /* * Devices may belong to either an offline or online * node, so unconditionally add them. @@ -895,25 +917,7 @@ static void hmat_register_target(struct memory_target *target) } mutex_unlock(&target_lock); - /* - * Skip offline nodes. This can happen when memory - * marked EFI_MEMORY_SP, "specific purpose", is applied - * to all the memory in a proximity domain leading to - * the node being marked offline / unplugged, or if - * memory-only "hotplug" node is offline. - */ - if (nid == NUMA_NO_NODE || !node_online(nid)) - return; - - mutex_lock(&target_lock); - if (!target->registered) { - hmat_register_target_initiators(target); - hmat_register_target_cache(target); - hmat_register_target_perf(target, ACCESS_COORDINATE_LOCAL); - hmat_register_target_perf(target, ACCESS_COORDINATE_CPU); - target->registered = true; - } - mutex_unlock(&target_lock); + hmat_hotplug_target(target); } static void hmat_register_targets(void) @@ -939,7 +943,7 @@ static int hmat_callback(struct notifier_block *self, if (!target) return NOTIFY_OK; - hmat_register_target(target); + hmat_hotplug_target(target); return NOTIFY_OK; } From 360b3730f8eab6c4467c6cca4cb0e30902174a63 Mon Sep 17 00:00:00 2001 From: Haotian Zhang Date: Wed, 12 Nov 2025 14:57:09 +0800 Subject: [PATCH 363/543] ASoC: rsnd: fix OF node reference leak in rsnd_ssiu_probe() rsnd_ssiu_probe() leaks an OF node reference obtained by rsnd_ssiu_of_node(). The node reference is acquired but never released across all return paths. Fix it by declaring the device node with the __free(device_node) cleanup construct to ensure automatic release when the variable goes out of scope. Fixes: 4e7788fb8018 ("ASoC: rsnd: add SSIU BUSIF support") Signed-off-by: Haotian Zhang Acked-by: Kuninori Morimoto Link: https://patch.msgid.link/20251112065709.1522-1-vulab@iscas.ac.cn Signed-off-by: Mark Brown --- sound/soc/renesas/rcar/ssiu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/renesas/rcar/ssiu.c b/sound/soc/renesas/rcar/ssiu.c index faf351126d57..244fb833292a 100644 --- a/sound/soc/renesas/rcar/ssiu.c +++ b/sound/soc/renesas/rcar/ssiu.c @@ -509,7 +509,7 @@ void rsnd_parse_connect_ssiu(struct rsnd_dai *rdai, int rsnd_ssiu_probe(struct rsnd_priv *priv) { struct device *dev = rsnd_priv_to_dev(priv); - struct device_node *node; + struct device_node *node __free(device_node) = rsnd_ssiu_of_node(priv); struct rsnd_ssiu *ssiu; struct rsnd_mod_ops *ops; const int *list = NULL; @@ -522,7 +522,6 @@ int rsnd_ssiu_probe(struct rsnd_priv *priv) * see * rsnd_ssiu_bufsif_to_id() */ - node = rsnd_ssiu_of_node(priv); if (node) nr = rsnd_node_count(priv, node, SSIU_NAME); else From 4495bffd86ba0fdabfaef0c41d12f68ec2a1e05b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 10 Nov 2025 16:22:25 -0600 Subject: [PATCH 364/543] PCI/ASPM: Cache L0s/L1 Supported so advertised link states can be overridden Defective devices sometimes advertise support for ASPM L0s or L1 states even if they don't work correctly. Cache the L0s Supported and L1 Supported bits early in enumeration so HEADER quirks can override the ASPM states advertised in Link Capabilities before pcie_aspm_cap_init() enables ASPM. Signed-off-by: Bjorn Helgaas Tested-by: Shawn Lin Reviewed-by: Lukas Wunner Link: https://patch.msgid.link/20251110222929.2140564-2-helgaas@kernel.org --- drivers/pci/pcie/aspm.c | 12 ++++-------- drivers/pci/probe.c | 7 +++++++ include/linux/pci.h | 2 ++ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 7cc8281e7011..15d50c089070 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -830,7 +830,6 @@ static void pcie_aspm_override_default_link_state(struct pcie_link_state *link) static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) { struct pci_dev *child = link->downstream, *parent = link->pdev; - u32 parent_lnkcap, child_lnkcap; u16 parent_lnkctl, child_lnkctl; struct pci_bus *linkbus = parent->subordinate; @@ -845,9 +844,8 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) * If ASPM not supported, don't mess with the clocks and link, * bail out now. */ - pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap); - pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap); - if (!(parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPMS)) + if (!(parent->aspm_l0s_support && child->aspm_l0s_support) && + !(parent->aspm_l1_support && child->aspm_l1_support)) return; /* Configure common clock before checking latencies */ @@ -859,8 +857,6 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) * read-only Link Capabilities may change depending on common clock * configuration (PCIe r5.0, sec 7.5.3.6). */ - pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap); - pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap); pcie_capability_read_word(parent, PCI_EXP_LNKCTL, &parent_lnkctl); pcie_capability_read_word(child, PCI_EXP_LNKCTL, &child_lnkctl); @@ -880,7 +876,7 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) * given link unless components on both sides of the link each * support L0s. */ - if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L0S) + if (parent->aspm_l0s_support && child->aspm_l0s_support) link->aspm_support |= PCIE_LINK_STATE_L0S; if (child_lnkctl & PCI_EXP_LNKCTL_ASPM_L0S) @@ -889,7 +885,7 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) link->aspm_enabled |= PCIE_LINK_STATE_L0S_DW; /* Setup L1 state */ - if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L1) + if (parent->aspm_l1_support && child->aspm_l1_support) link->aspm_support |= PCIE_LINK_STATE_L1; if (parent_lnkctl & child_lnkctl & PCI_EXP_LNKCTL_ASPM_L1) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index c83e75a0ec12..de72ceaea285 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1663,6 +1663,13 @@ void set_pcie_port_type(struct pci_dev *pdev) if (reg32 & PCI_EXP_LNKCAP_DLLLARC) pdev->link_active_reporting = 1; +#ifdef CONFIG_PCIEASPM + if (reg32 & PCI_EXP_LNKCAP_ASPM_L0S) + pdev->aspm_l0s_support = 1; + if (reg32 & PCI_EXP_LNKCAP_ASPM_L1) + pdev->aspm_l1_support = 1; +#endif + parent = pci_upstream_bridge(pdev); if (!parent) return; diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..bf97d49c23cf 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -412,6 +412,8 @@ struct pci_dev { u16 l1ss; /* L1SS Capability pointer */ #ifdef CONFIG_PCIEASPM struct pcie_link_state *link_state; /* ASPM link state */ + unsigned int aspm_l0s_support:1; /* ASPM L0s support */ + unsigned int aspm_l1_support:1; /* ASPM L1 support */ unsigned int ltr_path:1; /* Latency Tolerance Reporting supported from root to here */ #endif From 575b98e39d817537afdc6c39d35c1de484a64d42 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 10 Nov 2025 16:22:26 -0600 Subject: [PATCH 365/543] PCI/ASPM: Add pcie_aspm_remove_cap() to override advertised link states Add pcie_aspm_remove_cap(). A quirk can use this to prevent use of ASPM L0s or L1 link states, even if the device advertised support for them. Signed-off-by: Bjorn Helgaas Tested-by: Shawn Lin Reviewed-by: Lukas Wunner Link: https://patch.msgid.link/20251110222929.2140564-3-helgaas@kernel.org --- drivers/pci/pci.h | 2 ++ drivers/pci/pcie/aspm.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 4492b809094b..36f8c0985430 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -958,6 +958,7 @@ void pci_save_aspm_l1ss_state(struct pci_dev *dev); void pci_restore_aspm_l1ss_state(struct pci_dev *dev); #ifdef CONFIG_PCIEASPM +void pcie_aspm_remove_cap(struct pci_dev *pdev, u32 lnkcap); void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked); @@ -965,6 +966,7 @@ void pcie_aspm_powersave_config_link(struct pci_dev *pdev); void pci_configure_ltr(struct pci_dev *pdev); void pci_bridge_reconfigure_ltr(struct pci_dev *pdev); #else +static inline void pcie_aspm_remove_cap(struct pci_dev *pdev, u32 lnkcap) { } static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked) { } diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 15d50c089070..f61d98897503 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -1542,6 +1542,19 @@ int pci_enable_link_state_locked(struct pci_dev *pdev, int state) } EXPORT_SYMBOL(pci_enable_link_state_locked); +void pcie_aspm_remove_cap(struct pci_dev *pdev, u32 lnkcap) +{ + if (lnkcap & PCI_EXP_LNKCAP_ASPM_L0S) + pdev->aspm_l0s_support = 0; + if (lnkcap & PCI_EXP_LNKCAP_ASPM_L1) + pdev->aspm_l1_support = 0; + + pci_info(pdev, "ASPM: Link Capabilities%s%s treated as unsupported to avoid device defect\n", + lnkcap & PCI_EXP_LNKCAP_ASPM_L0S ? " L0s" : "", + lnkcap & PCI_EXP_LNKCAP_ASPM_L1 ? " L1" : ""); + +} + static int pcie_aspm_set_policy(const char *val, const struct kernel_param *kp) { From 30579eebba6ae52dc7441479aec9dd8d782256d3 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 10 Nov 2025 16:22:27 -0600 Subject: [PATCH 366/543] PCI/ASPM: Convert quirks to override advertised link states Existing quirks to disable ASPM L0s and L1 use pci_disable_link_state(), which disables ASPM states and prevents their use in the future. But since they are FINAL quirks, they happen after ASPM has already been enabled. Here's a typical call path: pci_host_probe pci_scan_root_bus_bridge pci_scan_child_bus pci_scan_slot pci_scan_single_device pci_device_add pci_fixup_device(pci_fixup_header) # HEADER quirks pcie_aspm_init_link_state pcie_config_aspm_path pcie_config_aspm_link pcie_config_aspm_dev # ASPM may be enabled pci_bus_add_devices pci_bus_add_devices pci_fixup_device(pci_fixup_final) # FINAL quirks quirk_disable_aspm_l0s pci_disable_link_state(dev, PCIE_LINK_STATE_L0S) Sometimes enabling ASPM can make the link non-functional, so if we know ASPM is broken on a device, we shouldn't enable it at all, even temporarily. Convert the existing quirks to use pcie_aspm_remove_cap() instead, which overrides the ASPM Support advertised in PCIe Link Capabilities, and make them HEADER quirks so they run before pcie_aspm_init_link_state() has a chance to enable ASPM. Signed-off-by: Bjorn Helgaas Tested-by: Shawn Lin Reviewed-by: Lukas Wunner Link: https://patch.msgid.link/20251110222929.2140564-4-helgaas@kernel.org --- drivers/pci/quirks.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 214ed060ca1b..922c77c627a1 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2494,28 +2494,27 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, */ static void quirk_disable_aspm_l0s(struct pci_dev *dev) { - pci_info(dev, "Disabling L0s\n"); - pci_disable_link_state(dev, PCIE_LINK_STATE_L0S); + pcie_aspm_remove_cap(dev, PCI_EXP_LNKCAP_ASPM_L0S); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10a7, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10a9, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10b6, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10c6, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10c7, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10c8, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10d6, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10db, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10dd, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10e1, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10ec, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10f1, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10f4, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1508, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10a7, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10a9, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10b6, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10c6, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10c7, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10c8, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10d6, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10db, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10dd, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10e1, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10ec, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10f1, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10f4, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1508, quirk_disable_aspm_l0s); static void quirk_disable_aspm_l0s_l1(struct pci_dev *dev) { - pci_info(dev, "Disabling ASPM L0s/L1\n"); - pci_disable_link_state(dev, PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1); + pcie_aspm_remove_cap(dev, + PCI_EXP_LNKCAP_ASPM_L0S | PCI_EXP_LNKCAP_ASPM_L1); } /* @@ -2523,7 +2522,7 @@ static void quirk_disable_aspm_l0s_l1(struct pci_dev *dev) * upstream PCIe root port when ASPM is enabled. At least L0s mode is affected; * disable both L0s and L1 for now to be safe. */ -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ASMEDIA, 0x1080, quirk_disable_aspm_l0s_l1); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ASMEDIA, 0x1080, quirk_disable_aspm_l0s_l1); /* * Some Pericom PCIe-to-PCI bridges in reverse mode need the PCIe Retrain From 5b40a5080c39933e7ce28bcb63cd4c1818d6c873 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 10 Nov 2025 16:22:28 -0600 Subject: [PATCH 367/543] PCI/ASPM: Avoid L0s and L1 on Freescale [1957:0451] Root Ports Christian reported that f3ac2ff14834 ("PCI/ASPM: Enable all ClockPM and ASPM states for devicetree platforms") broke booting on the A-EON X5000. Override the L0s and L1 Support advertised in Link Capabilities by the X5000 Root Ports ([1957:0451]) so we don't try to enable those states. Fixes: f3ac2ff14834 ("PCI/ASPM: Enable all ClockPM and ASPM states for devicetree platforms") Fixes: df5192d9bb0e ("PCI/ASPM: Enable only L0s and L1 for devicetree platforms") Reported-by: Christian Zigotzky Link: https://lore.kernel.org/r/db5c95a1-cf3e-46f9-8045-a1b04908051a@xenosoft.de Signed-off-by: Bjorn Helgaas Tested-by: Shawn Lin Reviewed-by: Lukas Wunner Link: https://patch.msgid.link/20251110222929.2140564-5-helgaas@kernel.org --- drivers/pci/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 922c77c627a1..b94264cd3833 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2523,6 +2523,7 @@ static void quirk_disable_aspm_l0s_l1(struct pci_dev *dev) * disable both L0s and L1 for now to be safe. */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ASMEDIA, 0x1080, quirk_disable_aspm_l0s_l1); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, 0x0451, quirk_disable_aspm_l0s_l1); /* * Some Pericom PCIe-to-PCI bridges in reverse mode need the PCIe Retrain From 823576c894d73255d35c0d0dabbb6ffecf1f2667 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 12 Nov 2025 18:36:24 -0600 Subject: [PATCH 368/543] PCI/ASPM: Avoid L0s and L1 on PA Semi [1959:a002] Root Ports Christian reported that f3ac2ff14834 ("PCI/ASPM: Enable all ClockPM and ASPM states for devicetree platforms") broke booting on the A-EON AmigaOne X1000. Override the L0s and L1 Support advertised in Link Capabilities by the X1000 Root Ports ([1959:a002]) so we don't try to enable those states. Fixes: f3ac2ff14834 ("PCI/ASPM: Enable all ClockPM and ASPM states for devicetree platforms") Fixes: df5192d9bb0e ("PCI/ASPM: Enable only L0s and L1 for devicetree platforms") Reported-by: Christian Zigotzky Link: https://lore.kernel.org/r/a41d2ca1-fcd9-c416-b111-a958e92e94bf@xenosoft.de Signed-off-by: Bjorn Helgaas --- drivers/pci/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index b94264cd3833..90f6abdb77f4 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2524,6 +2524,7 @@ static void quirk_disable_aspm_l0s_l1(struct pci_dev *dev) */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ASMEDIA, 0x1080, quirk_disable_aspm_l0s_l1); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, 0x0451, quirk_disable_aspm_l0s_l1); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_PASEMI, 0xa002, quirk_disable_aspm_l0s_l1); /* * Some Pericom PCIe-to-PCI bridges in reverse mode need the PCIe Retrain From 0eff2eaa5322b5b141ff5d5ded26fac4a52b5f7b Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 11 Nov 2025 17:08:28 +0800 Subject: [PATCH 369/543] virtio-net: fix incorrect flags recording in big mode The purpose of commit 703eec1b2422 ("virtio_net: fixing XDP for fully checksummed packets handling") is to record the flags in advance, as their value may be overwritten in the XDP case. However, the flags recorded under big mode are incorrect, because in big mode, the passed buf does not point to the rx buffer, but rather to the page of the submitted buffer. This commit fixes this issue. For the small mode, the commit c11a49d58ad2 ("virtio_net: Fix mismatched buf address when unmapping for small packets") fixed it. Tested-by: Alyssa Ross Fixes: 703eec1b2422 ("virtio_net: fixing XDP for fully checksummed packets handling") Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20251111090828.23186-1-xuanzhuo@linux.alibaba.com Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 8855a994e12b..0369dda5ed60 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2631,22 +2631,28 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, return; } - /* 1. Save the flags early, as the XDP program might overwrite them. + /* About the flags below: + * 1. Save the flags early, as the XDP program might overwrite them. * These flags ensure packets marked as VIRTIO_NET_HDR_F_DATA_VALID * stay valid after XDP processing. * 2. XDP doesn't work with partially checksummed packets (refer to * virtnet_xdp_set()), so packets marked as * VIRTIO_NET_HDR_F_NEEDS_CSUM get dropped during XDP processing. */ - flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; - if (vi->mergeable_rx_bufs) + if (vi->mergeable_rx_bufs) { + flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); - else if (vi->big_packets) + } else if (vi->big_packets) { + void *p = page_address((struct page *)buf); + + flags = ((struct virtio_net_common_hdr *)p)->hdr.flags; skb = receive_big(dev, vi, rq, buf, len, stats); - else + } else { + flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); + } if (unlikely(!skb)) return; From 921b3f59b7b00cd7067ab775b0e0ca4eca436c2f Mon Sep 17 00:00:00 2001 From: Shawn Lin Date: Wed, 12 Nov 2025 18:53:18 -0600 Subject: [PATCH 370/543] PCI/ASPM: Avoid L0s and L1 on Hi1105 [19e5:1105] Wi-Fi This Wi-Fi advertises the L0s and L1 capabilities but actually it doesn't support them. This is confirmed by HiSilicon team in actual productization. Signed-off-by: Shawn Lin Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/1762916319-139532-1-git-send-email-shawn.lin@rock-chips.com --- drivers/pci/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 90f6abdb77f4..b9c252aa6fe0 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2525,6 +2525,7 @@ static void quirk_disable_aspm_l0s_l1(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ASMEDIA, 0x1080, quirk_disable_aspm_l0s_l1); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, 0x0451, quirk_disable_aspm_l0s_l1); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_PASEMI, 0xa002, quirk_disable_aspm_l0s_l1); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_HUAWEI, 0x1105, quirk_disable_aspm_l0s_l1); /* * Some Pericom PCIe-to-PCI bridges in reverse mode need the PCIe Retrain From 0a4a18e888ae8c8004582f665c5792c84a681668 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 11 Nov 2025 17:09:20 -0800 Subject: [PATCH 371/543] drm/client: fix MODULE_PARM_DESC string for "active" The MODULE_PARM_DESC string for the "active" parameter is missing a space and has an extraneous trailing ']' character. Correct these. Before patch: $ modinfo -p ./drm_client_lib.ko active:Choose which drm client to start, default isfbdev] (string) After patch: $ modinfo -p ./drm_client_lib.ko active:Choose which drm client to start, default is fbdev (string) Fixes: f7b42442c4ac ("drm/log: Introduce a new boot logger to draw the kmsg on the screen") Signed-off-by: Randy Dunlap Reviewed-by: Thomas Zimmermann Reviewed-by: Jocelyn Falempe Signed-off-by: Thomas Zimmermann Link: https://patch.msgid.link/20251112010920.2355712-1-rdunlap@infradead.org --- drivers/gpu/drm/clients/drm_client_setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/clients/drm_client_setup.c b/drivers/gpu/drm/clients/drm_client_setup.c index 72480db1f00d..515aceac22b1 100644 --- a/drivers/gpu/drm/clients/drm_client_setup.c +++ b/drivers/gpu/drm/clients/drm_client_setup.c @@ -13,8 +13,8 @@ static char drm_client_default[16] = CONFIG_DRM_CLIENT_DEFAULT; module_param_string(active, drm_client_default, sizeof(drm_client_default), 0444); MODULE_PARM_DESC(active, - "Choose which drm client to start, default is" - CONFIG_DRM_CLIENT_DEFAULT "]"); + "Choose which drm client to start, default is " + CONFIG_DRM_CLIENT_DEFAULT); /** * drm_client_setup() - Setup in-kernel DRM clients From 96a3a03abf3d8cc38cd9cb0d280235fbcf7c3f7f Mon Sep 17 00:00:00 2001 From: Felix Maurer Date: Tue, 11 Nov 2025 17:29:32 +0100 Subject: [PATCH 372/543] hsr: Fix supervision frame sending on HSRv0 On HSRv0, no supervision frames were sent. The supervison frames were generated successfully, but failed the check for a sufficiently long mac header, i.e., at least sizeof(struct hsr_ethhdr), in hsr_fill_frame_info() because the mac header only contained the ethernet header. Fix this by including the HSR header in the mac header when generating HSR supervision frames. Note that the mac header now also includes the TLV fields. This matches how we set the headers on rx and also the size of struct hsrv0_ethhdr_sp. Reported-by: Hangbin Liu Closes: https://lore.kernel.org/netdev/aMONxDXkzBZZRfE5@fedora/ Fixes: 9cfb5e7f0ded ("net: hsr: fix hsr_init_sk() vs network/transport headers.") Signed-off-by: Felix Maurer Reviewed-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/4354114fea9a642fe71f49aeeb6c6159d1d61840.1762876095.git.fmaurer@redhat.com Tested-by: Hangbin Liu Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index fbbc3ccf9df6..1235abb2d79f 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -320,6 +320,9 @@ static void send_hsr_supervision_frame(struct hsr_port *port, } hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); + skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); + skb_reset_mac_len(skb); + set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version); From b2c26c82f7a94ec4da096f370e3612ee14424450 Mon Sep 17 00:00:00 2001 From: Felix Maurer Date: Tue, 11 Nov 2025 17:29:33 +0100 Subject: [PATCH 373/543] hsr: Follow standard for HSRv0 supervision frames For HSRv0, the path_id has the following meaning: - 0000: PRP supervision frame - 0001-1001: HSR ring identifier - 1010-1011: Frames from PRP network (A/B, with RedBoxes) - 1111: HSR supervision frame Follow the IEC 62439-3:2010 standard more closely by setting the right path_id for HSRv0 supervision frames (actually, it is correctly set when the frame is constructed, but hsr_set_path_id() overwrites it) and set a fixed HSR ring identifier of 1. The ring identifier seems to be generally unused and we ignore it anyways on reception, but some fixed identifier is definitely better than using one identifier in one direction and a wrong identifier in the other. This was also the behavior before commit f266a683a480 ("net/hsr: Better frame dispatch") which introduced the alternating path_id. This was later moved to hsr_set_path_id() in commit 451d8123f897 ("net: prp: add packet handling support"). The IEC 62439-3:2010 also contains 6 unused bytes after the MacAddressA in the HSRv0 supervision frames. Adjust a TODO comment accordingly. Fixes: f266a683a480 ("net/hsr: Better frame dispatch") Fixes: 451d8123f897 ("net: prp: add packet handling support") Signed-off-by: Felix Maurer Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/ea0d5133cd593856b2fa673d6e2067bf1d4d1794.1762876095.git.fmaurer@redhat.com Tested-by: Hangbin Liu Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 2 +- net/hsr/hsr_forward.c | 22 +++++++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 1235abb2d79f..492cbc78ab75 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -337,7 +337,7 @@ static void send_hsr_supervision_frame(struct hsr_port *port, } hsr_stag->tlv.HSR_TLV_type = type; - /* TODO: Why 12 in HSRv0? */ + /* HSRv0 has 6 unused bytes after the MAC */ hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ? sizeof(struct hsr_sup_payload) : 12; diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index c67c0d35921d..339f0d220212 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -262,15 +262,23 @@ static struct sk_buff *prp_fill_rct(struct sk_buff *skb, return skb; } -static void hsr_set_path_id(struct hsr_ethhdr *hsr_ethhdr, +static void hsr_set_path_id(struct hsr_frame_info *frame, + struct hsr_ethhdr *hsr_ethhdr, struct hsr_port *port) { int path_id; - if (port->type == HSR_PT_SLAVE_A) - path_id = 0; - else - path_id = 1; + if (port->hsr->prot_version) { + if (port->type == HSR_PT_SLAVE_A) + path_id = 0; + else + path_id = 1; + } else { + if (frame->is_supervision) + path_id = 0xf; + else + path_id = 1; + } set_hsr_tag_path(&hsr_ethhdr->hsr_tag, path_id); } @@ -304,7 +312,7 @@ static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, else hsr_ethhdr = (struct hsr_ethhdr *)pc; - hsr_set_path_id(hsr_ethhdr, port); + hsr_set_path_id(frame, hsr_ethhdr, port); set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; @@ -330,7 +338,7 @@ struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, (struct hsr_ethhdr *)skb_mac_header(frame->skb_hsr); /* set the lane id properly */ - hsr_set_path_id(hsr_ethhdr, port); + hsr_set_path_id(frame, hsr_ethhdr, port); return skb_clone(frame->skb_hsr, GFP_ATOMIC); } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { return skb_clone(frame->skb_std, GFP_ATOMIC); From ebd4469e7af61019daaf904fdcba07a9ecd18440 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Wed, 5 Nov 2025 14:40:32 +1100 Subject: [PATCH 374/543] entry: Fix ifndef around arch_xfer_to_guest_mode_handle_work() stub The stub implementation of arch_xfer_to_guest_mode_handle_work() is guarded by an #ifndef that incorrectly checks for the name arch_xfer_to_guest_mode_work instead. It seems the function was renamed to add "_handle" as a late change to the original patch, and the #ifndef wasn't updated to go with it. Change the #ifndef to match the name of the function. No users right now, so no need to update any architecture code. Fixes: 935ace2fb5cc4 ("entry: Provide infrastructure for work before transitioning to guest mode") Signed-off-by: Andrew Donnellan Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251105-entry-fix-ifndef-v1-1-d8d28045b627@linux.ibm.com --- include/linux/entry-virt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/entry-virt.h b/include/linux/entry-virt.h index 42c89e3e5ca7..bfa767702d9a 100644 --- a/include/linux/entry-virt.h +++ b/include/linux/entry-virt.h @@ -32,7 +32,7 @@ */ static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work); -#ifndef arch_xfer_to_guest_mode_work +#ifndef arch_xfer_to_guest_mode_handle_work static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work) { return 0; From 0a8fb03fe7b0abab0ff16522e2625163183e7ae4 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Thu, 13 Nov 2025 12:10:06 +0000 Subject: [PATCH 375/543] MAINTAINERS: Update name spelling Use transliteration from the Belarusian language instead of Russian. Signed-off-by: Kiryl Shutsemau Signed-off-by: Dave Hansen Link: https://patch.msgid.link/20251113121006.651992-1-kas%40kernel.org --- .mailmap | 2 +- MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 369cfe467932..0f74e16e239c 100644 --- a/.mailmap +++ b/.mailmap @@ -426,7 +426,7 @@ Kenneth W Chen Kenneth Westfield Kiran Gunda Kirill Tkhai -Kirill A. Shutemov +Kiryl Shutsemau Kishon Vijay Abraham I Konrad Dybcio Konrad Dybcio diff --git a/MAINTAINERS b/MAINTAINERS index 46bd8e033042..ffd964b0a9bc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27845,7 +27845,7 @@ F: arch/x86/kernel/stacktrace.c F: arch/x86/kernel/unwind_*.c X86 TRUST DOMAIN EXTENSIONS (TDX) -M: Kirill A. Shutemov +M: Kiryl Shutsemau R: Dave Hansen R: Rick Edgecombe L: x86@kernel.org From fbade4bd08ba52cbc74a71c4e86e736f059f99f7 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Tue, 11 Nov 2025 14:02:50 +0800 Subject: [PATCH 376/543] mptcp: Disallow MPTCP subflows from sockmap The sockmap feature allows bpf syscall from userspace, or based on bpf sockops, replacing the sk_prot of sockets during protocol stack processing with sockmap's custom read/write interfaces. ''' tcp_rcv_state_process() subflow_syn_recv_sock() tcp_init_transfer(BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) bpf_skops_established <== sockops bpf_sock_map_update(sk) <== call bpf helper tcp_bpf_update_proto() <== update sk_prot ''' Consider two scenarios: 1. When the server has MPTCP enabled and the client also requests MPTCP, the sk passed to the BPF program is a subflow sk. Since subflows only handle partial data, replacing their sk_prot is meaningless and will cause traffic disruption. 2. When the server has MPTCP enabled but the client sends a TCP SYN without MPTCP, subflow_syn_recv_sock() performs a fallback on the subflow, replacing the subflow sk's sk_prot with the native sk_prot. ''' subflow_ulp_fallback() subflow_drop_ctx() mptcp_subflow_ops_undo_override() ''' Subsequently, accept::mptcp_stream_accept::mptcp_fallback_tcp_ops() converts the subflow to plain TCP. For the first case, we should prevent it from being combined with sockmap by setting sk_prot->psock_update_sk_prot to NULL, which will be blocked by sockmap's own flow. For the second case, since subflow_syn_recv_sock() has already restored sk_prot to native tcp_prot/tcpv6_prot, no further action is needed. Fixes: cec37a6e41aa ("mptcp: Handle MP_CAPABLE options for outgoing connections") Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Reviewed-by: Matthieu Baerts (NGI0) Cc: Link: https://patch.msgid.link/20251111060307.194196-2-jiayuan.chen@linux.dev --- net/mptcp/subflow.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e8325890a322..af707ce0f624 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -2144,6 +2144,10 @@ void __init mptcp_subflow_init(void) tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; tcp_prot_override.diag_destroy = tcp_abort_override; +#ifdef CONFIG_BPF_SYSCALL + /* Disable sockmap processing for subflows */ + tcp_prot_override.psock_update_sk_prot = NULL; +#endif #if IS_ENABLED(CONFIG_MPTCP_IPV6) /* In struct mptcp_subflow_request_sock, we assume the TCP request sock @@ -2180,6 +2184,10 @@ void __init mptcp_subflow_init(void) tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; tcpv6_prot_override.diag_destroy = tcp_abort_override; +#ifdef CONFIG_BPF_SYSCALL + /* Disable sockmap processing for subflows */ + tcpv6_prot_override.psock_update_sk_prot = NULL; +#endif #endif mptcp_diag_subflow_init(&subflow_ulp_ops); From a257e974210320ede524f340ffe16bf4bf0dda1e Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 13 Nov 2025 19:43:55 +0800 Subject: [PATCH 377/543] sched_ext: Fix possible deadlock in the deferred_irq_workfn() For PREEMPT_RT=y kernels, the deferred_irq_workfn() is executed in the per-cpu irq_work/* task context and not disable-irq, if the rq returned by container_of() is current CPU's rq, the following scenarios may occur: lock(&rq->__lock); lock(&rq->__lock); This commit use IRQ_WORK_INIT_HARD() to replace init_irq_work() to initialize rq->scx.deferred_irq_work, make the deferred_irq_workfn() is always invoked in hard-irq context. Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 03d05ea20006..07399210ac2d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5321,7 +5321,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); - init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); + rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); if (cpu_online(cpu)) From cbcff934fa7deb670d9545a3aad4d07e8f1e4f3c Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 11 Nov 2025 21:53:31 +0900 Subject: [PATCH 378/543] mm/slub: fix memory leak in free_to_pcs_bulk() The commit 989b09b73978 ("slab: skip percpu sheaves for remote object freeing") introduced the remote_objects array in free_to_pcs_bulk() to skip sheaves when objects from a remote node are freed. However, the array is flushed only when: 1) the array becomes full (++remote_nr >= PCS_BATCH_MAX), or 2) slab_free_hook() returns false and size becomes zero. When neither of the conditions is met, objects in the array are leaked. This resulted in a memory leak [1], where 82 GiB of memory was allocated for the maple_node cache. Flush the array after successfully freeing objects to sheaves in the do_free: path. In the meantime, move the snippet if (!size) goto flush_remote; outside the while loop for readability. Let's say all objects in the array are from a remote node: then we acquire s->cpu_sheaves->lock and try to free an object even when size is zero. This doesn't appear to be harmful, but isn't really readable. Reported-by: Tytus Rogalewski Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220765 [1] Closes: https://lore.kernel.org/linux-mm/20251107094809.12e9d705b7bf4815783eb184@linux-foundation.org Closes: https://lore.kernel.org/all/aRGDTwbt2EIz2CYn@hyeyoo Fixes: 989b09b73978 ("slab: skip percpu sheaves for remote object freeing") Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20251111125331.12246-1-harry.yoo@oracle.com Acked-by: Liam R. Howlett Tested-by: Darrick J. Wong Tested-by: Tytus Rogalewski Signed-off-by: Vlastimil Babka --- mm/slub.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f1a5373eee7b..a787687a0d59 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6332,8 +6332,6 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) if (unlikely(!slab_free_hook(s, p[i], init, false))) { p[i] = p[--size]; - if (!size) - goto flush_remote; continue; } @@ -6348,6 +6346,9 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) i++; } + if (!size) + goto flush_remote; + next_batch: if (!local_trylock(&s->cpu_sheaves->lock)) goto fallback; @@ -6402,6 +6403,9 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) goto next_batch; } + if (remote_nr) + goto flush_remote; + return; no_empty: From 85c894a80ac46aa177df04e0a33bcad409b7d64f Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Fri, 7 Nov 2025 11:31:50 -0600 Subject: [PATCH 379/543] perf header: Write bpf_prog (infos|btfs)_cnt to data file With commit f0d0f978f3f5830a ("perf header: Don't write empty BPF/BTF info"), the write_bpf_( prog_info() | btf() ) functions exit without writing anything if env->bpf_prog.(infos| btfs)_cnt is zero. process_bpf_( prog_info() | btf() ), however, still expect a "count" value to exist in the data file. If btf information is empty, for example, process_bpf_btf will read garbage or some other data as the number of btf nodes in the data file. As a result, the data file will not be processed correctly. Instead, write the count to the data file and exit if it is zero. Fixes: f0d0f978f3f5830a ("perf header: Don't write empty BPF/BTF info") Reviewed-by: Ian Rogers Signed-off-by: Thomas Falcon Acked-by: Namhyung Kim Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4f2a6e10ed5c..4e12be579140 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1022,12 +1022,9 @@ static int write_bpf_prog_info(struct feat_fd *ff, down_read(&env->bpf_progs.lock); - if (env->bpf_progs.infos_cnt == 0) - goto out; - ret = do_write(ff, &env->bpf_progs.infos_cnt, sizeof(env->bpf_progs.infos_cnt)); - if (ret < 0) + if (ret < 0 || env->bpf_progs.infos_cnt == 0) goto out; root = &env->bpf_progs.infos; @@ -1067,13 +1064,10 @@ static int write_bpf_btf(struct feat_fd *ff, down_read(&env->bpf_progs.lock); - if (env->bpf_progs.btfs_cnt == 0) - goto out; - ret = do_write(ff, &env->bpf_progs.btfs_cnt, sizeof(env->bpf_progs.btfs_cnt)); - if (ret < 0) + if (ret < 0 || env->bpf_progs.btfs_cnt == 0) goto out; root = &env->bpf_progs.btfs; From a09e5967ad6819379fd31894634d7aed29c18409 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Nov 2025 21:57:08 -0300 Subject: [PATCH 380/543] perf build: Don't fail fast path feature detection when binutils-devel is not available This is one more remnant of the BUILD_NONDISTRO series to make building with binutils-devel opt-in due to license incompatibility. In this case just the references at link time were still in place, which make building the test-all.bin file fail, which wasn't detected before probably because the last test was done with binutils-devel available, doh. Now: $ rpm -q binutils-devel package binutils-devel is not installed $ file /tmp/build/perf-tools/feature/test-all.bin /tmp/build/perf-tools/feature/test-all.bin: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=4b5388a346b51f1b993f0b0dbd49f4570769b03c, for GNU/Linux 3.2.0, not stripped $ Fixes: 970ae86307718c34 ("perf build: The bfd features are opt-in, stop testing for them by default") Reviewed-by: Ian Rogers Cc: Adrian Hunter Cc: James Clark Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/feature/Makefile | 4 ++-- tools/perf/Makefile.config | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 49b0add392b1..95646290cb89 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -107,7 +107,7 @@ all: $(FILES) __BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS) BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1 BUILD_BFD = $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl - BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd + BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -ldl -lz -llzma -lzstd __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS) BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1 @@ -115,7 +115,7 @@ __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$( ############################### $(OUTPUT)test-all.bin: - $(BUILD_ALL) || $(BUILD_ALL) -lopcodes -liberty + $(BUILD_ALL) $(OUTPUT)test-hello.bin: $(BUILD) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 5700516aa84a..2dd5f5a60568 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -354,9 +354,6 @@ FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS) FEATURE_CHECK_LDFLAGS-libaio = -lrt -FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl -FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl - CORE_CFLAGS += -fno-omit-frame-pointer CORE_CFLAGS += -Wall CORE_CFLAGS += -Wextra @@ -930,6 +927,8 @@ ifdef BUILD_NONDISTRO ifeq ($(feature-libbfd), 1) EXTLIBS += -lbfd -lopcodes + FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl + FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl else # we are on a system that requires -liberty and (maybe) -lz # to link against -lbfd; test each case individually here From 84003ab3d0ca3717e4b36071c3c5f8b3c70e317c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 13 Nov 2025 15:03:07 -0300 Subject: [PATCH 381/543] tools headers UAPI: Sync KVM's vmx.h with the kernel to pick SEAMCALL exit reason To pick the changes in: 9d7dfb95da2cb5c1 ("KVM: VMX: Inject #UD if guest tries to execute SEAMCALL or TDCALL") The 'perf kvm-stat' tool uses the exit reasons that are included in the VMX_EXIT_REASONS define, this new SEAMCALL isn't included there (TDCALL is), so shouldn't be causing any change in behaviour, this patch ends up being just addressess the following perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/uapi/asm/vmx.h arch/x86/include/uapi/asm/vmx.h Please see tools/include/uapi/README for further details. Cc: Sean Christopherson Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/vmx.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index 9792e329343e..1baa86dfe029 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -93,6 +93,7 @@ #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_SEAMCALL 76 #define EXIT_REASON_TDCALL 77 #define EXIT_REASON_MSR_READ_IMM 84 #define EXIT_REASON_MSR_WRITE_IMM 85 From d0206db94b36c998c11458cfdae2f45ba20bc4fb Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 13 Nov 2025 16:01:23 +0000 Subject: [PATCH 382/543] perf lock: Fix segfault due to missing kernel map Kernel maps are encoded in PERF_RECORD_MMAP2 samples but "perf lock report" and "perf lock contention" do not process MMAP2 samples. Because of that, machine->vmlinux_map stays NULL and any later access triggers a segmentation fault. Fix it by adding ->mmap2() callbacks. Fixes: 53b00ff358dc75b1 ("perf record: Make --buildid-mmap the default") Reported-by: Tycho Andersen (AMD) Reviewed-by: Ian Rogers Signed-off-by: Ravi Bangoria Tested-by: Tycho Andersen (AMD) Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sandipan Das Cc: Santosh Shukla Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-lock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 078634461df2..e8962c985d34 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -1867,6 +1867,7 @@ static int __cmd_report(bool display_info) eops.sample = process_sample_event; eops.comm = perf_event__process_comm; eops.mmap = perf_event__process_mmap; + eops.mmap2 = perf_event__process_mmap2; eops.namespaces = perf_event__process_namespaces; eops.tracing_data = perf_event__process_tracing_data; session = perf_session__new(&data, &eops); @@ -2023,6 +2024,7 @@ static int __cmd_contention(int argc, const char **argv) eops.sample = process_sample_event; eops.comm = perf_event__process_comm; eops.mmap = perf_event__process_mmap; + eops.mmap2 = perf_event__process_mmap2; eops.tracing_data = perf_event__process_tracing_data; perf_env__init(&host_env); From 3c723f449723db2dc2b75b7efe03c2a76e4c09f0 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 13 Nov 2025 16:01:24 +0000 Subject: [PATCH 383/543] perf test: Fix lock contention test Couple of independent fixes: 1. Wire in SIGSEGV handler that terminates the test with a failure code. 2. Use "--lock-cgroup" instead of "-g"; "-g" was proposed but never merged. See commit 4d1792d0a2564caf ("perf lock contention: Add --lock-cgroup option") 3. Call cleanup() on every normal exit so trap_cleanup() doesn't mistake it for an unexpected signal and emit a false-negative "Unexpected signal in main" message. Before patch: # ./perf test -vv "lock contention" 85: kernel lock contention analysis test: --- start --- test child forked, pid 610711 Testing perf lock record and perf lock contention Testing perf lock contention --use-bpf Testing perf lock record and perf lock contention at the same time Testing perf lock contention --threads Testing perf lock contention --lock-addr Testing perf lock contention --lock-cgroup Unexpected signal in test_aggr_cgroup ---- end(0) ---- 85: kernel lock contention analysis test : Ok After patch: # ./perf test -vv "lock contention" 85: kernel lock contention analysis test: --- start --- test child forked, pid 602637 Testing perf lock record and perf lock contention Testing perf lock contention --use-bpf Testing perf lock record and perf lock contention at the same time Testing perf lock contention --threads Testing perf lock contention --lock-addr Testing perf lock contention --lock-cgroup Testing perf lock contention --type-filter (w/ spinlock) Testing perf lock contention --lock-filter (w/ tasklist_lock) Testing perf lock contention --callstack-filter (w/ unix_stream) [Skip] Could not find 'unix_stream' Testing perf lock contention --callstack-filter with task aggregation [Skip] Could not find 'unix_stream' Testing perf lock contention --cgroup-filter Testing perf lock contention CSV output ---- end(0) ---- 85: kernel lock contention analysis test : Ok Reviewed-by: Ian Rogers Signed-off-by: Ravi Bangoria Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sandipan Das Cc: Santosh Shukla Cc: Tycho Andersen Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/lock_contention.sh | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/perf/tests/shell/lock_contention.sh b/tools/perf/tests/shell/lock_contention.sh index 7248a74ca2a3..6dd90519f45c 100755 --- a/tools/perf/tests/shell/lock_contention.sh +++ b/tools/perf/tests/shell/lock_contention.sh @@ -13,15 +13,18 @@ cleanup() { rm -f ${perfdata} rm -f ${result} rm -f ${errout} - trap - EXIT TERM INT + trap - EXIT TERM INT ERR } trap_cleanup() { + if (( $? == 139 )); then #SIGSEGV + err=1 + fi echo "Unexpected signal in ${FUNCNAME[1]}" cleanup exit ${err} } -trap trap_cleanup EXIT TERM INT +trap trap_cleanup EXIT TERM INT ERR check() { if [ "$(id -u)" != 0 ]; then @@ -145,7 +148,7 @@ test_aggr_cgroup() fi # the perf lock contention output goes to the stderr - perf lock con -a -b -g -E 1 -q -- perf bench sched messaging -p > /dev/null 2> ${result} + perf lock con -a -b --lock-cgroup -E 1 -q -- perf bench sched messaging -p > /dev/null 2> ${result} if [ "$(cat "${result}" | wc -l)" != "1" ]; then echo "[Fail] BPF result count is not 1:" "$(cat "${result}" | wc -l)" err=1 @@ -271,7 +274,7 @@ test_cgroup_filter() return fi - perf lock con -a -b -g -E 1 -F wait_total -q -- perf bench sched messaging -p > /dev/null 2> ${result} + perf lock con -a -b --lock-cgroup -E 1 -F wait_total -q -- perf bench sched messaging -p > /dev/null 2> ${result} if [ "$(cat "${result}" | wc -l)" != "1" ]; then echo "[Fail] BPF result should have a cgroup result:" "$(cat "${result}")" err=1 @@ -279,7 +282,7 @@ test_cgroup_filter() fi cgroup=$(cat "${result}" | awk '{ print $3 }') - perf lock con -a -b -g -E 1 -G "${cgroup}" -q -- perf bench sched messaging -p > /dev/null 2> ${result} + perf lock con -a -b --lock-cgroup -E 1 -G "${cgroup}" -q -- perf bench sched messaging -p > /dev/null 2> ${result} if [ "$(cat "${result}" | wc -l)" != "1" ]; then echo "[Fail] BPF result should have a result with cgroup filter:" "$(cat "${cgroup}")" err=1 @@ -338,4 +341,5 @@ test_aggr_task_stack_filter test_cgroup_filter test_csv_output +cleanup exit ${err} From b72b8132d8fd2d6bf5b420a03d4fc553980c3a92 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 11 Nov 2025 23:43:11 -0800 Subject: [PATCH 384/543] perf libbfd: Ensure libbfd is initialized prior to use Multiple threads may be creating and destroying BFD objects in situations like `perf top`. Without appropriate initialization crashes may occur during libbfd's cache management. BFD's locks require recursive mutexes, add support for these. Committer testing: This happens only when building with 'make BUILD_NONDISTRO=1' and having the binutils-devel package (or equivalent) installed, i.e. linking with binutils devel files, an opt-in perf build. Before: root@x1:~# perf top perf: Segmentation fault -------- backtrace -------- root@x1:~# After this patch it works as before. Closes: https://lore.kernel.org/lkml/aQt66zhfxSA80xwt@gentoo.org/ Fixes: 95931d9a594dd0b5 ("perf libbfd: Move libbfd functionality to its own file") Reported-by: Guilherme Amadio Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/libbfd.c | 38 ++++++++++++++++++++++++++++++++++++++ tools/perf/util/mutex.c | 14 ++++++++++---- tools/perf/util/mutex.h | 2 ++ 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/libbfd.c b/tools/perf/util/libbfd.c index 01147fbf73b3..6434c2dccd4a 100644 --- a/tools/perf/util/libbfd.c +++ b/tools/perf/util/libbfd.c @@ -38,6 +38,39 @@ struct a2l_data { asymbol **syms; }; +static bool perf_bfd_lock(void *bfd_mutex) +{ + mutex_lock(bfd_mutex); + return true; +} + +static bool perf_bfd_unlock(void *bfd_mutex) +{ + mutex_unlock(bfd_mutex); + return true; +} + +static void perf_bfd_init(void) +{ + static struct mutex bfd_mutex; + + mutex_init_recursive(&bfd_mutex); + + if (bfd_init() != BFD_INIT_MAGIC) { + pr_err("Error initializing libbfd\n"); + return; + } + if (!bfd_thread_init(perf_bfd_lock, perf_bfd_unlock, &bfd_mutex)) + pr_err("Error initializing libbfd threading\n"); +} + +static void ensure_bfd_init(void) +{ + static pthread_once_t bfd_init_once = PTHREAD_ONCE_INIT; + + pthread_once(&bfd_init_once, perf_bfd_init); +} + static int bfd_error(const char *string) { const char *errmsg; @@ -132,6 +165,7 @@ static struct a2l_data *addr2line_init(const char *path) bfd *abfd; struct a2l_data *a2l = NULL; + ensure_bfd_init(); abfd = bfd_openr(path, NULL); if (abfd == NULL) return NULL; @@ -288,6 +322,7 @@ int dso__load_bfd_symbols(struct dso *dso, const char *debugfile) bfd *abfd; u64 start, len; + ensure_bfd_init(); abfd = bfd_openr(debugfile, NULL); if (!abfd) return -1; @@ -393,6 +428,7 @@ int libbfd__read_build_id(const char *filename, struct build_id *bid, bool block if (fd < 0) return -1; + ensure_bfd_init(); abfd = bfd_fdopenr(filename, /*target=*/NULL, fd); if (!abfd) return -1; @@ -421,6 +457,7 @@ int libbfd_filename__read_debuglink(const char *filename, char *debuglink, asection *section; bfd *abfd; + ensure_bfd_init(); abfd = bfd_openr(filename, NULL); if (!abfd) return -1; @@ -480,6 +517,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused, memset(tpath, 0, sizeof(tpath)); perf_exe(tpath, sizeof(tpath)); + ensure_bfd_init(); bfdf = bfd_openr(tpath, NULL); if (bfdf == NULL) abort(); diff --git a/tools/perf/util/mutex.c b/tools/perf/util/mutex.c index bca7f0717f35..7aa1f3f55a7d 100644 --- a/tools/perf/util/mutex.c +++ b/tools/perf/util/mutex.c @@ -17,7 +17,7 @@ static void check_err(const char *fn, int err) #define CHECK_ERR(err) check_err(__func__, err) -static void __mutex_init(struct mutex *mtx, bool pshared) +static void __mutex_init(struct mutex *mtx, bool pshared, bool recursive) { pthread_mutexattr_t attr; @@ -27,21 +27,27 @@ static void __mutex_init(struct mutex *mtx, bool pshared) /* In normal builds enable error checking, such as recursive usage. */ CHECK_ERR(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK)); #endif + if (recursive) + CHECK_ERR(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)); if (pshared) CHECK_ERR(pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); - CHECK_ERR(pthread_mutex_init(&mtx->lock, &attr)); CHECK_ERR(pthread_mutexattr_destroy(&attr)); } void mutex_init(struct mutex *mtx) { - __mutex_init(mtx, /*pshared=*/false); + __mutex_init(mtx, /*pshared=*/false, /*recursive=*/false); } void mutex_init_pshared(struct mutex *mtx) { - __mutex_init(mtx, /*pshared=*/true); + __mutex_init(mtx, /*pshared=*/true, /*recursive=*/false); +} + +void mutex_init_recursive(struct mutex *mtx) +{ + __mutex_init(mtx, /*pshared=*/false, /*recursive=*/true); } void mutex_destroy(struct mutex *mtx) diff --git a/tools/perf/util/mutex.h b/tools/perf/util/mutex.h index 38458f00846f..70232d8d094f 100644 --- a/tools/perf/util/mutex.h +++ b/tools/perf/util/mutex.h @@ -104,6 +104,8 @@ void mutex_init(struct mutex *mtx); * process-private attribute. */ void mutex_init_pshared(struct mutex *mtx); +/* Initializes a mutex that may be recursively held on the same thread. */ +void mutex_init_recursive(struct mutex *mtx); void mutex_destroy(struct mutex *mtx); void mutex_lock(struct mutex *mtx) EXCLUSIVE_LOCK_FUNCTION(*mtx); From c77b3b79a92e3345aa1ee296180d1af4e7031f8f Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Tue, 11 Nov 2025 14:02:51 +0800 Subject: [PATCH 385/543] mptcp: Fix proto fallback detection with BPF The sockmap feature allows bpf syscall from userspace, or based on bpf sockops, replacing the sk_prot of sockets during protocol stack processing with sockmap's custom read/write interfaces. ''' tcp_rcv_state_process() syn_recv_sock()/subflow_syn_recv_sock() tcp_init_transfer(BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) bpf_skops_established <== sockops bpf_sock_map_update(sk) <== call bpf helper tcp_bpf_update_proto() <== update sk_prot ''' When the server has MPTCP enabled but the client sends a TCP SYN without MPTCP, subflow_syn_recv_sock() performs a fallback on the subflow, replacing the subflow sk's sk_prot with the native sk_prot. ''' subflow_syn_recv_sock() subflow_ulp_fallback() subflow_drop_ctx() mptcp_subflow_ops_undo_override() ''' Then, this subflow can be normally used by sockmap, which replaces the native sk_prot with sockmap's custom sk_prot. The issue occurs when the user executes accept::mptcp_stream_accept::mptcp_fallback_tcp_ops(). Here, it uses sk->sk_prot to compare with the native sk_prot, but this is incorrect when sockmap is used, as we may incorrectly set sk->sk_socket->ops. This fix uses the more generic sk_family for the comparison instead. Additionally, this also prevents a WARNING from occurring: result from ./scripts/decode_stacktrace.sh: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 337 at net/mptcp/protocol.c:68 mptcp_stream_accept \ (net/mptcp/protocol.c:4005) Modules linked in: ... PKRU: 55555554 Call Trace: do_accept (net/socket.c:1989) __sys_accept4 (net/socket.c:2028 net/socket.c:2057) __x64_sys_accept (net/socket.c:2067) x64_sys_call (arch/x86/entry/syscall_64.c:41) do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f87ac92b83d ---[ end trace 0000000000000000 ]--- Fixes: 0b4f33def7bb ("mptcp: fix tcp fallback crash") Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Reviewed-by: Matthieu Baerts (NGI0) Cc: Link: https://patch.msgid.link/20251111060307.194196-3-jiayuan.chen@linux.dev --- net/mptcp/protocol.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2d6b8de35c44..90b4aeca2596 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -61,11 +61,13 @@ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) { + unsigned short family = READ_ONCE(sk->sk_family); + #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (sk->sk_prot == &tcpv6_prot) + if (family == AF_INET6) return &inet6_stream_ops; #endif - WARN_ON_ONCE(sk->sk_prot != &tcp_prot); + WARN_ON_ONCE(family != AF_INET); return &inet_stream_ops; } From cb730e4ac1b4dca09d364fd83464ebd29547a4ef Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Tue, 11 Nov 2025 14:02:52 +0800 Subject: [PATCH 386/543] selftests/bpf: Add mptcp test with sockmap Add test cases to verify that when MPTCP falls back to plain TCP sockets, they can properly work with sockmap. Additionally, add test cases to ensure that sockmap correctly rejects MPTCP sockets as expected. Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Acked-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251111060307.194196-4-jiayuan.chen@linux.dev --- .../testing/selftests/bpf/prog_tests/mptcp.c | 140 ++++++++++++++++++ .../selftests/bpf/progs/mptcp_sockmap.c | 43 ++++++ 2 files changed, 183 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/mptcp_sockmap.c diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index f8eb7f9d4fd2..8fade8bdc451 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -6,11 +6,13 @@ #include #include #include +#include #include "cgroup_helpers.h" #include "network_helpers.h" #include "mptcp_sock.skel.h" #include "mptcpify.skel.h" #include "mptcp_subflow.skel.h" +#include "mptcp_sockmap.skel.h" #define NS_TEST "mptcp_ns" #define ADDR_1 "10.0.1.1" @@ -436,6 +438,142 @@ static void test_subflow(void) close(cgroup_fd); } +/* Test sockmap on MPTCP server handling non-mp-capable clients. */ +static void test_sockmap_with_mptcp_fallback(struct mptcp_sockmap *skel) +{ + int listen_fd = -1, client_fd1 = -1, client_fd2 = -1; + int server_fd1 = -1, server_fd2 = -1, sent, recvd; + char snd[9] = "123456789"; + char rcv[10]; + + /* start server with MPTCP enabled */ + listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0); + if (!ASSERT_OK_FD(listen_fd, "sockmap-fb:start_mptcp_server")) + return; + + skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd)); + skel->bss->sk_index = 0; + /* create client without MPTCP enabled */ + client_fd1 = connect_to_fd_opts(listen_fd, NULL); + if (!ASSERT_OK_FD(client_fd1, "sockmap-fb:connect_to_fd")) + goto end; + + server_fd1 = accept(listen_fd, NULL, 0); + skel->bss->sk_index = 1; + client_fd2 = connect_to_fd_opts(listen_fd, NULL); + if (!ASSERT_OK_FD(client_fd2, "sockmap-fb:connect_to_fd")) + goto end; + + server_fd2 = accept(listen_fd, NULL, 0); + /* test normal redirect behavior: data sent by client_fd1 can be + * received by client_fd2 + */ + skel->bss->redirect_idx = 1; + sent = send(client_fd1, snd, sizeof(snd), 0); + if (!ASSERT_EQ(sent, sizeof(snd), "sockmap-fb:send(client_fd1)")) + goto end; + + /* try to recv more bytes to avoid truncation check */ + recvd = recv(client_fd2, rcv, sizeof(rcv), 0); + if (!ASSERT_EQ(recvd, sizeof(snd), "sockmap-fb:recv(client_fd2)")) + goto end; + +end: + if (client_fd1 >= 0) + close(client_fd1); + if (client_fd2 >= 0) + close(client_fd2); + if (server_fd1 >= 0) + close(server_fd1); + if (server_fd2 >= 0) + close(server_fd2); + close(listen_fd); +} + +/* Test sockmap rejection of MPTCP sockets - both server and client sides. */ +static void test_sockmap_reject_mptcp(struct mptcp_sockmap *skel) +{ + int listen_fd = -1, server_fd = -1, client_fd1 = -1; + int err, zero = 0; + + /* start server with MPTCP enabled */ + listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0); + if (!ASSERT_OK_FD(listen_fd, "start_mptcp_server")) + return; + + skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd)); + skel->bss->sk_index = 0; + /* create client with MPTCP enabled */ + client_fd1 = connect_to_fd(listen_fd, 0); + if (!ASSERT_OK_FD(client_fd1, "connect_to_fd client_fd1")) + goto end; + + /* bpf_sock_map_update() called from sockops should reject MPTCP sk */ + if (!ASSERT_EQ(skel->bss->helper_ret, -EOPNOTSUPP, "should reject")) + goto end; + + server_fd = accept(listen_fd, NULL, 0); + err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map), + &zero, &server_fd, BPF_NOEXIST); + if (!ASSERT_EQ(err, -EOPNOTSUPP, "server should be disallowed")) + goto end; + + /* MPTCP client should also be disallowed */ + err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map), + &zero, &client_fd1, BPF_NOEXIST); + if (!ASSERT_EQ(err, -EOPNOTSUPP, "client should be disallowed")) + goto end; +end: + if (client_fd1 >= 0) + close(client_fd1); + if (server_fd >= 0) + close(server_fd); + close(listen_fd); +} + +static void test_mptcp_sockmap(void) +{ + struct mptcp_sockmap *skel; + struct netns_obj *netns; + int cgroup_fd, err; + + cgroup_fd = test__join_cgroup("/mptcp_sockmap"); + if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup: mptcp_sockmap")) + return; + + skel = mptcp_sockmap__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_load: mptcp_sockmap")) + goto close_cgroup; + + skel->links.mptcp_sockmap_inject = + bpf_program__attach_cgroup(skel->progs.mptcp_sockmap_inject, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.mptcp_sockmap_inject, "attach sockmap")) + goto skel_destroy; + + err = bpf_prog_attach(bpf_program__fd(skel->progs.mptcp_sockmap_redirect), + bpf_map__fd(skel->maps.sock_map), + BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach stream verdict")) + goto skel_destroy; + + netns = netns_new(NS_TEST, true); + if (!ASSERT_OK_PTR(netns, "netns_new: mptcp_sockmap")) + goto skel_destroy; + + if (endpoint_init("subflow") < 0) + goto close_netns; + + test_sockmap_with_mptcp_fallback(skel); + test_sockmap_reject_mptcp(skel); + +close_netns: + netns_free(netns); +skel_destroy: + mptcp_sockmap__destroy(skel); +close_cgroup: + close(cgroup_fd); +} + void test_mptcp(void) { if (test__start_subtest("base")) @@ -444,4 +582,6 @@ void test_mptcp(void) test_mptcpify(); if (test__start_subtest("subflow")) test_subflow(); + if (test__start_subtest("sockmap")) + test_mptcp_sockmap(); } diff --git a/tools/testing/selftests/bpf/progs/mptcp_sockmap.c b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c new file mode 100644 index 000000000000..d4eef0cbadb9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bpf_tracing_net.h" + +char _license[] SEC("license") = "GPL"; + +int sk_index; +int redirect_idx; +int trace_port; +int helper_ret; +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, 100); +} sock_map SEC(".maps"); + +SEC("sockops") +int mptcp_sockmap_inject(struct bpf_sock_ops *skops) +{ + struct bpf_sock *sk; + + /* only accept specified connection */ + if (skops->local_port != trace_port || + skops->op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) + return 1; + + sk = skops->sk; + if (!sk) + return 1; + + /* update sk handler */ + helper_ret = bpf_sock_map_update(skops, &sock_map, &sk_index, BPF_NOEXIST); + + return 1; +} + +SEC("sk_skb/stream_verdict") +int mptcp_sockmap_redirect(struct __sk_buff *skb) +{ + /* redirect skb to the sk under sock_map[redirect_idx] */ + return bpf_sk_redirect_map(skb, &sock_map, redirect_idx, 0); +} From 264152a97edf9f1b7ed5372e4033e46108e41422 Mon Sep 17 00:00:00 2001 From: Chukun Pan Date: Sat, 1 Nov 2025 22:01:01 +0800 Subject: [PATCH 387/543] arm64: dts: rockchip: drop reset from rk3576 i2c9 node The reset property is not part of the binding, so drop it. It is also not used by the driver, so it was likely copied from some vendor-kernel node. Fixes: 57b1ce903966 ("arm64: dts: rockchip: Add rk3576 SoC base DT") Signed-off-by: Chukun Pan Link: https://patch.msgid.link/20251101140101.302229-1-amadeus@jmu.edu.cn Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3576.dtsi | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3576.dtsi b/arch/arm64/boot/dts/rockchip/rk3576.dtsi index f0c3ab00a7f3..a86fc6b4e8c4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3576.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3576.dtsi @@ -2549,8 +2549,6 @@ i2c9: i2c@2ae80000 { interrupts = ; pinctrl-names = "default"; pinctrl-0 = <&i2c9m0_xfer>; - resets = <&cru SRST_I2C9>, <&cru SRST_P_I2C9>; - reset-names = "i2c", "apb"; #address-cells = <1>; #size-cells = <0>; status = "disabled"; From baa18d577cd445145039e731d3de0fa49ca57204 Mon Sep 17 00:00:00 2001 From: Quentin Schulz Date: Wed, 12 Nov 2025 16:01:53 +0100 Subject: [PATCH 388/543] arm64: dts: rockchip: disable HS400 on RK3588 Tiger We've had reports from the field that some RK3588 Tiger have random issues with eMMC errors. Applying commit a28352cf2d2f ("mmc: sdhci-of-dwcmshc: Change DLL_STRBIN_TAPNUM_DEFAULT to 0x4") didn't help and seemed to have made things worse for our board. Our HW department checked the eMMC lines and reported that they are too long and don't look great so signal integrity is probably not the best. Note that not all Tigers with the same eMMC chip have errors, so the suspicion is that we're really on the edge in terms of signal integrity and only a handful devices are failing. Additionally, we have RK3588 Jaguars with the same eMMC chip but the layout is different and we also haven't received reports about those so far. Lowering the max-frequency to 150MHz from 200MHz instead of simply disabling HS400 was briefly tested and seem to work as well. We've disabled HS400 downstream and haven't received reports since so we'll go with that instead of lowering the max-frequency. Signed-off-by: Quentin Schulz Fixes: 6173ef24b35b ("arm64: dts: rockchip: add RK3588-Q7 (Tiger) SoM") Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20251112-tiger-hs200-v1-1-b50adac107c0@cherry.de [added Fixes tag and stable-cc from 2nd mail] Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi index b44e89e1bb15..365c1d958f2d 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi @@ -382,14 +382,12 @@ &sdhci { cap-mmc-highspeed; mmc-ddr-1_8v; mmc-hs200-1_8v; - mmc-hs400-1_8v; - mmc-hs400-enhanced-strobe; mmc-pwrseq = <&emmc_pwrseq>; no-sdio; no-sd; non-removable; pinctrl-names = "default"; - pinctrl-0 = <&emmc_bus8 &emmc_cmd &emmc_clk &emmc_data_strobe>; + pinctrl-0 = <&emmc_bus8 &emmc_cmd &emmc_clk>; vmmc-supply = <&vcc_3v3_s3>; vqmmc-supply = <&vcc_1v8_s3>; status = "okay"; From b5414520793e68d266fdd97a84989d9831156aad Mon Sep 17 00:00:00 2001 From: Mykola Kvach Date: Mon, 3 Nov 2025 12:27:40 +0200 Subject: [PATCH 389/543] arm64: dts: rockchip: fix PCIe 3.3V regulator voltage on orangepi-5 The vcc3v3_pcie20 fixed regulator powers the PCIe device-side 3.3V rail for pcie2x1l2 via vpcie3v3-supply. The DTS mistakenly set its regulator-min/max-microvolt to 1800000 (1.8 V). Correct both to 3300000 (3.3 V) to match the rail name, the PCIe/M.2 power requirement, and the actual hardware wiring on Orange Pi 5. Fixes: b6bc755d806e ("arm64: dts: rockchip: Add Orange Pi 5") Cc: stable@vger.kernel.org Signed-off-by: Mykola Kvach Reviewed-by: Michael Riesch Link: https://patch.msgid.link/cf6e08dfdfbf1c540685d12388baab1326f95d2c.1762165324.git.xakep.amatop@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts index ad6d04793b0a..83b9b6645a1e 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts @@ -14,8 +14,8 @@ vcc3v3_pcie20: regulator-vcc3v3-pcie20 { gpios = <&gpio0 RK_PC5 GPIO_ACTIVE_HIGH>; regulator-name = "vcc3v3_pcie20"; regulator-boot-on; - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <1800000>; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; startup-delay-us = <50000>; vin-supply = <&vcc5v0_sys>; }; From 7410c86fc05b1423466c1a579bcc994f87822566 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Wed, 12 Nov 2025 13:17:03 +0200 Subject: [PATCH 390/543] MAINTAINERS: Remove eth bridge website MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ethernet bridge website URL shows "This page isn’t available". Signed-off-by: Baruch Siach Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/0a32aaf7fa4473e7574f7327480e8fbc4fef2741.1762946223.git.baruch@tkos.co.il Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8a4d9c0c7b8a..cffd4effa50a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9266,7 +9266,6 @@ M: Ido Schimmel L: bridge@lists.linux.dev L: netdev@vger.kernel.org S: Maintained -W: http://www.linuxfoundation.org/en/Net:Bridge F: include/linux/if_bridge.h F: include/uapi/linux/if_bridge.h F: include/linux/netfilter_bridge/ From f796a8dec9beafcc0f6f0d3478ed685a15c5e062 Mon Sep 17 00:00:00 2001 From: Jiaming Zhang Date: Wed, 12 Nov 2025 01:36:52 +0800 Subject: [PATCH 391/543] net: core: prevent NULL deref in generic_hwtstamp_ioctl_lower() The ethtool tsconfig Netlink path can trigger a null pointer dereference. A call chain such as: tsconfig_prepare_data() -> dev_get_hwtstamp_phylib() -> vlan_hwtstamp_get() -> generic_hwtstamp_get_lower() -> generic_hwtstamp_ioctl_lower() results in generic_hwtstamp_ioctl_lower() being called with kernel_cfg->ifr as NULL. The generic_hwtstamp_ioctl_lower() function does not expect a NULL ifr and dereferences it, leading to a system crash. Fix this by adding a NULL check for kernel_cfg->ifr in generic_hwtstamp_ioctl_lower(). If ifr is NULL, return -EINVAL. Fixes: 6e9e2eed4f39 ("net: ethtool: Add support for tsconfig command to get/set hwtstamp config") Closes: https://lore.kernel.org/cd6a7056-fa6d-43f8-b78a-f5e811247ba8@linux.dev Signed-off-by: Jiaming Zhang Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20251111173652.749159-2-r772577952@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev_ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index ad54b12d4b4c..8bb71a10dba0 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -443,6 +443,9 @@ static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, struct ifreq ifrr; int err; + if (!kernel_cfg->ifr) + return -EINVAL; + strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; From 407a06507c2358554958e8164dc97176feddcafc Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Wed, 12 Nov 2025 05:21:14 +0000 Subject: [PATCH 392/543] mlxsw: spectrum: Fix memory leak in mlxsw_sp_flower_stats() The function mlxsw_sp_flower_stats() calls mlxsw_sp_acl_ruleset_get() to obtain a ruleset reference. If the subsequent call to mlxsw_sp_acl_rule_lookup() fails to find a rule, the function returns an error without releasing the ruleset reference, causing a memory leak. Fix this by using a goto to the existing error handling label, which calls mlxsw_sp_acl_ruleset_put() to properly release the reference. Fixes: 7c1b8eb175b69 ("mlxsw: spectrum: Add support for TC flower offload statistics") Signed-off-by: Zilin Guan Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20251112052114.1591695-1-zilin@seu.edu.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 6a4a81c63451..353fd9ca89a6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -830,8 +830,10 @@ int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp, return -EINVAL; rule = mlxsw_sp_acl_rule_lookup(mlxsw_sp, ruleset, f->cookie); - if (!rule) - return -EINVAL; + if (!rule) { + err = -EINVAL; + goto err_rule_get_stats; + } err = mlxsw_sp_acl_rule_get_stats(mlxsw_sp, rule, &packets, &bytes, &drops, &lastuse, &used_hw_stats); From a55ef3bff84f11ee8c84a1ae29b071ffd4ccbbd9 Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Mon, 10 Nov 2025 06:41:25 +0000 Subject: [PATCH 393/543] xfrm: fix memory leak in xfrm_add_acquire() The xfrm_add_acquire() function constructs an xfrm policy by calling xfrm_policy_construct(). This allocates the policy structure and potentially associates a security context and a device policy with it. However, at the end of the function, the policy object is freed using only kfree() . This skips the necessary cleanup for the security context and device policy, leading to a memory leak. To fix this, invoke the proper cleanup functions xfrm_dev_policy_delete(), xfrm_dev_policy_free(), and security_xfrm_policy_free() before freeing the policy object. This approach mirrors the error handling path in xfrm_add_policy(), ensuring that all associated resources are correctly released. Fixes: 980ebd25794f ("[IPSEC]: Sync series - acquire insert") Signed-off-by: Zilin Guan Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 9d98cc9daa37..403b5ecac2c5 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3038,6 +3038,9 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, } xfrm_state_free(x); + xfrm_dev_policy_delete(xp); + xfrm_dev_policy_free(xp); + security_xfrm_policy_free(xp->security); kfree(xp); return 0; From f84fd5bec502447df145f31734793714690ce27f Mon Sep 17 00:00:00 2001 From: Luke Wang Date: Fri, 14 Nov 2025 14:53:08 +0800 Subject: [PATCH 394/543] pwm: adp5585: Correct mismatched pwm chip info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The register addresses of ADP5585 and ADP5589 are swapped. Fixes: 75024f97e82e ("pwm: adp5585: add support for adp5589") Signed-off-by: Luke Wang Acked-by: Nuno Sá Tested-by: Liu Ying # ADP5585 PWM Link: https://patch.msgid.link/20251114065308.2074893-1-ziniu.wang_1@nxp.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-adp5585.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pwm/pwm-adp5585.c b/drivers/pwm/pwm-adp5585.c index dc2860979e24..806f8d79b0d7 100644 --- a/drivers/pwm/pwm-adp5585.c +++ b/drivers/pwm/pwm-adp5585.c @@ -190,13 +190,13 @@ static int adp5585_pwm_probe(struct platform_device *pdev) return 0; } -static const struct adp5585_pwm_chip adp5589_pwm_chip_info = { +static const struct adp5585_pwm_chip adp5585_pwm_chip_info = { .pwm_cfg = ADP5585_PWM_CFG, .pwm_offt_low = ADP5585_PWM_OFFT_LOW, .pwm_ont_low = ADP5585_PWM_ONT_LOW, }; -static const struct adp5585_pwm_chip adp5585_pwm_chip_info = { +static const struct adp5585_pwm_chip adp5589_pwm_chip_info = { .pwm_cfg = ADP5589_PWM_CFG, .pwm_offt_low = ADP5589_PWM_OFFT_LOW, .pwm_ont_low = ADP5589_PWM_ONT_LOW, From e1a97a627cd01d73fac5dd054d8f3de601ef2781 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 13 Nov 2025 16:35:50 -0600 Subject: [PATCH 395/543] x86/CPU/AMD: Add additional fixed RDSEED microcode revisions Microcode that resolves the RDSEED failure (SB-7055 [1]) has been released for additional Zen5 models to linux-firmware [2]. Update the zen5_rdseed_microcode array to cover these new models. Fixes: 607b9fb2ce24 ("x86/CPU/AMD: Add RDSEED fix for Zen5") Signed-off-by: Mario Limonciello Signed-off-by: Borislav Petkov (AMD) Cc: Link: https://www.amd.com/en/resources/product-security/bulletin/amd-sb-7055.html [1] Link: https://gitlab.com/kernel-firmware/linux-firmware/-/commit/6167e5566900cf236f7a69704e8f4c441bc7212a [2] Link: https://patch.msgid.link/20251113223608.1495655-1-mario.limonciello@amd.com --- arch/x86/kernel/cpu/amd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2ba9f2d42d8c..5d46709c58d0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1037,7 +1037,14 @@ static void init_amd_zen4(struct cpuinfo_x86 *c) static const struct x86_cpu_id zen5_rdseed_microcode[] = { ZEN_MODEL_STEP_UCODE(0x1a, 0x02, 0x1, 0x0b00215a), + ZEN_MODEL_STEP_UCODE(0x1a, 0x08, 0x1, 0x0b008121), ZEN_MODEL_STEP_UCODE(0x1a, 0x11, 0x0, 0x0b101054), + ZEN_MODEL_STEP_UCODE(0x1a, 0x24, 0x0, 0x0b204037), + ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x0, 0x0b404035), + ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x1, 0x0b404108), + ZEN_MODEL_STEP_UCODE(0x1a, 0x60, 0x0, 0x0b600037), + ZEN_MODEL_STEP_UCODE(0x1a, 0x68, 0x0, 0x0b608038), + ZEN_MODEL_STEP_UCODE(0x1a, 0x70, 0x0, 0x0b700037), {}, }; From dd14022a7ce96963aa923e35cf4bcc8c32f95840 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 14 Nov 2025 14:01:14 +0100 Subject: [PATCH 396/543] x86/microcode/AMD: Add Zen5 model 0x44, stepping 0x1 minrev Add the minimum Entrysign revision for that model+stepping to the list of minimum revisions. Fixes: 50cef76d5cb0 ("x86/microcode/AMD: Load only SHA256-checksummed patches") Reported-by: Andrew Cooper Signed-off-by: Borislav Petkov (AMD) Cc: Link: https://lore.kernel.org/r/e94dd76b-4911-482f-8500-5c848a3df026@citrix.com --- arch/x86/kernel/cpu/microcode/amd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index dc82153009da..a881bf4c2011 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -224,6 +224,7 @@ static bool need_sha_check(u32 cur_rev) case 0xb1010: return cur_rev <= 0xb101046; break; case 0xb2040: return cur_rev <= 0xb204031; break; case 0xb4040: return cur_rev <= 0xb404031; break; + case 0xb4041: return cur_rev <= 0xb404101; break; case 0xb6000: return cur_rev <= 0xb600031; break; case 0xb6080: return cur_rev <= 0xb608031; break; case 0xb7000: return cur_rev <= 0xb700031; break; From 21a9ab5b90b3716a631d559e62818029b4e7f5b7 Mon Sep 17 00:00:00 2001 From: Lushih Hsieh Date: Fri, 14 Nov 2025 13:20:53 +0800 Subject: [PATCH 397/543] ALSA: usb-audio: Add native DSD quirks for PureAudio DAC series The PureAudio APA DAC and Lotus DAC5 series are USB Audio 2.0 Class devices that support native Direct Stream Digital (DSD) playback via specific vendor protocols. Without these quirks, the devices may only function in standard PCM mode, or fail to correctly report their DSD format capabilities to the ALSA framework, preventing native DSD playback under Linux. This commit adds new quirk entries for the mentioned DAC models based on their respective Vendor/Product IDs (VID:PID), for example: 0x16d0:0x0ab1 (APA DAC), 0x16d0:0xeca1 (DAC5 series), etc. The quirk ensures correct DSD format handling by setting the required SNDRV_PCM_FMTBIT_DSD_U32_BE format bit and defining the DSD-specific Audio Class 2.0 (AC2.0) endpoint configurations. This allows the ALSA DSD API to correctly address the device for high-bitrate DSD streams, bypassing the need for DoP (DSD over PCM). Test on APA DAC and Lotus DAC5 SE under Arch Linux. Tested-by: Lushih Hsieh Signed-off-by: Lushih Hsieh Link: https://patch.msgid.link/20251114052053.54989-1-bruce@mail.kh.edu.tw Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index e5b857129caf..5e30bff69f82 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2022,6 +2022,8 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, case USB_ID(0x16d0, 0x09d8): /* NuPrime IDA-8 */ case USB_ID(0x16d0, 0x09db): /* NuPrime Audio DAC-9 */ case USB_ID(0x16d0, 0x09dd): /* Encore mDSD */ + case USB_ID(0x16d0, 0x0ab1): /* PureAudio APA DAC */ + case USB_ID(0x16d0, 0xeca1): /* PureAudio Lotus DAC5, DAC5 SE, DAC5 Pro */ case USB_ID(0x1db5, 0x0003): /* Bryston BDA3 */ case USB_ID(0x20a0, 0x4143): /* WaveIO USB Audio 2.0 */ case USB_ID(0x22e1, 0xca01): /* HDTA Serenade DSD */ @@ -2299,6 +2301,10 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_IGNORE_CLOCK_SOURCE), DEVICE_FLG(0x1686, 0x00dd, /* Zoom R16/24 */ QUIRK_FLAG_TX_LENGTH | QUIRK_FLAG_CTL_MSG_DELAY_1M), + DEVICE_FLG(0x16d0, 0x0ab1, /* PureAudio APA DAC */ + QUIRK_FLAG_DSD_RAW), + DEVICE_FLG(0x16d0, 0xeca1, /* PureAudio Lotus DAC5, DAC5 SE and DAC5 Pro */ + QUIRK_FLAG_DSD_RAW), DEVICE_FLG(0x17aa, 0x1046, /* Lenovo ThinkStation P620 Rear Line-in, Line-out and Microphone */ QUIRK_FLAG_DISABLE_AUTOSUSPEND), DEVICE_FLG(0x17aa, 0x104d, /* Lenovo ThinkStation P620 Internal Speaker + Front Headset */ From 37339122a7801660dce11abd817af82cc4bef163 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 14 Nov 2025 23:18:50 +0900 Subject: [PATCH 398/543] firewire: core: Initialize topology_map.lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lockdep barfs on the new uninitialized spinlock. Initialize it. protip: enable lockdep (CONFIG_PROVE_LOCKING=y) when doing locking changes firewire_ohci 0000:02:01.1: added OHCI v1.10 device as card 0, 4 IR + 4 IT contexts, quirks 0x11 INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. CPU: 0 UID: 0 PID: 1042 Comm: irq/17-firewire Not tainted 6.17.0-rc2-cl-bisect2-00026-g7d138cb269db #136 PREEMPT Hardware name: Dell Inc. Latitude E5400 /0D695C, BIOS A19 06/13/2013 Call Trace: dump_stack_lvl+0x6d/0xa0 register_lock_class+0x783/0x790 ? find_held_lock+0x2b/0x80 ? __mod_timer+0x110/0x320 ? __mod_timer+0x110/0x320 __lock_acquire+0x405/0x2600 lock_acquire+0xca/0x2e0 ? fw_core_handle_bus_reset+0x888/0xca0 [firewire_core] ? fw_core_handle_bus_reset+0x878/0xca0 [firewire_core] ? fw_core_handle_bus_reset+0x878/0xca0 [firewire_core] _raw_spin_lock+0x2e/0x40 ? fw_core_handle_bus_reset+0x888/0xca0 [firewire_core] fw_core_handle_bus_reset+0x888/0xca0 [firewire_core] handle_selfid_complete_event+0x35c/0x7a0 [firewire_ohci] ? irq_thread+0x8d/0x280 irq_thread_fn+0x18/0x50 irq_thread+0x15a/0x280 ? irq_check_status_bit+0x100/0x100 ? lockdep_hardirqs_on+0x78/0x100 ? irq_finalize_oneshot.part.0+0xc0/0xc0 ? irq_forced_thread_fn+0x60/0x60 kthread+0x114/0x200 ? kthreads_online_cpu+0x110/0x110 ret_from_fork+0x158/0x1e0 ? kthreads_online_cpu+0x110/0x110 ret_from_fork_asm+0x11/0x20 Reported-by: Erhard Furtner Fixes: 7d138cb269db ("firewire: core: use spin lock specific to topology map") Signed-off-by: Ville Syrjälä Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index e5e0174a0335..66e1106db5e7 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -577,6 +577,8 @@ void fw_card_initialize(struct fw_card *card, INIT_LIST_HEAD(&card->transactions.list); spin_lock_init(&card->transactions.lock); + spin_lock_init(&card->topology_map.lock); + card->split_timeout.hi = DEFAULT_SPLIT_TIMEOUT / 8000; card->split_timeout.lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; card->split_timeout.cycles = DEFAULT_SPLIT_TIMEOUT; From 31475b88110c4725b4f9a79c3a0d9bbf97e69e1c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 13 Nov 2025 13:21:47 +0100 Subject: [PATCH 399/543] s390/mm: Fix __ptep_rdp() inline assembly When a zero ASCE is passed to the __ptep_rdp() inline assembly, the generated instruction should have the R3 field of the instruction set to zero. However the inline assembly is written incorrectly: for such cases a zero is loaded into a register allocated by the compiler and this register is then used by the instruction. This means that selected TLB entries may not be flushed since the specified ASCE does not match the one which was used when the selected TLB entries were created. Fix this by removing the asce and opt parameters of __ptep_rdp(), since all callers always pass zero, and use a hard-coded register zero for the R3 field. Fixes: 0807b856521f ("s390/mm: add support for RDP (Reset DAT-Protection)") Cc: stable@vger.kernel.org Reviewed-by: Gerald Schaefer Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pgtable.h | 12 +++++------- arch/s390/mm/pgtable.c | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b7100c6a4054..6663f1619abb 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1154,17 +1154,15 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_NODAT 0x400 #define IPTE_GUEST_ASCE 0x800 -static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep, - unsigned long opt, unsigned long asce, - int local) +static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep, int local) { unsigned long pto; pto = __pa(ptep) & ~(PTRS_PER_PTE * sizeof(pte_t) - 1); - asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%[asce],%[m4]" + asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%%r0,%[m4]" : "+m" (*ptep) - : [r1] "a" (pto), [r2] "a" ((addr & PAGE_MASK) | opt), - [asce] "a" (asce), [m4] "i" (local)); + : [r1] "a" (pto), [r2] "a" (addr & PAGE_MASK), + [m4] "i" (local)); } static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, @@ -1348,7 +1346,7 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, * A local RDP can be used to do the flush. */ if (cpu_has_rdp() && !(pte_val(*ptep) & _PAGE_PROTECT)) - __ptep_rdp(address, ptep, 0, 0, 1); + __ptep_rdp(address, ptep, 1); } #define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 0fde20bbc50b..05974304d622 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -274,9 +274,9 @@ void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, preempt_disable(); atomic_inc(&mm->context.flush_count); if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __ptep_rdp(addr, ptep, 0, 0, 1); + __ptep_rdp(addr, ptep, 1); else - __ptep_rdp(addr, ptep, 0, 0, 0); + __ptep_rdp(addr, ptep, 0); /* * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That * means it is still valid and active, and must not be changed according From 14473a1f88596fd729e892782efc267c0097dd1d Mon Sep 17 00:00:00 2001 From: Nick Hu Date: Fri, 14 Nov 2025 15:28:44 +0800 Subject: [PATCH 400/543] irqchip/riscv-intc: Add missing free() callback in riscv_intc_domain_ops The irq_domain_free_irqs() helper requires that the irq_domain_ops->free callback is implemented. Otherwise, the kernel reports the warning message "NULL pointer, cannot free irq" when irq_dispose_mapping() is invoked to release the per-HART local interrupts. Set irq_domain_ops->free to irq_domain_free_irqs_top() to cure that. Fixes: 832f15f42646 ("RISC-V: Treat IPIs as normal Linux IRQs") Signed-off-by: Nick Hu Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251114-rv-intc-fix-v1-1-a3edd1c1a868@sifive.com --- drivers/irqchip/irq-riscv-intc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-riscv-intc.c b/drivers/irqchip/irq-riscv-intc.c index e5805885394e..70290b35b317 100644 --- a/drivers/irqchip/irq-riscv-intc.c +++ b/drivers/irqchip/irq-riscv-intc.c @@ -166,7 +166,8 @@ static int riscv_intc_domain_alloc(struct irq_domain *domain, static const struct irq_domain_ops riscv_intc_domain_ops = { .map = riscv_intc_domain_map, .xlate = irq_domain_xlate_onecell, - .alloc = riscv_intc_domain_alloc + .alloc = riscv_intc_domain_alloc, + .free = irq_domain_free_irqs_top, }; static struct fwnode_handle *riscv_intc_hwnode(void) From e0fd4d42e27f761e9cc82801b3f183e658dc749d Mon Sep 17 00:00:00 2001 From: Eslam Khafagy Date: Fri, 14 Nov 2025 14:27:39 +0200 Subject: [PATCH 401/543] posix-timers: Plug potential memory leak in do_timer_create() When posix timer creation is set to allocate a given timer ID and the access to the user space value faults, the function terminates without freeing the already allocated posix timer structure. Move the allocation after the user space access to cure that. [ tglx: Massaged change log ] Fixes: ec2d0c04624b3 ("posix-timers: Provide a mechanism to allocate a given timer ID") Reported-by: syzbot+9c47ad18f978d4394986@syzkaller.appspotmail.com Suggested-by: Cyrill Gorcunov Signed-off-by: Eslam Khafagy Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://patch.msgid.link/20251114122739.994326-1-eslam.medhat1993@gmail.com Closes: https://lore.kernel.org/all/69155df4.a70a0220.3124cb.0017.GAE@google.com/T/ --- kernel/time/posix-timers.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index aa3120104a51..56e17b625c72 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -475,12 +475,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (!kc->timer_create) return -EOPNOTSUPP; - new_timer = alloc_posix_timer(); - if (unlikely(!new_timer)) - return -EAGAIN; - - spin_lock_init(&new_timer->it_lock); - /* Special case for CRIU to restore timers with a given timer ID. */ if (unlikely(current->signal->timer_create_restore_ids)) { if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) @@ -490,6 +484,12 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, return -EINVAL; } + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + /* * Add the timer to the hash table. The timer is not yet valid * after insertion, but has a unique ID allocated. From ec33b59542d96830e3c89845ff833cf7b25ef172 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 13 Nov 2025 19:54:35 +0100 Subject: [PATCH 402/543] mm/mempool: fix poisoning order>0 pages with HIGHMEM The kernel test has reported: BUG: unable to handle page fault for address: fffba000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page *pde = 03171067 *pte = 00000000 Oops: Oops: 0002 [#1] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Tainted: G T 6.18.0-rc2-00031-gec7f31b2a2d3 #1 NONE a1d066dfe789f54bc7645c7989957d2bdee593ca Tainted: [T]=RANDSTRUCT Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 EIP: memset (arch/x86/include/asm/string_32.h:168 arch/x86/lib/memcpy_32.c:17) Code: a5 8b 4d f4 83 e1 03 74 02 f3 a4 83 c4 04 5e 5f 5d 2e e9 73 41 01 00 90 90 90 3e 8d 74 26 00 55 89 e5 57 56 89 c6 89 d0 89 f7 aa 89 f0 5e 5f 5d 2e e9 53 41 01 00 cc cc cc 55 89 e5 53 57 56 EAX: 0000006b EBX: 00000015 ECX: 001fefff EDX: 0000006b ESI: fffb9000 EDI: fffba000 EBP: c611fbf0 ESP: c611fbe8 DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068 EFLAGS: 00010287 CR0: 80050033 CR2: fffba000 CR3: 0316e000 CR4: 00040690 Call Trace: poison_element (mm/mempool.c:83 mm/mempool.c:102) mempool_init_node (mm/mempool.c:142 mm/mempool.c:226) mempool_init_noprof (mm/mempool.c:250 (discriminator 1)) ? mempool_alloc_pages (mm/mempool.c:640) bio_integrity_initfn (block/bio-integrity.c:483 (discriminator 8)) ? mempool_alloc_pages (mm/mempool.c:640) do_one_initcall (init/main.c:1283) Christoph found out this is due to the poisoning code not dealing properly with CONFIG_HIGHMEM because only the first page is mapped but then the whole potentially high-order page is accessed. We could give up on HIGHMEM here, but it's straightforward to fix this with a loop that's mapping, poisoning or checking and unmapping individual pages. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202511111411.9ebfa1ba-lkp@intel.com Analyzed-by: Christoph Hellwig Fixes: bdfedb76f4f5 ("mm, mempool: poison elements backed by slab allocator") Cc: stable@vger.kernel.org Tested-by: kernel test robot Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20251113-mempool-poison-v1-1-233b3ef984c3@suse.cz Signed-off-by: Vlastimil Babka --- mm/mempool.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 1c38e873e546..d7bbf1189db9 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -68,10 +68,20 @@ static void check_element(mempool_t *pool, void *element) } else if (pool->free == mempool_free_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; - void *addr = kmap_local_page((struct page *)element); - __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); - kunmap_local(addr); +#ifdef CONFIG_HIGHMEM + for (int i = 0; i < (1 << order); i++) { + struct page *page = (struct page *)element; + void *addr = kmap_local_page(page + i); + + __check_element(pool, addr, PAGE_SIZE); + kunmap_local(addr); + } +#else + void *addr = page_address((struct page *)element); + + __check_element(pool, addr, PAGE_SIZE << order); +#endif } } @@ -97,10 +107,20 @@ static void poison_element(mempool_t *pool, void *element) } else if (pool->alloc == mempool_alloc_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; - void *addr = kmap_local_page((struct page *)element); - __poison_element(addr, 1UL << (PAGE_SHIFT + order)); - kunmap_local(addr); +#ifdef CONFIG_HIGHMEM + for (int i = 0; i < (1 << order); i++) { + struct page *page = (struct page *)element; + void *addr = kmap_local_page(page + i); + + __poison_element(addr, PAGE_SIZE); + kunmap_local(addr); + } +#else + void *addr = page_address((struct page *)element); + + __poison_element(addr, PAGE_SIZE << order); +#endif } } #else /* CONFIG_SLUB_DEBUG_ON */ From 4ef92743625818932b9c320152b58274c05e5053 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Nov 2025 12:55:16 +0000 Subject: [PATCH 403/543] bpf: Add bpf_prog_run_data_pointers() syzbot found that cls_bpf_classify() is able to change tc_skb_cb(skb)->drop_reason triggering a warning in sk_skb_reason_drop(). WARNING: CPU: 0 PID: 5965 at net/core/skbuff.c:1192 __sk_skb_reason_drop net/core/skbuff.c:1189 [inline] WARNING: CPU: 0 PID: 5965 at net/core/skbuff.c:1192 sk_skb_reason_drop+0x76/0x170 net/core/skbuff.c:1214 struct tc_skb_cb has been added in commit ec624fe740b4 ("net/sched: Extend qdisc control block with tc control block"), which added a wrong interaction with db58ba459202 ("bpf: wire in data and data_end for cls_act_bpf"). drop_reason was added later. Add bpf_prog_run_data_pointers() helper to save/restore the net_sched storage colliding with BPF data_meta/data_end. Fixes: ec624fe740b4 ("net/sched: Extend qdisc control block with tc control block") Reported-by: syzbot Closes: https://lore.kernel.org/netdev/6913437c.a70a0220.22f260.013b.GAE@google.com/ Signed-off-by: Eric Dumazet Signed-off-by: Martin KaFai Lau Reviewed-by: Victor Nogueira Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20251112125516.1563021-1-edumazet@google.com --- include/linux/filter.h | 20 ++++++++++++++++++++ net/sched/act_bpf.c | 6 ++---- net/sched/cls_bpf.c | 6 ++---- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index f5c859b8131a..973233b82dc1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -901,6 +901,26 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb) cb->data_end = skb->data + skb_headlen(skb); } +static inline int bpf_prog_run_data_pointers( + const struct bpf_prog *prog, + struct sk_buff *skb) +{ + struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; + void *save_data_meta, *save_data_end; + int res; + + save_data_meta = cb->data_meta; + save_data_end = cb->data_end; + + bpf_compute_data_pointers(skb); + res = bpf_prog_run(prog, skb); + + cb->data_meta = save_data_meta; + cb->data_end = save_data_end; + + return res; +} + /* Similar to bpf_compute_data_pointers(), except that save orginal * data in cb->data and cb->meta_data for restore. */ diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 396b576390d0..c2b5bc19e091 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -47,12 +47,10 @@ TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(filter, skb); + filter_res = bpf_prog_run_data_pointers(filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(filter, skb); + filter_res = bpf_prog_run_data_pointers(filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 7fbe42f0e5c2..a32754a2658b 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -97,12 +97,10 @@ TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(prog->filter, skb); + filter_res = bpf_prog_run_data_pointers(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(prog->filter, skb); + filter_res = bpf_prog_run_data_pointers(prog->filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; From 4c5376b4b143c4834ebd392aef2215847752b16a Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Wed, 22 Oct 2025 19:47:20 +0800 Subject: [PATCH 404/543] drm/tegra: dc: Fix reference leak in tegra_dc_couple() driver_find_device() calls get_device() to increment the reference count once a matching device is found, but there is no put_device() to balance the reference count. To avoid reference count leakage, add put_device() to decrease the reference count. Found by code review. Cc: stable@vger.kernel.org Fixes: a31500fe7055 ("drm/tegra: dc: Restore coupling of display controllers") Signed-off-by: Ma Ke Acked-by: Mikko Perttunen Signed-off-by: Thierry Reding Link: https://patch.msgid.link/20251022114720.24937-1-make24@iscas.ac.cn --- drivers/gpu/drm/tegra/dc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c index 59d5c1ba145a..6c84bd69b11f 100644 --- a/drivers/gpu/drm/tegra/dc.c +++ b/drivers/gpu/drm/tegra/dc.c @@ -3148,6 +3148,7 @@ static int tegra_dc_couple(struct tegra_dc *dc) dc->client.parent = &parent->client; dev_dbg(dc->dev, "coupled to %s\n", dev_name(companion)); + put_device(companion); } return 0; From 6cbab9f0da72b4dc3c3f9161197aa3b9daa1fa3a Mon Sep 17 00:00:00 2001 From: Prateek Agarwal Date: Fri, 19 Sep 2025 13:25:40 +0900 Subject: [PATCH 405/543] drm/tegra: Add call to put_pid() Add a call to put_pid() corresponding to get_task_pid(). host1x_memory_context_alloc() does not take ownership of the PID so we need to free it here to avoid leaking. Signed-off-by: Prateek Agarwal Fixes: e09db97889ec ("drm/tegra: Support context isolation") [mperttunen@nvidia.com: reword commit message] Signed-off-by: Mikko Perttunen Signed-off-by: Thierry Reding Link: https://patch.msgid.link/20250919-host1x-put-pid-v1-1-19c2163dfa87@nvidia.com --- drivers/gpu/drm/tegra/uapi.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/tegra/uapi.c b/drivers/gpu/drm/tegra/uapi.c index 5adab6b22916..d0b6a1fa6efa 100644 --- a/drivers/gpu/drm/tegra/uapi.c +++ b/drivers/gpu/drm/tegra/uapi.c @@ -114,9 +114,12 @@ int tegra_drm_ioctl_channel_open(struct drm_device *drm, void *data, struct drm_ if (err) goto put_channel; - if (supported) + if (supported) { + struct pid *pid = get_task_pid(current, PIDTYPE_TGID); context->memory_context = host1x_memory_context_alloc( - host, client->base.dev, get_task_pid(current, PIDTYPE_TGID)); + host, client->base.dev, pid); + put_pid(pid); + } if (IS_ERR(context->memory_context)) { if (PTR_ERR(context->memory_context) != -EOPNOTSUPP) { From 660b299bed2a2a55a1f9102d029549d0235f881c Mon Sep 17 00:00:00 2001 From: Diogo Ivo Date: Mon, 3 Nov 2025 14:14:15 +0000 Subject: [PATCH 406/543] Revert "drm/tegra: dsi: Clear enable register if powered by bootloader" Commit b6bcbce33596 ("soc/tegra: pmc: Ensure power-domains are in a known state") was introduced so that all power domains get initialized to a known working state when booting and it does this by shutting them down (including asserting resets and disabling clocks) before registering each power domain with the genpd framework, leaving it to each driver to later on power its needed domains. This caused the Google Pixel C to hang when booting due to a workaround in the DSI driver introduced in commit b22fd0b9639e ("drm/tegra: dsi: Clear enable register if powered by bootloader") meant to handle the case where the bootloader enabled the DSI hardware module. The workaround relies on reading a hardware register to determine the current status and after b6bcbce33596 that now happens in a powered down state thus leading to the boot hang. Fix this by reverting b22fd0b9639e since currently we are guaranteed that the hardware will be fully reset by the time we start enabling the DSI module. Fixes: b6bcbce33596 ("soc/tegra: pmc: Ensure power-domains are in a known state") Cc: stable@vger.kernel.org Signed-off-by: Diogo Ivo Signed-off-by: Thierry Reding Link: https://patch.msgid.link/20251103-diogo-smaug_ec_typec-v1-1-be656ccda391@tecnico.ulisboa.pt --- drivers/gpu/drm/tegra/dsi.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/gpu/drm/tegra/dsi.c b/drivers/gpu/drm/tegra/dsi.c index b5089b772267..ddfb2858acbf 100644 --- a/drivers/gpu/drm/tegra/dsi.c +++ b/drivers/gpu/drm/tegra/dsi.c @@ -913,15 +913,6 @@ static void tegra_dsi_encoder_enable(struct drm_encoder *encoder) u32 value; int err; - /* If the bootloader enabled DSI it needs to be disabled - * in order for the panel initialization commands to be - * properly sent. - */ - value = tegra_dsi_readl(dsi, DSI_POWER_CONTROL); - - if (value & DSI_POWER_CONTROL_ENABLE) - tegra_dsi_disable(dsi); - err = tegra_dsi_prepare(dsi); if (err < 0) { dev_err(dsi->dev, "failed to prepare: %d\n", err); From b0c8e6d3d866b6a7f73877f71968dbffd27b7785 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 13 Nov 2025 18:57:29 -0800 Subject: [PATCH 407/543] bpf: account for current allocated stack depth in widen_imprecise_scalars() The usage pattern for widen_imprecise_scalars() looks as follows: prev_st = find_prev_entry(env, ...); queued_st = push_stack(...); widen_imprecise_scalars(env, prev_st, queued_st); Where prev_st is an ancestor of the queued_st in the explored states tree. This ancestor is not guaranteed to have same allocated stack depth as queued_st. E.g. in the following case: def main(): for i in 1..2: foo(i) // same callsite, differnt param def foo(i): if i == 1: use 128 bytes of stack iterator based loop Here, for a second 'foo' call prev_st->allocated_stack is 128, while queued_st->allocated_stack is much smaller. widen_imprecise_scalars() needs to take this into account and avoid accessing bpf_verifier_state->frame[*]->stack out of bounds. Fixes: 2793a8b015f7 ("bpf: exact states comparison for iterator convergence checks") Reported-by: Emil Tsalapatis Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251114025730.772723-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8314518c8d93..fbe4bb91c564 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8866,7 +8866,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) { struct bpf_func_state *fold, *fcur; - int i, fr; + int i, fr, num_slots; reset_idmap_scratch(env); for (fr = old->curframe; fr >= 0; fr--) { @@ -8879,7 +8879,9 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, &fcur->regs[i], &env->idmap_scratch); - for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) { + num_slots = min(fold->allocated_stack / BPF_REG_SIZE, + fcur->allocated_stack / BPF_REG_SIZE); + for (i = 0; i < num_slots; i++) { if (!is_spilled_reg(&fold->stack[i]) || !is_spilled_reg(&fcur->stack[i])) continue; From 6c762611fed7365790000925f3d14f20037d0061 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 13 Nov 2025 18:57:30 -0800 Subject: [PATCH 408/543] selftests/bpf: Test widen_imprecise_scalars() with different stack depth A test case for a situation when widen_imprecise_scalars() is called with old->allocated_stack > cur->allocated_stack. Test structure: def widening_stack_size_bug(): r1 = 0 for r6 in 0..1: iterator_with_diff_stack_depth(r1) r1 = 42 def iterator_with_diff_stack_depth(r1): if r1 != 42: use 128 bytes of stack iterator based loop iterator_with_diff_stack_depth() is verified with r1 == 0 first and r1 == 42 next. Causing stack usage of 128 bytes on a first visit and 8 bytes on a second. Such arrangement triggered a KASAN error in widen_imprecise_scalars(). Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251114025730.772723-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/iters_looping.c | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/iters_looping.c b/tools/testing/selftests/bpf/progs/iters_looping.c index 05fa5ce7fc59..d00fd570255a 100644 --- a/tools/testing/selftests/bpf/progs/iters_looping.c +++ b/tools/testing/selftests/bpf/progs/iters_looping.c @@ -161,3 +161,56 @@ int simplest_loop(void *ctx) return 0; } + +__used +static void iterator_with_diff_stack_depth(int x) +{ + struct bpf_iter_num iter; + + asm volatile ( + "if r1 == 42 goto 0f;" + "*(u64 *)(r10 - 128) = 0;" + "0:" + /* create iterator */ + "r1 = %[iter];" + "r2 = 0;" + "r3 = 10;" + "call %[bpf_iter_num_new];" + "1:" + /* consume next item */ + "r1 = %[iter];" + "call %[bpf_iter_num_next];" + "if r0 == 0 goto 2f;" + "goto 1b;" + "2:" + /* destroy iterator */ + "r1 = %[iter];" + "call %[bpf_iter_num_destroy];" + : + : __imm_ptr(iter), ITER_HELPERS + : __clobber_common, "r6" + ); +} + +SEC("socket") +__success +__naked int widening_stack_size_bug(void *ctx) +{ + /* + * Depending on iterator_with_diff_stack_depth() parameter value, + * subprogram stack depth is either 8 or 128 bytes. Arrange values so + * that it is 128 on a first call and 8 on a second. This triggered a + * bug in verifier's widen_imprecise_scalars() logic. + */ + asm volatile ( + "r6 = 0;" + "r1 = 0;" + "1:" + "call iterator_with_diff_stack_depth;" + "r1 = 42;" + "r6 += 1;" + "if r6 < 2 goto 1b;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} From ec0ca4be116ad7efb08cd23acc1ff29b04d9cf52 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 31 Oct 2025 14:50:39 +0100 Subject: [PATCH 409/543] MAINTAINERS: Update Krzysztof Kozlowski's email Update Krzysztof Kozlowski's email address in mailmap to stay reachable. Link: https://patch.msgid.link/20251021095426.86549-2-krzysztof.kozlowski@linaro.org Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20251031135041.78789-2-krzysztof.kozlowski@linaro.org Signed-off-by: Arnd Bergmann --- .mailmap | 1 + MAINTAINERS | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 369cfe467932..ccec2aeb28f9 100644 --- a/.mailmap +++ b/.mailmap @@ -437,6 +437,7 @@ Krishna Manikandan Krzysztof Kozlowski Krzysztof Kozlowski Krzysztof Kozlowski +Krzysztof Kozlowski Krzysztof Wilczyński Krzysztof Wilczyński Kshitiz Godara diff --git a/MAINTAINERS b/MAINTAINERS index ddecf1ef3bed..29f86c9aa27b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16204,7 +16204,7 @@ MEMORY CONTROLLER DRIVERS M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org S: Maintained -B: mailto:krzysztof.kozlowski@linaro.org +B: mailto:krzk@kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git F: Documentation/devicetree/bindings/memory-controllers/ F: drivers/memory/ @@ -21177,7 +21177,7 @@ F: Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml F: drivers/i2c/busses/i2c-qcom-cci.c QUALCOMM INTERCONNECT BWMON DRIVER -M: Krzysztof Kozlowski +M: Krzysztof Kozlowski L: linux-arm-msm@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/interconnect/qcom,msm8998-bwmon.yaml From e6751b0b19a6baab219a62e1e302b8aa6b5a55b2 Mon Sep 17 00:00:00 2001 From: Pavel Zhigulin Date: Thu, 13 Nov 2025 16:57:44 +0300 Subject: [PATCH 410/543] net: dsa: hellcreek: fix missing error handling in LED registration The LED setup routine registered both led_sync_good and led_is_gm devices without checking the return values of led_classdev_register(). If either registration failed, the function continued silently, leaving the driver in a partially-initialized state and leaking a registered LED classdev. Add proper error handling Fixes: 7d9ee2e8ff15 ("net: dsa: hellcreek: Add PTP status LEDs") Signed-off-by: Pavel Zhigulin Reviewed-by: Andrew Lunn Acked-by: Kurt Kanzenbach Link: https://patch.msgid.link/20251113135745.92375-1-Pavel.Zhigulin@kaspersky.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/hirschmann/hellcreek_ptp.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/hirschmann/hellcreek_ptp.c b/drivers/net/dsa/hirschmann/hellcreek_ptp.c index bfe21f9f7dcd..cb23bea9c21b 100644 --- a/drivers/net/dsa/hirschmann/hellcreek_ptp.c +++ b/drivers/net/dsa/hirschmann/hellcreek_ptp.c @@ -376,8 +376,18 @@ static int hellcreek_led_setup(struct hellcreek *hellcreek) hellcreek_set_brightness(hellcreek, STATUS_OUT_IS_GM, 1); /* Register both leds */ - led_classdev_register(hellcreek->dev, &hellcreek->led_sync_good); - led_classdev_register(hellcreek->dev, &hellcreek->led_is_gm); + ret = led_classdev_register(hellcreek->dev, &hellcreek->led_sync_good); + if (ret) { + dev_err(hellcreek->dev, "Failed to register sync_good LED\n"); + goto out; + } + + ret = led_classdev_register(hellcreek->dev, &hellcreek->led_is_gm); + if (ret) { + dev_err(hellcreek->dev, "Failed to register is_gm LED\n"); + led_classdev_unregister(&hellcreek->led_sync_good); + goto out; + } ret = 0; From b0c959fec18f4595a6a6317ffc30615cfa37bf69 Mon Sep 17 00:00:00 2001 From: Pavel Zhigulin Date: Thu, 13 Nov 2025 19:19:21 +0300 Subject: [PATCH 411/543] net: mlxsw: linecards: fix missing error check in mlxsw_linecard_devlink_info_get() The call to devlink_info_version_fixed_put() in mlxsw_linecard_devlink_info_get() did not check for errors, although it is checked everywhere in the code. Add missed 'err' check to the mlxsw_linecard_devlink_info_get() Fixes: 3fc0c51905fb ("mlxsw: core_linecards: Expose device PSID over device info") Signed-off-by: Pavel Zhigulin Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20251113161922.813828-1-Pavel.Zhigulin@kaspersky.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/core_linecards.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c index b032d5a4b3b8..10f5bc4892fc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c @@ -601,6 +601,8 @@ int mlxsw_linecard_devlink_info_get(struct mlxsw_linecard *linecard, err = devlink_info_version_fixed_put(req, DEVLINK_INFO_VERSION_GENERIC_FW_PSID, info->psid); + if (err) + goto unlock; sprintf(buf, "%u.%u.%u", info->fw_major, info->fw_minor, info->fw_sub_minor); From 035bca3f017ee9dea3a5a756e77a6f7138cc6eea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 13 Nov 2025 10:39:24 +0000 Subject: [PATCH 412/543] mptcp: fix race condition in mptcp_schedule_work() syzbot reported use-after-free in mptcp_schedule_work() [1] Issue here is that mptcp_schedule_work() schedules a work, then gets a refcount on sk->sk_refcnt if the work was scheduled. This refcount will be released by mptcp_worker(). [A] if (schedule_work(...)) { [B] sock_hold(sk); return true; } Problem is that mptcp_worker() can run immediately and complete before [B] We need instead : sock_hold(sk); if (schedule_work(...)) return true; sock_put(sk); [1] refcount_t: addition on 0; use-after-free. WARNING: CPU: 1 PID: 29 at lib/refcount.c:25 refcount_warn_saturate+0xfa/0x1d0 lib/refcount.c:25 Call Trace: __refcount_add include/linux/refcount.h:-1 [inline] __refcount_inc include/linux/refcount.h:366 [inline] refcount_inc include/linux/refcount.h:383 [inline] sock_hold include/net/sock.h:816 [inline] mptcp_schedule_work+0x164/0x1a0 net/mptcp/protocol.c:943 mptcp_tout_timer+0x21/0xa0 net/mptcp/protocol.c:2316 call_timer_fn+0x17e/0x5f0 kernel/time/timer.c:1747 expire_timers kernel/time/timer.c:1798 [inline] __run_timers kernel/time/timer.c:2372 [inline] __run_timer_base+0x648/0x970 kernel/time/timer.c:2384 run_timer_base kernel/time/timer.c:2393 [inline] run_timer_softirq+0xb7/0x180 kernel/time/timer.c:2403 handle_softirqs+0x22f/0x710 kernel/softirq.c:622 __do_softirq kernel/softirq.c:656 [inline] run_ktimerd+0xcf/0x190 kernel/softirq.c:1138 smpboot_thread_fn+0x542/0xa60 kernel/smpboot.c:160 kthread+0x711/0x8a0 kernel/kthread.c:463 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Cc: stable@vger.kernel.org Fixes: 3b1d6210a957 ("mptcp: implement and use MPTCP-level retransmission") Reported-by: syzbot+355158e7e301548a1424@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6915b46f.050a0220.3565dc.0028.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251113103924.3737425-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2d6b8de35c44..e27e0fe2460f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -935,14 +935,19 @@ static void mptcp_reset_rtx_timer(struct sock *sk) bool mptcp_schedule_work(struct sock *sk) { - if (inet_sk_state_load(sk) != TCP_CLOSE && - schedule_work(&mptcp_sk(sk)->work)) { - /* each subflow already holds a reference to the sk, and the - * workqueue is invoked by a subflow, so sk can't go away here. - */ - sock_hold(sk); + if (inet_sk_state_load(sk) == TCP_CLOSE) + return false; + + /* Get a reference on this socket, mptcp_worker() will release it. + * As mptcp_worker() might complete before us, we can not avoid + * a sock_hold()/sock_put() if schedule_work() returns false. + */ + sock_hold(sk); + + if (schedule_work(&mptcp_sk(sk)->work)) return true; - } + + sock_put(sk); return false; } From dfe28c4167a9259fc0c372d9f9473e1ac95cff67 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 12 Nov 2025 12:14:03 +0100 Subject: [PATCH 413/543] net: openvswitch: remove never-working support for setting nsh fields The validation of the set(nsh(...)) action is completely wrong. It runs through the nsh_key_put_from_nlattr() function that is the same function that validates NSH keys for the flow match and the push_nsh() action. However, the set(nsh(...)) has a very different memory layout. Nested attributes in there are doubled in size in case of the masked set(). That makes proper validation impossible. There is also confusion in the code between the 'masked' flag, that says that the nested attributes are doubled in size containing both the value and the mask, and the 'is_mask' that says that the value we're parsing is the mask. This is causing kernel crash on trying to write into mask part of the match with SW_FLOW_KEY_PUT() during validation, while validate_nsh() doesn't allocate any memory for it: BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 1c2383067 P4D 1c2383067 PUD 20b703067 PMD 0 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 8 UID: 0 Kdump: loaded Not tainted 6.17.0-rc4+ #107 PREEMPT(voluntary) RIP: 0010:nsh_key_put_from_nlattr+0x19d/0x610 [openvswitch] Call Trace: validate_nsh+0x60/0x90 [openvswitch] validate_set.constprop.0+0x270/0x3c0 [openvswitch] __ovs_nla_copy_actions+0x477/0x860 [openvswitch] ovs_nla_copy_actions+0x8d/0x100 [openvswitch] ovs_packet_cmd_execute+0x1cc/0x310 [openvswitch] genl_family_rcv_msg_doit+0xdb/0x130 genl_family_rcv_msg+0x14b/0x220 genl_rcv_msg+0x47/0xa0 netlink_rcv_skb+0x53/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x280/0x3b0 netlink_sendmsg+0x1f7/0x430 ____sys_sendmsg+0x36b/0x3a0 ___sys_sendmsg+0x87/0xd0 __sys_sendmsg+0x6d/0xd0 do_syscall_64+0x7b/0x2c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e The third issue with this process is that while trying to convert the non-masked set into masked one, validate_set() copies and doubles the size of the OVS_KEY_ATTR_NSH as if it didn't have any nested attributes. It should be copying each nested attribute and doubling them in size independently. And the process must be properly reversed during the conversion back from masked to a non-masked variant during the flow dump. In the end, the only two outcomes of trying to use this action are either validation failure or a kernel crash. And if somehow someone manages to install a flow with such an action, it will most definitely not do what it is supposed to, since all the keys and the masks are mixed up. Fixing all the issues is a complex task as it requires re-writing most of the validation code. Given that and the fact that this functionality never worked since introduction, let's just remove it altogether. It's better to re-introduce it later with a proper implementation instead of trying to fix it in stable releases. Fixes: b2d0f5d5dc53 ("openvswitch: enable NSH support") Reported-by: Junvy Yang Signed-off-by: Ilya Maximets Acked-by: Eelco Chaudron Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20251112112246.95064-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- net/openvswitch/actions.c | 68 +--------------------------------- net/openvswitch/flow_netlink.c | 64 ++++---------------------------- net/openvswitch/flow_netlink.h | 2 - 3 files changed, 9 insertions(+), 125 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2832e0794197..792ca44a461d 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -572,69 +572,6 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key, - const struct nlattr *a) -{ - struct nshhdr *nh; - size_t length; - int err; - u8 flags; - u8 ttl; - int i; - - struct ovs_key_nsh key; - struct ovs_key_nsh mask; - - err = nsh_key_from_nlattr(a, &key, &mask); - if (err) - return err; - - /* Make sure the NSH base header is there */ - if (!pskb_may_pull(skb, skb_network_offset(skb) + NSH_BASE_HDR_LEN)) - return -ENOMEM; - - nh = nsh_hdr(skb); - length = nsh_hdr_len(nh); - - /* Make sure the whole NSH header is there */ - err = skb_ensure_writable(skb, skb_network_offset(skb) + - length); - if (unlikely(err)) - return err; - - nh = nsh_hdr(skb); - skb_postpull_rcsum(skb, nh, length); - flags = nsh_get_flags(nh); - flags = OVS_MASKED(flags, key.base.flags, mask.base.flags); - flow_key->nsh.base.flags = flags; - ttl = nsh_get_ttl(nh); - ttl = OVS_MASKED(ttl, key.base.ttl, mask.base.ttl); - flow_key->nsh.base.ttl = ttl; - nsh_set_flags_and_ttl(nh, flags, ttl); - nh->path_hdr = OVS_MASKED(nh->path_hdr, key.base.path_hdr, - mask.base.path_hdr); - flow_key->nsh.base.path_hdr = nh->path_hdr; - switch (nh->mdtype) { - case NSH_M_TYPE1: - for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) { - nh->md1.context[i] = - OVS_MASKED(nh->md1.context[i], key.context[i], - mask.context[i]); - } - memcpy(flow_key->nsh.context, nh->md1.context, - sizeof(nh->md1.context)); - break; - case NSH_M_TYPE2: - memset(flow_key->nsh.context, 0, - sizeof(flow_key->nsh.context)); - break; - default: - return -EINVAL; - } - skb_postpush_rcsum(skb, nh, length); - return 0; -} - /* Must follow skb_ensure_writable() since that can move the skb data. */ static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) @@ -1130,10 +1067,6 @@ static int execute_masked_set_action(struct sk_buff *skb, get_mask(a, struct ovs_key_ethernet *)); break; - case OVS_KEY_ATTR_NSH: - err = set_nsh(skb, flow_key, a); - break; - case OVS_KEY_ATTR_IPV4: err = set_ipv4(skb, flow_key, nla_data(a), get_mask(a, struct ovs_key_ipv4 *)); @@ -1170,6 +1103,7 @@ static int execute_masked_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_CT_LABELS: case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4: case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6: + case OVS_KEY_ATTR_NSH: err = -EINVAL; break; } diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ad64bb9ab5e2..1cb4f97335d8 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1305,6 +1305,11 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, return 0; } +/* + * Constructs NSH header 'nh' from attributes of OVS_ACTION_ATTR_PUSH_NSH, + * where 'nh' points to a memory block of 'size' bytes. It's assumed that + * attributes were previously validated with validate_push_nsh(). + */ int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh, size_t size) { @@ -1314,8 +1319,6 @@ int nsh_hdr_from_nlattr(const struct nlattr *attr, u8 ttl = 0; int mdlen = 0; - /* validate_nsh has check this, so we needn't do duplicate check here - */ if (size < NSH_BASE_HDR_LEN) return -ENOBUFS; @@ -1359,46 +1362,6 @@ int nsh_hdr_from_nlattr(const struct nlattr *attr, return 0; } -int nsh_key_from_nlattr(const struct nlattr *attr, - struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask) -{ - struct nlattr *a; - int rem; - - /* validate_nsh has check this, so we needn't do duplicate check here - */ - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - - switch (type) { - case OVS_NSH_KEY_ATTR_BASE: { - const struct ovs_nsh_key_base *base = nla_data(a); - const struct ovs_nsh_key_base *base_mask = base + 1; - - nsh->base = *base; - nsh_mask->base = *base_mask; - break; - } - case OVS_NSH_KEY_ATTR_MD1: { - const struct ovs_nsh_key_md1 *md1 = nla_data(a); - const struct ovs_nsh_key_md1 *md1_mask = md1 + 1; - - memcpy(nsh->context, md1->context, sizeof(*md1)); - memcpy(nsh_mask->context, md1_mask->context, - sizeof(*md1_mask)); - break; - } - case OVS_NSH_KEY_ATTR_MD2: - /* Not supported yet */ - return -ENOTSUPP; - default: - return -EINVAL; - } - } - - return 0; -} - static int nsh_key_put_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool is_push_nsh, bool log) @@ -2839,17 +2802,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, return err; } -static bool validate_nsh(const struct nlattr *attr, bool is_mask, - bool is_push_nsh, bool log) +static bool validate_push_nsh(const struct nlattr *attr, bool log) { struct sw_flow_match match; struct sw_flow_key key; - int ret = 0; ovs_match_init(&match, &key, true, NULL); - ret = nsh_key_put_from_nlattr(attr, &match, is_mask, - is_push_nsh, log); - return !ret; + return !nsh_key_put_from_nlattr(attr, &match, false, true, log); } /* Return false if there are any non-masked bits set. @@ -2997,13 +2956,6 @@ static int validate_set(const struct nlattr *a, break; - case OVS_KEY_ATTR_NSH: - if (eth_type != htons(ETH_P_NSH)) - return -EINVAL; - if (!validate_nsh(nla_data(a), masked, false, log)) - return -EINVAL; - break; - default: return -EINVAL; } @@ -3437,7 +3389,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return -EINVAL; } mac_proto = MAC_PROTO_NONE; - if (!validate_nsh(nla_data(a), false, true, true)) + if (!validate_push_nsh(nla_data(a), log)) return -EINVAL; break; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index fe7f77fc5f18..ff8cdecbe346 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -65,8 +65,6 @@ int ovs_nla_put_actions(const struct nlattr *attr, void ovs_nla_free_flow_actions(struct sw_flow_actions *); void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); -int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh, - struct ovs_key_nsh *nsh_mask); int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh, size_t size); From 5442a9da69789741bfda39f34ee7f69552bf0c56 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 12 Nov 2025 14:13:52 +0100 Subject: [PATCH 414/543] veth: more robust handing of race to avoid txq getting stuck Commit dc82a33297fc ("veth: apply qdisc backpressure on full ptr_ring to reduce TX drops") introduced a race condition that can lead to a permanently stalled TXQ. This was observed in production on ARM64 systems (Ampere Altra Max). The race occurs in veth_xmit(). The producer observes a full ptr_ring and stops the queue (netif_tx_stop_queue()). The subsequent conditional logic, intended to re-wake the queue if the consumer had just emptied it (if (__ptr_ring_empty(...)) netif_tx_wake_queue()), can fail. This leads to a "lost wakeup" where the TXQ remains stopped (QUEUE_STATE_DRV_XOFF) and traffic halts. This failure is caused by an incorrect use of the __ptr_ring_empty() API from the producer side. As noted in kernel comments, this check is not guaranteed to be correct if a consumer is operating on another CPU. The empty test is based on ptr_ring->consumer_head, making it reliable only for the consumer. Using this check from the producer side is fundamentally racy. This patch fixes the race by adopting the more robust logic from an earlier version V4 of the patchset, which always flushed the peer: (1) In veth_xmit(), the racy conditional wake-up logic and its memory barrier are removed. Instead, after stopping the queue, we unconditionally call __veth_xdp_flush(rq). This guarantees that the NAPI consumer is scheduled, making it solely responsible for re-waking the TXQ. This handles the race where veth_poll() consumes all packets and completes NAPI *before* veth_xmit() on the producer side has called netif_tx_stop_queue. The __veth_xdp_flush(rq) will observe rx_notify_masked is false and schedule NAPI. (2) On the consumer side, the logic for waking the peer TXQ is moved out of veth_xdp_rcv() and placed at the end of the veth_poll() function. This placement is part of fixing the race, as the netif_tx_queue_stopped() check must occur after rx_notify_masked is potentially set to false during NAPI completion. This handles the race where veth_poll() consumes all packets, but haven't finished (rx_notify_masked is still true). The producer veth_xmit() stops the TXQ and __veth_xdp_flush(rq) will observe rx_notify_masked is true, meaning not starting NAPI. Then veth_poll() change rx_notify_masked to false and stops NAPI. Before exiting veth_poll() will observe TXQ is stopped and wake it up. Fixes: dc82a33297fc ("veth: apply qdisc backpressure on full ptr_ring to reduce TX drops") Reviewed-by: Toshiaki Makita Signed-off-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/176295323282.307447.14790015927673763094.stgit@firesoul Signed-off-by: Jakub Kicinski --- drivers/net/veth.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index a3046142cb8e..35dd89aff4a9 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -392,14 +392,12 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) } /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */ __skb_push(skb, ETH_HLEN); - /* Depend on prior success packets started NAPI consumer via - * __veth_xdp_flush(). Cancel TXQ stop if consumer stopped, - * paired with empty check in veth_poll(). - */ netif_tx_stop_queue(txq); - smp_mb__after_atomic(); - if (unlikely(__ptr_ring_empty(&rq->xdp_ring))) - netif_tx_wake_queue(txq); + /* Makes sure NAPI peer consumer runs. Consumer is responsible + * for starting txq again, until then ndo_start_xmit (this + * function) will not be invoked by the netstack again. + */ + __veth_xdp_flush(rq); break; case NET_RX_DROP: /* same as NET_XMIT_DROP */ drop: @@ -900,17 +898,9 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { - struct veth_priv *priv = netdev_priv(rq->dev); - int queue_idx = rq->xdp_rxq.queue_index; - struct netdev_queue *peer_txq; - struct net_device *peer_dev; int i, done = 0, n_xdpf = 0; void *xdpf[VETH_XDP_BATCH]; - /* NAPI functions as RCU section */ - peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); - peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; - for (i = 0; i < budget; i++) { void *ptr = __ptr_ring_consume(&rq->xdp_ring); @@ -959,9 +949,6 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, rq->stats.vs.xdp_packets += done; u64_stats_update_end(&rq->stats.syncp); - if (peer_txq && unlikely(netif_tx_queue_stopped(peer_txq))) - netif_tx_wake_queue(peer_txq); - return done; } @@ -969,12 +956,20 @@ static int veth_poll(struct napi_struct *napi, int budget) { struct veth_rq *rq = container_of(napi, struct veth_rq, xdp_napi); + struct veth_priv *priv = netdev_priv(rq->dev); + int queue_idx = rq->xdp_rxq.queue_index; + struct netdev_queue *peer_txq; struct veth_stats stats = {}; + struct net_device *peer_dev; struct veth_xdp_tx_bq bq; int done; bq.count = 0; + /* NAPI functions as RCU section */ + peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); + peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; + xdp_set_return_frame_no_direct(); done = veth_xdp_rcv(rq, budget, &bq, &stats); @@ -996,6 +991,13 @@ static int veth_poll(struct napi_struct *napi, int budget) veth_xdp_flush(rq, &bq); xdp_clear_return_frame_no_direct(); + /* Release backpressure per NAPI poll */ + smp_rmb(); /* Paired with netif_tx_stop_queue set_bit */ + if (peer_txq && netif_tx_queue_stopped(peer_txq)) { + txq_trans_cond_update(peer_txq); + netif_tx_wake_queue(peer_txq); + } + return done; } From 39231e8d6ba7f794b566fd91ebd88c0834a23b98 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Fri, 14 Nov 2025 22:49:20 +0100 Subject: [PATCH 415/543] mm: fix MAX_FOLIO_ORDER on powerpc configs with hugetlb In the past, CONFIG_ARCH_HAS_GIGANTIC_PAGE indicated that we support runtime allocation of gigantic hugetlb folios. In the meantime it evolved into a generic way for the architecture to state that it supports gigantic hugetlb folios. In commit fae7d834c43c ("mm: add __dump_folio()") we started using CONFIG_ARCH_HAS_GIGANTIC_PAGE to decide MAX_FOLIO_ORDER: whether we could have folios larger than what the buddy can handle. In the context of that commit, we started using MAX_FOLIO_ORDER to detect page corruptions when dumping tail pages of folios. Before that commit, we assumed that we cannot have folios larger than the highest buddy order, which was obviously wrong. In commit 7b4f21f5e038 ("mm/hugetlb: check for unreasonable folio sizes when registering hstate"), we used MAX_FOLIO_ORDER to detect inconsistencies, and in fact, we found some now. Powerpc allows for configs that can allocate gigantic folio during boot (not at runtime), that do not set CONFIG_ARCH_HAS_GIGANTIC_PAGE and can exceed PUD_ORDER. To fix it, let's make powerpc select CONFIG_ARCH_HAS_GIGANTIC_PAGE with hugetlb on powerpc, and increase the maximum folio size with hugetlb to 16 GiB on 64bit (possible on arm64 and powerpc) and 1 GiB on 32 bit (powerpc). Note that on some powerpc configurations, whether we actually have gigantic pages depends on the setting of CONFIG_ARCH_FORCE_MAX_ORDER, but there is nothing really problematic about setting it unconditionally: we just try to keep the value small so we can better detect problems in __dump_folio() and inconsistencies around the expected largest folio in the system. Ideally, we'd have a better way to obtain the maximum hugetlb folio size and detect ourselves whether we really end up with gigantic folios. Let's defer bigger changes and fix the warnings first. While at it, handle gigantic DAX folios more clearly: DAX can only end up creating gigantic folios with HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD. Add a new Kconfig option HAVE_GIGANTIC_FOLIOS to make both cases clearer. In particular, worry about ARCH_HAS_GIGANTIC_PAGE only with HUGETLB_PAGE. Note: with enabling CONFIG_ARCH_HAS_GIGANTIC_PAGE on powerpc, we will now also allow for runtime allocations of folios in some more powerpc configs. I don't think this is a problem, but if it is we could handle it through __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED. While __dump_page()/__dump_folio was also problematic (not handling dumping of tail pages of such gigantic folios correctly), it doesn't seem critical enough to mark it as a fix. Link: https://lkml.kernel.org/r/20251114214920.2550676-1-david@kernel.org Fixes: 7b4f21f5e038 ("mm/hugetlb: check for unreasonable folio sizes when registering hstate") Reported-by: Christophe Leroy Closes: https://lore.kernel.org/r/3e043453-3f27-48ad-b987-cc39f523060a@csgroup.eu/ Reported-by: Sourabh Jain Closes: https://lore.kernel.org/r/94377f5c-d4f0-4c0f-b0f6-5bf1cd7305b1@linux.ibm.com/ Signed-off-by: David Hildenbrand (Red Hat) Cc: Ritesh Harjani (IBM) Cc: Madhavan Srinivasan Cc: Donet Tom Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Lorenzo Stoakes Cc: "Liam R. Howlett" Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/platforms/Kconfig.cputype | 1 - include/linux/mm.h | 13 ++++++++++--- mm/Kconfig | 7 +++++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e24f4d88885a..9537a61ebae0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,6 +137,7 @@ config PPC select ARCH_HAS_DMA_OPS if PPC64 select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_GIGANTIC_PAGE if ARCH_SUPPORTS_HUGETLBFS select ARCH_HAS_KCOV select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC64 && PPC_FPU select ARCH_HAS_MEMBARRIER_CALLBACKS diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7b527d18aa5e..4c321a8ea896 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -423,7 +423,6 @@ config PPC_64S_HASH_MMU config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 - select ARCH_HAS_GIGANTIC_PAGE default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..7c79b3369b82 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2074,7 +2074,7 @@ static inline unsigned long folio_nr_pages(const struct folio *folio) return folio_large_nr_pages(folio); } -#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) +#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) /* * We don't expect any folios that exceed buddy sizes (and consequently * memory sections). @@ -2087,10 +2087,17 @@ static inline unsigned long folio_nr_pages(const struct folio *folio) * pages are guaranteed to be contiguous. */ #define MAX_FOLIO_ORDER PFN_SECTION_SHIFT -#else +#elif defined(CONFIG_HUGETLB_PAGE) /* * There is no real limit on the folio size. We limit them to the maximum we - * currently expect (e.g., hugetlb, dax). + * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect + * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. + */ +#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#else +/* + * Without hugetlb, gigantic folios that are bigger than a single PUD are + * currently impossible. */ #define MAX_FOLIO_ORDER PUD_ORDER #endif diff --git a/mm/Kconfig b/mm/Kconfig index 0e26f4fc8717..ca3f146bc705 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -908,6 +908,13 @@ config PAGE_MAPCOUNT config PGTABLE_HAS_HUGE_LEAVES def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE +# +# We can end up creating gigantic folio. +# +config HAVE_GIGANTIC_FOLIOS + def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \ + (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + # TODO: Allow to be enabled without THP config ARCH_SUPPORTS_HUGE_PFNMAP def_bool n From 00fbff75c5acb4755f06f08bd1071879c63940c5 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Sun, 2 Nov 2025 01:07:41 +0530 Subject: [PATCH 416/543] crash: fix crashkernel resource shrink When crashkernel is configured with a high reservation, shrinking its value below the low crashkernel reservation causes two issues: 1. Invalid crashkernel resource objects 2. Kernel crash if crashkernel shrinking is done twice For example, with crashkernel=200M,high, the kernel reserves 200MB of high memory and some default low memory (say 256MB). The reservation appears as: cat /proc/iomem | grep -i crash af000000-beffffff : Crash kernel 433000000-43f7fffff : Crash kernel If crashkernel is then shrunk to 50MB (echo 52428800 > /sys/kernel/kexec_crash_size), /proc/iomem still shows 256MB reserved: af000000-beffffff : Crash kernel Instead, it should show 50MB: af000000-b21fffff : Crash kernel Further shrinking crashkernel to 40MB causes a kernel crash with the following trace (x86): BUG: kernel NULL pointer dereference, address: 0000000000000038 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI Call Trace: ? __die_body.cold+0x19/0x27 ? page_fault_oops+0x15a/0x2f0 ? search_module_extables+0x19/0x60 ? search_bpf_extables+0x5f/0x80 ? exc_page_fault+0x7e/0x180 ? asm_exc_page_fault+0x26/0x30 ? __release_resource+0xd/0xb0 release_resource+0x26/0x40 __crash_shrink_memory+0xe5/0x110 crash_shrink_memory+0x12a/0x190 kexec_crash_size_store+0x41/0x80 kernfs_fop_write_iter+0x141/0x1f0 vfs_write+0x294/0x460 ksys_write+0x6d/0xf0 This happens because __crash_shrink_memory()/kernel/crash_core.c incorrectly updates the crashk_res resource object even when crashk_low_res should be updated. Fix this by ensuring the correct crashkernel resource object is updated when shrinking crashkernel memory. Link: https://lkml.kernel.org/r/20251101193741.289252-1-sourabhjain@linux.ibm.com Fixes: 16c6006af4d4 ("kexec: enable kexec_crash_size to support two crash kernel regions") Signed-off-by: Sourabh Jain Acked-by: Baoquan He Cc: Zhen Lei Cc: Signed-off-by: Andrew Morton --- kernel/crash_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 3b1c43382eec..99dac1aa972a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -373,7 +373,7 @@ static int __crash_shrink_memory(struct resource *old_res, old_res->start = 0; old_res->end = 0; } else { - crashk_res.end = ram_res->start - 1; + old_res->end = ram_res->start - 1; } crash_free_reserved_phys_range(ram_res->start, ram_res->end); From 3470715e5c22578c6ea4098b256d5a904e12eef2 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Mon, 3 Nov 2025 11:36:59 +0100 Subject: [PATCH 417/543] MAINTAINERS: update David Hildenbrand's email address Switch to kernel.org email address as I will be leaving Red Hat. The old address will remain active until end of January 2026, so performing the change now should make sure that most mails will reach me. Link: https://lkml.kernel.org/r/20251103103659.379335-1-david@kernel.org Signed-off-by: David Hildenbrand Signed-off-by: David Hildenbrand (Red Hat) Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/.mailmap b/.mailmap index 369cfe467932..a14166f834a4 100644 --- a/.mailmap +++ b/.mailmap @@ -206,6 +206,7 @@ Danilo Krummrich David Brownell David Collins David Heidelberg +David Hildenbrand David Rheinsberg David Rheinsberg David Rheinsberg diff --git a/MAINTAINERS b/MAINTAINERS index 5b93346f464f..c39701eec3fe 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11526,7 +11526,7 @@ F: include/linux/platform_data/huawei-gaokun-ec.h HUGETLB SUBSYSTEM M: Muchun Song M: Oscar Salvador -R: David Hildenbrand +R: David Hildenbrand L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages @@ -13733,7 +13733,7 @@ KERNEL VIRTUAL MACHINE for s390 (KVM/s390) M: Christian Borntraeger M: Janosch Frank M: Claudio Imbrenda -R: David Hildenbrand +R: David Hildenbrand L: kvm@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git @@ -16220,7 +16220,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git F: drivers/devfreq/tegra30-devfreq.c MEMORY HOT(UN)PLUG -M: David Hildenbrand +M: David Hildenbrand M: Oscar Salvador L: linux-mm@kvack.org S: Maintained @@ -16245,7 +16245,7 @@ F: tools/mm/ MEMORY MANAGEMENT - CORE M: Andrew Morton -M: David Hildenbrand +M: David Hildenbrand R: Lorenzo Stoakes R: Liam R. Howlett R: Vlastimil Babka @@ -16301,7 +16301,7 @@ F: mm/execmem.c MEMORY MANAGEMENT - GUP (GET USER PAGES) M: Andrew Morton -M: David Hildenbrand +M: David Hildenbrand R: Jason Gunthorpe R: John Hubbard R: Peter Xu @@ -16317,7 +16317,7 @@ F: tools/testing/selftests/mm/gup_test.c MEMORY MANAGEMENT - KSM (Kernel Samepage Merging) M: Andrew Morton -M: David Hildenbrand +M: David Hildenbrand R: Xu Xin R: Chengming Zhou L: linux-mm@kvack.org @@ -16333,7 +16333,7 @@ F: mm/mm_slot.h MEMORY MANAGEMENT - MEMORY POLICY AND MIGRATION M: Andrew Morton -M: David Hildenbrand +M: David Hildenbrand R: Zi Yan R: Matthew Brost R: Joshua Hahn @@ -16373,7 +16373,7 @@ F: mm/workingset.c MEMORY MANAGEMENT - MISC M: Andrew Morton -M: David Hildenbrand +M: David Hildenbrand R: Lorenzo Stoakes R: Liam R. Howlett R: Vlastimil Babka @@ -16461,7 +16461,7 @@ F: mm/shuffle.h MEMORY MANAGEMENT - RECLAIM M: Andrew Morton M: Johannes Weiner -R: David Hildenbrand +R: David Hildenbrand R: Michal Hocko R: Qi Zheng R: Shakeel Butt @@ -16474,7 +16474,7 @@ F: mm/workingset.c MEMORY MANAGEMENT - RMAP (REVERSE MAPPING) M: Andrew Morton -M: David Hildenbrand +M: David Hildenbrand M: Lorenzo Stoakes R: Rik van Riel R: Liam R. Howlett @@ -16519,7 +16519,7 @@ F: mm/swapfile.c MEMORY MANAGEMENT - THP (TRANSPARENT HUGE PAGE) M: Andrew Morton -M: David Hildenbrand +M: David Hildenbrand M: Lorenzo Stoakes R: Zi Yan R: Baolin Wang @@ -16621,7 +16621,7 @@ MEMORY MAPPING - MADVISE (MEMORY ADVICE) M: Andrew Morton M: Liam R. Howlett M: Lorenzo Stoakes -M: David Hildenbrand +M: David Hildenbrand R: Vlastimil Babka R: Jann Horn L: linux-mm@kvack.org @@ -27088,7 +27088,7 @@ F: net/vmw_vsock/virtio_transport_common.c VIRTIO BALLOON M: "Michael S. Tsirkin" -M: David Hildenbrand +M: David Hildenbrand L: virtualization@lists.linux.dev S: Maintained F: drivers/virtio/virtio_balloon.c @@ -27243,7 +27243,7 @@ F: drivers/iommu/virtio-iommu.c F: include/uapi/linux/virtio_iommu.h VIRTIO MEM DRIVER -M: David Hildenbrand +M: David Hildenbrand L: virtualization@lists.linux.dev S: Maintained W: https://virtio-mem.gitlab.io/ From f1d47cafe513b5552a5b20a7af0936d9070a8a78 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 5 Nov 2025 11:29:10 -0500 Subject: [PATCH 418/543] mm/huge_memory: fix folio split check for anon folios in swapcache Both uniform and non uniform split check missed the check to prevent splitting anon folios in swapcache to non-zero order. Splitting anon folios in swapcache to non-zero order can cause data corruption since swapcache only support PMD order and order-0 entries. This can happen when one use split_huge_pages under debugfs to split anon folios in swapcache. In-tree callers do not perform such an illegal operation. Only debugfs interface could trigger it. I will put adding a test case on my TODO list. Fix the check. Link: https://lkml.kernel.org/r/20251105162910.752266-1-ziy@nvidia.com Fixes: 58729c04cf10 ("mm/huge_memory: add buddy allocator like (non-uniform) folio_split()") Signed-off-by: Zi Yan Reported-by: "David Hildenbrand (Red Hat)" Closes: https://lore.kernel.org/all/dc0ecc2c-4089-484f-917f-920fdca4c898@kernel.org/ Acked-by: David Hildenbrand (Red Hat) Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Nico Pache Cc: Ryan Roberts Cc: Wei Yang Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 323654fb4f8c..2f2a521e5d68 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3522,7 +3522,8 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, /* order-1 is not supported for anonymous THP. */ VM_WARN_ONCE(warns && new_order == 1, "Cannot split to order-1 folio"); - return new_order != 1; + if (new_order == 1) + return false; } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !mapping_large_folio_support(folio->mapping)) { /* @@ -3553,7 +3554,8 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, if (folio_test_anon(folio)) { VM_WARN_ONCE(warns && new_order == 1, "Cannot split to order-1 folio"); - return new_order != 1; + if (new_order == 1) + return false; } else if (new_order) { if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !mapping_large_folio_support(folio->mapping)) { From a26ec8f3d4e56d4a7ffa301e8032dca9df0bbc05 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 6 Nov 2025 17:06:35 -0500 Subject: [PATCH 419/543] lib/test_kho: check if KHO is enabled We must check whether KHO is enabled prior to issuing KHO commands, otherwise KHO internal data structures are not initialized. Link: https://lkml.kernel.org/r/20251106220635.2608494-1-pasha.tatashin@soleen.com Fixes: b753522bed0b ("kho: add test for kexec handover") Signed-off-by: Pasha Tatashin Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202511061629.e242724-lkp@intel.com Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Signed-off-by: Andrew Morton --- lib/test_kho.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/test_kho.c b/lib/test_kho.c index 60cd899ea745..fff018e5548d 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -301,6 +301,9 @@ static int __init kho_test_init(void) phys_addr_t fdt_phys; int err; + if (!kho_is_enabled()) + return 0; + err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys); if (!err) return kho_test_restore(fdt_phys); From 216158f063fe24fb003bd7da0cd92cd6e2c4d48b Mon Sep 17 00:00:00 2001 From: Ankit Khushwaha Date: Thu, 6 Nov 2025 15:25:32 +0530 Subject: [PATCH 420/543] selftests/user_events: fix type cast for write_index packed member in perf_test Accessing 'reg.write_index' directly triggers a -Waddress-of-packed-member warning due to potential unaligned pointer access: perf_test.c:239:38: warning: taking address of packed member 'write_index' of class or structure 'user_reg' may result in an unaligned pointer value [-Waddress-of-packed-member] 239 | ASSERT_NE(-1, write(self->data_fd, ®.write_index, | ^~~~~~~~~~~~~~~ Since write(2) works with any alignment. Casting '®.write_index' explicitly to 'void *' to suppress this warning. Link: https://lkml.kernel.org/r/20251106095532.15185-1-ankitkhushwaha.linux@gmail.com Fixes: 42187bdc3ca4 ("selftests/user_events: Add perf self-test for empty arguments events") Signed-off-by: Ankit Khushwaha Cc: Beau Belgrave Cc: "Masami Hiramatsu (Google)" Cc: Steven Rostedt Cc: sunliming Cc: Wei Yang Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/user_events/perf_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index 5288e768b207..68625362add2 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -236,7 +236,7 @@ TEST_F(user, perf_empty_events) { ASSERT_EQ(1 << reg.enable_bit, self->check); /* Ensure write shows up at correct offset */ - ASSERT_NE(-1, write(self->data_fd, ®.write_index, + ASSERT_NE(-1, write(self->data_fd, (void *)®.write_index, sizeof(reg.write_index))); val = (void *)(((char *)perf_page) + perf_page->data_offset); ASSERT_EQ(PERF_RECORD_SAMPLE, *val); From 1c2a936edd71e133f2806e68324ec81a4eb07588 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 11 Nov 2025 21:36:08 +0800 Subject: [PATCH 421/543] mm, swap: fix potential UAF issue for VMA readahead Since commit 78524b05f1a3 ("mm, swap: avoid redundant swap device pinning"), the common helper for allocating and preparing a folio in the swap cache layer no longer tries to get a swap device reference internally, because all callers of __read_swap_cache_async are already holding a swap entry reference. The repeated swap device pinning isn't needed on the same swap device. Caller of VMA readahead is also holding a reference to the target entry's swap device, but VMA readahead walks the page table, so it might encounter swap entries from other devices, and call __read_swap_cache_async on another device without holding a reference to it. So it is possible to cause a UAF when swapoff of device A raced with swapin on device B, and VMA readahead tries to read swap entries from device A. It's not easy to trigger, but in theory, it could cause real issues. Make VMA readahead try to get the device reference first if the swap device is a different one from the target entry. Link: https://lkml.kernel.org/r/20251111-swap-fix-vma-uaf-v1-1-41c660e58562@tencent.com Fixes: 78524b05f1a3 ("mm, swap: avoid redundant swap device pinning") Suggested-by: Huang Ying Signed-off-by: Kairui Song Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: Kemeng Shi Cc: Nhat Pham Cc: Signed-off-by: Andrew Morton --- mm/swap_state.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/swap_state.c b/mm/swap_state.c index b13e9c4baa90..f4980dde5394 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -748,6 +748,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, blk_start_plug(&plug); for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { + struct swap_info_struct *si = NULL; + if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) @@ -761,8 +763,19 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, continue; pte_unmap(pte); pte = NULL; + /* + * Readahead entry may come from a device that we are not + * holding a reference to, try to grab a reference, or skip. + */ + if (swp_type(entry) != swp_type(targ_entry)) { + si = get_swap_device(entry); + if (!si) + continue; + } folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); + if (si) + put_swap_device(si); if (!folio) continue; if (page_allocated) { From 91f815b7073d8f1abb5f3a3a7bf6b9466a6e5e8f Mon Sep 17 00:00:00 2001 From: Chao-ying Fu Date: Thu, 13 Nov 2025 17:10:32 +0100 Subject: [PATCH 422/543] riscv: Update MIPS vendor id to 0x127 [1] defines MIPS vendor id as 0x127. All previous MIPS RISC-V patches were tested on QEMU, also modified to use 0x722 as MIPS_VENDOR_ID. This new value should reflect real hardware. [1] https://mips.com/wp-content/uploads/2025/06/P8700_Programmers_Reference_Manual_Rev1.84_5-31-2025.pdf Fixes: a8fed1bc03ac ("riscv: Add xmipsexectl as a vendor extension") Signed-off-by: Chao-ying Fu Signed-off-by: Aleksa Paunovic Link: https://patch.msgid.link/20251113-mips-vendorid-v2-1-3279489b7f84@htecgroup.com Cc: Signed-off-by: Paul WAlmsley --- arch/riscv/include/asm/vendorid_list.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h index 3b09874d7a6d..7f5030ee1fcf 100644 --- a/arch/riscv/include/asm/vendorid_list.h +++ b/arch/riscv/include/asm/vendorid_list.h @@ -7,8 +7,8 @@ #define ANDES_VENDOR_ID 0x31e #define MICROCHIP_VENDOR_ID 0x029 +#define MIPS_VENDOR_ID 0x127 #define SIFIVE_VENDOR_ID 0x489 #define THEAD_VENDOR_ID 0x5b7 -#define MIPS_VENDOR_ID 0x722 #endif From 1107aac1ad7f445a83604b14af7be47f1a795c66 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Fri, 14 Nov 2025 23:44:21 +0900 Subject: [PATCH 423/543] firewire: core: fix to update generation field in topology map The generation field of topology map is updated after initialized by zero. The updated value of generation field is always zero, and is against specification. This commit fixes the bug. Fixes: 7d138cb269db ("firewire: core: use spin lock specific to topology map") Link: https://lore.kernel.org/r/20251114144421.415278-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-topology.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 2f73bcd5696f..ed3ae8cdb0cd 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -441,12 +441,13 @@ static void update_topology_map(__be32 *buffer, size_t buffer_size, int root_nod const u32 *self_ids, int self_id_count) { __be32 *map = buffer; + u32 next_generation = be32_to_cpu(buffer[1]) + 1; int node_count = (root_node_id & 0x3f) + 1; memset(map, 0, buffer_size); *map++ = cpu_to_be32((self_id_count + 2) << 16); - *map++ = cpu_to_be32(be32_to_cpu(buffer[1]) + 1); + *map++ = cpu_to_be32(next_generation); *map++ = cpu_to_be32((node_count << 16) | self_id_count); while (self_id_count--) From 7b090e7b910cafd245f23e56f3257a151ca0a289 Mon Sep 17 00:00:00 2001 From: Mayuresh Chitale Date: Fri, 14 Nov 2025 06:58:06 +0000 Subject: [PATCH 424/543] riscv: sbi: Prefer SRST shutdown over legacy Currently, the sbi_init() always attempts to register the legacy shutdown function as the sys-off handler which is fine when RISCV_SBI_V01 is not enabled. However, if RISCV_SBI_V01 is enabled in the kernel and the SBI v0.1 is not supported by the underlying SBI implementation then the legacy shutdown fails. Fix this by not registering the legacy shutdown when SRST shutdown is available. Fixes: 70ddf86d76c1 ("riscv: sbi: Switch to new sys-off handler API") Signed-off-by: Mayuresh Chitale Reviewed-by: Anup Patel Link: https://patch.msgid.link/20251114065808.304430-1-mchitale@ventanamicro.com Signed-off-by: Paul Walmsley --- arch/riscv/kernel/sbi.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 5e8cde055264..c443337056ab 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -648,9 +648,9 @@ int sbi_debug_console_read(char *bytes, unsigned int num_bytes) void __init sbi_init(void) { + bool srst_power_off = false; int ret; - sbi_set_power_off(); ret = sbi_get_spec_version(); if (ret > 0) sbi_spec_version = ret; @@ -683,6 +683,7 @@ void __init sbi_init(void) sbi_probe_extension(SBI_EXT_SRST)) { pr_info("SBI SRST extension detected\n"); register_platform_power_off(sbi_srst_power_off); + srst_power_off = true; sbi_srst_reboot_nb.notifier_call = sbi_srst_reboot; sbi_srst_reboot_nb.priority = 192; register_restart_handler(&sbi_srst_reboot_nb); @@ -702,4 +703,7 @@ void __init sbi_init(void) __sbi_send_ipi = __sbi_send_ipi_v01; __sbi_rfence = __sbi_rfence_v01; } + + if (!srst_power_off) + sbi_set_power_off(); } From e2cb69263e797c0aa6676bcef23e9e27e44c83b0 Mon Sep 17 00:00:00 2001 From: Chen Pei Date: Fri, 14 Nov 2025 15:12:15 +0800 Subject: [PATCH 425/543] tools: riscv: Fixed misalignment of CSR related definitions The file tools/arch/riscv/include/asm/csr.h borrows from arch/riscv/include/asm/csr.h, and subsequent modifications related to CSR should maintain consistency. Signed-off-by: Chen Pei Link: https://patch.msgid.link/20251114071215.816-1-cp0613@linux.alibaba.com [pjw@kernel.org: dropped Fixes: lines for patches that weren't broken; removed superfluous blank line] Signed-off-by: Paul Walmsley --- drivers/perf/riscv_pmu_sbi.c | 2 +- tools/arch/riscv/include/asm/csr.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index e255c1b069ec..7dd282da67ce 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -1109,7 +1109,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) /* compute hardware counter index */ hidx = info->csr - CSR_CYCLE; - /* check if the corresponding bit is set in sscountovf or overflow mask in shmem */ + /* check if the corresponding bit is set in scountovf or overflow mask in shmem */ if (!(overflow & BIT(hidx))) continue; diff --git a/tools/arch/riscv/include/asm/csr.h b/tools/arch/riscv/include/asm/csr.h index 56d7367ee344..21d8cee04638 100644 --- a/tools/arch/riscv/include/asm/csr.h +++ b/tools/arch/riscv/include/asm/csr.h @@ -167,7 +167,8 @@ #define VSIP_TO_HVIP_SHIFT (IRQ_VS_SOFT - IRQ_S_SOFT) #define VSIP_VALID_MASK ((_AC(1, UL) << IRQ_S_SOFT) | \ (_AC(1, UL) << IRQ_S_TIMER) | \ - (_AC(1, UL) << IRQ_S_EXT)) + (_AC(1, UL) << IRQ_S_EXT) | \ + (_AC(1, UL) << IRQ_PMU_OVF)) /* AIA CSR bits */ #define TOPI_IID_SHIFT 16 @@ -280,7 +281,7 @@ #define CSR_HPMCOUNTER30H 0xc9e #define CSR_HPMCOUNTER31H 0xc9f -#define CSR_SSCOUNTOVF 0xda0 +#define CSR_SCOUNTOVF 0xda0 #define CSR_SSTATUS 0x100 #define CSR_SIE 0x104 From 6a23ae0a96a600d1d12557add110e0bb6e32730c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Nov 2025 14:25:38 -0800 Subject: [PATCH 426/543] Linux 6.18-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fb4389aa5d5f..d763c2c75cdb 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 18 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Baby Opossum Posse # *DOCUMENTATION* From 36c6f3c03d104faf1aa90922f2310549c175420f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 17 Nov 2025 20:53:10 +0800 Subject: [PATCH 427/543] sched_ext: Use IRQ_WORK_INIT_HARD() to initialize rq->scx.kick_cpus_irq_work For PREEMPT_RT kernels, the kick_cpus_irq_workfn() be invoked in the per-cpu irq_work/* task context and there is no rcu-read critical section to protect. this commit therefore use IRQ_WORK_INIT_HARD() to initialize the per-cpu rq->scx.kick_cpus_irq_work in the init_sched_ext_class(). Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 07399210ac2d..7aae1d0ce37e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5322,7 +5322,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); - init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); if (cpu_online(cpu)) cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; From 159de7a825aea4242d3f8d32de5853d269dbe72f Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Fri, 7 Nov 2025 09:17:11 +1000 Subject: [PATCH 428/543] nvmet-auth: update sc_c in target host hash calculation Commit 7e091add9c43 "nvme-auth: update sc_c in host response" added the sc_c variable to the dhchap queue context structure which is appropriately set during negotiate and then used in the host response. This breaks secure concat connections with a Linux target as the target code wasn't updated at the same time. This patch fixes this by adding a new sc_c variable to the host hash calculations. Fixes: 7e091add9c43 ("nvme-auth: update sc_c in host response") Tested-by: Shin'ichiro Kawasaki Tested-by: Yi Zhang Reviewed-by: Martin George Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Alistair Francis Signed-off-by: Keith Busch --- drivers/nvme/target/auth.c | 4 ++-- drivers/nvme/target/fabrics-cmd-auth.c | 1 + drivers/nvme/target/nvmet.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index ceba21684e82..300d5e032f6d 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -298,7 +298,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, const char *hash_name; u8 *challenge = req->sq->dhchap_c1; struct nvme_dhchap_key *transformed_key; - u8 buf[4], sc_c = ctrl->concat ? 1 : 0; + u8 buf[4]; int ret; hash_name = nvme_auth_hmac_name(ctrl->shash_id); @@ -367,7 +367,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, ret = crypto_shash_update(shash, buf, 2); if (ret) goto out; - *buf = sc_c; + *buf = req->sq->sc_c; ret = crypto_shash_update(shash, buf, 1); if (ret) goto out; diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index bf01ec414c55..5946681cb0e3 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -43,6 +43,7 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d) data->auth_protocol[0].dhchap.halen, data->auth_protocol[0].dhchap.dhlen); req->sq->dhchap_tid = le16_to_cpu(data->t_id); + req->sq->sc_c = data->sc_c; if (data->sc_c != NVME_AUTH_SECP_NOSC) { if (!IS_ENABLED(CONFIG_NVME_TARGET_TCP_TLS)) return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 51df72f5e89b..f3b09f4099f0 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -159,6 +159,7 @@ struct nvmet_sq { bool authenticated; struct delayed_work auth_expired_work; u16 dhchap_tid; + u8 sc_c; u8 dhchap_status; u8 dhchap_step; u8 *dhchap_c1; From 6d87cd5335784351280f82c47cc8a657271929c3 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Mon, 17 Nov 2025 11:23:39 +0900 Subject: [PATCH 429/543] nvme-multipath: fix lockdep WARN due to partition scan work Blktests test cases nvme/014, 057 and 058 fail occasionally due to a lockdep WARN. As reported in the Closes tag URL, the WARN indicates that a deadlock can happen due to the dependency among disk->open_mutex, kblockd workqueue completion and partition_scan_work completion. To avoid the lockdep WARN and the potential deadlock, cut the dependency by running the partition_scan_work not by kblockd workqueue but by nvme_wq. Reported-by: Yi Zhang Closes: https://lore.kernel.org/linux-block/CAHj4cs8mJ+R_GmQm9R8ebResKAWUE8kF5+_WVg0v8zndmqd6BQ@mail.gmail.com/ Link: https://lore.kernel.org/linux-block/oeyzci6ffshpukpfqgztsdeke5ost5hzsuz4rrsjfmvpqcevax@5nhnwbkzbrpa/ Fixes: 1f021341eef4 ("nvme-multipath: defer partition scanning") Signed-off-by: Shin'ichiro Kawasaki Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 543e17aead12..e35eccacee8c 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -793,7 +793,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) return; } nvme_add_ns_head_cdev(head); - kblockd_schedule_work(&head->partition_scan_work); + queue_work(nvme_wq, &head->partition_scan_work); } nvme_mpath_add_sysfs_link(ns->head); From ea3442efabd0aa3930c5bab73c3901ef38ef6ac3 Mon Sep 17 00:00:00 2001 From: "Ewan D. Milne" Date: Mon, 10 Nov 2025 16:20:00 -0500 Subject: [PATCH 430/543] nvme: nvme-fc: move tagset removal to nvme_fc_delete_ctrl() Now target is removed from nvme_fc_ctrl_free() which is the ctrl->ref release handler. And even admin queue is unquiesced there, this way is definitely wrong because the ctr->ref is grabbed when submitting command. And Marco observed that nvme_fc_ctrl_free() can be called from request completion code path, and trigger kernel warning since request completes from softirq context. Fix the issue by moveing target removal into nvme_fc_delete_ctrl(), which is also aligned with nvme-tcp and nvme-rdma. Patch originally proposed by Ming Lei, then modified to move the tagset removal down to after nvme_fc_delete_association() after further testing. Cc: Marco Patalano Cc: Ewan Milne Cc: James Smart Cc: Sagi Grimberg Signed-off-by: Ming Lei Cc: stable@vger.kernel.org Tested-by: Marco Patalano Reviewed-by: Justin Tee Signed-off-by: Ewan D. Milne Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 03987f497a5b..2dd8adf1139e 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2355,17 +2355,11 @@ nvme_fc_ctrl_free(struct kref *ref) container_of(ref, struct nvme_fc_ctrl, ref); unsigned long flags; - if (ctrl->ctrl.tagset) - nvme_remove_io_tag_set(&ctrl->ctrl); - /* remove from rport list */ spin_lock_irqsave(&ctrl->rport->lock, flags); list_del(&ctrl->ctrl_list); spin_unlock_irqrestore(&ctrl->rport->lock, flags); - nvme_unquiesce_admin_queue(&ctrl->ctrl); - nvme_remove_admin_tag_set(&ctrl->ctrl); - kfree(ctrl->queues); put_device(ctrl->dev); @@ -3261,11 +3255,18 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) cancel_work_sync(&ctrl->ioerr_work); cancel_delayed_work_sync(&ctrl->connect_work); + /* * kill the association on the link side. this will block * waiting for io to terminate */ nvme_fc_delete_association(ctrl); + + if (ctrl->ctrl.tagset) + nvme_remove_io_tag_set(&ctrl->ctrl); + + nvme_unquiesce_admin_queue(&ctrl->ctrl); + nvme_remove_admin_tag_set(&ctrl->ctrl); } static void From 0a2c5495b6d1ecb0fa18ef6631450f391a888256 Mon Sep 17 00:00:00 2001 From: "Ewan D. Milne" Date: Mon, 10 Nov 2025 16:20:01 -0500 Subject: [PATCH 431/543] nvme: nvme-fc: Ensure ->ioerr_work is cancelled in nvme_fc_delete_ctrl() nvme_fc_delete_assocation() waits for pending I/O to complete before returning, and an error can cause ->ioerr_work to be queued after cancel_work_sync() had been called. Move the call to cancel_work_sync() to be after nvme_fc_delete_association() to ensure ->ioerr_work is not running when the nvme_fc_ctrl object is freed. Otherwise the following can occur: [ 1135.911754] list_del corruption, ff2d24c8093f31f8->next is NULL [ 1135.917705] ------------[ cut here ]------------ [ 1135.922336] kernel BUG at lib/list_debug.c:52! [ 1135.926784] Oops: invalid opcode: 0000 [#1] SMP NOPTI [ 1135.931851] CPU: 48 UID: 0 PID: 726 Comm: kworker/u449:23 Kdump: loaded Not tainted 6.12.0 #1 PREEMPT(voluntary) [ 1135.943490] Hardware name: Dell Inc. PowerEdge R660/0HGTK9, BIOS 2.5.4 01/16/2025 [ 1135.950969] Workqueue: 0x0 (nvme-wq) [ 1135.954673] RIP: 0010:__list_del_entry_valid_or_report.cold+0xf/0x6f [ 1135.961041] Code: c7 c7 98 68 72 94 e8 26 45 fe ff 0f 0b 48 c7 c7 70 68 72 94 e8 18 45 fe ff 0f 0b 48 89 fe 48 c7 c7 80 69 72 94 e8 07 45 fe ff <0f> 0b 48 89 d1 48 c7 c7 a0 6a 72 94 48 89 c2 e8 f3 44 fe ff 0f 0b [ 1135.979788] RSP: 0018:ff579b19482d3e50 EFLAGS: 00010046 [ 1135.985015] RAX: 0000000000000033 RBX: ff2d24c8093f31f0 RCX: 0000000000000000 [ 1135.992148] RDX: 0000000000000000 RSI: ff2d24d6bfa1d0c0 RDI: ff2d24d6bfa1d0c0 [ 1135.999278] RBP: ff2d24c8093f31f8 R08: 0000000000000000 R09: ffffffff951e2b08 [ 1136.006413] R10: ffffffff95122ac8 R11: 0000000000000003 R12: ff2d24c78697c100 [ 1136.013546] R13: fffffffffffffff8 R14: 0000000000000000 R15: ff2d24c78697c0c0 [ 1136.020677] FS: 0000000000000000(0000) GS:ff2d24d6bfa00000(0000) knlGS:0000000000000000 [ 1136.028765] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1136.034510] CR2: 00007fd207f90b80 CR3: 000000163ea22003 CR4: 0000000000f73ef0 [ 1136.041641] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1136.048776] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 [ 1136.055910] PKRU: 55555554 [ 1136.058623] Call Trace: [ 1136.061074] [ 1136.063179] ? show_trace_log_lvl+0x1b0/0x2f0 [ 1136.067540] ? show_trace_log_lvl+0x1b0/0x2f0 [ 1136.071898] ? move_linked_works+0x4a/0xa0 [ 1136.075998] ? __list_del_entry_valid_or_report.cold+0xf/0x6f [ 1136.081744] ? __die_body.cold+0x8/0x12 [ 1136.085584] ? die+0x2e/0x50 [ 1136.088469] ? do_trap+0xca/0x110 [ 1136.091789] ? do_error_trap+0x65/0x80 [ 1136.095543] ? __list_del_entry_valid_or_report.cold+0xf/0x6f [ 1136.101289] ? exc_invalid_op+0x50/0x70 [ 1136.105127] ? __list_del_entry_valid_or_report.cold+0xf/0x6f [ 1136.110874] ? asm_exc_invalid_op+0x1a/0x20 [ 1136.115059] ? __list_del_entry_valid_or_report.cold+0xf/0x6f [ 1136.120806] move_linked_works+0x4a/0xa0 [ 1136.124733] worker_thread+0x216/0x3a0 [ 1136.128485] ? __pfx_worker_thread+0x10/0x10 [ 1136.132758] kthread+0xfa/0x240 [ 1136.135904] ? __pfx_kthread+0x10/0x10 [ 1136.139657] ret_from_fork+0x31/0x50 [ 1136.143236] ? __pfx_kthread+0x10/0x10 [ 1136.146988] ret_from_fork_asm+0x1a/0x30 [ 1136.150915] Fixes: 19fce0470f05 ("nvme-fc: avoid calling _nvme_fc_abort_outstanding_ios from interrupt context") Cc: stable@vger.kernel.org Tested-by: Marco Patalano Reviewed-by: Justin Tee Signed-off-by: Ewan D. Milne Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 2dd8adf1139e..2c903729b0b9 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3253,7 +3253,6 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - cancel_work_sync(&ctrl->ioerr_work); cancel_delayed_work_sync(&ctrl->connect_work); /* @@ -3261,6 +3260,7 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) * waiting for io to terminate */ nvme_fc_delete_association(ctrl); + cancel_work_sync(&ctrl->ioerr_work); if (ctrl->ctrl.tagset) nvme_remove_io_tag_set(&ctrl->ctrl); From fa766e759ff7b128ab77323d9d9c232434621bb6 Mon Sep 17 00:00:00 2001 From: Dnyaneshwar Bhadane Date: Thu, 16 Oct 2025 18:45:17 +0530 Subject: [PATCH 432/543] drm/i915/xe3lpd: Load DMC for Xe3_LPD version 30.02 Load the DMC for Xe3_LPD version 30.02. Fixes: 3c0f211bc8fc ("drm/xe: Add Wildcat Lake device IDs to PTL list") Signed-off-by: Dnyaneshwar Bhadane Reviewed-by: Gustavo Sousa Reviewed-by: Chaitanya Kumar Borah Link: https://lore.kernel.org/r/20251016131517.2032684-1-dnyaneshwar.bhadane@intel.com Signed-off-by: Gustavo Sousa (cherry picked from commit a63db39a578b543f5e5719b9f14dd82d3b8648d1) Signed-off-by: Rodrigo Vivi [Rodrigo added the Fixes tag while cherry-picking to fixes] --- drivers/gpu/drm/i915/display/intel_dmc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dmc.c b/drivers/gpu/drm/i915/display/intel_dmc.c index 4a4cace1f879..e1455fd7277f 100644 --- a/drivers/gpu/drm/i915/display/intel_dmc.c +++ b/drivers/gpu/drm/i915/display/intel_dmc.c @@ -127,6 +127,9 @@ static bool dmc_firmware_param_disabled(struct intel_display *display) #define DISPLAY_VER13_DMC_MAX_FW_SIZE 0x20000 #define DISPLAY_VER12_DMC_MAX_FW_SIZE ICL_DMC_MAX_FW_SIZE +#define XE3LPD_3002_DMC_PATH DMC_PATH(xe3lpd_3002) +MODULE_FIRMWARE(XE3LPD_3002_DMC_PATH); + #define XE3LPD_DMC_PATH DMC_PATH(xe3lpd) MODULE_FIRMWARE(XE3LPD_DMC_PATH); @@ -183,9 +186,10 @@ static const char *dmc_firmware_default(struct intel_display *display, u32 *size { const char *fw_path = NULL; u32 max_fw_size = 0; - - if (DISPLAY_VERx100(display) == 3002 || - DISPLAY_VERx100(display) == 3000) { + if (DISPLAY_VERx100(display) == 3002) { + fw_path = XE3LPD_3002_DMC_PATH; + max_fw_size = XE2LPD_DMC_MAX_FW_SIZE; + } else if (DISPLAY_VERx100(display) == 3000) { fw_path = XE3LPD_DMC_PATH; max_fw_size = XE2LPD_DMC_MAX_FW_SIZE; } else if (DISPLAY_VERx100(display) == 2000) { From 949f1fd2225baefbea2995afa807dba5cbdb6bd3 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 17 Nov 2025 08:42:31 +0000 Subject: [PATCH 433/543] nouveau/firmware: Add missing kfree() of nvkm_falcon_fw::boot nvkm_falcon_fw::boot is allocated, but no one frees it. This causes a kmemleak warning. Make sure this data is deallocated. Fixes: 2541626cfb79 ("drm/nouveau/acr: use common falcon HS FW code for ACR FWs") Signed-off-by: Nam Cao Cc: stable@vger.kernel.org Reviewed-by: Lyude Paul Signed-off-by: Lyude Paul Link: https://patch.msgid.link/20251117084231.2910561-1-namcao@linutronix.de --- drivers/gpu/drm/nouveau/nvkm/falcon/fw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c b/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c index cac6d64ab67d..4e8b3f1c7e25 100644 --- a/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c +++ b/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c @@ -159,6 +159,8 @@ nvkm_falcon_fw_dtor(struct nvkm_falcon_fw *fw) nvkm_memory_unref(&fw->inst); nvkm_falcon_fw_dtor_sigs(fw); nvkm_firmware_dtor(&fw->fw); + kfree(fw->boot); + fw->boot = NULL; } static const struct nvkm_firmware_func From da02a1824884d6c84c5e5b5ac373b0c9e3288ec2 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov Date: Wed, 12 Nov 2025 19:27:24 +0100 Subject: [PATCH 434/543] s390/ctcm: Fix double-kfree The function 'mpc_rcvd_sweep_req(mpcginfo)' is called conditionally from function 'ctcmpc_unpack_skb'. It frees passed mpcginfo. After that a call to function 'kfree' in function 'ctcmpc_unpack_skb' frees it again. Remove 'kfree' call in function 'mpc_rcvd_sweep_req(mpcginfo)'. Bug detected by the clang static analyzer. Fixes: 0c0b20587b9f25a2 ("s390/ctcm: fix potential memory leak") Reviewed-by: Aswin Karuvally Signed-off-by: Aleksei Nikiforov Signed-off-by: Aswin Karuvally Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251112182724.1109474-1-aswin@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/s390/net/ctcm_mpc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/s390/net/ctcm_mpc.c b/drivers/s390/net/ctcm_mpc.c index 0aeafa772fb1..407b7c516658 100644 --- a/drivers/s390/net/ctcm_mpc.c +++ b/drivers/s390/net/ctcm_mpc.c @@ -701,7 +701,6 @@ static void mpc_rcvd_sweep_req(struct mpcg_info *mpcginfo) grp->sweep_req_pend_num--; ctcmpc_send_sweep_resp(ch); - kfree(mpcginfo); return; } From bed22c7b90af732978715a1789bca1c3cfa245a6 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 16 Nov 2025 10:10:29 +0200 Subject: [PATCH 435/543] selftests: net: lib: Do not overwrite error messages ret_set_ksft_status() calls ksft_status_merge() with the current return status and the last one. It treats a non-zero return code from ksft_status_merge() as an indication that the return status was overwritten by the last one and therefore overwrites the return message with the last one. Currently, ksft_status_merge() returns a non-zero return code even if the current return status and the last one are equal. This results in return messages being overwritten which is counter-productive since we are more interested in the first failure message and not the last one. Fix by changing ksft_status_merge() to only return a non-zero return code if the current return status was actually changed. Add a test case which checks that the first error message is not overwritten. Before: # ./lib_sh_test.sh [...] TEST: RET tfail2 tfail -> fail [FAIL] retmsg=tfail expected tfail2 [...] # echo $? 1 After: # ./lib_sh_test.sh [...] TEST: RET tfail2 tfail -> fail [ OK ] [...] # echo $? 0 Fixes: 596c8819cb78 ("selftests: forwarding: Have RET track kselftest framework constants") Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20251116081029.69112-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib_sh_test.sh | 7 +++++++ tools/testing/selftests/net/lib.sh | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh index ff2accccaf4d..b4eda6c6199e 100755 --- a/tools/testing/selftests/net/forwarding/lib_sh_test.sh +++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh @@ -30,6 +30,11 @@ tfail() do_test "tfail" false } +tfail2() +{ + do_test "tfail2" false +} + txfail() { FAIL_TO_XFAIL=yes do_test "txfail" false @@ -132,6 +137,8 @@ test_ret() ret_subtest $ksft_fail "tfail" txfail tfail ret_subtest $ksft_xfail "txfail" txfail txfail + + ret_subtest $ksft_fail "tfail2" tfail2 tfail } exit_status_tests_run() diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index feba4ef69a54..f448bafb3f20 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -43,7 +43,7 @@ __ksft_status_merge() weights[$i]=$((weight++)) done - if [[ ${weights[$a]} > ${weights[$b]} ]]; then + if [[ ${weights[$a]} -ge ${weights[$b]} ]]; then echo "$a" return 0 else From 8e0a754b0836d996802713bbebc87bc1cc17925c Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 13 Nov 2025 18:19:38 +0100 Subject: [PATCH 436/543] net: airoha: Do not loopback traffic to GDM2 if it is available on the device Airoha_eth driver forwards offloaded uplink traffic (packets received on GDM1 and forwarded to GDM{3,4}) to GDM2 in order to apply hw QoS. This is correct if the device does not support a dedicated GDM2 port. In this case, in order to enable hw offloading for uplink traffic, the packets should be sent to GDM{3,4} directly. Fixes: 9cd451d414f6 ("net: airoha: Add loopback support for GDM2") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20251113-airoha-hw-offload-gdm2-fix-v1-1-7e4ca300872f@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_ppe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 691361b25407..c0e17035db18 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -282,7 +282,7 @@ static int airoha_ppe_foe_entry_prepare(struct airoha_eth *eth, if (!airoha_is_valid_gdm_port(eth, port)) return -EINVAL; - if (dsa_port >= 0) + if (dsa_port >= 0 || eth->ports[1]) pse_port = port->id == 4 ? FE_PSE_PORT_GDM4 : port->id; else From ae8966b7b5bd69b86209cc34bcca1ba9f18b68e6 Mon Sep 17 00:00:00 2001 From: Peter Hutterer Date: Thu, 6 Nov 2025 21:45:34 +1000 Subject: [PATCH 437/543] Input: rename INPUT_PROP_HAPTIC_TOUCHPAD to INPUT_PROP_PRESSUREPAD And expand it to encompass all pressure pads. Definition: "pressure pad" as used here as includes all touchpads that use physical pressure to convert to click, without physical hinges. Also called haptic touchpads in general parlance, Synaptics calls them ForcePads. Most (all?) pressure pads are currently advertised as INPUT_PROP_BUTTONPAD. The suggestion to identify them as pressure pads by defining the resolution on ABS_MT_PRESSURE has been in the docs since commit 20ccc8dd38a3 ("Documentation: input: define ABS_PRESSURE/ABS_MT_PRESSURE resolution as grams") but few devices provide this information. In userspace it's thus impossible to determine whether a device is a true pressure pad (pressure equals pressure) or a normal clickpad with (pressure equals finger size). Commit 7075ae4ac9db ("Input: add INPUT_PROP_HAPTIC_TOUCHPAD") introduces INPUT_PROP_HAPTIC_TOUCHPAD but restricted it to those touchpads that have support for userspace-controlled effects. Let's expand and rename that definition to include all pressure pad touchpads since those that do support FF effects can be identified by the presence of the FF_HAPTIC bit. This means: - clickpad: INPUT_PROP_BUTTONPAD - pressurepad: INPUT_PROP_BUTTONPAD + INPUT_PROP_PRESSUREPAD - pressurepad with configurable haptics: INPUT_PROP_BUTTONPAD + INPUT_PROP_PRESSUREPAD + FF_HAPTIC Signed-off-by: Peter Hutterer Acked-by: Benjamin Tissoires Link: https://patch.msgid.link/20251106114534.GA405512@tassie Signed-off-by: Dmitry Torokhov --- Documentation/input/event-codes.rst | 25 ++++++++++++++++++------- drivers/hid/hid-haptic.c | 2 +- include/uapi/linux/input-event-codes.h | 2 +- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst index 1ead9bb8d9c6..4424cbff251f 100644 --- a/Documentation/input/event-codes.rst +++ b/Documentation/input/event-codes.rst @@ -400,19 +400,30 @@ can report through the rotational axes (absolute and/or relative rx, ry, rz). All other axes retain their meaning. A device must not mix regular directional axes and accelerometer axes on the same event node. -INPUT_PROP_HAPTIC_TOUCHPAD --------------------------- +INPUT_PROP_PRESSUREPAD +---------------------- + +The INPUT_PROP_PRESSUREPAD property indicates that the device provides +simulated haptic feedback (e.g. a vibrator motor situated below the surface) +instead of physical haptic feedback (e.g. a hinge). This property is only set +if the device: -The INPUT_PROP_HAPTIC_TOUCHPAD property indicates that device: -- supports simple haptic auto and manual triggering - can differentiate between at least 5 fingers - uses correct resolution for the X/Y (units and value) -- reports correct force per touch, and correct units for them (newtons or grams) - follows the MT protocol type B +If the simulated haptic feedback is controllable by userspace the device must: + +- support simple haptic auto and manual triggering, and +- report correct force per touch, and correct units for them (newtons or grams), and +- provide the EV_FF FF_HAPTIC force feedback effect. + Summing up, such devices follow the MS spec for input devices in -Win8 and Win8.1, and in addition support the Simple haptic controller HID table, -and report correct units for the pressure. +Win8 and Win8.1, and in addition may support the Simple haptic controller HID +table, and report correct units for the pressure. + +Where applicable, this property is set in addition to INPUT_PROP_BUTTONPAD, it +does not replace that property. Guidelines ========== diff --git a/drivers/hid/hid-haptic.c b/drivers/hid/hid-haptic.c index aa090684c1f2..fc8a9997f815 100644 --- a/drivers/hid/hid-haptic.c +++ b/drivers/hid/hid-haptic.c @@ -86,7 +86,7 @@ int hid_haptic_input_configured(struct hid_device *hdev, if (hi->application == HID_DG_TOUCHPAD) { if (haptic->auto_trigger_report && haptic->manual_trigger_report) { - __set_bit(INPUT_PROP_HAPTIC_TOUCHPAD, hi->input->propbit); + __set_bit(INPUT_PROP_PRESSUREPAD, hi->input->propbit); return 1; } return 0; diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 9cd89bcc1d9c..30f3c9eaafaa 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -27,7 +27,7 @@ #define INPUT_PROP_TOPBUTTONPAD 0x04 /* softbuttons at top of pad */ #define INPUT_PROP_POINTING_STICK 0x05 /* is a pointing stick */ #define INPUT_PROP_ACCELEROMETER 0x06 /* has accelerometer */ -#define INPUT_PROP_HAPTIC_TOUCHPAD 0x07 /* is a haptic touchpad */ +#define INPUT_PROP_PRESSUREPAD 0x07 /* pressure triggers clicks */ #define INPUT_PROP_MAX 0x1f #define INPUT_PROP_CNT (INPUT_PROP_MAX + 1) From 6f91ad24c6639220f2edb0ad8edb199b43cc3b22 Mon Sep 17 00:00:00 2001 From: Anthony Wong Date: Mon, 17 Nov 2025 02:53:11 +0800 Subject: [PATCH 438/543] platform/x86: alienware-wmi-wmax: Add AWCC support to Alienware 16 Aurora MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add AWCC support to Alienware 16 Aurora Cc: stable@vger.kernel.org Signed-off-by: Anthony Wong Reviewed-by: Kurt Borja Link: https://patch.msgid.link/20251116185311.18074-1-anthony.wong@canonical.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi-wmax.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/platform/x86/dell/alienware-wmi-wmax.c b/drivers/platform/x86/dell/alienware-wmi-wmax.c index 1c92db1ac087..fadf7aac6779 100644 --- a/drivers/platform/x86/dell/alienware-wmi-wmax.c +++ b/drivers/platform/x86/dell/alienware-wmi-wmax.c @@ -89,6 +89,14 @@ static struct awcc_quirks generic_quirks = { static struct awcc_quirks empty_quirks; static const struct dmi_system_id awcc_dmi_table[] __initconst = { + { + .ident = "Alienware 16 Aurora", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware 16 Aurora"), + }, + .driver_data = &g_series_quirks, + }, { .ident = "Alienware Area-51m", .matches = { From ddf5ffff3a5fe95bed178f5554596b93c52afbc9 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 10 Nov 2025 15:50:41 -0800 Subject: [PATCH 439/543] platform/x86/intel/hid: Add Nova Lake support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ACPI ID for Nova Lake. Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/20251110235041.123685-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/hid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c index f25a427cccda..9c07a7faf18f 100644 --- a/drivers/platform/x86/intel/hid.c +++ b/drivers/platform/x86/intel/hid.c @@ -55,6 +55,7 @@ static const struct acpi_device_id intel_hid_ids[] = { { "INTC10CB" }, { "INTC10CC" }, { "INTC10F1" }, + { "INTC10F2" }, { } }; MODULE_DEVICE_TABLE(acpi, intel_hid_ids); From d8bb447efc5622577994287dc77c684fa8840b30 Mon Sep 17 00:00:00 2001 From: Haotian Zhang Date: Mon, 17 Nov 2025 11:33:54 +0800 Subject: [PATCH 440/543] platform/x86/intel/speed_select_if: Convert PCIBIOS_* return codes to errnos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit isst_if_probe() uses pci_read_config_dword() that returns PCIBIOS_* codes. The return code is returned from the probe function as is but probe functions should return normal errnos. A proper implementation can be found in drivers/leds/leds-ss4200.c. Convert PCIBIOS_* return codes using pcibios_err_to_errno() into normal errno before returning. Fixes: d3a23584294c ("platform/x86: ISST: Add Intel Speed Select mmio interface") Signed-off-by: Haotian Zhang Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20251117033354.132-1-vulab@iscas.ac.cn Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c b/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c index 3f4343147dad..950ede5eab76 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c @@ -108,11 +108,11 @@ static int isst_if_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ret = pci_read_config_dword(pdev, 0xD0, &mmio_base); if (ret) - return ret; + return pcibios_err_to_errno(ret); ret = pci_read_config_dword(pdev, 0xFC, &pcu_base); if (ret) - return ret; + return pcibios_err_to_errno(ret); pcu_base &= GENMASK(10, 0); base_addr = (u64)mmio_base << 23 | (u64) pcu_base << 12; From 444a9256f8d106e08a6bc2dc8ef28a8699e4b3ba Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 17 Nov 2025 16:59:38 +0100 Subject: [PATCH 441/543] platform/x86: acer-wmi: Ignore backlight event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On the Acer Nitro AN515-58, the event 4 - 0 is send by the ACPI firmware when the backlight up/down keys are pressed. Ignore this event to avoid spamming the kernel log with error messages, as the acpi-video driver already handles brightness up/down events. Reported-by: Bugaddr Closes: https://bugaddr.tech/posts/2025-11-16-debugging-the-acer-nitro-5-an515-58-fn-f10-keyboard-backlight-bug-on-linux/#wmi-interface-issues Tested-by: Bugaddr Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20251117155938.3030-1-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/acer-wmi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c index 13eb22b35aa8..d848afc91f87 100644 --- a/drivers/platform/x86/acer-wmi.c +++ b/drivers/platform/x86/acer-wmi.c @@ -102,6 +102,7 @@ MODULE_ALIAS("wmi:676AA15E-6A47-4D9F-A2CC-1E6D18D14026"); enum acer_wmi_event_ids { WMID_HOTKEY_EVENT = 0x1, + WMID_BACKLIGHT_EVENT = 0x4, WMID_ACCEL_OR_KBD_DOCK_EVENT = 0x5, WMID_GAMING_TURBO_KEY_EVENT = 0x7, WMID_AC_EVENT = 0x8, @@ -2369,6 +2370,9 @@ static void acer_wmi_notify(union acpi_object *obj, void *context) sparse_keymap_report_event(acer_wmi_input_dev, scancode, 1, true); } break; + case WMID_BACKLIGHT_EVENT: + /* Already handled by acpi-video */ + break; case WMID_ACCEL_OR_KBD_DOCK_EVENT: acer_gsensor_event(); acer_kbd_dock_event(&return_value); From db30233361f94e1a84450c607989bdb671100fb6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 10 Nov 2025 22:09:34 -0800 Subject: [PATCH 442/543] platform/x86: intel-uncore-freq: fix all header kernel-doc warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In file uncore-frequency/uncore-frequency-common.h, correct all kernel-doc warnings by adding missing leading " *" to some lines, adding a missing kernel-doc entry, and fixing a name typo. Warning: uncore-frequency-common.h:50 bad line: Storage for kobject attribute elc_low_threshold_percent Warning: uncore-frequency-common.h:52 bad line: Storage for kobject attribute elc_high_threshold_percent Warning: uncore-frequency-common.h:54 bad line: Storage for kobject attribute elc_high_threshold_enable Warning: uncore-frequency-common.h:92 struct member 'min_freq_khz_kobj_attr' not described in 'uncore_data' Warning: uncore-frequency-common.h:92 struct member 'die_id_kobj_attr' not described in 'uncore_data' Fixes: 24b6616355f7 ("platform/x86/intel-uncore-freq: Add efficiency latency control to sysfs interface") Fixes: 416de0246f35 ("platform/x86: intel-uncore-freq: Fix types in sysfs callbacks") Fixes: 247b43fcd872 ("platform/x86/intel-uncore-freq: Add attributes to show die_id") Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251111060938.1998542-1-rdunlap@infradead.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../x86/intel/uncore-frequency/uncore-frequency-common.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h index 70ae11519837..0abe850ef54e 100644 --- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h @@ -40,7 +40,7 @@ * @agent_type_mask: Bit mask of all hardware agents for this domain * @uncore_attr_group: Attribute group storage * @max_freq_khz_kobj_attr: Storage for kobject attribute max_freq_khz - * @mix_freq_khz_kobj_attr: Storage for kobject attribute min_freq_khz + * @min_freq_khz_kobj_attr: Storage for kobject attribute min_freq_khz * @initial_max_freq_khz_kobj_attr: Storage for kobject attribute initial_max_freq_khz * @initial_min_freq_khz_kobj_attr: Storage for kobject attribute initial_min_freq_khz * @current_freq_khz_kobj_attr: Storage for kobject attribute current_freq_khz @@ -48,13 +48,14 @@ * @fabric_cluster_id_kobj_attr: Storage for kobject attribute fabric_cluster_id * @package_id_kobj_attr: Storage for kobject attribute package_id * @elc_low_threshold_percent_kobj_attr: - Storage for kobject attribute elc_low_threshold_percent + * Storage for kobject attribute elc_low_threshold_percent * @elc_high_threshold_percent_kobj_attr: - Storage for kobject attribute elc_high_threshold_percent + * Storage for kobject attribute elc_high_threshold_percent * @elc_high_threshold_enable_kobj_attr: - Storage for kobject attribute elc_high_threshold_enable + * Storage for kobject attribute elc_high_threshold_enable * @elc_floor_freq_khz_kobj_attr: Storage for kobject attribute elc_floor_freq_khz * @agent_types_kobj_attr: Storage for kobject attribute agent_type + * @die_id_kobj_attr: Attribute storage for die_id information * @uncore_attrs: Attribute storage for group creation * * This structure is used to encapsulate all data related to uncore sysfs From 896f1a2493b59beb2b5ccdf990503dbb16cb2256 Mon Sep 17 00:00:00 2001 From: Pavel Zhigulin Date: Thu, 13 Nov 2025 14:27:56 +0300 Subject: [PATCH 443/543] net: qlogic/qede: fix potential out-of-bounds read in qede_tpa_cont() and qede_tpa_end() The loops in 'qede_tpa_cont()' and 'qede_tpa_end()', iterate over 'cqe->len_list[]' using only a zero-length terminator as the stopping condition. If the terminator was missing or malformed, the loop could run past the end of the fixed-size array. Add an explicit bound check using ARRAY_SIZE() in both loops to prevent a potential out-of-bounds access. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 55482edc25f0 ("qede: Add slowpath/fastpath support and enable hardware GRO") Signed-off-by: Pavel Zhigulin Link: https://patch.msgid.link/20251113112757.4166625-1-Pavel.Zhigulin@kaspersky.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/qlogic/qede/qede_fp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 847fa62c80df..e338bfc8b7b2 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -4,6 +4,7 @@ * Copyright (c) 2019-2020 Marvell International Ltd. */ +#include #include #include #include @@ -960,7 +961,7 @@ static inline void qede_tpa_cont(struct qede_dev *edev, { int i; - for (i = 0; cqe->len_list[i]; i++) + for (i = 0; cqe->len_list[i] && i < ARRAY_SIZE(cqe->len_list); i++) qede_fill_frag_skb(edev, rxq, cqe->tpa_agg_index, le16_to_cpu(cqe->len_list[i])); @@ -985,7 +986,7 @@ static int qede_tpa_end(struct qede_dev *edev, dma_unmap_page(rxq->dev, tpa_info->buffer.mapping, PAGE_SIZE, rxq->data_direction); - for (i = 0; cqe->len_list[i]; i++) + for (i = 0; cqe->len_list[i] && i < ARRAY_SIZE(cqe->len_list); i++) qede_fill_frag_skb(edev, rxq, cqe->tpa_agg_index, le16_to_cpu(cqe->len_list[i])); if (unlikely(i > 1)) From 0f08f0b0fb5e674b48f30e86c103760204a1d3f3 Mon Sep 17 00:00:00 2001 From: Florian Fuchs Date: Thu, 13 Nov 2025 19:10:00 +0100 Subject: [PATCH 444/543] net: ps3_gelic_net: handle skb allocation failures Handle skb allocation failures in RX path, to avoid NULL pointer dereference and RX stalls under memory pressure. If the refill fails with -ENOMEM, complete napi polling and wake up later to retry via timer. Also explicitly re-enable RX DMA after oom, so the dmac doesn't remain stopped in this situation. Previously, memory pressure could lead to skb allocation failures and subsequent Oops like: Oops: Kernel access of bad area, sig: 11 [#2] Hardware name: SonyPS3 Cell Broadband Engine 0x701000 PS3 NIP [c0003d0000065900] gelic_net_poll+0x6c/0x2d0 [ps3_gelic] (unreliable) LR [c0003d00000659c4] gelic_net_poll+0x130/0x2d0 [ps3_gelic] Call Trace: gelic_net_poll+0x130/0x2d0 [ps3_gelic] (unreliable) __napi_poll+0x44/0x168 net_rx_action+0x178/0x290 Steps to reproduce the issue: 1. Start a continuous network traffic, like scp of a 20GB file 2. Inject failslab errors using the kernel fault injection: echo -1 > /sys/kernel/debug/failslab/times echo 30 > /sys/kernel/debug/failslab/interval echo 100 > /sys/kernel/debug/failslab/probability 3. After some time, traces start to appear, kernel Oopses and the system stops Step 2 is not always necessary, as it is usually already triggered by the transfer of a big enough file. Fixes: 02c1889166b4 ("ps3: gigabit ethernet driver for PS3, take3") Signed-off-by: Florian Fuchs Link: https://patch.msgid.link/20251113181000.3914980-1-fuchsfl@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/toshiba/ps3_gelic_net.c | 45 +++++++++++++++----- drivers/net/ethernet/toshiba/ps3_gelic_net.h | 1 + 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c b/drivers/net/ethernet/toshiba/ps3_gelic_net.c index 5ee8e8980393..591866fc9055 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c +++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c @@ -260,6 +260,7 @@ void gelic_card_down(struct gelic_card *card) if (atomic_dec_if_positive(&card->users) == 0) { pr_debug("%s: real do\n", __func__); napi_disable(&card->napi); + timer_delete_sync(&card->rx_oom_timer); /* * Disable irq. Wireless interrupts will * be disabled later if any @@ -970,7 +971,8 @@ static void gelic_net_pass_skb_up(struct gelic_descr *descr, * gelic_card_decode_one_descr - processes an rx descriptor * @card: card structure * - * returns 1 if a packet has been sent to the stack, otherwise 0 + * returns 1 if a packet has been sent to the stack, -ENOMEM on skb alloc + * failure, otherwise 0 * * processes an rx descriptor by iommu-unmapping the data buffer and passing * the packet up to the stack @@ -981,16 +983,18 @@ static int gelic_card_decode_one_descr(struct gelic_card *card) struct gelic_descr_chain *chain = &card->rx_chain; struct gelic_descr *descr = chain->head; struct net_device *netdev = NULL; - int dmac_chain_ended; + int dmac_chain_ended = 0; + int prepare_rx_ret; status = gelic_descr_get_status(descr); if (status == GELIC_DESCR_DMA_CARDOWNED) return 0; - if (status == GELIC_DESCR_DMA_NOT_IN_USE) { + if (status == GELIC_DESCR_DMA_NOT_IN_USE || !descr->skb) { dev_dbg(ctodev(card), "dormant descr? %p\n", descr); - return 0; + dmac_chain_ended = 1; + goto refill; } /* netdevice select */ @@ -1048,9 +1052,10 @@ static int gelic_card_decode_one_descr(struct gelic_card *card) refill: /* is the current descriptor terminated with next_descr == NULL? */ - dmac_chain_ended = - be32_to_cpu(descr->hw_regs.dmac_cmd_status) & - GELIC_DESCR_RX_DMA_CHAIN_END; + if (!dmac_chain_ended) + dmac_chain_ended = + be32_to_cpu(descr->hw_regs.dmac_cmd_status) & + GELIC_DESCR_RX_DMA_CHAIN_END; /* * So that always DMAC can see the end * of the descriptor chain to avoid @@ -1062,10 +1067,11 @@ static int gelic_card_decode_one_descr(struct gelic_card *card) gelic_descr_set_status(descr, GELIC_DESCR_DMA_NOT_IN_USE); /* - * this call can fail, but for now, just leave this - * descriptor without skb + * this call can fail, propagate the error */ - gelic_descr_prepare_rx(card, descr); + prepare_rx_ret = gelic_descr_prepare_rx(card, descr); + if (prepare_rx_ret) + return prepare_rx_ret; chain->tail = descr; chain->head = descr->next; @@ -1087,6 +1093,13 @@ static int gelic_card_decode_one_descr(struct gelic_card *card) return 1; } +static void gelic_rx_oom_timer(struct timer_list *t) +{ + struct gelic_card *card = timer_container_of(card, t, rx_oom_timer); + + napi_schedule(&card->napi); +} + /** * gelic_net_poll - NAPI poll function called by the stack to return packets * @napi: napi structure @@ -1099,14 +1112,22 @@ static int gelic_net_poll(struct napi_struct *napi, int budget) { struct gelic_card *card = container_of(napi, struct gelic_card, napi); int packets_done = 0; + int work_result = 0; while (packets_done < budget) { - if (!gelic_card_decode_one_descr(card)) + work_result = gelic_card_decode_one_descr(card); + if (work_result != 1) break; packets_done++; } + if (work_result == -ENOMEM) { + napi_complete_done(napi, packets_done); + mod_timer(&card->rx_oom_timer, jiffies + 1); + return packets_done; + } + if (packets_done < budget) { napi_complete_done(napi, packets_done); gelic_card_rx_irq_on(card); @@ -1576,6 +1597,8 @@ static struct gelic_card *gelic_alloc_card_net(struct net_device **netdev) mutex_init(&card->updown_lock); atomic_set(&card->users, 0); + timer_setup(&card->rx_oom_timer, gelic_rx_oom_timer, 0); + return card; } diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.h b/drivers/net/ethernet/toshiba/ps3_gelic_net.h index f7d7931e51b7..c10f1984a5a1 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_net.h +++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.h @@ -268,6 +268,7 @@ struct gelic_vlan_id { struct gelic_card { struct napi_struct napi; struct net_device *netdev[GELIC_PORT_MAX]; + struct timer_list rx_oom_timer; /* * hypervisor requires irq_status should be * 8 bytes aligned, but u64 member is From 6eb2e056b0e418718fc5a3cfe79bdb41d9a2851d Mon Sep 17 00:00:00 2001 From: Dnyaneshwar Bhadane Date: Mon, 22 Sep 2025 20:33:15 +0530 Subject: [PATCH 445/543] drm/pcids: Split PTL pciids group to make wcl subplatform To form the WCL platform as a subplatform of PTL in definition, WCL pci ids are splited into saparate group from PTL. So update the pciidlist struct to cover all the pci ids. v2: - Squash wcl description in single patch for display and xe.(jani,gustavo) Fixes: 3c0f211bc8fc ("drm/xe: Add Wildcat Lake device IDs to PTL list") Signed-off-by: Dnyaneshwar Bhadane Reviewed-by: Gustavo Sousa Signed-off-by: Suraj Kandpal Link: https://lore.kernel.org/r/20250922150317.2334680-2-dnyaneshwar.bhadane@intel.com (cherry picked from commit 32620e176443bf23ec81bfe8f177c6721a904864) Signed-off-by: Rodrigo Vivi [Rodrigo added the Fixes tag when porting it to fixes] --- drivers/gpu/drm/i915/display/intel_display_device.c | 1 + drivers/gpu/drm/xe/xe_pci.c | 1 + include/drm/intel/pciids.h | 5 ++++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_display_device.c b/drivers/gpu/drm/i915/display/intel_display_device.c index a002bc6ce7b0..a9a36176096f 100644 --- a/drivers/gpu/drm/i915/display/intel_display_device.c +++ b/drivers/gpu/drm/i915/display/intel_display_device.c @@ -1482,6 +1482,7 @@ static const struct { INTEL_LNL_IDS(INTEL_DISPLAY_DEVICE, &lnl_desc), INTEL_BMG_IDS(INTEL_DISPLAY_DEVICE, &bmg_desc), INTEL_PTL_IDS(INTEL_DISPLAY_DEVICE, &ptl_desc), + INTEL_WCL_IDS(INTEL_DISPLAY_DEVICE, &ptl_desc), }; static const struct { diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 9a6df79fc5b6..89cc6d32f041 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -375,6 +375,7 @@ static const struct pci_device_id pciidlist[] = { INTEL_LNL_IDS(INTEL_VGA_DEVICE, &lnl_desc), INTEL_BMG_IDS(INTEL_VGA_DEVICE, &bmg_desc), INTEL_PTL_IDS(INTEL_VGA_DEVICE, &ptl_desc), + INTEL_WCL_IDS(INTEL_VGA_DEVICE, &ptl_desc), { } }; MODULE_DEVICE_TABLE(pci, pciidlist); diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h index da6301a6fcea..69d4ae92d822 100644 --- a/include/drm/intel/pciids.h +++ b/include/drm/intel/pciids.h @@ -877,7 +877,10 @@ MACRO__(0xB08F, ## __VA_ARGS__), \ MACRO__(0xB090, ## __VA_ARGS__), \ MACRO__(0xB0A0, ## __VA_ARGS__), \ - MACRO__(0xB0B0, ## __VA_ARGS__), \ + MACRO__(0xB0B0, ## __VA_ARGS__) + +/* WCL */ +#define INTEL_WCL_IDS(MACRO__, ...) \ MACRO__(0xFD80, ## __VA_ARGS__), \ MACRO__(0xFD81, ## __VA_ARGS__) From 913253ed47b9925454cbb17faa3e350015b3d67a Mon Sep 17 00:00:00 2001 From: Dnyaneshwar Bhadane Date: Mon, 22 Sep 2025 20:33:16 +0530 Subject: [PATCH 446/543] drm/i915/display: Add definition for wcl as subplatform We will need to differentiate between WCL and PTL in intel_encoder_is_c10phy(). Since WCL and PTL use the same display architecture, let's define WCL as a subplatform of PTL to allow the differentiation. v2: Update commit message and reorder wcl define (Gustavo) Fixes: 3c0f211bc8fc ("drm/xe: Add Wildcat Lake device IDs to PTL list") Signed-off-by: Dnyaneshwar Bhadane Reviewed-by: Gustavo Sousa Signed-off-by: Suraj Kandpal Link: https://lore.kernel.org/r/20250922150317.2334680-3-dnyaneshwar.bhadane@intel.com (cherry picked from commit 4dfaae643e59cf3ab71b88689dce1b874f036f00) Signed-off-by: Rodrigo Vivi [Rodrigo added Fixes tag when porting it to fixes] --- drivers/gpu/drm/i915/display/intel_display_device.c | 12 ++++++++++++ drivers/gpu/drm/i915/display/intel_display_device.h | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_display_device.c b/drivers/gpu/drm/i915/display/intel_display_device.c index a9a36176096f..f3f1f25b0f38 100644 --- a/drivers/gpu/drm/i915/display/intel_display_device.c +++ b/drivers/gpu/drm/i915/display/intel_display_device.c @@ -1404,8 +1404,20 @@ static const struct platform_desc bmg_desc = { PLATFORM_GROUP(dgfx), }; +static const u16 wcl_ids[] = { + INTEL_WCL_IDS(ID), + 0 +}; + static const struct platform_desc ptl_desc = { PLATFORM(pantherlake), + .subplatforms = (const struct subplatform_desc[]) { + { + SUBPLATFORM(pantherlake, wildcatlake), + .pciidlist = wcl_ids, + }, + {}, + } }; __diag_pop(); diff --git a/drivers/gpu/drm/i915/display/intel_display_device.h b/drivers/gpu/drm/i915/display/intel_display_device.h index f329f1beafef..a910642d589c 100644 --- a/drivers/gpu/drm/i915/display/intel_display_device.h +++ b/drivers/gpu/drm/i915/display/intel_display_device.h @@ -101,7 +101,9 @@ struct pci_dev; /* Display ver 14.1 (based on GMD ID) */ \ func(battlemage) \ /* Display ver 30 (based on GMD ID) */ \ - func(pantherlake) + func(pantherlake) \ + func(pantherlake_wildcatlake) + #define __MEMBER(name) unsigned long name:1; #define __COUNT(x) 1 + From 5474560381775bc70cc90ed2acefad48ffd6ee07 Mon Sep 17 00:00:00 2001 From: Dnyaneshwar Bhadane Date: Mon, 22 Sep 2025 20:33:17 +0530 Subject: [PATCH 447/543] drm/i915/xe3: Restrict PTL intel_encoder_is_c10phy() to only PHY A On PTL, no combo PHY is connected to PORT B. However, PORT B can still be used for Type-C and will utilize the C20 PHY for eDP over Type-C. In such configurations, VBTs also enumerate PORT B. This leads to issues where PORT B is incorrectly identified as using the C10 PHY, due to the assumption that returning true for PORT B in intel_encoder_is_c10phy() would not cause problems. From PTL's perspective, only PORT A/PHY A uses the C10 PHY. Update the helper intel_encoder_is_c10phy() to return true only for PORT A/PHY on PTL. v2: Change the condition code style for ptl/wcl Bspec: 72571,73944 Fixes: 9d10de78a37f ("drm/i915/wcl: C10 phy connected to port A and B") Signed-off-by: Dnyaneshwar Bhadane Reviewed-by: Gustavo Sousa Signed-off-by: Suraj Kandpal Link: https://lore.kernel.org/r/20250922150317.2334680-4-dnyaneshwar.bhadane@intel.com (cherry picked from commit 8147f7a1c083fd565fb958824f7c552de3b2dc46) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_cx0_phy.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cx0_phy.c b/drivers/gpu/drm/i915/display/intel_cx0_phy.c index 801235a5bc0a..a2d2cecf7121 100644 --- a/drivers/gpu/drm/i915/display/intel_cx0_phy.c +++ b/drivers/gpu/drm/i915/display/intel_cx0_phy.c @@ -39,14 +39,12 @@ bool intel_encoder_is_c10phy(struct intel_encoder *encoder) struct intel_display *display = to_intel_display(encoder); enum phy phy = intel_encoder_to_phy(encoder); - /* PTL doesn't have a PHY connected to PORT B; as such, - * there will never be a case where PTL uses PHY B. - * WCL uses PORT A and B with the C10 PHY. - * Reusing the condition for WCL and extending it for PORT B - * should not cause any issues for PTL. - */ - if (display->platform.pantherlake && phy < PHY_C) - return true; + if (display->platform.pantherlake) { + if (display->platform.pantherlake_wildcatlake) + return phy <= PHY_B; + else + return phy == PHY_A; + } if ((display->platform.lunarlake || display->platform.meteorlake) && phy < PHY_C) return true; From f384497a76ed9539f70f6e8fe81a193441c943d2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Nov 2025 15:16:04 +0100 Subject: [PATCH 448/543] PM: sleep: core: Fix runtime PM enabling in device_resume_early() Runtime PM should only be enabled in device_resume_early() if it has been disabled for the given device by device_suspend_late(). Otherwise, it may cause runtime PM callbacks to run prematurely in some cases which leads to further functional issues. Make two changes to address this problem. First, reorder device_suspend_late() to only disable runtime PM for a device when it is going to look for the device's callback or if the device is a "syscore" one. In all of the other cases, disabling runtime PM for the device is not in fact necessary. However, if the device's callback returns an error and the power.is_late_suspended flag is not going to be set, enable runtime PM so it only remains disabled when power.is_late_suspended is set. Second, make device_resume_early() only enable runtime PM for the devices with the power.is_late_suspended flag set. Fixes: 443046d1ad66 ("PM: sleep: Make suspend of devices more asynchronous") Reported-by: Rose Wu Closes: https://lore.kernel.org/linux-pm/70b25dca6f8c2756d78f076f4a7dee7edaaffc33.camel@mediatek.com/ Cc: 6.16+ # 6.16+ Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/12784270.O9o76ZdvQC@rafael.j.wysocki --- drivers/base/power/main.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e83503bdc1fd..1de1cd72b616 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -888,12 +888,15 @@ static void device_resume_early(struct device *dev, pm_message_t state, bool asy TRACE_DEVICE(dev); TRACE_RESUME(0); - if (dev->power.syscore || dev->power.direct_complete) + if (dev->power.direct_complete) goto Out; if (!dev->power.is_late_suspended) goto Out; + if (dev->power.syscore) + goto Skip; + if (!dpm_wait_for_superior(dev, async)) goto Out; @@ -926,11 +929,11 @@ static void device_resume_early(struct device *dev, pm_message_t state, bool asy Skip: dev->power.is_late_suspended = false; + pm_runtime_enable(dev); Out: TRACE_RESUME(error); - pm_runtime_enable(dev); complete_all(&dev->power.completion); if (error) { @@ -1615,12 +1618,6 @@ static void device_suspend_late(struct device *dev, pm_message_t state, bool asy TRACE_DEVICE(dev); TRACE_SUSPEND(0); - /* - * Disable runtime PM for the device without checking if there is a - * pending resume request for it. - */ - __pm_runtime_disable(dev, false); - dpm_wait_for_subordinate(dev, async); if (READ_ONCE(async_error)) @@ -1631,9 +1628,18 @@ static void device_suspend_late(struct device *dev, pm_message_t state, bool asy goto Complete; } - if (dev->power.syscore || dev->power.direct_complete) + if (dev->power.direct_complete) goto Complete; + /* + * Disable runtime PM for the device without checking if there is a + * pending resume request for it. + */ + __pm_runtime_disable(dev, false); + + if (dev->power.syscore) + goto Skip; + if (dev->pm_domain) { info = "late power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); @@ -1664,6 +1670,7 @@ static void device_suspend_late(struct device *dev, pm_message_t state, bool asy WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async late" : " late", error); + pm_runtime_enable(dev); goto Complete; } dpm_propagate_wakeup_to_parent(dev); From 5bebe8de19264946d398ead4e6c20c229454a552 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Nov 2025 08:21:27 -0800 Subject: [PATCH 449/543] mm/huge_memory: Fix initialization of huge zero folio The recent fix to properly initialize the tags of the huge zero folio had an unfortunate not-so-subtle side effect: it caused the actual *contents* of the huge zero folio to not be initialized at all when the hardware didn't support the memory tagging. The reason was the unfortunate semantics of tag_clear_highpage(): on hardware that didn't do the tagging, it would silently just not do anything at all. And since this is done only on arm64 with MTE support, that basically meant most hardware. It wasn't necessarily immediately obvious since the huge zero page isn't necessarily very heavily used - or because it might already be zero because all-zeroes is the most common pattern. But it ends up causing random odd user space failures when you do hit it. The unfortunate semantics have been around for a while, but became a real bug only when we started actively using __GFP_ZEROTAGS in the generic get_huge_zero_folio() function - before that, it had only ever been used in code that checked that the hardware supported it. Fix this by simply changing the semantics of tag_clear_highpage() to return whether it actually successfully did something or not. While at it, also make it initialize multiple pages in one go, since that's actually what the only caller wants it to do and it simplifies the whole logic. Fixes: adfb6609c680 ("mm/huge_memory: initialise the tags of the huge zero folio") Link: https://lore.kernel.org/all/20251117082023.90176-1-00107082@163.com/ Reviewed-by: David Hildenbrand (Red Hat) Reported-and-tested-by: David Wang <00107082@163.com> Reported-and-tested-by: Carlos Llamas Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/page.h | 4 ++-- arch/arm64/mm/fault.c | 21 +++++++++++---------- include/linux/highmem.h | 6 ++++-- mm/page_alloc.c | 9 ++------- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 2312e6ee595f..258cca4b4873 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -33,8 +33,8 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr); #define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio -void tag_clear_highpage(struct page *to); -#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE +bool tag_clear_highpages(struct page *to, int numpages); +#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGES #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 125dfa6c613b..a193b6a5d1e6 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -967,20 +967,21 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, return vma_alloc_folio(flags, 0, vma, vaddr); } -void tag_clear_highpage(struct page *page) +bool tag_clear_highpages(struct page *page, int numpages) { /* * Check if MTE is supported and fall back to clear_highpage(). * get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and - * post_alloc_hook() will invoke tag_clear_highpage(). + * post_alloc_hook() will invoke tag_clear_highpages(). */ - if (!system_supports_mte()) { - clear_highpage(page); - return; - } + if (!system_supports_mte()) + return false; - /* Newly allocated page, shouldn't have been tagged yet */ - WARN_ON_ONCE(!try_page_mte_tagging(page)); - mte_zero_clear_page_tags(page_address(page)); - set_page_mte_tagged(page); + /* Newly allocated pages, shouldn't have been tagged yet */ + for (int i = 0; i < numpages; i++, page++) { + WARN_ON_ONCE(!try_page_mte_tagging(page)); + mte_zero_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } + return true; } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 105cc4c00cc3..abc20f9810fd 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -249,10 +249,12 @@ static inline void clear_highpage_kasan_tagged(struct page *page) kunmap_local(kaddr); } -#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE +#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -static inline void tag_clear_highpage(struct page *page) +/* Return false to let people know we did not initialize the pages */ +static inline bool tag_clear_highpages(struct page *page, int numpages) { + return false; } #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 600d9e981c23..ed82ee55e66a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1822,14 +1822,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * If memory tags should be zeroed * (which happens only when memory should be initialized as well). */ - if (zero_tags) { - /* Initialize both memory and memory tags. */ - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); + if (zero_tags) + init = !tag_clear_highpages(page, 1 << order); - /* Take note that memory was initialized by the loop above. */ - init = false; - } if (!should_skip_kasan_unpoison(gfp_flags) && kasan_unpoison_pages(page, order, init)) { /* Take note that memory was initialized by KASAN. */ From cfa0904a35fd0231f4d05da0190f0a22ed881cce Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Thu, 18 Sep 2025 16:25:45 -0400 Subject: [PATCH 450/543] drm/amd/display: Prevent Gating DTBCLK before It Is Properly Latched [why] 1. With allow_0_dtb_clk enabled, the time required to latch DTBCLK to 600 MHz depends on the SMU. If DTBCLK is not latched to 600 MHz before set_mode completes, gating DTBCLK causes the DP2 sink to lose its clock source. 2. The existing DTBCLK gating sequence ungates DTBCLK based on both pix_clk and ref_dtbclk, but gates DTBCLK when either pix_clk or ref_dtbclk is zero. pix_clk can be zero outside the set_mode sequence before DTBCLK is properly latched, which can lead to DTBCLK being gated by mistake. [how] Consider both pixel_clk and ref_dtbclk when determining when it is safe to gate DTBCLK; this is more accurate. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4701 Fixes: 5949e7c4890c ("drm/amd/display: Enable Dynamic DTBCLK Switch") Reviewed-by: Charlene Liu Reviewed-by: Aurabindo Pillai Signed-off-by: Fangzhi Zuo Signed-off-by: Roman Li Tested-by: Dan Wheeler Signed-off-by: Alex Deucher (cherry picked from commit d04eb0c402780ca037b62a6aecf23b863545ebca) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c | 4 +++- drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c index b11383fba35f..1eb04772f5da 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c @@ -394,6 +394,8 @@ void dcn35_update_clocks(struct clk_mgr *clk_mgr_base, display_count = dcn35_get_active_display_cnt_wa(dc, context, &all_active_disps); if (new_clocks->dtbclk_en && !new_clocks->ref_dtbclk_khz) new_clocks->ref_dtbclk_khz = 600000; + else if (!new_clocks->dtbclk_en && new_clocks->ref_dtbclk_khz > 590000) + new_clocks->ref_dtbclk_khz = 0; /* * if it is safe to lower, but we are already in the lower state, we don't have to do anything @@ -435,7 +437,7 @@ void dcn35_update_clocks(struct clk_mgr *clk_mgr_base, actual_dtbclk = REG_READ(CLK1_CLK4_CURRENT_CNT); - if (actual_dtbclk) { + if (actual_dtbclk > 590000) { clk_mgr_base->clks.ref_dtbclk_khz = new_clocks->ref_dtbclk_khz; clk_mgr_base->clks.dtbclk_en = new_clocks->dtbclk_en; } diff --git a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c index de6d62401362..c899c09ea31b 100644 --- a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c +++ b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c @@ -1411,7 +1411,7 @@ static void dccg35_set_dtbclk_dto( __func__, params->otg_inst, params->pixclk_khz, params->ref_dtbclk_khz, req_dtbclk_khz, phase, modulo); - } else { + } else if (!params->ref_dtbclk_khz && !req_dtbclk_khz) { switch (params->otg_inst) { case 0: REG_UPDATE(DCCG_GATE_DISABLE_CNTL5, DTBCLK_P0_GATE_DISABLE, 0); From 3fa05f96fc08dff5e846c2cc283a249c1bf029a1 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 12 Nov 2025 01:30:17 +0000 Subject: [PATCH 451/543] KVM: SVM: Fix redundant updates of LBR MSR intercepts Don't update the LBR MSR intercept bitmaps if they're already up-to-date, as unconditionally updating the intercepts forces KVM to recalculate the MSR bitmaps for vmcb02 on every nested VMRUN. The redundant updates are functionally okay; however, they neuter an optimization in Hyper-V nested virtualization enlightenments and this manifests as a self-test failure. In particular, Hyper-V lets L1 mark "nested enlightenments" as clean, i.e. tell KVM that no changes were made to the MSR bitmap since the last VMRUN. The hyperv_svm_test KVM selftest intentionally changes the MSR bitmap "without telling KVM about it" to verify that KVM honors the clean hint, correctly fails because KVM notices the changed bitmap anyway: ==== Test Assertion Failure ==== x86/hyperv_svm_test.c:120: vmcb->control.exit_code == 0x081 pid=193558 tid=193558 errno=4 - Interrupted system call 1 0x0000000000411361: assert_on_unhandled_exception at processor.c:659 2 0x0000000000406186: _vcpu_run at kvm_util.c:1699 3 (inlined by) vcpu_run at kvm_util.c:1710 4 0x0000000000401f2a: main at hyperv_svm_test.c:175 5 0x000000000041d0d3: __libc_start_call_main at libc-start.o:? 6 0x000000000041f27c: __libc_start_main_impl at ??:? 7 0x00000000004021a0: _start at ??:? vmcb->control.exit_code == SVM_EXIT_VMMCALL Do *not* fix this by skipping svm_hv_vmcb_dirty_nested_enlightenments() when svm_set_intercept_for_msr() performs a no-op change. changes to the L0 MSR interception bitmap are only triggered by full CPUID updates and MSR filter updates, both of which should be rare. Changing svm_set_intercept_for_msr() risks hiding unintended pessimizations like this one, and is actually more complex than this change. Fixes: fbe5e5f030c2 ("KVM: nSVM: Always recalculate LBR MSR intercepts in svm_update_lbrv()") Cc: stable@vger.kernel.org Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251112013017.1836863-1-yosry.ahmed@linux.dev [Rewritten commit message based on mailing list discussion. - Paolo] Reviewed-by: Sean Christopherson Tested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 9 ++++++++- arch/x86/kvm/svm/svm.h | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 10c21e4c5406..9d29b2e7e855 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -705,7 +705,11 @@ void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask) static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) { - bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); + struct vcpu_svm *svm = to_svm(vcpu); + bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); + + if (intercept == svm->lbr_msrs_intercepted) + return; svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept); svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept); @@ -714,6 +718,8 @@ static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) if (sev_es_guest(vcpu->kvm)) svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); + + svm->lbr_msrs_intercepted = intercept; } void svm_vcpu_free_msrpm(void *msrpm) @@ -1221,6 +1227,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) } svm->x2avic_msrs_intercepted = true; + svm->lbr_msrs_intercepted = true; svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index c856d8e0f95e..dd78e6402345 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -336,6 +336,7 @@ struct vcpu_svm { bool guest_state_loaded; bool x2avic_msrs_intercepted; + bool lbr_msrs_intercepted; /* Guest GIF value, used when vGIF is not enabled */ bool guest_gif; From cdcbb8e8d10f656642380ee13516290437b52b36 Mon Sep 17 00:00:00 2001 From: Naoki Ueki Date: Mon, 3 Nov 2025 21:16:45 +0900 Subject: [PATCH 452/543] HID: elecom: Add support for ELECOM M-XT3URBK (018F) The ELECOM M-XT3URBK trackball has an additional device ID (0x018F), which shares the same report descriptor as the existing device (0x00FB). However, the driver does not currently recognize this new ID, resulting in only five buttons being functional. This patch adds the new device ID so that all six buttons work properly. Signed-off-by: Naoki Ueki Signed-off-by: Jiri Kosina --- drivers/hid/hid-elecom.c | 6 ++++-- drivers/hid/hid-ids.h | 3 ++- drivers/hid/hid-quirks.c | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/hid/hid-elecom.c b/drivers/hid/hid-elecom.c index 69771fd35006..981d1b6e9658 100644 --- a/drivers/hid/hid-elecom.c +++ b/drivers/hid/hid-elecom.c @@ -75,7 +75,8 @@ static const __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc, */ mouse_button_fixup(hdev, rdesc, *rsize, 20, 28, 22, 14, 8); break; - case USB_DEVICE_ID_ELECOM_M_XT3URBK: + case USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB: + case USB_DEVICE_ID_ELECOM_M_XT3URBK_018F: case USB_DEVICE_ID_ELECOM_M_XT3DRBK: case USB_DEVICE_ID_ELECOM_M_XT4DRBK: /* @@ -119,7 +120,8 @@ static const __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc, static const struct hid_device_id elecom_devices[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_BM084) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XGL20DLBK) }, - { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_018F) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT4DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) }, diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 85db279baa72..c4589075a5ed 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -449,7 +449,8 @@ #define USB_VENDOR_ID_ELECOM 0x056e #define USB_DEVICE_ID_ELECOM_BM084 0x0061 #define USB_DEVICE_ID_ELECOM_M_XGL20DLBK 0x00e6 -#define USB_DEVICE_ID_ELECOM_M_XT3URBK 0x00fb +#define USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB 0x00fb +#define USB_DEVICE_ID_ELECOM_M_XT3URBK_018F 0x018f #define USB_DEVICE_ID_ELECOM_M_XT3DRBK 0x00fc #define USB_DEVICE_ID_ELECOM_M_XT4DRBK 0x00fd #define USB_DEVICE_ID_ELECOM_M_DT1URBK 0x00fe diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index 22760ac50f2d..c89a015686c0 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -410,7 +410,8 @@ static const struct hid_device_id hid_have_special_driver[] = { #if IS_ENABLED(CONFIG_HID_ELECOM) { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_BM084) }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XGL20DLBK) }, - { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_018F) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT4DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) }, From 71ad9054c1f241be63f9d11df8cbd0aa0352fe16 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Mon, 3 Nov 2025 11:17:44 -0600 Subject: [PATCH 453/543] drm/amd/display: Move sleep into each retry for retrieve_link_cap() [Why] When a monitor is booting it's possible that it isn't ready to retrieve link caps and this can lead to an EDID read failure: ``` [drm:retrieve_link_cap [amdgpu]] *ERROR* retrieve_link_cap: Read receiver caps dpcd data failed. amdgpu 0000:c5:00.0: [drm] *ERROR* No EDID read. ``` [How] Rather than msleep once and try a few times, msleep each time. Should be no changes for existing working monitors, but should correct reading caps on a monitor that is slow to boot. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4672 Reviewed-by: Alex Hung Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Ivan Lipski Tested-by: Dan Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 669dca37b3348a447db04bbdcbb3def94d5997cc) Cc: stable@vger.kernel.org --- .../amd/display/dc/link/protocols/link_dp_capability.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c index b12c11bd6a14..4d0079b44de1 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c @@ -1734,12 +1734,13 @@ static bool retrieve_link_cap(struct dc_link *link) } dpcd_set_source_specific_data(link); - /* Sink may need to configure internals based on vendor, so allow some - * time before proceeding with possibly vendor specific transactions - */ - msleep(post_oui_delay); for (i = 0; i < read_dpcd_retry_cnt; i++) { + /* + * Sink may need to configure internals based on vendor, so allow some + * time before proceeding with possibly vendor specific transactions + */ + msleep(post_oui_delay); status = core_link_read_dpcd( link, DP_DPCD_REV, From 4e127a74786fa9573a32c8aa4bbf69ef78c3232a Mon Sep 17 00:00:00 2001 From: Stuart Hayhurst Date: Mon, 3 Nov 2025 14:21:13 +0000 Subject: [PATCH 454/543] HID: corsair-void: Use %pe for printing PTR_ERR Use %pe to print a PTR_ERR to silence a cocci warning Reported-by: kernel test robot Reported-by: Julia Lawall Closes: https://lore.kernel.org/r/202510300342.WtPn2jF3-lkp@intel.com/ Signed-off-by: Stuart Hayhurst Signed-off-by: Jiri Kosina --- drivers/hid/hid-corsair-void.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-corsair-void.c b/drivers/hid/hid-corsair-void.c index fee134a7eba3..5e9a5b8f7f16 100644 --- a/drivers/hid/hid-corsair-void.c +++ b/drivers/hid/hid-corsair-void.c @@ -553,9 +553,8 @@ static void corsair_void_add_battery(struct corsair_void_drvdata *drvdata) if (IS_ERR(new_supply)) { hid_err(drvdata->hid_dev, - "failed to register battery '%s' (reason: %ld)\n", - drvdata->battery_desc.name, - PTR_ERR(new_supply)); + "failed to register battery '%s' (reason: %pe)\n", + drvdata->battery_desc.name, new_supply); return; } From 9d7b89a1028230315a8999cfea7795fbe84f62cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Paku=C5=82a?= Date: Mon, 3 Nov 2025 21:02:43 +0100 Subject: [PATCH 455/543] HID: pidff: Fix needs_playback check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A small bug made it's way here when rewriting code to Linux quality. Currently, if an effect is not infinite and a program requests it's playback with the same number of loops, the play command won't be fired and if an effect is infinite, the spam will continue. We want every playback update for non-infinite effects and only some for infinite (detecting when a program requests stop with 0 which will be different than previous value which is usually 1 or 255). Signed-off-by: Tomasz Pakuła Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-pidff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/usbhid/hid-pidff.c b/drivers/hid/usbhid/hid-pidff.c index edd61ef50e16..95377c5f6335 100644 --- a/drivers/hid/usbhid/hid-pidff.c +++ b/drivers/hid/usbhid/hid-pidff.c @@ -806,8 +806,8 @@ static int pidff_request_effect_upload(struct pidff_device *pidff, int efnum) static int pidff_needs_playback(struct pidff_device *pidff, int effect_id, int n) { - return pidff->effect[effect_id].is_infinite || - pidff->effect[effect_id].loop_count != n; + return !pidff->effect[effect_id].is_infinite || + pidff->effect[effect_id].loop_count != n; } /* From 8612badc331bcab2068baefa69e1458085ed89e3 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Mon, 3 Nov 2025 12:11:31 -0600 Subject: [PATCH 456/543] drm/amd/display: Increase DPCD read retries [Why] Empirical measurement of some monitors that fail to read EDID while booting shows that the number of retries with a 30ms delay between tries is as high as 16. [How] Increase number of retries to 20. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4672 Reviewed-by: Alex Hung Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Ivan Lipski Tested-by: Dan Wheeler Signed-off-by: Alex Deucher (cherry picked from commit ad1c59ad7cf74ec06e32fe2c330ac1e957222288) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c index 4d0079b44de1..eb262ce42e2d 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c @@ -1691,7 +1691,7 @@ static bool retrieve_link_cap(struct dc_link *link) union edp_configuration_cap edp_config_cap; union dp_downstream_port_present ds_port = { 0 }; enum dc_status status = DC_ERROR_UNEXPECTED; - uint32_t read_dpcd_retry_cnt = 3; + uint32_t read_dpcd_retry_cnt = 20; int i; struct dp_sink_hw_fw_revision dp_hw_fw_revision; const uint32_t post_oui_delay = 30; // 30ms From c97da4785b3bbc60c24cfd1ffea1d7c8b90ed743 Mon Sep 17 00:00:00 2001 From: Ivan Lipski Date: Thu, 30 Oct 2025 11:25:33 -0400 Subject: [PATCH 457/543] drm/amd/display: Add an HPD filter for HDMI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] Some monitors perform rapid “autoscan” HPD re‑assertions right after a disconnect or powersaving mode enablement. These appear as a quick disconnect→reconnect with an identical EDID. Since Linux has no HDMI hotplug detection (HPD) filter, these quick reconnects are seen as hotplug events, which can unintentionally wake a system with DPMS off. An example: https://gitlab.freedesktop.org/drm/amd/-/issues/2876 Such 'fake reconnects' are considered when the interval between a disconnect and a connect is within 1500ms (experimentally chosen using several monitors), and the two connections have the same EDID. [How] Implement a time-based debounce mechanism: 1. On HDMI disconnect detection, instead of immediately processing the HPD event, save the current sink and schedule delayed work (default 1500ms) 2. If another HDMI disconnect HPD event arrives during the debounce period, it reschedules the pending work, ensuring only the final state is processed. 3. When the debounce timer expires, re-detect the display and compare the new sink with the cached one using EDID comparison. 4. If sinks match (same EDID), this was a spontaneous HPD toggle: - Update connector state internally - Skip hotplug event to prevent desktop rearrangement If sinks differ, this was a real display change: - Process normally with the hotplug event The debounce delay is configurable via module parameter 'hdmi_hpd_debounce_delay_ms'. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2876 Reviewed-by: Sun peng (Leo) Li Signed-off-by: Ivan Lipski Tested-by: Dan Wheeler Signed-off-by: Alex Deucher (cherry picked from commit c918e75e1ed95be76f8e3156a411188f650fe03f) --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 138 ++++++++++++++++++ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 6 + 2 files changed, 144 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 91c0188a29b2..2a7a491a62e0 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3859,6 +3859,97 @@ void amdgpu_dm_update_connector_after_detect( update_subconnector_property(aconnector); } +static bool are_sinks_equal(const struct dc_sink *sink1, const struct dc_sink *sink2) +{ + if (!sink1 || !sink2) + return false; + if (sink1->sink_signal != sink2->sink_signal) + return false; + + if (sink1->dc_edid.length != sink2->dc_edid.length) + return false; + + if (memcmp(sink1->dc_edid.raw_edid, sink2->dc_edid.raw_edid, + sink1->dc_edid.length) != 0) + return false; + return true; +} + + +/** + * DOC: hdmi_hpd_debounce_work + * + * HDMI HPD debounce delay in milliseconds. When an HDMI display toggles HPD + * (such as during power save transitions), this delay determines how long to + * wait before processing the HPD event. This allows distinguishing between a + * physical unplug (>hdmi_hpd_debounce_delay) + * and a spontaneous RX HPD toggle (base; + struct drm_device *dev = connector->dev; + struct amdgpu_device *adev = drm_to_adev(dev); + struct dc *dc = aconnector->dc_link->ctx->dc; + bool fake_reconnect = false; + bool reallow_idle = false; + bool ret = false; + guard(mutex)(&aconnector->hpd_lock); + + /* Re-detect the display */ + scoped_guard(mutex, &adev->dm.dc_lock) { + if (dc->caps.ips_support && dc->ctx->dmub_srv->idle_allowed) { + dc_allow_idle_optimizations(dc, false); + reallow_idle = true; + } + ret = dc_link_detect(aconnector->dc_link, DETECT_REASON_HPD); + } + + if (ret) { + /* Apply workaround delay for certain panels */ + apply_delay_after_dpcd_poweroff(adev, aconnector->dc_sink); + /* Compare sinks to determine if this was a spontaneous HPD toggle */ + if (are_sinks_equal(aconnector->dc_link->local_sink, aconnector->hdmi_prev_sink)) { + /* + * Sinks match - this was a spontaneous HDMI HPD toggle. + */ + drm_dbg_kms(dev, "HDMI HPD: Sink unchanged after debounce, internal re-enable\n"); + fake_reconnect = true; + } + + /* Update connector state */ + amdgpu_dm_update_connector_after_detect(aconnector); + + drm_modeset_lock_all(dev); + dm_restore_drm_connector_state(dev, connector); + drm_modeset_unlock_all(dev); + + /* Only notify OS if sink actually changed */ + if (!fake_reconnect && aconnector->base.force == DRM_FORCE_UNSPECIFIED) + drm_kms_helper_hotplug_event(dev); + } + + /* Release the cached sink reference */ + if (aconnector->hdmi_prev_sink) { + dc_sink_release(aconnector->hdmi_prev_sink); + aconnector->hdmi_prev_sink = NULL; + } + + scoped_guard(mutex, &adev->dm.dc_lock) { + if (reallow_idle && dc->caps.ips_support) + dc_allow_idle_optimizations(dc, true); + } +} + static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector) { struct drm_connector *connector = &aconnector->base; @@ -3868,6 +3959,7 @@ static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector) struct dm_connector_state *dm_con_state = to_dm_connector_state(connector->state); struct dc *dc = aconnector->dc_link->ctx->dc; bool ret = false; + bool debounce_required = false; if (adev->dm.disable_hpd_irq) return; @@ -3890,6 +3982,14 @@ static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector) if (!dc_link_detect_connection_type(aconnector->dc_link, &new_connection_type)) drm_err(adev_to_drm(adev), "KMS: Failed to detect connector\n"); + /* + * Check for HDMI disconnect with debounce enabled. + */ + debounce_required = (aconnector->hdmi_hpd_debounce_delay_ms > 0 && + dc_is_hdmi_signal(aconnector->dc_link->connector_signal) && + new_connection_type == dc_connection_none && + aconnector->dc_link->local_sink != NULL); + if (aconnector->base.force && new_connection_type == dc_connection_none) { emulated_link_detect(aconnector->dc_link); @@ -3899,7 +3999,34 @@ static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector) if (aconnector->base.force == DRM_FORCE_UNSPECIFIED) drm_kms_helper_connector_hotplug_event(connector); + } else if (debounce_required) { + /* + * HDMI disconnect detected - schedule delayed work instead of + * processing immediately. This allows us to coalesce spurious + * HDMI signals from physical unplugs. + */ + drm_dbg_kms(dev, "HDMI HPD: Disconnect detected, scheduling debounce work (%u ms)\n", + aconnector->hdmi_hpd_debounce_delay_ms); + + /* Cache the current sink for later comparison */ + if (aconnector->hdmi_prev_sink) + dc_sink_release(aconnector->hdmi_prev_sink); + aconnector->hdmi_prev_sink = aconnector->dc_link->local_sink; + if (aconnector->hdmi_prev_sink) + dc_sink_retain(aconnector->hdmi_prev_sink); + + /* Schedule delayed detection. */ + if (mod_delayed_work(system_wq, + &aconnector->hdmi_hpd_debounce_work, + msecs_to_jiffies(aconnector->hdmi_hpd_debounce_delay_ms))) + drm_dbg_kms(dev, "HDMI HPD: Re-scheduled debounce work\n"); + } else { + + /* If the aconnector->hdmi_hpd_debounce_work is scheduled, exit early */ + if (delayed_work_pending(&aconnector->hdmi_hpd_debounce_work)) + return; + scoped_guard(mutex, &adev->dm.dc_lock) { dc_exit_ips_for_hw_access(dc); ret = dc_link_detect(aconnector->dc_link, DETECT_REASON_HPD); @@ -7388,6 +7515,13 @@ static void amdgpu_dm_connector_destroy(struct drm_connector *connector) if (aconnector->mst_mgr.dev) drm_dp_mst_topology_mgr_destroy(&aconnector->mst_mgr); + /* Cancel and flush any pending HDMI HPD debounce work */ + cancel_delayed_work_sync(&aconnector->hdmi_hpd_debounce_work); + if (aconnector->hdmi_prev_sink) { + dc_sink_release(aconnector->hdmi_prev_sink); + aconnector->hdmi_prev_sink = NULL; + } + if (aconnector->bl_idx != -1) { backlight_device_unregister(dm->backlight_dev[aconnector->bl_idx]); dm->backlight_dev[aconnector->bl_idx] = NULL; @@ -8549,6 +8683,10 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm, mutex_init(&aconnector->hpd_lock); mutex_init(&aconnector->handle_mst_msg_ready); + aconnector->hdmi_hpd_debounce_delay_ms = AMDGPU_DM_HDMI_HPD_DEBOUNCE_MS; + INIT_DELAYED_WORK(&aconnector->hdmi_hpd_debounce_work, hdmi_hpd_debounce_work); + aconnector->hdmi_prev_sink = NULL; + /* * configure support HPD hot plug connector_>polled default value is 0 * which means HPD hot plug not supported diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index db75e991ac7b..8ca738957598 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -59,6 +59,7 @@ #define AMDGPU_HDR_MULT_DEFAULT (0x100000000LL) +#define AMDGPU_DM_HDMI_HPD_DEBOUNCE_MS 1500 /* #include "include/amdgpu_dal_power_if.h" #include "amdgpu_dm_irq.h" @@ -819,6 +820,11 @@ struct amdgpu_dm_connector { bool pack_sdp_v1_3; enum adaptive_sync_type as_type; struct amdgpu_hdmi_vsdb_info vsdb_info; + + /* HDMI HPD debounce support */ + unsigned int hdmi_hpd_debounce_delay_ms; + struct delayed_work hdmi_hpd_debounce_work; + struct dc_sink *hdmi_prev_sink; }; static inline void amdgpu_dm_set_mst_status(uint8_t *status, From 8513c154f8ad7097653dd9bf43d6155e5aad4ab3 Mon Sep 17 00:00:00 2001 From: Abdun Nihaal Date: Mon, 10 Nov 2025 22:45:50 +0530 Subject: [PATCH 458/543] HID: playstation: Fix memory leak in dualshock4_get_calibration_data() The memory allocated for buf is not freed in the error paths when ps_get_report() fails. Free buf before jumping to transfer_failed label Fixes: 947992c7fa9e ("HID: playstation: DS4: Fix calibration workaround for clone devices") Signed-off-by: Abdun Nihaal Reviewed-by: Silvan Jegen Signed-off-by: Jiri Kosina --- drivers/hid/hid-playstation.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/hid-playstation.c b/drivers/hid/hid-playstation.c index 63f6eb9030d1..128aa6abd10b 100644 --- a/drivers/hid/hid-playstation.c +++ b/drivers/hid/hid-playstation.c @@ -1942,6 +1942,7 @@ static int dualshock4_get_calibration_data(struct dualshock4 *ds4) "Failed to retrieve DualShock4 calibration info: %d\n", ret); ret = -EILSEQ; + kfree(buf); goto transfer_failed; } else { break; @@ -1959,6 +1960,7 @@ static int dualshock4_get_calibration_data(struct dualshock4 *ds4) if (ret) { hid_warn(hdev, "Failed to retrieve DualShock4 calibration info: %d\n", ret); + kfree(buf); goto transfer_failed; } } From 5bab4c89390f32b2f491f49a151948cd226dd909 Mon Sep 17 00:00:00 2001 From: Ivan Lipski Date: Wed, 5 Nov 2025 15:27:42 -0500 Subject: [PATCH 459/543] drm/amd/display: Clear the CUR_ENABLE register on DCN20 on DPP5 [Why] On DCN20 & DCN30, the 6th DPP's & HUBP's are powered on permanently and cannot be power gated. Thus, when dpp_reset() is invoked for the DPP5, while it's still powered on, the cached cursor_state (dpp_base->pos.cur0_ctl.bits.cur0_enable) and the actual state (CUR0_ENABLE) bit are unsycned. This can cause a double cursor in full screen with non-native scaling. [How] Force disable cursor on DPP5 on plane powerdown for ASICs w/ 6 DPPs/HUBPs. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4673 Reviewed-by: Aric Cyr Signed-off-by: Ivan Lipski Tested-by: Dan Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 79b3c037f972dcb13e325a8eabfb8da835764e15) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index 9477c9f9e196..59c42db5382e 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -614,6 +614,14 @@ void dcn20_dpp_pg_control( * DOMAIN11_PGFSM_PWR_STATUS, pwr_status, * 1, 1000); */ + + /* Force disable cursor on plane powerdown on DPP 5 using dpp_force_disable_cursor */ + if (!power_on) { + struct dpp *dpp5 = hws->ctx->dc->res_pool->dpps[dpp_inst]; + if (dpp5 && dpp5->funcs->dpp_force_disable_cursor) + dpp5->funcs->dpp_force_disable_cursor(dpp5); + } + break; default: BREAK_TO_DEBUGGER(); From a78eb69d60ce893de48dd75f725ba21309131fc2 Mon Sep 17 00:00:00 2001 From: Abdun Nihaal Date: Mon, 10 Nov 2025 22:59:41 +0530 Subject: [PATCH 460/543] HID: uclogic: Fix potential memory leak in error path In uclogic_params_ugee_v2_init_event_hooks(), the memory allocated for event_hook is not freed in the next error path. Fix that by freeing it. Fixes: a251d6576d2a ("HID: uclogic: Handle wireless device reconnection") Signed-off-by: Abdun Nihaal Signed-off-by: Jiri Kosina --- drivers/hid/hid-uclogic-params.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-uclogic-params.c b/drivers/hid/hid-uclogic-params.c index ffa14a4621ef..4c4bac6f792b 100644 --- a/drivers/hid/hid-uclogic-params.c +++ b/drivers/hid/hid-uclogic-params.c @@ -1369,8 +1369,10 @@ static int uclogic_params_ugee_v2_init_event_hooks(struct hid_device *hdev, event_hook->hdev = hdev; event_hook->size = ARRAY_SIZE(reconnect_event); event_hook->event = kmemdup(reconnect_event, event_hook->size, GFP_KERNEL); - if (!event_hook->event) + if (!event_hook->event) { + kfree(event_hook); return -ENOMEM; + } list_add_tail(&event_hook->list, &p->event_hooks->list); From 1788ef30725da53face7e311cdf62ad65fababcd Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Fri, 7 Nov 2025 15:01:30 -0500 Subject: [PATCH 461/543] drm/amd/display: Fix pbn to kbps Conversion [Why] Existing routine has two conversion sequence, pbn_to_kbps and kbps_to_pbn with margin. Non of those has without-margin calculation. kbps_to_pbn with margin conversion includes fec overhead which has already been included in pbn_div calculation with 0.994 factor considered. It is a double counted fec overhead factor that causes potential bw loss. [How] Add without-margin calculation. Fix fec overhead double counted issue. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3735 Reviewed-by: Aurabindo Pillai Signed-off-by: Fangzhi Zuo Signed-off-by: Ivan Lipski Tested-by: Dan Wheeler Signed-off-by: Alex Deucher (cherry picked from commit e0dec00f3d05e8c0eceaaebfdca217f8d10d380c) Cc: stable@vger.kernel.org --- .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 59 ++++++++----------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 5e92eaa67aa3..dbd1da4d85d3 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -884,26 +884,28 @@ struct dsc_mst_fairness_params { }; #if defined(CONFIG_DRM_AMD_DC_FP) -static uint16_t get_fec_overhead_multiplier(struct dc_link *dc_link) +static uint64_t kbps_to_pbn(int kbps, bool is_peak_pbn) { - u8 link_coding_cap; - uint16_t fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_8B_10B; + uint64_t effective_kbps = (uint64_t)kbps; - link_coding_cap = dc_link_dp_mst_decide_link_encoding_format(dc_link); - if (link_coding_cap == DP_128b_132b_ENCODING) - fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_128B_132B; + if (is_peak_pbn) { // add 0.6% (1006/1000) overhead into effective kbps + effective_kbps *= 1006; + effective_kbps = div_u64(effective_kbps, 1000); + } - return fec_overhead_multiplier_x1000; + return (uint64_t) DIV64_U64_ROUND_UP(effective_kbps * 64, (54 * 8 * 1000)); } -static int kbps_to_peak_pbn(int kbps, uint16_t fec_overhead_multiplier_x1000) +static uint32_t pbn_to_kbps(unsigned int pbn, bool with_margin) { - u64 peak_kbps = kbps; + uint64_t pbn_effective = (uint64_t)pbn; - peak_kbps *= 1006; - peak_kbps *= fec_overhead_multiplier_x1000; - peak_kbps = div_u64(peak_kbps, 1000 * 1000); - return (int) DIV64_U64_ROUND_UP(peak_kbps * 64, (54 * 8 * 1000)); + if (with_margin) // deduct 0.6% (994/1000) overhead from effective pbn + pbn_effective *= (1000000 / PEAK_FACTOR_X1000); + else + pbn_effective *= 1000; + + return DIV_U64_ROUND_UP(pbn_effective * 8 * 54, 64); } static void set_dsc_configs_from_fairness_vars(struct dsc_mst_fairness_params *params, @@ -974,7 +976,7 @@ static int bpp_x16_from_pbn(struct dsc_mst_fairness_params param, int pbn) dc_dsc_get_default_config_option(param.sink->ctx->dc, &dsc_options); dsc_options.max_target_bpp_limit_override_x16 = drm_connector->display_info.max_dsc_bpp * 16; - kbps = div_u64((u64)pbn * 994 * 8 * 54, 64); + kbps = pbn_to_kbps(pbn, false); dc_dsc_compute_config( param.sink->ctx->dc->res_pool->dscs[0], ¶m.sink->dsc_caps.dsc_dec_caps, @@ -1003,12 +1005,11 @@ static int increase_dsc_bpp(struct drm_atomic_state *state, int link_timeslots_used; int fair_pbn_alloc; int ret = 0; - uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); for (i = 0; i < count; i++) { if (vars[i + k].dsc_enabled) { initial_slack[i] = - kbps_to_peak_pbn(params[i].bw_range.max_kbps, fec_overhead_multiplier_x1000) - vars[i + k].pbn; + kbps_to_pbn(params[i].bw_range.max_kbps, false) - vars[i + k].pbn; bpp_increased[i] = false; remaining_to_increase += 1; } else { @@ -1104,7 +1105,6 @@ static int try_disable_dsc(struct drm_atomic_state *state, int next_index; int remaining_to_try = 0; int ret; - uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); int var_pbn; for (i = 0; i < count; i++) { @@ -1137,7 +1137,7 @@ static int try_disable_dsc(struct drm_atomic_state *state, DRM_DEBUG_DRIVER("MST_DSC index #%d, try no compression\n", next_index); var_pbn = vars[next_index].pbn; - vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps, fec_overhead_multiplier_x1000); + vars[next_index].pbn = kbps_to_pbn(params[next_index].bw_range.stream_kbps, true); ret = drm_dp_atomic_find_time_slots(state, params[next_index].port->mgr, params[next_index].port, @@ -1197,7 +1197,6 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, int count = 0; int i, k, ret; bool debugfs_overwrite = false; - uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); struct drm_connector_state *new_conn_state; memset(params, 0, sizeof(params)); @@ -1278,7 +1277,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, DRM_DEBUG_DRIVER("MST_DSC Try no compression\n"); for (i = 0; i < count; i++) { vars[i + k].aconnector = params[i].aconnector; - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000); + vars[i + k].pbn = kbps_to_pbn(params[i].bw_range.stream_kbps, false); vars[i + k].dsc_enabled = false; vars[i + k].bpp_x16 = 0; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, params[i].port, @@ -1300,7 +1299,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, DRM_DEBUG_DRIVER("MST_DSC Try max compression\n"); for (i = 0; i < count; i++) { if (params[i].compression_possible && params[i].clock_force_enable != DSC_CLK_FORCE_DISABLE) { - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.min_kbps, fec_overhead_multiplier_x1000); + vars[i + k].pbn = kbps_to_pbn(params[i].bw_range.min_kbps, false); vars[i + k].dsc_enabled = true; vars[i + k].bpp_x16 = params[i].bw_range.min_target_bpp_x16; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, @@ -1308,7 +1307,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, if (ret < 0) return ret; } else { - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000); + vars[i + k].pbn = kbps_to_pbn(params[i].bw_range.stream_kbps, false); vars[i + k].dsc_enabled = false; vars[i + k].bpp_x16 = 0; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, @@ -1763,18 +1762,6 @@ int pre_validate_dsc(struct drm_atomic_state *state, return ret; } -static uint32_t kbps_from_pbn(unsigned int pbn) -{ - uint64_t kbps = (uint64_t)pbn; - - kbps *= (1000000 / PEAK_FACTOR_X1000); - kbps *= 8; - kbps *= 54; - kbps /= 64; - - return (uint32_t)kbps; -} - static bool is_dsc_common_config_possible(struct dc_stream_state *stream, struct dc_dsc_bw_range *bw_range) { @@ -1873,7 +1860,7 @@ enum dc_status dm_dp_mst_is_port_support_mode( dc_link_get_highest_encoding_format(stream->link)); cur_link_settings = stream->link->verified_link_cap; root_link_bw_in_kbps = dc_link_bandwidth_kbps(aconnector->dc_link, &cur_link_settings); - virtual_channel_bw_in_kbps = kbps_from_pbn(aconnector->mst_output_port->full_pbn); + virtual_channel_bw_in_kbps = pbn_to_kbps(aconnector->mst_output_port->full_pbn, true); /* pick the end to end bw bottleneck */ end_to_end_bw_in_kbps = min(root_link_bw_in_kbps, virtual_channel_bw_in_kbps); @@ -1926,7 +1913,7 @@ enum dc_status dm_dp_mst_is_port_support_mode( immediate_upstream_port = aconnector->mst_output_port->parent->port_parent; if (immediate_upstream_port) { - virtual_channel_bw_in_kbps = kbps_from_pbn(immediate_upstream_port->full_pbn); + virtual_channel_bw_in_kbps = pbn_to_kbps(immediate_upstream_port->full_pbn, true); virtual_channel_bw_in_kbps = min(root_link_bw_in_kbps, virtual_channel_bw_in_kbps); } else { /* For topology LCT 1 case - only one mstb*/ From 9eb00b5f5697bd56baa3222c7a1426fa15bacfb5 Mon Sep 17 00:00:00 2001 From: Robert McClinton Date: Sun, 16 Nov 2025 12:33:21 -0500 Subject: [PATCH 462/543] drm/radeon: delete radeon_fence_process in is_signaled, no deadlock Delete the attempt to progress the queue when checking if fence is signaled. This avoids deadlock. dma-fence_ops::signaled can be called with the fence lock in unknown state. For radeon, the fence lock is also the wait queue lock. This can cause a self deadlock when signaled() tries to make forward progress on the wait queue. But advancing the queue is unneeded because incorrectly returning false from signaled() is perfectly acceptable. Link: https://github.com/brave/brave-browser/issues/49182 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4641 Cc: Alex Deucher Signed-off-by: Robert McClinton Signed-off-by: Alex Deucher (cherry picked from commit 527ba26e50ec2ca2be9c7c82f3ad42998a75d0db) Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/radeon_fence.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 5b5b54e876d4..167d6f122b8e 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -360,13 +360,6 @@ static bool radeon_fence_is_signaled(struct dma_fence *f) if (atomic64_read(&rdev->fence_drv[ring].last_seq) >= seq) return true; - if (down_read_trylock(&rdev->exclusive_lock)) { - radeon_fence_process(rdev, ring); - up_read(&rdev->exclusive_lock); - - if (atomic64_read(&rdev->fence_drv[ring].last_seq) >= seq) - return true; - } return false; } From d52dea485cd3c98cfeeb474cf66cf95df2ab142f Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Wed, 12 Nov 2025 18:10:06 +0000 Subject: [PATCH 463/543] drm/xe: Prevent BIT() overflow when handling invalid prefetch region If user provides a large value (such as 0x80) for parameter prefetch_mem_region_instance in vm_bind ioctl, it will cause BIT(prefetch_region) overflow as below: " ------------[ cut here ]------------ UBSAN: shift-out-of-bounds in drivers/gpu/drm/xe/xe_vm.c:3414:7 shift exponent 128 is too large for 64-bit type 'long unsigned int' CPU: 8 UID: 0 PID: 53120 Comm: xe_exec_system_ Tainted: G W 6.18.0-rc1-lgci-xe-kernel+ #200 PREEMPT(voluntary) Tainted: [W]=WARN Hardware name: ASUS System Product Name/PRIME Z790-P WIFI, BIOS 0812 02/24/2023 Call Trace: dump_stack_lvl+0xa0/0xc0 dump_stack+0x10/0x20 ubsan_epilogue+0x9/0x40 __ubsan_handle_shift_out_of_bounds+0x10e/0x170 ? mutex_unlock+0x12/0x20 xe_vm_bind_ioctl.cold+0x20/0x3c [xe] ... " Fix it by validating prefetch_region before the BIT() usage. v2: Add Closes and Cc stable kernels. (Matt) Reported-by: Koen Koning Reported-by: Peter Senna Tschudin Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6478 Cc: # v6.8+ Reviewed-by: Matthew Auld Signed-off-by: Shuicheng Lin Signed-off-by: Matthew Auld Link: https://patch.msgid.link/20251112181005.2120521-2-shuicheng.lin@intel.com (cherry picked from commit 8f565bdd14eec5611cc041dba4650e42ccdf71d9) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_vm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index ccb09ef4ec9e..cdd1dc540a59 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -3369,8 +3369,10 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm, op == DRM_XE_VM_BIND_OP_PREFETCH) || XE_IOCTL_DBG(xe, prefetch_region && op != DRM_XE_VM_BIND_OP_PREFETCH) || - XE_IOCTL_DBG(xe, (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC && - !(BIT(prefetch_region) & xe->info.mem_region_mask))) || + XE_IOCTL_DBG(xe, (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC && + /* Guard against undefined shift in BIT(prefetch_region) */ + (prefetch_region >= (sizeof(xe->info.mem_region_mask) * 8) || + !(BIT(prefetch_region) & xe->info.mem_region_mask)))) || XE_IOCTL_DBG(xe, obj && op == DRM_XE_VM_BIND_OP_UNMAP) || XE_IOCTL_DBG(xe, (flags & DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET) && From 905a3468ec679293949438393de7e61310432662 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Thu, 13 Nov 2025 15:40:39 -0800 Subject: [PATCH 464/543] drm/xe/kunit: Fix forcewake assertion in mocs test The MOCS kunit test calls KUNIT_ASSERT_TRUE_MSG() with a condition of 'true;' this prevents the assertion from ever failing. Replace KUNIT_ASSERT_TRUE_MSG with KUNIT_FAIL_AND_ABORT to get the intended failure behavior in cases where forcewake was not acquired successfully. Fixes: 51c0ee84e4dc ("drm/xe/tests/mocs: Hold XE_FORCEWAKE_ALL for LNCF regs") Cc: Tejas Upadhyay Cc: Gustavo Sousa Reviewed-by: Lucas De Marchi Reviewed-by: Gustavo Sousa Link: https://patch.msgid.link/20251113234038.2256106-2-matthew.d.roper@intel.com Signed-off-by: Matt Roper (cherry picked from commit 9be4f0f687048ba77428ceca11994676736507b7) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/tests/xe_mocs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/tests/xe_mocs.c b/drivers/gpu/drm/xe/tests/xe_mocs.c index 0e502feaca81..6bb278167aaf 100644 --- a/drivers/gpu/drm/xe/tests/xe_mocs.c +++ b/drivers/gpu/drm/xe/tests/xe_mocs.c @@ -49,7 +49,7 @@ static void read_l3cc_table(struct xe_gt *gt, fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); if (!xe_force_wake_ref_has_domain(fw_ref, XE_FORCEWAKE_ALL)) { xe_force_wake_put(gt_to_fw(gt), fw_ref); - KUNIT_ASSERT_TRUE_MSG(test, true, "Forcewake Failed.\n"); + KUNIT_FAIL_AND_ABORT(test, "Forcewake Failed.\n"); } for (i = 0; i < info->num_mocs_regs; i++) { From 27c0a54e48c658eb12fa3bcbb2892a3fa17b72af Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Mon, 10 Nov 2025 23:26:58 +0000 Subject: [PATCH 465/543] drm/xe: Remove duplicate DRM_EXEC selection from Kconfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are 2 identical "select DRM_EXEC" lines for DRM_XE. Remove one to clean up the configuration. Fixes: d490ecf57790 ("drm/xe: Rework xe_exec and the VM rebind worker to use the drm_exec helper") Cc: Thomas Hellström Cc: Lucas De Marchi Signed-off-by: Shuicheng Lin Reviewed-by: Nitin Gote Reviewed-by: Lucas De Marchi Link: https://patch.msgid.link/20251110232657.1807998-2-shuicheng.lin@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit b1aa02acd03bfef3ed39c511d33c4a4303d2f9b1) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig index 7219f6b884b6..4b288eb3f5b0 100644 --- a/drivers/gpu/drm/xe/Kconfig +++ b/drivers/gpu/drm/xe/Kconfig @@ -13,7 +13,6 @@ config DRM_XE select TMPFS select DRM_BUDDY select DRM_CLIENT_SELECTION - select DRM_EXEC select DRM_KMS_HELPER select DRM_KUNIT_TEST_HELPERS if DRM_XE_KUNIT_TEST != n select DRM_PANEL From 5b38c22687d9287d85dd3bef2fa708bf62cf3895 Mon Sep 17 00:00:00 2001 From: Venkata Ramana Nayana Date: Fri, 7 Nov 2025 14:01:41 +0530 Subject: [PATCH 466/543] drm/xe/irq: Handle msix vector0 interrupt Current gu2host handler registered as MSI-X vector 0 and as per bspec for a msix vector 0 interrupt, the driver must check the legacy registers 190008(TILE_INT_REG), 190060h (GT INTR Identity Reg 0) and other registers mentioned in "Interrupt Service Routine Pseudocode" otherwise it will block the next interrupts. To overcome this issue replacing guc2host handler with legacy xe_irq_handler. Fixes: da889070be7b2 ("drm/xe/irq: Separate MSI and MSI-X flows") Bspec: 62357 Signed-off-by: Venkata Ramana Nayana Reviewed-by: Balasubramani Vivekanandan Link: https://patch.msgid.link/20251107083141.2080189-1-venkata.ramana.nayana@intel.com Signed-off-by: Matt Roper (cherry picked from commit c34a14bce7090862ebe5a64abe8d85df75e62737) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_irq.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c index 870edaf69388..06976cc77918 100644 --- a/drivers/gpu/drm/xe/xe_irq.c +++ b/drivers/gpu/drm/xe/xe_irq.c @@ -847,22 +847,6 @@ static int xe_irq_msix_init(struct xe_device *xe) return 0; } -static irqreturn_t guc2host_irq_handler(int irq, void *arg) -{ - struct xe_device *xe = arg; - struct xe_tile *tile; - u8 id; - - if (!atomic_read(&xe->irq.enabled)) - return IRQ_NONE; - - for_each_tile(tile, xe, id) - xe_guc_irq_handler(&tile->primary_gt->uc.guc, - GUC_INTR_GUC2HOST); - - return IRQ_HANDLED; -} - static irqreturn_t xe_irq_msix_default_hwe_handler(int irq, void *arg) { unsigned int tile_id, gt_id; @@ -979,7 +963,7 @@ int xe_irq_msix_request_irqs(struct xe_device *xe) u16 msix; msix = GUC2HOST_MSIX; - err = xe_irq_msix_request_irq(xe, guc2host_irq_handler, xe, + err = xe_irq_msix_request_irq(xe, xe_irq_handler(xe), xe, DRIVER_NAME "-guc2host", false, &msix); if (err) return err; From 118082368c2b6ddefe6cb607efc312285148f044 Mon Sep 17 00:00:00 2001 From: Emil Tantilov Date: Mon, 13 Oct 2025 08:08:24 -0700 Subject: [PATCH 467/543] idpf: fix possible vport_config NULL pointer deref in remove Attempting to remove the driver will cause a crash in cases where the vport failed to initialize. Following trace is from an instance where the driver failed during an attempt to create a VF: [ 1661.543624] idpf 0000:84:00.7: Device HW Reset initiated [ 1722.923726] idpf 0000:84:00.7: Transaction timed-out (op:1 cookie:2900 vc_op:1 salt:29 timeout:60000ms) [ 1723.353263] BUG: kernel NULL pointer dereference, address: 0000000000000028 ... [ 1723.358472] RIP: 0010:idpf_remove+0x11c/0x200 [idpf] ... [ 1723.364973] Call Trace: [ 1723.365475] [ 1723.365972] pci_device_remove+0x42/0xb0 [ 1723.366481] device_release_driver_internal+0x1a9/0x210 [ 1723.366987] pci_stop_bus_device+0x6d/0x90 [ 1723.367488] pci_stop_and_remove_bus_device+0x12/0x20 [ 1723.367971] pci_iov_remove_virtfn+0xbd/0x120 [ 1723.368309] sriov_disable+0x34/0xe0 [ 1723.368643] idpf_sriov_configure+0x58/0x140 [idpf] [ 1723.368982] sriov_numvfs_store+0xda/0x1c0 Avoid the NULL pointer dereference by adding NULL pointer check for vport_config[i], before freeing user_config.q_coalesce. Fixes: e1e3fec3e34b ("idpf: preserve coalescing settings across resets") Signed-off-by: Emil Tantilov Reviewed-by: Chittim Madhu Reviewed-by: Simon Horman Tested-by: Samuel Salin Reviewed-by: Aleksandr Loktionov Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_main.c b/drivers/net/ethernet/intel/idpf/idpf_main.c index 8c46481d2e1f..8cf4ff697572 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_main.c +++ b/drivers/net/ethernet/intel/idpf/idpf_main.c @@ -63,6 +63,8 @@ static void idpf_remove(struct pci_dev *pdev) destroy_workqueue(adapter->vc_event_wq); for (i = 0; i < adapter->max_vports; i++) { + if (!adapter->vport_config[i]) + continue; kfree(adapter->vport_config[i]->user_config.q_coalesce); kfree(adapter->vport_config[i]); adapter->vport_config[i] = NULL; From 7a601324ac9828468291151d220edb47a6a82449 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Tue, 18 Nov 2025 11:26:52 -0800 Subject: [PATCH 468/543] MAINTAINERS: sync omap devicetree maintainers with omap platform Both used to go through Tony's branches, so lets keep things together. This was missed at the time when Co-Maintainers were added. Signed-off-by: Andreas Kemnade Acked-by: Aaro Koskinen Acked-by: Tony Lindgren Reviewed-by: Roger Quadros Acked-by: Kevin Hilman Link: https://patch.msgid.link/20240915195321.1071967-1-andreas@kemnade.info Signed-off-by: Kevin Hilman Link: https://lore.kernel.org/r/20251118192652.316198-1-khilman@baylibre.com Signed-off-by: Arnd Bergmann --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 29f86c9aa27b..e661829ff723 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18779,6 +18779,10 @@ S: Maintained F: arch/arm/*omap*/*clock* OMAP DEVICE TREE SUPPORT +M: Aaro Koskinen +M: Andreas Kemnade +M: Kevin Hilman +M: Roger Quadros M: Tony Lindgren L: linux-omap@vger.kernel.org L: devicetree@vger.kernel.org From 23a5b9b12de9dcd15ebae4f1abc8814ec1c51ab0 Mon Sep 17 00:00:00 2001 From: Grzegorz Nitka Date: Mon, 20 Oct 2025 12:02:16 +0200 Subject: [PATCH 469/543] ice: fix PTP cleanup on driver removal in error path Improve the cleanup on releasing PTP resources in error path. The error case might happen either at the driver probe and PTP feature initialization or on PTP restart (errors in reset handling, NVM update etc). In both cases, calls to PF PTP cleanup (ice_ptp_cleanup_pf function) and 'ps_lock' mutex deinitialization were missed. Additionally, ptp clock was not unregistered in the latter case. Keep PTP state as 'uninitialized' on init to distinguish between error scenarios and to avoid resource release duplication at driver removal. The consequence of missing ice_ptp_cleanup_pf call is the following call trace dumped when ice_adapter object is freed (port list is not empty, as it is required at this stage): [ T93022] ------------[ cut here ]------------ [ T93022] WARNING: CPU: 10 PID: 93022 at ice/ice_adapter.c:67 ice_adapter_put+0xef/0x100 [ice] ... [ T93022] RIP: 0010:ice_adapter_put+0xef/0x100 [ice] ... [ T93022] Call Trace: [ T93022] [ T93022] ? ice_adapter_put+0xef/0x100 [ice 33d2647ad4f6d866d41eefff1806df37c68aef0c] [ T93022] ? __warn.cold+0xb0/0x10e [ T93022] ? ice_adapter_put+0xef/0x100 [ice 33d2647ad4f6d866d41eefff1806df37c68aef0c] [ T93022] ? report_bug+0xd8/0x150 [ T93022] ? handle_bug+0xe9/0x110 [ T93022] ? exc_invalid_op+0x17/0x70 [ T93022] ? asm_exc_invalid_op+0x1a/0x20 [ T93022] ? ice_adapter_put+0xef/0x100 [ice 33d2647ad4f6d866d41eefff1806df37c68aef0c] [ T93022] pci_device_remove+0x42/0xb0 [ T93022] device_release_driver_internal+0x19f/0x200 [ T93022] driver_detach+0x48/0x90 [ T93022] bus_remove_driver+0x70/0xf0 [ T93022] pci_unregister_driver+0x42/0xb0 [ T93022] ice_module_exit+0x10/0xdb0 [ice 33d2647ad4f6d866d41eefff1806df37c68aef0c] ... [ T93022] ---[ end trace 0000000000000000 ]--- [ T93022] ice: module unloaded Fixes: e800654e85b5 ("ice: Use ice_adapter for PTP shared data instead of auxdev") Signed-off-by: Grzegorz Nitka Reviewed-by: Aleksandr Loktionov Reviewed-by: Paul Menzel Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ptp.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index fb0f6365a6d6..8ec0f7d0fceb 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -3246,7 +3246,7 @@ void ice_ptp_init(struct ice_pf *pf) err = ice_ptp_init_port(pf, &ptp->port); if (err) - goto err_exit; + goto err_clean_pf; /* Start the PHY timestamping block */ ice_ptp_reset_phy_timestamping(pf); @@ -3263,13 +3263,19 @@ void ice_ptp_init(struct ice_pf *pf) dev_info(ice_pf_to_dev(pf), "PTP init successful\n"); return; +err_clean_pf: + mutex_destroy(&ptp->port.ps_lock); + ice_ptp_cleanup_pf(pf); err_exit: /* If we registered a PTP clock, release it */ if (pf->ptp.clock) { ptp_clock_unregister(ptp->clock); pf->ptp.clock = NULL; } - ptp->state = ICE_PTP_ERROR; + /* Keep ICE_PTP_UNINIT state to avoid ambiguity at driver unload + * and to avoid duplicated resources release. + */ + ptp->state = ICE_PTP_UNINIT; dev_err(ice_pf_to_dev(pf), "PTP failed %d\n", err); } @@ -3282,9 +3288,19 @@ void ice_ptp_init(struct ice_pf *pf) */ void ice_ptp_release(struct ice_pf *pf) { - if (pf->ptp.state != ICE_PTP_READY) + if (pf->ptp.state == ICE_PTP_UNINIT) return; + if (pf->ptp.state != ICE_PTP_READY) { + mutex_destroy(&pf->ptp.port.ps_lock); + ice_ptp_cleanup_pf(pf); + if (pf->ptp.clock) { + ptp_clock_unregister(pf->ptp.clock); + pf->ptp.clock = NULL; + } + return; + } + pf->ptp.state = ICE_PTP_UNINIT; /* Disable timestamping for both Tx and Rx */ From 97ea34defbb57bfaf71ce487b1b0865ffd186e81 Mon Sep 17 00:00:00 2001 From: Jared Kangas Date: Tue, 11 Nov 2025 13:54:11 -0800 Subject: [PATCH 470/543] pinctrl: s32cc: fix uninitialized memory in s32_pinctrl_desc s32_pinctrl_desc is allocated with devm_kmalloc(), but not all of its fields are initialized. Notably, num_custom_params is used in pinconf_generic_parse_dt_config(), resulting in intermittent allocation errors, such as the following splat when probing i2c-imx: WARNING: CPU: 0 PID: 176 at mm/page_alloc.c:4795 __alloc_pages_noprof+0x290/0x300 [...] Hardware name: NXP S32G3 Reference Design Board 3 (S32G-VNP-RDB3) (DT) [...] Call trace: __alloc_pages_noprof+0x290/0x300 (P) ___kmalloc_large_node+0x84/0x168 __kmalloc_large_node_noprof+0x34/0x120 __kmalloc_noprof+0x2ac/0x378 pinconf_generic_parse_dt_config+0x68/0x1a0 s32_dt_node_to_map+0x104/0x248 dt_to_map_one_config+0x154/0x1d8 pinctrl_dt_to_map+0x12c/0x280 create_pinctrl+0x6c/0x270 pinctrl_get+0xc0/0x170 devm_pinctrl_get+0x50/0xa0 pinctrl_bind_pins+0x60/0x2a0 really_probe+0x60/0x3a0 [...] __platform_driver_register+0x2c/0x40 i2c_adap_imx_init+0x28/0xff8 [i2c_imx] [...] This results in later parse failures that can cause issues in dependent drivers: s32g-siul2-pinctrl 4009c240.pinctrl: /soc@0/pinctrl@4009c240/i2c0-pins/i2c0-grp0: could not parse node property s32g-siul2-pinctrl 4009c240.pinctrl: /soc@0/pinctrl@4009c240/i2c0-pins/i2c0-grp0: could not parse node property [...] pca953x 0-0022: failed writing register: -6 i2c i2c-0: IMX I2C adapter registered s32g-siul2-pinctrl 4009c240.pinctrl: /soc@0/pinctrl@4009c240/i2c2-pins/i2c2-grp0: could not parse node property s32g-siul2-pinctrl 4009c240.pinctrl: /soc@0/pinctrl@4009c240/i2c2-pins/i2c2-grp0: could not parse node property i2c i2c-1: IMX I2C adapter registered s32g-siul2-pinctrl 4009c240.pinctrl: /soc@0/pinctrl@4009c240/i2c4-pins/i2c4-grp0: could not parse node property s32g-siul2-pinctrl 4009c240.pinctrl: /soc@0/pinctrl@4009c240/i2c4-pins/i2c4-grp0: could not parse node property i2c i2c-2: IMX I2C adapter registered Fix this by initializing s32_pinctrl_desc with devm_kzalloc() instead of devm_kmalloc() in s32_pinctrl_probe(), which sets the previously uninitialized fields to zero. Fixes: fd84aaa8173d ("pinctrl: add NXP S32 SoC family support") Signed-off-by: Jared Kangas Tested-by: Jan Petrous (OSS) Signed-off-by: Linus Walleij --- drivers/pinctrl/nxp/pinctrl-s32cc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/nxp/pinctrl-s32cc.c b/drivers/pinctrl/nxp/pinctrl-s32cc.c index 501eb296c760..51ecb8d0fb7e 100644 --- a/drivers/pinctrl/nxp/pinctrl-s32cc.c +++ b/drivers/pinctrl/nxp/pinctrl-s32cc.c @@ -951,7 +951,7 @@ int s32_pinctrl_probe(struct platform_device *pdev, spin_lock_init(&ipctl->gpio_configs_lock); s32_pinctrl_desc = - devm_kmalloc(&pdev->dev, sizeof(*s32_pinctrl_desc), GFP_KERNEL); + devm_kzalloc(&pdev->dev, sizeof(*s32_pinctrl_desc), GFP_KERNEL); if (!s32_pinctrl_desc) return -ENOMEM; From 6010d4d8b55b5d3ae1efb5502c54312e15c14f21 Mon Sep 17 00:00:00 2001 From: Jared Kangas Date: Tue, 11 Nov 2025 13:54:12 -0800 Subject: [PATCH 471/543] pinctrl: s32cc: initialize gpio_pin_config::list after kmalloc() s32_pmx_gpio_request_enable() does not initialize the newly-allocated gpio_pin_config::list before adding it to s32_pinctrl::gpio_configs. This could result in a linked list corruption. Initialize the new list_head with INIT_LIST_HEAD() to fix this. Fixes: fd84aaa8173d ("pinctrl: add NXP S32 SoC family support") Signed-off-by: Jared Kangas Signed-off-by: Linus Walleij --- drivers/pinctrl/nxp/pinctrl-s32cc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/nxp/pinctrl-s32cc.c b/drivers/pinctrl/nxp/pinctrl-s32cc.c index 51ecb8d0fb7e..35511f83d056 100644 --- a/drivers/pinctrl/nxp/pinctrl-s32cc.c +++ b/drivers/pinctrl/nxp/pinctrl-s32cc.c @@ -392,6 +392,7 @@ static int s32_pmx_gpio_request_enable(struct pinctrl_dev *pctldev, gpio_pin->pin_id = offset; gpio_pin->config = config; + INIT_LIST_HEAD(&gpio_pin->list); spin_lock_irqsave(&ipctl->gpio_configs_lock, flags); list_add(&gpio_pin->list, &ipctl->gpio_configs); From f94c1a114ac209977bdf5ca841b98424295ab1f0 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 17 Nov 2025 14:05:49 +0200 Subject: [PATCH 472/543] devlink: rate: Unset parent pointer in devl_rate_nodes_destroy The function devl_rate_nodes_destroy is documented to "Unset parent for all rate objects". However, it was only calling the driver-specific `rate_leaf_parent_set` or `rate_node_parent_set` ops and decrementing the parent's refcount, without actually setting the `devlink_rate->parent` pointer to NULL. This leaves a dangling pointer in the `devlink_rate` struct, which cause refcount error in netdevsim[1] and mlx5[2]. In addition, this is inconsistent with the behavior of `devlink_nl_rate_parent_node_set`, where the parent pointer is correctly cleared. This patch fixes the issue by explicitly setting `devlink_rate->parent` to NULL after notifying the driver, thus fulfilling the function's documented behavior for all rate objects. [1] repro steps: echo 1 > /sys/bus/netdevsim/new_device devlink dev eswitch set netdevsim/netdevsim1 mode switchdev echo 1 > /sys/bus/netdevsim/devices/netdevsim1/sriov_numvfs devlink port function rate add netdevsim/netdevsim1/test_node devlink port function rate set netdevsim/netdevsim1/128 parent test_node echo 1 > /sys/bus/netdevsim/del_device dmesg: refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 8 PID: 1530 at lib/refcount.c:31 refcount_warn_saturate+0x42/0xe0 CPU: 8 UID: 0 PID: 1530 Comm: bash Not tainted 6.18.0-rc4+ #1 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:refcount_warn_saturate+0x42/0xe0 Call Trace: devl_rate_leaf_destroy+0x8d/0x90 __nsim_dev_port_del+0x6c/0x70 [netdevsim] nsim_dev_reload_destroy+0x11c/0x140 [netdevsim] nsim_drv_remove+0x2b/0xb0 [netdevsim] device_release_driver_internal+0x194/0x1f0 bus_remove_device+0xc6/0x130 device_del+0x159/0x3c0 device_unregister+0x1a/0x60 del_device_store+0x111/0x170 [netdevsim] kernfs_fop_write_iter+0x12e/0x1e0 vfs_write+0x215/0x3d0 ksys_write+0x5f/0xd0 do_syscall_64+0x55/0x10f0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 [2] devlink dev eswitch set pci/0000:08:00.0 mode switchdev devlink port add pci/0000:08:00.0 flavour pcisf pfnum 0 sfnum 1000 devlink port function rate add pci/0000:08:00.0/group1 devlink port function rate set pci/0000:08:00.0/32768 parent group1 modprobe -r mlx5_ib mlx5_fwctl mlx5_core dmesg: refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 7 PID: 16151 at lib/refcount.c:31 refcount_warn_saturate+0x42/0xe0 CPU: 7 UID: 0 PID: 16151 Comm: bash Not tainted 6.17.0-rc7_for_upstream_min_debug_2025_10_02_12_44 #1 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 RIP: 0010:refcount_warn_saturate+0x42/0xe0 Call Trace: devl_rate_leaf_destroy+0x8d/0x90 mlx5_esw_offloads_devlink_port_unregister+0x33/0x60 [mlx5_core] mlx5_esw_offloads_unload_rep+0x3f/0x50 [mlx5_core] mlx5_eswitch_unload_sf_vport+0x40/0x90 [mlx5_core] mlx5_sf_esw_event+0xc4/0x120 [mlx5_core] notifier_call_chain+0x33/0xa0 blocking_notifier_call_chain+0x3b/0x50 mlx5_eswitch_disable_locked+0x50/0x110 [mlx5_core] mlx5_eswitch_disable+0x63/0x90 [mlx5_core] mlx5_unload+0x1d/0x170 [mlx5_core] mlx5_uninit_one+0xa2/0x130 [mlx5_core] remove_one+0x78/0xd0 [mlx5_core] pci_device_remove+0x39/0xa0 device_release_driver_internal+0x194/0x1f0 unbind_store+0x99/0xa0 kernfs_fop_write_iter+0x12e/0x1e0 vfs_write+0x215/0x3d0 ksys_write+0x5f/0xd0 do_syscall_64+0x53/0x1f0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: d75559845078 ("devlink: Allow setting parent node of rate objects") Signed-off-by: Shay Drory Reviewed-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1763381149-1234377-1-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- net/devlink/rate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 264fb82cba19..d157a8419bca 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -828,13 +828,15 @@ void devl_rate_nodes_destroy(struct devlink *devlink) if (!devlink_rate->parent) continue; - refcount_dec(&devlink_rate->parent->refcnt); if (devlink_rate_is_leaf(devlink_rate)) ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); else if (devlink_rate_is_node(devlink_rate)) ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); + + refcount_dec(&devlink_rate->parent->refcnt); + devlink_rate->parent = NULL; } list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate)) { From 426358d9be7ce3518966422f87b96f1bad27295f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 17 Nov 2025 10:07:44 +0000 Subject: [PATCH 473/543] mptcp: fix a race in mptcp_pm_del_add_timer() mptcp_pm_del_add_timer() can call sk_stop_timer_sync(sk, &entry->add_timer) while another might have free entry already, as reported by syzbot. Add RCU protection to fix this issue. Also change confusing add_timer variable with stop_timer boolean. syzbot report: BUG: KASAN: slab-use-after-free in __timer_delete_sync+0x372/0x3f0 kernel/time/timer.c:1616 Read of size 4 at addr ffff8880311e4150 by task kworker/1:1/44 CPU: 1 UID: 0 PID: 44 Comm: kworker/1:1 Not tainted syzkaller #0 PREEMPT_{RT,(full)} Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/02/2025 Workqueue: events mptcp_worker Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x240 mm/kasan/report.c:482 kasan_report+0x118/0x150 mm/kasan/report.c:595 __timer_delete_sync+0x372/0x3f0 kernel/time/timer.c:1616 sk_stop_timer_sync+0x1b/0x90 net/core/sock.c:3631 mptcp_pm_del_add_timer+0x283/0x310 net/mptcp/pm.c:362 mptcp_incoming_options+0x1357/0x1f60 net/mptcp/options.c:1174 tcp_data_queue+0xca/0x6450 net/ipv4/tcp_input.c:5361 tcp_rcv_established+0x1335/0x2670 net/ipv4/tcp_input.c:6441 tcp_v4_do_rcv+0x98b/0xbf0 net/ipv4/tcp_ipv4.c:1931 tcp_v4_rcv+0x252a/0x2dc0 net/ipv4/tcp_ipv4.c:2374 ip_protocol_deliver_rcu+0x221/0x440 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x3bb/0x6f0 net/ipv4/ip_input.c:239 NF_HOOK+0x30c/0x3a0 include/linux/netfilter.h:318 NF_HOOK+0x30c/0x3a0 include/linux/netfilter.h:318 __netif_receive_skb_one_core net/core/dev.c:6079 [inline] __netif_receive_skb+0x143/0x380 net/core/dev.c:6192 process_backlog+0x31e/0x900 net/core/dev.c:6544 __napi_poll+0xb6/0x540 net/core/dev.c:7594 napi_poll net/core/dev.c:7657 [inline] net_rx_action+0x5f7/0xda0 net/core/dev.c:7784 handle_softirqs+0x22f/0x710 kernel/softirq.c:622 __do_softirq kernel/softirq.c:656 [inline] __local_bh_enable_ip+0x1a0/0x2e0 kernel/softirq.c:302 mptcp_pm_send_ack net/mptcp/pm.c:210 [inline] mptcp_pm_addr_send_ack+0x41f/0x500 net/mptcp/pm.c:-1 mptcp_pm_worker+0x174/0x320 net/mptcp/pm.c:1002 mptcp_worker+0xd5/0x1170 net/mptcp/protocol.c:2762 process_one_work kernel/workqueue.c:3263 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3346 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3427 kthread+0x711/0x8a0 kernel/kthread.c:463 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Allocated by task 44: kasan_save_stack mm/kasan/common.c:56 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:77 poison_kmalloc_redzone mm/kasan/common.c:400 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:417 kasan_kmalloc include/linux/kasan.h:262 [inline] __kmalloc_cache_noprof+0x1ef/0x6c0 mm/slub.c:5748 kmalloc_noprof include/linux/slab.h:957 [inline] mptcp_pm_alloc_anno_list+0x104/0x460 net/mptcp/pm.c:385 mptcp_pm_create_subflow_or_signal_addr+0xf9d/0x1360 net/mptcp/pm_kernel.c:355 mptcp_pm_nl_fully_established net/mptcp/pm_kernel.c:409 [inline] __mptcp_pm_kernel_worker+0x417/0x1ef0 net/mptcp/pm_kernel.c:1529 mptcp_pm_worker+0x1ee/0x320 net/mptcp/pm.c:1008 mptcp_worker+0xd5/0x1170 net/mptcp/protocol.c:2762 process_one_work kernel/workqueue.c:3263 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3346 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3427 kthread+0x711/0x8a0 kernel/kthread.c:463 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Freed by task 6630: kasan_save_stack mm/kasan/common.c:56 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:77 __kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:587 kasan_save_free_info mm/kasan/kasan.h:406 [inline] poison_slab_object mm/kasan/common.c:252 [inline] __kasan_slab_free+0x5c/0x80 mm/kasan/common.c:284 kasan_slab_free include/linux/kasan.h:234 [inline] slab_free_hook mm/slub.c:2523 [inline] slab_free mm/slub.c:6611 [inline] kfree+0x197/0x950 mm/slub.c:6818 mptcp_remove_anno_list_by_saddr+0x2d/0x40 net/mptcp/pm.c:158 mptcp_pm_flush_addrs_and_subflows net/mptcp/pm_kernel.c:1209 [inline] mptcp_nl_flush_addrs_list net/mptcp/pm_kernel.c:1240 [inline] mptcp_pm_nl_flush_addrs_doit+0x593/0xbb0 net/mptcp/pm_kernel.c:1281 genl_family_rcv_msg_doit+0x215/0x300 net/netlink/genetlink.c:1115 genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x60e/0x790 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x208/0x470 net/netlink/af_netlink.c:2552 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1320 [inline] netlink_unicast+0x846/0xa10 net/netlink/af_netlink.c:1346 netlink_sendmsg+0x805/0xb30 net/netlink/af_netlink.c:1896 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg+0x21c/0x270 net/socket.c:742 ____sys_sendmsg+0x508/0x820 net/socket.c:2630 ___sys_sendmsg+0x21f/0x2a0 net/socket.c:2684 __sys_sendmsg net/socket.c:2716 [inline] __do_sys_sendmsg net/socket.c:2721 [inline] __se_sys_sendmsg net/socket.c:2719 [inline] __x64_sys_sendmsg+0x1a1/0x260 net/socket.c:2719 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Cc: stable@vger.kernel.org Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Reported-by: syzbot+2a6fbf0f0530375968df@syzkaller.appspotmail.com Closes: https://lore.kernel.org/691ad3c3.a70a0220.f6df1.0004.GAE@google.com Signed-off-by: Eric Dumazet Cc: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251117100745.1913963-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 2ff1b9499568..9604b91902b8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -18,6 +18,7 @@ struct mptcp_pm_add_entry { u8 retrans_times; struct timer_list add_timer; struct mptcp_sock *sock; + struct rcu_head rcu; }; static DEFINE_SPINLOCK(mptcp_pm_list_lock); @@ -155,7 +156,7 @@ bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, entry = mptcp_pm_del_add_timer(msk, addr, false); ret = entry; - kfree(entry); + kfree_rcu(entry, rcu); return ret; } @@ -345,22 +346,27 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *entry; struct sock *sk = (struct sock *)msk; - struct timer_list *add_timer = NULL; + bool stop_timer = false; + + rcu_read_lock(); spin_lock_bh(&msk->pm.lock); entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (entry && (!check_id || entry->addr.id == addr->id)) { entry->retrans_times = ADD_ADDR_RETRANS_MAX; - add_timer = &entry->add_timer; + stop_timer = true; } if (!check_id && entry) list_del(&entry->list); spin_unlock_bh(&msk->pm.lock); - /* no lock, because sk_stop_timer_sync() is calling timer_delete_sync() */ - if (add_timer) - sk_stop_timer_sync(sk, add_timer); + /* Note: entry might have been removed by another thread. + * We hold rcu_read_lock() to ensure it is not freed under us. + */ + if (stop_timer) + sk_stop_timer_sync(sk, &entry->add_timer); + rcu_read_unlock(); return entry; } @@ -415,7 +421,7 @@ static void mptcp_pm_free_anno_list(struct mptcp_sock *msk) list_for_each_entry_safe(entry, tmp, &free_list, list) { sk_stop_timer_sync(sk, &entry->add_timer); - kfree(entry); + kfree_rcu(entry, rcu); } } From d47515af6cccd7484d8b0870376858c9848a18ec Mon Sep 17 00:00:00 2001 From: Pradyumn Rahar Date: Mon, 17 Nov 2025 14:16:08 +0200 Subject: [PATCH 474/543] net/mlx5: Clean up only new IRQ glue on request_irq() failure The mlx5_irq_alloc() function can inadvertently free the entire rmap and end up in a crash[1] when the other threads tries to access this, when request_irq() fails due to exhausted IRQ vectors. This commit modifies the cleanup to remove only the specific IRQ mapping that was just added. This prevents removal of other valid mappings and ensures precise cleanup of the failed IRQ allocation's associated glue object. Note: This error is observed when both fwctl and rds configs are enabled. [1] mlx5_core 0000:05:00.0: Successfully registered panic handler for port 1 mlx5_core 0000:05:00.0: mlx5_irq_alloc:293:(pid 66740): Failed to request irq. err = -28 infiniband mlx5_0: mlx5_ib_test_wc:290:(pid 66740): Error -28 while trying to test write-combining support mlx5_core 0000:05:00.0: Successfully unregistered panic handler for port 1 mlx5_core 0000:06:00.0: Successfully registered panic handler for port 1 mlx5_core 0000:06:00.0: mlx5_irq_alloc:293:(pid 66740): Failed to request irq. err = -28 infiniband mlx5_0: mlx5_ib_test_wc:290:(pid 66740): Error -28 while trying to test write-combining support mlx5_core 0000:06:00.0: Successfully unregistered panic handler for port 1 mlx5_core 0000:03:00.0: mlx5_irq_alloc:293:(pid 28895): Failed to request irq. err = -28 mlx5_core 0000:05:00.0: mlx5_irq_alloc:293:(pid 28895): Failed to request irq. err = -28 general protection fault, probably for non-canonical address 0xe277a58fde16f291: 0000 [#1] SMP NOPTI RIP: 0010:free_irq_cpu_rmap+0x23/0x7d Call Trace: ? show_trace_log_lvl+0x1d6/0x2f9 ? show_trace_log_lvl+0x1d6/0x2f9 ? mlx5_irq_alloc.cold+0x5d/0xf3 [mlx5_core] ? __die_body.cold+0x8/0xa ? die_addr+0x39/0x53 ? exc_general_protection+0x1c4/0x3e9 ? dev_vprintk_emit+0x5f/0x90 ? asm_exc_general_protection+0x22/0x27 ? free_irq_cpu_rmap+0x23/0x7d mlx5_irq_alloc.cold+0x5d/0xf3 [mlx5_core] irq_pool_request_vector+0x7d/0x90 [mlx5_core] mlx5_irq_request+0x2e/0xe0 [mlx5_core] mlx5_irq_request_vector+0xad/0xf7 [mlx5_core] comp_irq_request_pci+0x64/0xf0 [mlx5_core] create_comp_eq+0x71/0x385 [mlx5_core] ? mlx5e_open_xdpsq+0x11c/0x230 [mlx5_core] mlx5_comp_eqn_get+0x72/0x90 [mlx5_core] ? xas_load+0x8/0x91 mlx5_comp_irqn_get+0x40/0x90 [mlx5_core] mlx5e_open_channel+0x7d/0x3c7 [mlx5_core] mlx5e_open_channels+0xad/0x250 [mlx5_core] mlx5e_open_locked+0x3e/0x110 [mlx5_core] mlx5e_open+0x23/0x70 [mlx5_core] __dev_open+0xf1/0x1a5 __dev_change_flags+0x1e1/0x249 dev_change_flags+0x21/0x5c do_setlink+0x28b/0xcc4 ? __nla_parse+0x22/0x3d ? inet6_validate_link_af+0x6b/0x108 ? cpumask_next+0x1f/0x35 ? __snmp6_fill_stats64.constprop.0+0x66/0x107 ? __nla_validate_parse+0x48/0x1e6 __rtnl_newlink+0x5ff/0xa57 ? kmem_cache_alloc_trace+0x164/0x2ce rtnl_newlink+0x44/0x6e rtnetlink_rcv_msg+0x2bb/0x362 ? __netlink_sendskb+0x4c/0x6c ? netlink_unicast+0x28f/0x2ce ? rtnl_calcit.isra.0+0x150/0x146 netlink_rcv_skb+0x5f/0x112 netlink_unicast+0x213/0x2ce netlink_sendmsg+0x24f/0x4d9 __sock_sendmsg+0x65/0x6a ____sys_sendmsg+0x28f/0x2c9 ? import_iovec+0x17/0x2b ___sys_sendmsg+0x97/0xe0 __sys_sendmsg+0x81/0xd8 do_syscall_64+0x35/0x87 entry_SYSCALL_64_after_hwframe+0x6e/0x0 RIP: 0033:0x7fc328603727 Code: c3 66 90 41 54 41 89 d4 55 48 89 f5 53 89 fb 48 83 ec 10 e8 0b ed ff ff 44 89 e2 48 89 ee 89 df 41 89 c0 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 35 44 89 c7 48 89 44 24 08 e8 44 ed ff ff 48 RSP: 002b:00007ffe8eb3f1a0 EFLAGS: 00000293 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 000000000000000d RCX: 00007fc328603727 RDX: 0000000000000000 RSI: 00007ffe8eb3f1f0 RDI: 000000000000000d RBP: 00007ffe8eb3f1f0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000 R13: 0000000000000000 R14: 00007ffe8eb3f3c8 R15: 00007ffe8eb3f3bc ---[ end trace f43ce73c3c2b13a2 ]--- RIP: 0010:free_irq_cpu_rmap+0x23/0x7d Code: 0f 1f 80 00 00 00 00 48 85 ff 74 6b 55 48 89 fd 53 66 83 7f 06 00 74 24 31 db 48 8b 55 08 0f b7 c3 48 8b 04 c2 48 85 c0 74 09 <8b> 38 31 f6 e8 c4 0a b8 ff 83 c3 01 66 3b 5d 06 72 de b8 ff ff ff RSP: 0018:ff384881640eaca0 EFLAGS: 00010282 RAX: e277a58fde16f291 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ff2335e2e20b3600 RSI: 0000000000000000 RDI: ff2335e2e20b3400 RBP: ff2335e2e20b3400 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 00000000ffffffe4 R12: ff384881640ead88 R13: ff2335c3760751e0 R14: ff2335e2e1672200 R15: ff2335c3760751f8 FS: 00007fc32ac22480(0000) GS:ff2335e2d6e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f651ab54000 CR3: 00000029f1206003 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x1dc00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) kvm-guest: disable async PF for cpu 0 Fixes: 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation") Signed-off-by: Mohith Kumar Thummaluru Tested-by: Mohith Kumar Thummaluru Reviewed-by: Moshe Shemesh Reviewed-by: Shay Drori Signed-off-by: Pradyumn Rahar Reviewed-by: Jacob Keller Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1763381768-1234998-1-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index e18a850c615c..aa3b5878e3da 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -324,10 +324,8 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, free_irq(irq->map.virq, &irq->nh); err_req_irq: #ifdef CONFIG_RFS_ACCEL - if (i && rmap && *rmap) { - free_irq_cpu_rmap(*rmap); - *rmap = NULL; - } + if (i && rmap && *rmap) + irq_cpu_rmap_remove(*rmap, irq->map.virq); err_irq_rmap: #endif if (i && pci_msix_can_alloc_dyn(dev->pdev)) From 7bf3a476ce43833c49fceddbe94ff3472e04e9bc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Nov 2025 17:47:10 +0000 Subject: [PATCH 475/543] af_unix: Read sk_peek_offset() again after sleeping in unix_stream_read_generic(). Miao Wang reported a bug of SO_PEEK_OFF on AF_UNIX SOCK_STREAM socket. The unexpected behaviour is triggered when the peek offset is larger than the recv queue and the thread is unblocked by new data. Let's assume a socket which has "aaaa" in the recv queue and the peek offset is 4. First, unix_stream_read_generic() reads the offset 4 and skips the skb(s) of "aaaa" with the code below: skip = max(sk_peek_offset(sk, flags), 0); /* @skip is 4. */ do { ... while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); ... skb = skb_peek_next(skb, &sk->sk_receive_queue); if (!skb) goto again; /* @skip is 0. */ } The thread jumps to the 'again' label and goes to sleep since new data has not arrived yet. Later, new data "bbbb" unblocks the thread, and the thread jumps to the 'redo:' label to restart the entire process from the first skb in the recv queue. do { ... redo: ... last = skb = skb_peek(&sk->sk_receive_queue); ... again: if (skb == NULL) { ... timeo = unix_stream_data_wait(sk, timeo, last, last_len, freezable); ... goto redo; /* @skip is 0 !! */ However, the peek offset is not reset in the path. If the buffer size is 8, recv() will return "aaaabbbb" without skipping any data, and the final offset will be 12 (the original offset 4 + peeked skbs' length 8). After sleeping in unix_stream_read_generic(), we have to fetch the peek offset again. Let's move the redo label before mutex_lock(&u->iolock). Fixes: 9f389e35674f ("af_unix: return data from multiple SKBs on recv() with MSG_PEEK flag") Reported-by: Miao Wang Closes: https://lore.kernel.org/netdev/3B969F90-F51F-4B9D-AB1A-994D9A54D460@gmail.com/ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251117174740.3684604-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 768098dec231..833c3616d2a2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2954,6 +2954,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, u = unix_sk(sk); +redo: /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ @@ -2965,7 +2966,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, struct sk_buff *skb, *last; int chunk; -redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; @@ -3015,7 +3015,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, goto out; } - mutex_lock(&u->iolock); goto redo; unlock: unix_state_unlock(sk); From e1bb28bf13f41af5d7cc48359d1755cbcda4d502 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Nov 2025 17:47:11 +0000 Subject: [PATCH 476/543] selftest: af_unix: Add test for SO_PEEK_OFF. The test covers various cases to verify SO_PEEK_OFF behaviour for all AF_UNIX socket types. two_chunks_blocking and two_chunks_overlap_blocking reproduce the issue mentioned in the previous patch. Without the patch, the two tests fail: # RUN so_peek_off.stream.two_chunks_blocking ... # so_peek_off.c:121:two_chunks_blocking:Expected 'bbbb' == 'aaaabbbb'. # two_chunks_blocking: Test terminated by assertion # FAIL so_peek_off.stream.two_chunks_blocking not ok 3 so_peek_off.stream.two_chunks_blocking # RUN so_peek_off.stream.two_chunks_overlap_blocking ... # so_peek_off.c:159:two_chunks_overlap_blocking:Expected 'bbbb' == 'aaaabbbb'. # two_chunks_overlap_blocking: Test terminated by assertion # FAIL so_peek_off.stream.two_chunks_overlap_blocking not ok 5 so_peek_off.stream.two_chunks_overlap_blocking With the patch, all tests pass: # PASSED: 15 / 15 tests passed. # Totals: pass:15 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251117174740.3684604-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/af_unix/Makefile | 1 + .../selftests/net/af_unix/so_peek_off.c | 162 ++++++++++++++++++ 3 files changed, 164 insertions(+) create mode 100644 tools/testing/selftests/net/af_unix/so_peek_off.c diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 439101b518ee..8f9850a71f54 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -45,6 +45,7 @@ skf_net_off socket so_incoming_cpu so_netns_cookie +so_peek_off so_txtime so_rcv_listener stress_reuseport_listen diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile index de805cbbdf69..528d14c598bb 100644 --- a/tools/testing/selftests/net/af_unix/Makefile +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := \ scm_inq \ scm_pidfd \ scm_rights \ + so_peek_off \ unix_connect \ # end of TEST_GEN_PROGS diff --git a/tools/testing/selftests/net/af_unix/so_peek_off.c b/tools/testing/selftests/net/af_unix/so_peek_off.c new file mode 100644 index 000000000000..1a77728128e5 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/so_peek_off.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2025 Google LLC */ + +#include +#include + +#include + +#include "../../kselftest_harness.h" + +FIXTURE(so_peek_off) +{ + int fd[2]; /* 0: sender, 1: receiver */ +}; + +FIXTURE_VARIANT(so_peek_off) +{ + int type; +}; + +FIXTURE_VARIANT_ADD(so_peek_off, stream) +{ + .type = SOCK_STREAM, +}; + +FIXTURE_VARIANT_ADD(so_peek_off, dgram) +{ + .type = SOCK_DGRAM, +}; + +FIXTURE_VARIANT_ADD(so_peek_off, seqpacket) +{ + .type = SOCK_SEQPACKET, +}; + +FIXTURE_SETUP(so_peek_off) +{ + struct timeval timeout = { + .tv_sec = 0, + .tv_usec = 3000, + }; + int ret; + + ret = socketpair(AF_UNIX, variant->type, 0, self->fd); + ASSERT_EQ(0, ret); + + ret = setsockopt(self->fd[1], SOL_SOCKET, SO_RCVTIMEO_NEW, + &timeout, sizeof(timeout)); + ASSERT_EQ(0, ret); + + ret = setsockopt(self->fd[1], SOL_SOCKET, SO_PEEK_OFF, + &(int){0}, sizeof(int)); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(so_peek_off) +{ + close_range(self->fd[0], self->fd[1], 0); +} + +#define sendeq(fd, str, flags) \ + do { \ + int bytes, len = strlen(str); \ + \ + bytes = send(fd, str, len, flags); \ + ASSERT_EQ(len, bytes); \ + } while (0) + +#define recveq(fd, str, buflen, flags) \ + do { \ + char buf[(buflen) + 1] = {}; \ + int bytes; \ + \ + bytes = recv(fd, buf, buflen, flags); \ + ASSERT_NE(-1, bytes); \ + ASSERT_STREQ(str, buf); \ + } while (0) + +#define async \ + for (pid_t pid = (pid = fork(), \ + pid < 0 ? \ + __TH_LOG("Failed to start async {}"), \ + _metadata->exit_code = KSFT_FAIL, \ + __bail(1, _metadata), \ + 0xdead : \ + pid); \ + !pid; exit(0)) + +TEST_F(so_peek_off, single_chunk) +{ + sendeq(self->fd[0], "aaaabbbb", 0); + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks) +{ + sendeq(self->fd[0], "aaaa", 0); + sendeq(self->fd[0], "bbbb", 0); + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks_blocking) +{ + async { + usleep(1000); + sendeq(self->fd[0], "aaaa", 0); + } + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + + async { + usleep(1000); + sendeq(self->fd[0], "bbbb", 0); + } + + /* goto again; -> goto redo; in unix_stream_read_generic(). */ + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks_overlap) +{ + sendeq(self->fd[0], "aaaa", 0); + recveq(self->fd[1], "aa", 2, MSG_PEEK); + + sendeq(self->fd[0], "bbbb", 0); + + if (variant->type == SOCK_STREAM) { + /* SOCK_STREAM tries to fill the buffer. */ + recveq(self->fd[1], "aabb", 4, MSG_PEEK); + recveq(self->fd[1], "bb", 100, MSG_PEEK); + } else { + /* SOCK_DGRAM and SOCK_SEQPACKET returns at the skb boundary. */ + recveq(self->fd[1], "aa", 100, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); + } +} + +TEST_F(so_peek_off, two_chunks_overlap_blocking) +{ + async { + usleep(1000); + sendeq(self->fd[0], "aaaa", 0); + } + + recveq(self->fd[1], "aa", 2, MSG_PEEK); + + async { + usleep(1000); + sendeq(self->fd[0], "bbbb", 0); + } + + /* Even SOCK_STREAM does not wait if at least one byte is read. */ + recveq(self->fd[1], "aa", 100, MSG_PEEK); + + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_HARNESS_MAIN From a24074ca8840cf28fa50c40e957fdc50f29971b3 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Nov 2025 10:15:38 +0100 Subject: [PATCH 477/543] perf/x86/intel/uncore: Remove superfluous check The 'pmu' pointer cannot be NULL, as it is taken as a pointer to an array. Remove the superfluous NULL check. Found by Coverity: CID#1497507. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Ingo Molnar Cc: Liang Kan Cc: Peter Zijlstra Link: https://patch.msgid.link/20251119091538.825307-1-jirislaby@kernel.org --- arch/x86/events/intel/uncore.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index d6c945cc5d07..e228e564b15e 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1325,8 +1325,6 @@ static void uncore_pci_sub_driver_init(void) continue; pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; - if (!pmu) - continue; if (uncore_pci_get_dev_die_info(pci_sub_dev, &die)) continue; From d4cd0902c156b2ca60fdda8cd8b5bcb4b0e9ed64 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 17 Nov 2025 16:08:42 +0100 Subject: [PATCH 478/543] gpio: cdev: make sure the cdev fd is still active before emitting events With the final call to fput() on a file descriptor, the release action may be deferred and scheduled on a work queue. The reference count of that descriptor is still zero and it must not be used. It's possible that a GPIO change, we want to notify the user-space about, happens AFTER the reference count on the file descriptor associated with the character device went down to zero but BEFORE the .release() callback was called from the workqueue and so BEFORE we unregistered from the notifier. Using the regular get_file() routine in this situation triggers the following warning: struct file::f_count incremented from zero; use-after-free condition present! So use the get_file_active() variant that will return NULL on file descriptors that have been or are being released. Fixes: 40b7c49950bd ("gpio: cdev: put emitting the line state events on a workqueue") Reported-by: Alexander Sverdlin Closes: https://lore.kernel.org/all/5d605f7fc99456804911403102a4fe999a14cc85.camel@siemens.com/ Tested-by: Alexander Sverdlin Link: https://lore.kernel.org/r/20251117-gpio-cdev-get-file-v1-1-28a16b5985b8@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-cdev.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index 175836467f21..d8d93059ac04 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -2548,10 +2548,17 @@ static int lineinfo_changed_notify(struct notifier_block *nb, container_of(nb, struct gpio_chardev_data, lineinfo_changed_nb); struct lineinfo_changed_ctx *ctx; struct gpio_desc *desc = data; + struct file *fp; if (!test_bit(gpio_chip_hwgpio(desc), cdev->watched_lines)) return NOTIFY_DONE; + /* Keep the file descriptor alive for the duration of the notification. */ + fp = get_file_active(&cdev->fp); + if (!fp) + /* Chardev file descriptor was or is being released. */ + return NOTIFY_DONE; + /* * If this is called from atomic context (for instance: with a spinlock * taken by the atomic notifier chain), any sleeping calls must be done @@ -2575,8 +2582,6 @@ static int lineinfo_changed_notify(struct notifier_block *nb, /* Keep the GPIO device alive until we emit the event. */ ctx->gdev = gpio_device_get(desc->gdev); ctx->cdev = cdev; - /* Keep the file descriptor alive too. */ - get_file(ctx->cdev->fp); INIT_WORK(&ctx->work, lineinfo_changed_func); queue_work(ctx->gdev->line_state_wq, &ctx->work); From 2b6d546ba83e8332870741eca469aed662d819ff Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 18 Nov 2025 13:18:16 +0100 Subject: [PATCH 479/543] MAINTAINERS: update my email address Due to an upcoming change in my professional situation, I will need to start using my kernel.org address. Update all my MAINTAINERS entries. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20251118121816.23018-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- MAINTAINERS | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index e64b94e6b5a9..55b3b659449d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3926,7 +3926,7 @@ F: crypto/async_tx/ F: include/linux/async_tx.h AT24 EEPROM DRIVER -M: Bartosz Golaszewski +M: Bartosz Golaszewski L: linux-i2c@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git @@ -10679,7 +10679,7 @@ F: tools/gpio/gpio-sloppy-logic-analyzer.sh GPIO SUBSYSTEM M: Linus Walleij -M: Bartosz Golaszewski +M: Bartosz Golaszewski L: linux-gpio@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git @@ -10696,7 +10696,7 @@ K: GPIOD_FLAGS_BIT_NONEXCLUSIVE K: devm_gpiod_unhinge GPIO UAPI -M: Bartosz Golaszewski +M: Bartosz Golaszewski R: Kent Gibson L: linux-gpio@vger.kernel.org S: Maintained @@ -15310,7 +15310,7 @@ F: drivers/pwm/pwm-max7360.c F: include/linux/mfd/max7360.h MAXIM MAX77650 PMIC MFD DRIVER -M: Bartosz Golaszewski +M: Bartosz Golaszewski L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/*/*max77650.yaml @@ -19900,7 +19900,7 @@ F: drivers/pci/p2pdma.c F: include/linux/pci-p2pdma.h PCI POWER CONTROL -M: Bartosz Golaszewski +M: Bartosz Golaszewski L: linux-pci@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git @@ -20497,7 +20497,7 @@ F: include/linux/powercap.h F: kernel/configs/nopm.config POWER SEQUENCING -M: Bartosz Golaszewski +M: Bartosz Golaszewski L: linux-pm@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git @@ -21300,7 +21300,7 @@ F: Documentation/tee/qtee.rst F: drivers/tee/qcomtee/ QUALCOMM TRUST ZONE MEMORY ALLOCATOR -M: Bartosz Golaszewski +M: Bartosz Golaszewski L: linux-arm-msm@vger.kernel.org S: Maintained F: drivers/firmware/qcom/qcom_tzmem.c @@ -25668,7 +25668,7 @@ F: Documentation/devicetree/bindings/crypto/ti,am62l-dthev2.yaml F: drivers/crypto/ti/ TI DAVINCI MACHINE SUPPORT -M: Bartosz Golaszewski +M: Bartosz Golaszewski L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git From d2932a59c2d4fb364396f21df58431c44918dd47 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 18 Nov 2025 17:27:12 -0800 Subject: [PATCH 480/543] ACPI: APEI: EINJ: Fix EINJV2 initialization and injection ACPI 6.6 specification for EINJV2 appends an extra structure to the end of the existing struct set_error_type_with_address. Several issues showed up in testing. 1) Initialization was broken by an earlier fix [1] since is_v2 is only set while performing an injection, not during initialization. 2) A buggy BIOS provided invalid "revision" and "length" for the extension structure. Add several sanity checks. 3) When injecting legacy error types on an EINJV2 capable system, don't copy the component arrays. Fixes: 6c7058514991 ("ACPI: APEI: EINJ: Check if user asked for EINJV2 injection") # [1] Fixes: b47610296d17 ("ACPI: APEI: EINJ: Enable EINJv2 error injections") Signed-off-by: Tony Luck [ rjw: Changelog edits ] Cc: 6.17+ # 6.17+ Link: https://patch.msgid.link/20251119012712.178715-1-tony.luck@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/einj-core.c | 64 ++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/drivers/acpi/apei/einj-core.c b/drivers/acpi/apei/einj-core.c index 3c87953dbd19..305c240a303f 100644 --- a/drivers/acpi/apei/einj-core.c +++ b/drivers/acpi/apei/einj-core.c @@ -182,6 +182,7 @@ bool einj_initialized __ro_after_init; static void __iomem *einj_param; static u32 v5param_size; +static u32 v66param_size; static bool is_v2; static void einj_exec_ctx_init(struct apei_exec_context *ctx) @@ -283,6 +284,24 @@ static void check_vendor_extension(u64 paddr, acpi_os_unmap_iomem(p, sizeof(v)); } +static u32 einjv2_init(struct einjv2_extension_struct *e) +{ + if (e->revision != 1) { + pr_info("Unknown v2 extension revision %u\n", e->revision); + return 0; + } + if (e->length < sizeof(*e) || e->length > PAGE_SIZE) { + pr_info(FW_BUG "Bad1 v2 extension length %u\n", e->length); + return 0; + } + if ((e->length - sizeof(*e)) % sizeof(e->component_arr[0])) { + pr_info(FW_BUG "Bad2 v2 extension length %u\n", e->length); + return 0; + } + + return (e->length - sizeof(*e)) / sizeof(e->component_arr[0]); +} + static void __iomem *einj_get_parameter_address(void) { int i; @@ -310,28 +329,21 @@ static void __iomem *einj_get_parameter_address(void) v5param_size = sizeof(v5param); p = acpi_os_map_iomem(pa_v5, sizeof(*p)); if (p) { - int offset, len; - memcpy_fromio(&v5param, p, v5param_size); acpi5 = 1; check_vendor_extension(pa_v5, &v5param); - if (is_v2 && available_error_type & ACPI65_EINJV2_SUPP) { - len = v5param.einjv2_struct.length; - offset = offsetof(struct einjv2_extension_struct, component_arr); - max_nr_components = (len - offset) / - sizeof(v5param.einjv2_struct.component_arr[0]); - /* - * The first call to acpi_os_map_iomem above does not include the - * component array, instead it is used to read and calculate maximum - * number of components supported by the system. Below, the mapping - * is expanded to include the component array. - */ + if (available_error_type & ACPI65_EINJV2_SUPP) { + struct einjv2_extension_struct *e; + + e = &v5param.einjv2_struct; + max_nr_components = einjv2_init(e); + + /* remap including einjv2_extension_struct */ acpi_os_unmap_iomem(p, v5param_size); - offset = offsetof(struct set_error_type_with_address, einjv2_struct); - v5param_size = offset + struct_size(&v5param.einjv2_struct, - component_arr, max_nr_components); - p = acpi_os_map_iomem(pa_v5, v5param_size); + v66param_size = v5param_size - sizeof(*e) + e->length; + p = acpi_os_map_iomem(pa_v5, v66param_size); } + return p; } } @@ -527,6 +539,7 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3, u64 param4) { struct apei_exec_context ctx; + u32 param_size = is_v2 ? v66param_size : v5param_size; u64 val, trigger_paddr, timeout = FIRMWARE_TIMEOUT; int i, rc; @@ -539,11 +552,11 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, if (acpi5) { struct set_error_type_with_address *v5param; - v5param = kmalloc(v5param_size, GFP_KERNEL); + v5param = kmalloc(param_size, GFP_KERNEL); if (!v5param) return -ENOMEM; - memcpy_fromio(v5param, einj_param, v5param_size); + memcpy_fromio(v5param, einj_param, param_size); v5param->type = type; if (type & ACPI5_VENDOR_BIT) { switch (vendor_flags) { @@ -601,7 +614,7 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, break; } } - memcpy_toio(einj_param, v5param, v5param_size); + memcpy_toio(einj_param, v5param, param_size); kfree(v5param); } else { rc = apei_exec_run(&ctx, ACPI_EINJ_SET_ERROR_TYPE); @@ -1132,9 +1145,14 @@ static void einj_remove(struct faux_device *fdev) struct apei_exec_context ctx; if (einj_param) { - acpi_size size = (acpi5) ? - v5param_size : - sizeof(struct einj_parameter); + acpi_size size; + + if (v66param_size) + size = v66param_size; + else if (acpi5) + size = v5param_size; + else + size = sizeof(struct einj_parameter); acpi_os_unmap_iomem(einj_param, size); if (vendor_errors.size) From 79afd3c5edac93b684393ec84185b2776d0630ef Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 18 Nov 2025 15:34:27 -0600 Subject: [PATCH 481/543] dt-bindings: pinctrl: xlnx,versal-pinctrl: Add missing unevaluatedProperties on '^conf' nodes Add the missing unevaluatedProperties to disallow extra properties on the '^conf' nodes. Signed-off-by: Rob Herring (Arm) Signed-off-by: Linus Walleij --- .../devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml index 55ece6a8be5e..81e2164ea98f 100644 --- a/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml @@ -74,6 +74,7 @@ patternProperties: '^conf': type: object + unevaluatedProperties: false description: Pinctrl node's client devices use subnodes for pin configurations, which in turn use the standard properties below. From e31a11be41cd134f245c01d1329e7bc89aba78fb Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 17 Nov 2025 18:29:43 +0800 Subject: [PATCH 482/543] net: phylink: add missing supported link modes for the fixed-link Pause, Asym_Pause and Autoneg bits are not set when pl->supported is initialized, so these link modes will not work for the fixed-link. This leads to a TCP performance degradation issue observed on the i.MX943 platform. The switch CPU port of i.MX943 is connected to an ENETC MAC, this link is a fixed link and the link speed is 2.5Gbps. And one of the switch user ports is the RGMII interface, and its link speed is 1Gbps. If the flow-control of the fixed link is not enabled, we can easily observe the iperf performance of TCP packets is very low. Because the inbound rate on the CPU port is greater than the outbound rate on the user port, the switch is prone to congestion, leading to the loss of some TCP packets and requiring multiple retransmissions. Solving this problem should be as simple as setting the Asym_Pause and Pause bits. The reason why the Autoneg bit needs to be set, Russell has gave a very good explanation in the thread [1], see below. "As the advertising and lp_advertising bitmasks have to be non-empty, and the swphy reports aneg capable, aneg complete, and AN enabled, then for consistency with that state, Autoneg should be set. This is how it was prior to the blamed commit." Fixes: de7d3f87be3c ("net: phylink: Use phy_caps_lookup for fixed-link configuration") Link: https://lore.kernel.org/aRjqLN8eQDIQfBjS@shell.armlinux.org.uk # [1] Signed-off-by: Wei Fang Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20251117102943.1862680-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 9d7799ea1c17..918244308215 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -637,6 +637,9 @@ static int phylink_validate(struct phylink *pl, unsigned long *supported, static void phylink_fill_fixedlink_supported(unsigned long *supported) { + linkmode_set_bit(ETHTOOL_LINK_MODE_Pause_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, supported); linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, supported); linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, supported); linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, supported); From cead55e24cf9e092890cf51c0548eccd7569defa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 13 Nov 2025 01:30:28 +0200 Subject: [PATCH 483/543] drm/plane: Fix create_in_format_blob() return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit create_in_format_blob() is either supposed to return a valid pointer or an error, but never NULL. The caller will dereference the blob when it is not an error, and thus will oops if NULL returned. Return proper error values in the failure cases. Cc: stable@vger.kernel.org Cc: Arun R Murthy Fixes: 0d6dcd741c26 ("drm/plane: modify create_in_formats to acommodate async") Signed-off-by: Ville Syrjälä Link: https://patch.msgid.link/20251112233030.24117-2-ville.syrjala@linux.intel.com Reviewed-by: Arun R Murthy --- drivers/gpu/drm/drm_plane.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c index 38f82391bfda..a30493ed9715 100644 --- a/drivers/gpu/drm/drm_plane.c +++ b/drivers/gpu/drm/drm_plane.c @@ -210,7 +210,7 @@ static struct drm_property_blob *create_in_format_blob(struct drm_device *dev, formats_size = sizeof(__u32) * plane->format_count; if (WARN_ON(!formats_size)) { /* 0 formats are never expected */ - return 0; + return ERR_PTR(-EINVAL); } modifiers_size = @@ -226,7 +226,7 @@ static struct drm_property_blob *create_in_format_blob(struct drm_device *dev, blob = drm_property_create_blob(dev, blob_size, NULL); if (IS_ERR(blob)) - return NULL; + return blob; blob_data = blob->data; blob_data->version = FORMAT_BLOB_CURRENT; From 807e0d187da4c0b22036b5e34000f7a8c52f6e50 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Thu, 20 Nov 2025 01:45:25 +0800 Subject: [PATCH 484/543] tick/sched: Fix bogus condition in report_idle_softirq() In commit 0345691b24c0 ("tick/rcu: Stop allowing RCU_SOFTIRQ in idle") the new function report_idle_softirq() was created by breaking code out of the existing can_stop_idle_tick() for kernels v5.18 and newer. In doing so, the code essentially went from this form: if (A) { static int ratelimit; if (ratelimit < 10 && !C && A&D) { pr_warn("NOHZ tick-stop error: ..."); ratelimit++; } return false; } to a new function: static bool report_idle_softirq(void) { static int ratelimit; if (likely(!A)) return false; if (ratelimit < 10) return false; ... pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", pending); ratelimit++; return true; } commit a7e282c77785 ("tick/rcu: Fix bogus ratelimit condition") realized ratelimit was essentially set to zero instead of ten, and hence *no* softirq pending messages would ever be issued, but "fixed" it as: - if (ratelimit < 10) + if (ratelimit >= 10) return false; However, this fix introduced another issue: When ratelimit is greater than or equal 10, even if A is true, it will directly return false. While ratelimit in the original code was only used to control printing and will not affect the return value. Restore the original logic and restrict ratelimit to control the printk and not the return value. Fixes: 0345691b24c0 ("tick/rcu: Stop allowing RCU_SOFTIRQ in idle") Fixes: a7e282c77785 ("tick/rcu: Fix bogus ratelimit condition") Signed-off-by: Wen Yang Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251119174525.29470-1-wen.yang@linux.dev --- kernel/time/tick-sched.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c527b421c865..466e083c8272 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1152,16 +1152,15 @@ static bool report_idle_softirq(void) return false; } - if (ratelimit >= 10) - return false; - /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; - pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", - pending); - ratelimit++; + if (ratelimit < 10) { + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + } return true; } From 31ab31433c9bd2f255c48dc6cb9a99845c58b1e4 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 18 Nov 2025 07:18:10 -0600 Subject: [PATCH 485/543] drm/amd: Skip power ungate during suspend for VPE During the suspend sequence VPE is already going to be power gated as part of vpe_suspend(). It's unnecessary to call during calls to amdgpu_device_set_pg_state(). It actually can expose a race condition with the firmware if s0i3 sequence starts as well. Drop these calls. Cc: Peyton.Lee@amd.com Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 2a6c826cfeedd7714611ac115371a959ead55bda) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2819aceaab74..076bbc09f30c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3414,10 +3414,11 @@ int amdgpu_device_set_pg_state(struct amdgpu_device *adev, (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) continue; - /* skip CG for VCE/UVD, it's handled specially */ + /* skip CG for VCE/UVD/VPE, it's handled specially */ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VPE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && adev->ip_blocks[i].version->funcs->set_powergating_state) { /* enable powergating to save power */ From 80d8a9ad1587b64c545d515ab6cb7ecb9908e1b3 Mon Sep 17 00:00:00 2001 From: Yifan Zha Date: Fri, 14 Nov 2025 17:48:58 +0800 Subject: [PATCH 486/543] drm/amdgpu: Skip emit de meta data on gfx11 with rs64 enabled [Why] Accoreding to CP updated to RS64 on gfx11, WRITE_DATA with PREEMPTION_META_MEMORY(dst_sel=8) is illegal for CP FW. That packet is used for MCBP on F32 based system. So it would lead to incorrect GRBM write and FW is not handling that extra case correctly. [How] With gfx11 rs64 enabled, skip emit de meta data. Signed-off-by: Yifan Zha Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 8366cd442d226463e673bed5d199df916f4ecbcf) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index d61eb9f187c6..f2be16e700c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -5872,9 +5872,9 @@ static void gfx_v11_0_ring_emit_ib_gfx(struct amdgpu_ring *ring, if (flags & AMDGPU_IB_PREEMPTED) control |= INDIRECT_BUFFER_PRE_RESUME(1); - if (vmid) + if (vmid && !ring->adev->gfx.rs64_enable) gfx_v11_0_ring_emit_de_meta(ring, - (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ? true : false); + !amdgpu_sriov_vf(ring->adev) && (flags & AMDGPU_IB_PREEMPTED)); } amdgpu_ring_write(ring, header); From a44592339397bc6715917997c6869bdedd1a7256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Wed, 19 Nov 2025 10:25:42 +0100 Subject: [PATCH 487/543] drm/amdgpu/vm: Check PRT uAPI flag instead of PTE flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes sparse mappings (aka. partially resident textures). Check the correct flags. Since a recent refactor, the code works with uAPI flags (for mapping buffer objects), and not PTE (page table entry) flags. Fixes: 6716a823d18d ("drm/amdgpu: rework how PTE flags are generated v3") Signed-off-by: Timur Kristóf Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 8feeab26c80635b802f72b3ed986c693ff8f3212) --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index c1a801203949..b1aaef962ad9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2078,7 +2078,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev, struct amdgpu_bo *bo = before->bo_va->base.bo; amdgpu_vm_it_insert(before, &vm->va); - if (before->flags & AMDGPU_PTE_PRT_FLAG(adev)) + if (before->flags & AMDGPU_VM_PAGE_PRT) amdgpu_vm_prt_get(adev); if (amdgpu_vm_is_bo_always_valid(vm, bo) && @@ -2093,7 +2093,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev, struct amdgpu_bo *bo = after->bo_va->base.bo; amdgpu_vm_it_insert(after, &vm->va); - if (after->flags & AMDGPU_PTE_PRT_FLAG(adev)) + if (after->flags & AMDGPU_VM_PAGE_PRT) amdgpu_vm_prt_get(adev); if (amdgpu_vm_is_bo_always_valid(vm, bo) && From 21f46f54769c45ac8ca0dbaa977bc1b436ffdee2 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Tue, 18 Nov 2025 14:28:33 +0530 Subject: [PATCH 488/543] drm/amdgpu/ttm: Fix crash when handling MMIO_REMAP in PDE flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MMIO_REMAP BO is a special 4K IO page that does not have a ttm_tt behind it. However, amdgpu_ttm_tt_pde_flags() was treating it like normal TT/doorbell/preempt memory and unconditionally accessed ttm->caching. For the MMIO_REMAP BO, ttm is NULL, so this leads to a NULL pointer dereference when computing PDE flags. Fix this by checking that ttm is non-NULL before reading ttm->caching. This prevents the crash for MMIO_REMAP and also makes the code more defensive if other BOs ever come through without a ttm_tt. Fixes: fb5a52dbe9fe ("drm/amdgpu: Implement TTM handling for MMIO_REMAP placement") Suggested-by: Jesse Zhang Suggested-by: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Reviewed-by: Jesse Zhang Tested-by: Jesse Zhang Signed-off-by: Alex Deucher (cherry picked from commit 0db94da5a0a1cacda080b9ec8425fcbe4babc141) --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index aa9ee5dffa45..9d568c16beb1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1372,7 +1372,7 @@ uint64_t amdgpu_ttm_tt_pde_flags(struct ttm_tt *ttm, struct ttm_resource *mem) mem->mem_type == AMDGPU_PL_MMIO_REMAP)) { flags |= AMDGPU_PTE_SYSTEM; - if (ttm->caching == ttm_cached) + if (ttm && ttm->caching == ttm_cached) flags |= AMDGPU_PTE_SNOOPED; } From c156c7f27ecdb7b89dbbeaaa1f40d9fadc3c1680 Mon Sep 17 00:00:00 2001 From: Shikang Fan Date: Wed, 19 Nov 2025 18:05:10 +0800 Subject: [PATCH 489/543] drm/amdgpu: Add sriov vf check for VCN per queue reset support. Add SRIOV check when setting VCN ring's supported reset mask. Signed-off-by: Shikang Fan Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher (cherry picked from commit ee9b603ad43f9870eb75184f9fb0a84f8c3cc852) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 +- drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index eacf4e93ba2f..cb7123ec1a5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -141,7 +141,7 @@ static int vcn_v4_0_3_late_init(struct amdgpu_ip_block *ip_block) adev->vcn.supported_reset = amdgpu_get_soft_full_reset_mask(&adev->vcn.inst[0].ring_enc[0]); - if (amdgpu_dpm_reset_vcn_is_supported(adev)) + if (amdgpu_dpm_reset_vcn_is_supported(adev) && !amdgpu_sriov_vf(adev)) adev->vcn.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c index 714350cabf2f..8bd457dea4cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c @@ -122,7 +122,9 @@ static int vcn_v5_0_1_late_init(struct amdgpu_ip_block *ip_block) switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { case IP_VERSION(13, 0, 12): - if ((adev->psp.sos.fw_version >= 0x00450025) && amdgpu_dpm_reset_vcn_is_supported(adev)) + if ((adev->psp.sos.fw_version >= 0x00450025) && + amdgpu_dpm_reset_vcn_is_supported(adev) && + !amdgpu_sriov_vf(adev)) adev->vcn.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; break; default: From e837b9091b277ae6f309d7e9fc93cb0308cf461f Mon Sep 17 00:00:00 2001 From: Bitterblue Smith Date: Fri, 14 Nov 2025 00:54:48 +0200 Subject: [PATCH 490/543] wifi: rtw89: hw_scan: Don't let the operating channel be last Scanning can be offloaded to the firmware. To that end, the driver prepares a list of channels to scan, including periodic visits back to the operating channel, and sends the list to the firmware. When the channel list is too long to fit in a single H2C message, the driver splits the list, sends the first part, and tells the firmware to scan. When the scan is complete, the driver sends the next part of the list and tells the firmware to scan. When the last channel that fit in the H2C message is the operating channel something seems to go wrong in the firmware. It will acknowledge receiving the list of channels but apparently it will not do anything more. The AP can't be pinged anymore. The driver still receives beacons, though. One way to avoid this is to split the list of channels before the operating channel. Affected devices: * RTL8851BU with firmware 0.29.41.3 * RTL8832BU with firmware 0.29.29.8 * RTL8852BE with firmware 0.29.29.8 The commit 57a5fbe39a18 ("wifi: rtw89: refactor flow that hw scan handles channel list") is found by git blame, but it is actually to refine the scan flow, but not a culprit, so skip Fixes tag. Reported-by: Bitterblue Smith Closes: https://lore.kernel.org/linux-wireless/0abbda91-c5c2-4007-84c8-215679e652e1@gmail.com/ Cc: stable@vger.kernel.org # 6.16+ Signed-off-by: Bitterblue Smith Acked-by: Ping-Ke Shih Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/c1e61744-8db4-4646-867f-241b47d30386@gmail.com --- drivers/net/wireless/realtek/rtw89/fw.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index ab904a7def1b..080c4f8a655a 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -7694,6 +7694,13 @@ int rtw89_hw_scan_add_chan_list_ax(struct rtw89_dev *rtwdev, INIT_LIST_HEAD(&list); list_for_each_entry_safe(ch_info, tmp, &scan_info->chan_list, list) { + /* The operating channel (tx_null == true) should + * not be last in the list, to avoid breaking + * RTL8851BU and RTL8832BU. + */ + if (list_len + 1 == RTW89_SCAN_LIST_LIMIT_AX && ch_info->tx_null) + break; + list_move_tail(&ch_info->list, &list); list_len++; From 90449f2d1e1f020835cba5417234636937dd657e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 13 Nov 2025 10:16:43 -0800 Subject: [PATCH 491/543] scsi: sg: Do not sleep in atomic context sg_finish_rem_req() calls blk_rq_unmap_user(). The latter function may sleep. Hence, call sg_finish_rem_req() with interrupts enabled instead of disabled. Reported-by: syzbot+c01f8e6e73f20459912e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-scsi/691560c4.a70a0220.3124cb.001a.GAE@google.com/ Cc: Hannes Reinecke Cc: stable@vger.kernel.org Fixes: 97d27b0dd015 ("scsi: sg: close race condition in sg_remove_sfp_usercontext()") Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20251113181643.1108973-1-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/sg.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 4c62c597c7be..b3af9b78fa12 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -2208,9 +2208,17 @@ sg_remove_sfp_usercontext(struct work_struct *work) write_lock_irqsave(&sfp->rq_list_lock, iflags); while (!list_empty(&sfp->rq_list)) { srp = list_first_entry(&sfp->rq_list, Sg_request, entry); - sg_finish_rem_req(srp); list_del(&srp->entry); + write_unlock_irqrestore(&sfp->rq_list_lock, iflags); + + sg_finish_rem_req(srp); + /* + * sg_rq_end_io() uses srp->parentfp. Hence, only clear + * srp->parentfp after blk_mq_free_request() has been called. + */ srp->parentfp = NULL; + + write_lock_irqsave(&sfp->rq_list_lock, iflags); } write_unlock_irqrestore(&sfp->rq_list_lock, iflags); From 5e15395f6d9ec07395866c5511f4b4ac566c0c9b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Nov 2025 08:20:19 +0100 Subject: [PATCH 492/543] mptcp: fix ack generation for fallback msk mptcp_cleanup_rbuf() needs to know the last most recent, mptcp-level rcv_wnd sent, and such information is tracked into the msk->old_wspace field, updated at ack transmission time by mptcp_write_options(). Fallback socket do not add any mptcp options, such helper is never invoked, and msk->old_wspace value remain stale. That in turn makes ack generation at recvmsg() time quite random. Address the issue ensuring mptcp_write_options() is invoked even for fallback sockets, and just update the needed info in such a case. The issue went unnoticed for a long time, as mptcp currently overshots the fallback socket receive buffer autotune significantly. It is going to change in the near future. Fixes: e3859603ba13 ("mptcp: better msk receive window updates") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/594 Signed-off-by: Paolo Abeni Reviewed-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251118-net-mptcp-misc-fixes-6-18-rc6-v1-1-806d3781c95f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1103b3341a70..8a63bd00807d 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -838,8 +838,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, opts->suboptions = 0; + /* Force later mptcp_write_options(), but do not use any actual + * option space. + */ if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) - return false; + return true; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) || @@ -1319,6 +1322,20 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); } +static void mptcp_track_rwin(struct tcp_sock *tp) +{ + const struct sock *ssk = (const struct sock *)tp; + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + + if (!ssk) + return; + + subflow = mptcp_subflow_ctx(ssk); + msk = mptcp_sk(subflow->conn); + WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); +} + __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) { struct csum_pseudo_header header; @@ -1611,6 +1628,10 @@ void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, opts->reset_transient, opts->reset_reason); return; + } else if (unlikely(!opts->suboptions)) { + /* Fallback to TCP */ + mptcp_track_rwin(tp); + return; } if (OPTION_MPTCP_PRIO & opts->suboptions) { From 4f102d747cadd8f595f2b25882eed9bec1675fb1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Nov 2025 08:20:20 +0100 Subject: [PATCH 493/543] mptcp: avoid unneeded subflow-level drops The rcv window is shared among all the subflows. Currently, MPTCP sync the TCP-level rcv window with the MPTCP one at tcp_transmit_skb() time. The above means that incoming data may sporadically observe outdated TCP-level rcv window and being wrongly dropped by TCP. Address the issue checking for the edge condition before queuing the data at TCP level, and eventually syncing the rcv window as needed. Note that the issue is actually present from the very first MPTCP implementation, but backports older than the blamed commit below will range from impossible to useless. Before: $ nstat -n; sleep 1; nstat -z TcpExtBeyondWindow TcpExtBeyondWindow 14 0.0 After: $ nstat -n; sleep 1; nstat -z TcpExtBeyondWindow TcpExtBeyondWindow 0 0.0 Fixes: fa3fe2b15031 ("mptcp: track window announced to peer") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251118-net-mptcp-misc-fixes-6-18-rc6-v1-2-806d3781c95f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 31 +++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 1 + 2 files changed, 32 insertions(+) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8a63bd00807d..f24ae7d40e88 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1044,6 +1044,31 @@ static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) WRITE_ONCE(msk->snd_una, new_snd_una); } +static void rwin_update(struct mptcp_sock *msk, struct sock *ssk, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct tcp_sock *tp = tcp_sk(ssk); + u64 mptcp_rcv_wnd; + + /* Avoid touching extra cachelines if TCP is going to accept this + * skb without filling the TCP-level window even with a possibly + * outdated mptcp-level rwin. + */ + if (!skb->len || skb->len < tcp_receive_window(tp)) + return; + + mptcp_rcv_wnd = atomic64_read(&msk->rcv_wnd_sent); + if (!after64(mptcp_rcv_wnd, subflow->rcv_wnd_sent)) + return; + + /* Some other subflow grew the mptcp-level rwin since rcv_wup, + * resync. + */ + tp->rcv_wnd += mptcp_rcv_wnd - subflow->rcv_wnd_sent; + subflow->rcv_wnd_sent = mptcp_rcv_wnd; +} + static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) @@ -1211,6 +1236,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) */ if (mp_opt.use_ack) ack_update_msk(msk, sk, &mp_opt); + rwin_update(msk, sk, skb); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not @@ -1297,6 +1323,10 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) if (rcv_wnd_new != rcv_wnd_old) { raise_win: + /* The msk-level rcv wnd is after the tcp level one, + * sync the latter. + */ + rcv_wnd_new = rcv_wnd_old; win = rcv_wnd_old - ack_seq; tp->rcv_wnd = min_t(u64, win, U32_MAX); new_win = tp->rcv_wnd; @@ -1320,6 +1350,7 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) update_wspace: WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); + subflow->rcv_wnd_sent = rcv_wnd_new; } static void mptcp_track_rwin(struct tcp_sock *tp) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 379a88e14e8d..5575ef64ea31 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -509,6 +509,7 @@ struct mptcp_subflow_context { u64 remote_key; u64 idsn; u64 map_seq; + u64 rcv_wnd_sent; u32 snd_isn; u32 token; u32 rel_write_seq; From 17393fa7b7086664be519e7230cb6ed7ec7d9462 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Nov 2025 08:20:21 +0100 Subject: [PATCH 494/543] mptcp: fix premature close in case of fallback I'm observing very frequent self-tests failures in case of fallback when running on a CONFIG_PREEMPT kernel. The root cause is that subflow_sched_work_if_closed() closes any subflow as soon as it is half-closed and has no incoming data pending. That works well for regular subflows - MPTCP needs bi-directional connectivity to operate on a given subflow - but for fallback socket is race prone. When TCP peer closes the connection before the MPTCP one, subflow_sched_work_if_closed() will schedule the MPTCP worker to gracefully close the subflow, and shortly after will do another schedule to inject and process a dummy incoming DATA_FIN. On CONFIG_PREEMPT kernel, the MPTCP worker can kick-in and close the fallback subflow before subflow_sched_work_if_closed() is able to create the dummy DATA_FIN, unexpectedly interrupting the transfer. Address the issue explicitly avoiding closing fallback subflows on when the peer is only half-closed. Note that, when the subflow is able to create the DATA_FIN before the worker invocation, the worker will change the msk state before trying to close the subflow and will skip the latter operation as the msk will not match anymore the precondition in __mptcp_close_subflow(). Fixes: f09b0ad55a11 ("mptcp: close subflow when receiving TCP+FIN") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251118-net-mptcp-misc-fixes-6-18-rc6-v1-3-806d3781c95f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e27e0fe2460f..e30e9043a694 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2563,7 +2563,8 @@ static void __mptcp_close_subflow(struct sock *sk) if (ssk_state != TCP_CLOSE && (ssk_state != TCP_CLOSE_WAIT || - inet_sk_state_load(sk) != TCP_ESTABLISHED)) + inet_sk_state_load(sk) != TCP_ESTABLISHED || + __mptcp_check_fallback(msk))) continue; /* 'subflow_data_ready' will re-sched once rx queue is empty */ From 1bba3f219c5e8c29e63afa3c1fc24f875ebec119 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Nov 2025 08:20:22 +0100 Subject: [PATCH 495/543] mptcp: do not fallback when OoO is present In case of DSS corruption, the MPTCP protocol tries to avoid the subflow reset if fallback is possible. Such corruptions happen in the receive path; to ensure fallback is possible the stack additionally needs to check for OoO data, otherwise the fallback will break the data stream. Fixes: e32d262c89e2 ("mptcp: handle consistently DSS corruption") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/598 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251118-net-mptcp-misc-fixes-6-18-rc6-v1-4-806d3781c95f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e30e9043a694..6f0e8f670d83 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -76,6 +76,13 @@ bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib) if (__mptcp_check_fallback(msk)) return true; + /* The caller possibly is not holding the msk socket lock, but + * in the fallback case only the current subflow is touching + * the OoO queue. + */ + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) + return false; + spin_lock_bh(&msk->fallback_lock); if (!msk->allow_infinite_fallback) { spin_unlock_bh(&msk->fallback_lock); From fff0c87996672816a84c3386797a5e69751c5888 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Nov 2025 08:20:23 +0100 Subject: [PATCH 496/543] mptcp: decouple mptcp fastclose from tcp close With the current fastclose implementation, the mptcp_do_fastclose() helper is in charge of two distinct actions: send the fastclose reset and cleanup the subflows. Formally decouple the two steps, ensuring that mptcp explicitly closes all the subflows after the mentioned helper. This will make the upcoming fix simpler, and allows dropping the 2nd argument from mptcp_destroy_common(). The Fixes tag is then the same as in the next commit to help with the backports. Fixes: d21f83485518 ("mptcp: use fastclose on more edge scenarios") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251118-net-mptcp-misc-fixes-6-18-rc6-v1-5-806d3781c95f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 13 +++++++++---- net/mptcp/protocol.h | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 6f0e8f670d83..c59246c1fde6 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2808,7 +2808,11 @@ static void mptcp_worker(struct work_struct *work) __mptcp_close_subflow(sk); if (mptcp_close_tout_expired(sk)) { + struct mptcp_subflow_context *subflow, *tmp; + mptcp_do_fastclose(sk); + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, subflow->tcp_sock, subflow, 0); mptcp_close_wake_up(sk); } @@ -3233,7 +3237,8 @@ static int mptcp_disconnect(struct sock *sk, int flags) /* msk->subflow is still intact, the following will not free the first * subflow */ - mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); + mptcp_do_fastclose(sk); + mptcp_destroy_common(msk); /* The first subflow is already in TCP_CLOSE status, the following * can't overlap with a fallback anymore @@ -3412,7 +3417,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; } -void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) +void mptcp_destroy_common(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; @@ -3421,7 +3426,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) /* join list will be eventually flushed (with rst) at sock lock release time */ mptcp_for_each_subflow_safe(msk, subflow, tmp) - __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, 0); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); @@ -3439,7 +3444,7 @@ static void mptcp_destroy(struct sock *sk) /* allow the following to close even the initial subflow */ msk->free_first = 1; - mptcp_destroy_common(msk, 0); + mptcp_destroy_common(msk); sk_sockets_allocated_dec(sk); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 5575ef64ea31..6ca97096607c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -977,7 +977,7 @@ static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) local_bh_enable(); } -void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags); +void mptcp_destroy_common(struct mptcp_sock *msk); #define MPTCP_TOKEN_MAX_RETRIES 4 From ae155060247be8dcae3802a95bd1bdf93ab3215d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Nov 2025 08:20:24 +0100 Subject: [PATCH 497/543] mptcp: fix duplicate reset on fastclose The CI reports sporadic failures of the fastclose self-tests. The root cause is a duplicate reset, not carrying the relevant MPTCP option. In the failing scenario the bad reset is received by the peer before the fastclose one, preventing the reception of the latter. Indeed there is window of opportunity at fastclose time for the following race: mptcp_do_fastclose __mptcp_close_ssk __tcp_close() tcp_set_state() [1] tcp_send_active_reset() [2] After [1] the stack will send reset to in-flight data reaching the now closed port. Such reset may race with [2]. Address the issue explicitly sending a single reset on fastclose before explicitly moving the subflow to close status. Fixes: d21f83485518 ("mptcp: use fastclose on more edge scenarios") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/596 Signed-off-by: Paolo Abeni Reviewed-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251118-net-mptcp-misc-fixes-6-18-rc6-v1-6-806d3781c95f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c59246c1fde6..a70267a74e3c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2409,7 +2409,6 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) /* flags for __mptcp_close_ssk() */ #define MPTCP_CF_PUSH BIT(1) -#define MPTCP_CF_FASTCLOSE BIT(2) /* be sure to send a reset only if the caller asked for it, also * clean completely the subflow status when the subflow reaches @@ -2420,7 +2419,7 @@ static void __mptcp_subflow_disconnect(struct sock *ssk, unsigned int flags) { if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || - (flags & MPTCP_CF_FASTCLOSE)) { + subflow->send_fastclose) { /* The MPTCP code never wait on the subflow sockets, TCP-level * disconnect should never fail */ @@ -2467,14 +2466,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); - if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { - /* be sure to force the tcp_close path - * to generate the egress reset - */ - ssk->sk_lingertime = 0; - sock_set_flag(ssk, SOCK_LINGER); - subflow->send_fastclose = 1; - } + if (subflow->send_fastclose && ssk->sk_state != TCP_CLOSE) + tcp_set_state(ssk, TCP_CLOSE); need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { @@ -2779,9 +2772,26 @@ static void mptcp_do_fastclose(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(sk); mptcp_set_state(sk, TCP_CLOSE); - mptcp_for_each_subflow_safe(msk, subflow, tmp) - __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), - subflow, MPTCP_CF_FASTCLOSE); + + /* Explicitly send the fastclose reset as need */ + if (__mptcp_check_fallback(msk)) + return; + + mptcp_for_each_subflow_safe(msk, subflow, tmp) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + + /* Some subflow socket states don't allow/need a reset.*/ + if ((1 << ssk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + goto unlock; + + subflow->send_fastclose = 1; + tcp_send_active_reset(ssk, ssk->sk_allocation, + SK_RST_REASON_TCP_ABORT_ON_CLOSE); +unlock: + release_sock(ssk); + } } static void mptcp_worker(struct work_struct *work) From efff6cd53ac52827948298043270bb81ff17fdff Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 18 Nov 2025 08:20:25 +0100 Subject: [PATCH 498/543] selftests: mptcp: join: fastclose: remove flaky marks After recent fixes like the parent commit, and "selftests: mptcp: connect: trunc: read all recv data", the two fastclose subtests no longer look flaky any more. It then feels fine to remove these flaky marks, to no longer ignore these subtests in case of errors. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251118-net-mptcp-misc-fixes-6-18-rc6-v1-7-806d3781c95f@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 41503c241989..303abbca59fc 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3500,7 +3500,6 @@ fullmesh_tests() fastclose_tests() { if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then - MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=client \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 @@ -3509,7 +3508,6 @@ fastclose_tests() fi if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then - MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 join_rst_nr=1 \ From fb13c6bb810ca871964e062cf91882d1c83db509 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 18 Nov 2025 08:20:26 +0100 Subject: [PATCH 499/543] selftests: mptcp: join: endpoints: longer timeout In rare cases, when the test environment is very slow, some endpoints tests can fail because some expected events have not been seen. Because the tests are expecting a long on-going connection, and they are not waiting for the end of the transfer, it is fine to have a longer timeout, and even go over the default one. This connection will be killed at the end, after the verifications: increasing the timeout doesn't change anything, apart from avoiding it to end before the end of the verifications. To play it safe, all endpoints tests not waiting for the end of the transfer are now having a longer timeout: 2 minutes. The Fixes commit was making the connection longer, but still, the default timeout would have stopped it after 1 minute, which might not be enough in very slow environments. Fixes: 6457595db987 ("selftests: mptcp: join: endpoints: longer transfer") Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Geliang Tang Link: https://patch.msgid.link/20251118-net-mptcp-misc-fixes-6-18-rc6-v1-8-806d3781c95f@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 303abbca59fc..93d38ded5e4e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3941,7 +3941,7 @@ endpoint_tests() pm_nl_set_limits $ns1 2 2 pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal - { test_linkfail=128 speed=slow \ + { timeout_test=120 test_linkfail=128 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -3968,7 +3968,7 @@ endpoint_tests() pm_nl_set_limits $ns2 0 3 pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - { test_linkfail=128 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4046,7 +4046,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal - { test_linkfail=128 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4119,7 +4119,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow - { test_linkfail=128 speed=20 \ + { timeout_test=120 test_linkfail=128 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! From 0e4ec14dc1ee4b1ec347729c225c3ca950f2bcf6 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 18 Nov 2025 08:20:27 +0100 Subject: [PATCH 500/543] selftests: mptcp: join: userspace: longer timeout In rare cases, when the test environment is very slow, some userspace tests can fail because some expected events have not been seen. Because the tests are expecting a long on-going connection, and they are not waiting for the end of the transfer, it is fine to have a longer timeout, and even go over the default one. This connection will be killed at the end, after the verifications: increasing the timeout doesn't change anything, apart from avoiding it to end before the end of the verifications. To play it safe, all userspace tests not waiting for the end of the transfer are now having a longer timeout: 2 minutes. The Fixes commit was making the connection longer, but still, the default timeout would have stopped it after 1 minute, which might not be enough in very slow environments. Fixes: 290493078b96 ("selftests: mptcp: join: userspace: longer transfer") Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Geliang Tang Link: https://patch.msgid.link/20251118-net-mptcp-misc-fixes-6-18-rc6-v1-9-806d3781c95f@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 93d38ded5e4e..74632beae2c6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3804,7 +3804,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 2 2 - { test_linkfail=128 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 @@ -3837,7 +3837,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { test_linkfail=128 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3865,7 +3865,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { test_linkfail=128 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3886,7 +3886,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { test_linkfail=128 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3910,7 +3910,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 1 1 - { test_linkfail=128 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 From 92e239e36d600002559074994a545fcfac9afd2d Mon Sep 17 00:00:00 2001 From: Gang Yan Date: Tue, 18 Nov 2025 08:20:28 +0100 Subject: [PATCH 501/543] mptcp: fix address removal logic in mptcp_pm_nl_rm_addr Fix inverted WARN_ON_ONCE condition that prevented normal address removal counter updates. The current code only executes decrement logic when the counter is already 0 (abnormal state), while normal removals (counter > 0) are ignored. Signed-off-by: Gang Yan Fixes: 636113918508 ("mptcp: pm: remove '_nl' from mptcp_pm_nl_rm_addr_received") Cc: stable@vger.kernel.org Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251118-net-mptcp-misc-fixes-6-18-rc6-v1-10-806d3781c95f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 2ae95476dba3..0a50fd5edc06 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -672,7 +672,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) { - if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + if (rm_id && !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { u8 limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); From 0eee0fdf9b7b0baf698f9b426384aa9714d76a51 Mon Sep 17 00:00:00 2001 From: Gang Yan Date: Tue, 18 Nov 2025 08:20:29 +0100 Subject: [PATCH 502/543] selftests: mptcp: add a check for 'add_addr_accepted' The previous patch fixed an issue with the 'add_addr_accepted' counter. This was not spot by the test suite. Check this counter and 'add_addr_signal' in MPTCP Join 'delete re-add signal' test. This should help spotting similar regressions later on. These counters are crucial for ensuring the MPTCP path manager correctly handles the subflow creation via 'ADD_ADDR'. Signed-off-by: Gang Yan Reviewed-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251118-net-mptcp-misc-fixes-6-18-rc6-v1-11-806d3781c95f@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 74632beae2c6..43f31f8d587f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -4055,38 +4055,45 @@ endpoint_tests() $ns1 10.0.2.1 id 1 flags signal chk_subflow_nr "before delete" 2 chk_mptcp_info subflows 1 subflows 1 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 1 pm_nl_del_endpoint $ns1 1 10.0.2.1 pm_nl_del_endpoint $ns1 2 224.0.0.1 sleep 0.5 chk_subflow_nr "after delete" 1 chk_mptcp_info subflows 0 subflows 0 + chk_mptcp_info add_addr_signal 0 add_addr_accepted 0 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal wait_mpj $ns2 chk_subflow_nr "after re-add" 3 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_del_endpoint $ns1 42 10.0.1.1 sleep 0.5 chk_subflow_nr "after delete ID 0" 2 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal wait_mpj $ns2 chk_subflow_nr "after re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 + chk_mptcp_info add_addr_signal 3 add_addr_accepted 2 pm_nl_del_endpoint $ns1 99 10.0.1.1 sleep 0.5 chk_subflow_nr "after re-delete ID 0" 2 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal wait_mpj $ns2 chk_subflow_nr "after re-re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 + chk_mptcp_info add_addr_signal 3 add_addr_accepted 2 mptcp_lib_kill_group_wait $tests_pid kill_events_pids From 20d7338f2d3bcb570068dd6d39b16f1a909fe976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 20 Nov 2025 14:42:05 +0800 Subject: [PATCH 503/543] LoongArch: Use UAPI types in ptrace UAPI header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel UAPI headers already contain fixed-width integer types, there is no need to rely on the libc types. There may not be a libc available or the libc may not provides the , like for example on nolibc. This also aligns the header with the rest of the LoongArch UAPI headers. Fixes: 803b0fc5c3f2 ("LoongArch: Add process management") Signed-off-by: Thomas Weißschuh Signed-off-by: Huacai Chen --- arch/loongarch/include/uapi/asm/ptrace.h | 40 +++++++++++------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/arch/loongarch/include/uapi/asm/ptrace.h b/arch/loongarch/include/uapi/asm/ptrace.h index aafb3cd9e943..215e0f9e8aa3 100644 --- a/arch/loongarch/include/uapi/asm/ptrace.h +++ b/arch/loongarch/include/uapi/asm/ptrace.h @@ -10,10 +10,6 @@ #include -#ifndef __KERNEL__ -#include -#endif - /* * For PTRACE_{POKE,PEEK}USR. 0 - 31 are GPRs, * 32 is syscall's original ARG0, 33 is PC, 34 is BADVADDR. @@ -41,44 +37,44 @@ struct user_pt_regs { } __attribute__((aligned(8))); struct user_fp_state { - uint64_t fpr[32]; - uint64_t fcc; - uint32_t fcsr; + __u64 fpr[32]; + __u64 fcc; + __u32 fcsr; }; struct user_lsx_state { /* 32 registers, 128 bits width per register. */ - uint64_t vregs[32*2]; + __u64 vregs[32*2]; }; struct user_lasx_state { /* 32 registers, 256 bits width per register. */ - uint64_t vregs[32*4]; + __u64 vregs[32*4]; }; struct user_lbt_state { - uint64_t scr[4]; - uint32_t eflags; - uint32_t ftop; + __u64 scr[4]; + __u32 eflags; + __u32 ftop; }; struct user_watch_state { - uint64_t dbg_info; + __u64 dbg_info; struct { - uint64_t addr; - uint64_t mask; - uint32_t ctrl; - uint32_t pad; + __u64 addr; + __u64 mask; + __u32 ctrl; + __u32 pad; } dbg_regs[8]; }; struct user_watch_state_v2 { - uint64_t dbg_info; + __u64 dbg_info; struct { - uint64_t addr; - uint64_t mask; - uint32_t ctrl; - uint32_t pad; + __u64 addr; + __u64 mask; + __u32 ctrl; + __u32 pad; } dbg_regs[14]; }; From 1c004609fdefb48888ef98bc6e3b8fe78ae4e088 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 20 Nov 2025 14:42:05 +0800 Subject: [PATCH 504/543] LoongArch: Consolidate CPU names in /proc/cpuinfo Some processors have no IOCSR.VENDOR and IOCSR.CPUNAME, some processors have these registers but there is no valid information. Consolidate CPU names in /proc/cpuinfo: 1. Add "PRID" to display the PRID & Core-Name; 2. Let "Model Name" display "Unknown" if no valid name. Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/cpu.h | 21 +++++++++++++++++++ arch/loongarch/kernel/cpu-probe.c | 34 ++++++++++--------------------- arch/loongarch/kernel/proc.c | 2 ++ 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/arch/loongarch/include/asm/cpu.h b/arch/loongarch/include/asm/cpu.h index d4cd4041bee7..f3efb00b6141 100644 --- a/arch/loongarch/include/asm/cpu.h +++ b/arch/loongarch/include/asm/cpu.h @@ -55,6 +55,27 @@ enum cpu_type_enum { CPU_LAST }; +static inline char *id_to_core_name(unsigned int id) +{ + if ((id & PRID_COMP_MASK) != PRID_COMP_LOONGSON) + return "Unknown"; + + switch (id & PRID_SERIES_MASK) { + case PRID_SERIES_LA132: + return "LA132"; + case PRID_SERIES_LA264: + return "LA264"; + case PRID_SERIES_LA364: + return "LA364"; + case PRID_SERIES_LA464: + return "LA464"; + case PRID_SERIES_LA664: + return "LA664"; + default: + return "Unknown"; + } +} + #endif /* !__ASSEMBLER__ */ /* diff --git a/arch/loongarch/kernel/cpu-probe.c b/arch/loongarch/kernel/cpu-probe.c index 6f943d1391ff..a2060a24b39f 100644 --- a/arch/loongarch/kernel/cpu-probe.c +++ b/arch/loongarch/kernel/cpu-probe.c @@ -277,7 +277,7 @@ static inline void cpu_probe_loongson(struct cpuinfo_loongarch *c, unsigned int uint32_t config; uint64_t *vendor = (void *)(&cpu_full_name[VENDOR_OFFSET]); uint64_t *cpuname = (void *)(&cpu_full_name[CPUNAME_OFFSET]); - const char *core_name = "Unknown"; + const char *core_name = id_to_core_name(c->processor_id); switch (BIT(fls(c->isa_level) - 1)) { case LOONGARCH_CPU_ISA_LA32R: @@ -291,35 +291,23 @@ static inline void cpu_probe_loongson(struct cpuinfo_loongarch *c, unsigned int break; } - switch (c->processor_id & PRID_SERIES_MASK) { - case PRID_SERIES_LA132: - core_name = "LA132"; - break; - case PRID_SERIES_LA264: - core_name = "LA264"; - break; - case PRID_SERIES_LA364: - core_name = "LA364"; - break; - case PRID_SERIES_LA464: - core_name = "LA464"; - break; - case PRID_SERIES_LA664: - core_name = "LA664"; - break; - } - pr_info("%s Processor probed (%s Core)\n", __cpu_family[cpu], core_name); - if (!cpu_has_iocsr) + if (!cpu_has_iocsr) { + __cpu_full_name[cpu] = "Unknown"; return; - - if (!__cpu_full_name[cpu]) - __cpu_full_name[cpu] = cpu_full_name; + } *vendor = iocsr_read64(LOONGARCH_IOCSR_VENDOR); *cpuname = iocsr_read64(LOONGARCH_IOCSR_CPUNAME); + if (!__cpu_full_name[cpu]) { + if (((char *)vendor)[0] == 0) + __cpu_full_name[cpu] = "Unknown"; + else + __cpu_full_name[cpu] = cpu_full_name; + } + config = iocsr_read32(LOONGARCH_IOCSR_FEATURES); if (config & IOCSRF_CSRIPI) c->options |= LOONGARCH_CPU_CSRIPI; diff --git a/arch/loongarch/kernel/proc.c b/arch/loongarch/kernel/proc.c index cea30768ae92..63d2b7e7e844 100644 --- a/arch/loongarch/kernel/proc.c +++ b/arch/loongarch/kernel/proc.c @@ -17,6 +17,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) { unsigned long n = (unsigned long) v - 1; unsigned int isa = cpu_data[n].isa_level; + unsigned int prid = cpu_data[n].processor_id; unsigned int version = cpu_data[n].processor_id & 0xff; unsigned int fp_version = cpu_data[n].fpu_vers; @@ -37,6 +38,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "global_id\t\t: %d\n", cpu_data[n].global_id); seq_printf(m, "CPU Family\t\t: %s\n", __cpu_family[n]); seq_printf(m, "Model Name\t\t: %s\n", __cpu_full_name[n]); + seq_printf(m, "PRID\t\t\t: %s (%08x)\n", id_to_core_name(prid), prid); seq_printf(m, "CPU Revision\t\t: 0x%02x\n", version); seq_printf(m, "FPU Revision\t\t: 0x%02x\n", fp_version); seq_printf(m, "CPU MHz\t\t\t: %llu.%02llu\n", From acf5de1b23b0275eb69f235c8e9f2cef19fa39a1 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 20 Nov 2025 14:42:05 +0800 Subject: [PATCH 505/543] LoongArch: Fix NUMA node parsing with numa_memblks On physical machine, NUMA node id comes from high bit 44:48 of physical address. However it is not true on virt machine. With general method, it comes from ACPI SRAT table. Here the common function numa_memblks_init() is used to parse NUMA node information with numa_memblks. Cc: Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kernel/numa.c | 60 +++++++++++------------------------- 1 file changed, 18 insertions(+), 42 deletions(-) diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index ab9c660526a3..8b89898e20df 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -158,35 +158,9 @@ static void __init node_mem_init(unsigned int node) #ifdef CONFIG_ACPI_NUMA -/* - * add_numamem_region - * - * Add a uasable memory region described by BIOS. The - * routine gets each intersection between BIOS's region - * and node's region, and adds them into node's memblock - * pool. - * - */ -static void __init add_numamem_region(u64 start, u64 end, u32 type) -{ - u32 node = pa_to_nid(start); - u64 size = end - start; - static unsigned long num_physpages; +static unsigned long num_physpages; - if (start >= end) { - pr_debug("Invalid region: %016llx-%016llx\n", start, end); - return; - } - - num_physpages += (size >> PAGE_SHIFT); - pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n", - node, type, start, size); - pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", - start >> PAGE_SHIFT, end >> PAGE_SHIFT, num_physpages); - memblock_set_node(start, size, &memblock.memory, node); -} - -static void __init init_node_memblock(void) +static void __init info_node_memblock(void) { u32 mem_type; u64 mem_end, mem_start, mem_size; @@ -206,12 +180,20 @@ static void __init init_node_memblock(void) case EFI_BOOT_SERVICES_DATA: case EFI_PERSISTENT_MEMORY: case EFI_CONVENTIONAL_MEMORY: - add_numamem_region(mem_start, mem_end, mem_type); + num_physpages += (mem_size >> PAGE_SHIFT); + pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n", + (u32)pa_to_nid(mem_start), mem_type, mem_start, mem_size); + pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", + mem_start >> PAGE_SHIFT, mem_end >> PAGE_SHIFT, num_physpages); break; case EFI_PAL_CODE: case EFI_UNUSABLE_MEMORY: case EFI_ACPI_RECLAIM_MEMORY: - add_numamem_region(mem_start, mem_end, mem_type); + num_physpages += (mem_size >> PAGE_SHIFT); + pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n", + (u32)pa_to_nid(mem_start), mem_type, mem_start, mem_size); + pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", + mem_start >> PAGE_SHIFT, mem_end >> PAGE_SHIFT, num_physpages); fallthrough; case EFI_RESERVED_TYPE: case EFI_RUNTIME_SERVICES_CODE: @@ -249,22 +231,16 @@ int __init init_numa_memory(void) for (i = 0; i < NR_CPUS; i++) set_cpuid_to_node(i, NUMA_NO_NODE); - numa_reset_distance(); - nodes_clear(numa_nodes_parsed); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - WARN_ON(memblock_clear_hotplug(0, PHYS_ADDR_MAX)); - /* Parse SRAT and SLIT if provided by firmware. */ - ret = acpi_disabled ? fake_numa_init() : acpi_numa_init(); + if (!acpi_disabled) + ret = numa_memblks_init(acpi_numa_init, false); + else + ret = numa_memblks_init(fake_numa_init, false); + if (ret < 0) return ret; - node_possible_map = numa_nodes_parsed; - if (WARN_ON(nodes_empty(node_possible_map))) - return -EINVAL; - - init_node_memblock(); + info_node_memblock(); if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; From 863a320dc6fd7c855f47da4bb82a8de2d9102ea2 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 20 Nov 2025 14:42:05 +0800 Subject: [PATCH 506/543] LoongArch: Mask all interrupts during kexec/kdump If the default state of the interrupt controllers in the first kernel don't mask any interrupts, it may cause the second kernel to potentially receive interrupts (which were previously allocated by the first kernel) immediately after a CPU becomes online during its boot process. These interrupts cannot be properly routed, leading to bad IRQ issues. This patch calls machine_kexec_mask_interrupts() to mask all interrupts during the kexec/kdump process. Signed-off-by: Tianyang Zhang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/machine_kexec.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/loongarch/kernel/machine_kexec.c b/arch/loongarch/kernel/machine_kexec.c index 2d64b7c81e5e..d7fafda1d541 100644 --- a/arch/loongarch/kernel/machine_kexec.c +++ b/arch/loongarch/kernel/machine_kexec.c @@ -237,6 +237,7 @@ void machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_SMP crash_smp_send_stop(); #endif + machine_kexec_mask_interrupts(); cpumask_set_cpu(crashing_cpu, &cpus_in_crash); pr_info("Starting crashdump kernel...\n"); @@ -274,6 +275,7 @@ void machine_kexec(struct kimage *image) /* We do not want to be bothered. */ local_irq_disable(); + machine_kexec_mask_interrupts(); pr_notice("EFI boot flag: 0x%lx\n", efi_boot); pr_notice("Command line addr: 0x%lx\n", cmdline_ptr); From a6b533adfc05ba15360631e019d3e18275080275 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 20 Nov 2025 14:42:05 +0800 Subject: [PATCH 507/543] LoongArch: Don't panic if no valid cache info for PCI If there is no valid cache info detected (may happen in virtual machine) for pci_dfl_cache_line_size, kernel shouldn't panic. Because in the PCI core it will be evaluated to (L1_CACHE_BYTES >> 2). Cc: Signed-off-by: Jiaxun Yang Signed-off-by: Huacai Chen --- arch/loongarch/pci/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/pci/pci.c b/arch/loongarch/pci/pci.c index 5bc9627a6cf9..d9fc5d520b37 100644 --- a/arch/loongarch/pci/pci.c +++ b/arch/loongarch/pci/pci.c @@ -50,11 +50,11 @@ static int __init pcibios_init(void) */ lsize = cpu_last_level_cache_line_size(); - BUG_ON(!lsize); + if (lsize) { + pci_dfl_cache_line_size = lsize >> 2; - pci_dfl_cache_line_size = lsize >> 2; - - pr_debug("PCI: pci_cache_line_size set to %d bytes\n", lsize); + pr_debug("PCI: pci_cache_line_size set to %d bytes\n", lsize); + } return 0; } From 677e6123e3d24adaa252697dc89740f2ac07664e Mon Sep 17 00:00:00 2001 From: Vincent Li Date: Thu, 20 Nov 2025 14:42:05 +0800 Subject: [PATCH 508/543] LoongArch: BPF: Disable trampoline for kernel module function trace The current LoongArch BPF trampoline implementation is incompatible with tracing functions in kernel modules. This causes several severe and user-visible problems: * The `bpf_selftests/module_attach` test fails consistently. * Kernel lockup when a BPF program is attached to a module function [1]. * Critical kernel modules like WireGuard experience traffic disruption when their functions are traced with fentry [2]. Given the severity and the potential for other unknown side-effects, it is safest to disable the feature entirely for now. This patch prevents the BPF subsystem from allowing trampoline attachments to kernel module functions on LoongArch. This is a temporary mitigation until the core issues in the trampoline code for kernel module handling can be identified and fixed. [root@fedora bpf]# ./test_progs -a module_attach -v bpf_testmod.ko is already unloaded. Loading bpf_testmod.ko... Successfully loaded bpf_testmod.ko. test_module_attach:PASS:skel_open 0 nsec test_module_attach:PASS:set_attach_target 0 nsec test_module_attach:PASS:set_attach_target_explicit 0 nsec test_module_attach:PASS:skel_load 0 nsec libbpf: prog 'handle_fentry': failed to attach: -ENOTSUPP libbpf: prog 'handle_fentry': failed to auto-attach: -ENOTSUPP test_module_attach:FAIL:skel_attach skeleton attach failed: -524 Summary: 0/0 PASSED, 0 SKIPPED, 1 FAILED Successfully unloaded bpf_testmod.ko. [1]: https://lore.kernel.org/loongarch/CAK3+h2wDmpC-hP4u4pJY8T-yfKyk4yRzpu2LMO+C13FMT58oqQ@mail.gmail.com/ [2]: https://lore.kernel.org/loongarch/CAK3+h2wYcpc+OwdLDUBvg2rF9rvvyc5amfHT-KcFaK93uoELPg@mail.gmail.com/ Cc: stable@vger.kernel.org Fixes: f9b6b41f0cf3 ("LoongArch: BPF: Add basic bpf trampoline support") Acked-by: Hengqi Chen Signed-off-by: Vincent Li Signed-off-by: Huacai Chen --- arch/loongarch/net/bpf_jit.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index cbe53d0b7fb0..f97dc9936401 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -1624,6 +1624,9 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i /* Direct jump skips 5 NOP instructions */ else if (is_bpf_text_address((unsigned long)orig_call)) orig_call += LOONGARCH_BPF_FENTRY_NBYTES; + /* Module tracing not supported - cause kernel lockups */ + else if (is_module_text_address((unsigned long)orig_call)) + return -ENOTSUPP; if (flags & BPF_TRAMP_F_CALL_ORIG) { move_addr(ctx, LOONGARCH_GPR_A0, (const u64)im); From a9d1f38df7ecd0e21233447c9cc6fa1799eddaf3 Mon Sep 17 00:00:00 2001 From: Henrique Carvalho Date: Thu, 13 Nov 2025 15:09:13 -0300 Subject: [PATCH 509/543] smb: client: introduce close_cached_dir_locked() Replace close_cached_dir() calls under cfid_list_lock with a new close_cached_dir_locked() variant that uses kref_put() instead of kref_put_lock() to avoid recursive locking when dropping references. While the existing code works if the refcount >= 2 invariant holds, this area has proven error-prone. Make deadlocks impossible and WARN on invariant violations. Cc: stable@vger.kernel.org Reviewed-by: David Howells Signed-off-by: Henrique Carvalho Signed-off-by: Steve French --- fs/smb/client/cached_dir.c | 41 +++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 018055fd2cdb..e3ea6fe7edb4 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -16,6 +16,7 @@ static struct cached_fid *init_cached_dir(const char *path); static void free_cached_dir(struct cached_fid *cfid); static void smb2_close_cached_fid(struct kref *ref); static void cfids_laundromat_worker(struct work_struct *work); +static void close_cached_dir_locked(struct cached_fid *cfid); struct cached_dir_dentry { struct list_head entry; @@ -388,7 +389,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, * lease. Release one here, and the second below. */ cfid->has_lease = false; - close_cached_dir(cfid); + close_cached_dir_locked(cfid); } spin_unlock(&cfids->cfid_list_lock); @@ -480,18 +481,52 @@ void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon, spin_lock(&cfid->cfids->cfid_list_lock); if (cfid->has_lease) { cfid->has_lease = false; - close_cached_dir(cfid); + close_cached_dir_locked(cfid); } spin_unlock(&cfid->cfids->cfid_list_lock); close_cached_dir(cfid); } - +/** + * close_cached_dir - drop a reference of a cached dir + * + * The release function will be called with cfid_list_lock held to remove the + * cached dirs from the list before any other thread can take another @cfid + * ref. Must not be called with cfid_list_lock held; use + * close_cached_dir_locked() called instead. + * + * @cfid: cached dir + */ void close_cached_dir(struct cached_fid *cfid) { + lockdep_assert_not_held(&cfid->cfids->cfid_list_lock); kref_put_lock(&cfid->refcount, smb2_close_cached_fid, &cfid->cfids->cfid_list_lock); } +/** + * close_cached_dir_locked - put a reference of a cached dir with + * cfid_list_lock held + * + * Calling close_cached_dir() with cfid_list_lock held has the potential effect + * of causing a deadlock if the invariant of refcount >= 2 is false. + * + * This function is used in paths that hold cfid_list_lock and expect at least + * two references. If that invariant is violated, WARNs and returns without + * dropping a reference; the final put must still go through + * close_cached_dir(). + * + * @cfid: cached dir + */ +static void close_cached_dir_locked(struct cached_fid *cfid) +{ + lockdep_assert_held(&cfid->cfids->cfid_list_lock); + + if (WARN_ON(kref_read(&cfid->refcount) < 2)) + return; + + kref_put(&cfid->refcount, smb2_close_cached_fid); +} + /* * Called from cifs_kill_sb when we unmount a share */ From 7e4d9120cfa413dd34f4f434befc5dbe6c38b2e5 Mon Sep 17 00:00:00 2001 From: Shaurya Rane Date: Tue, 18 Nov 2025 20:32:57 +0530 Subject: [PATCH 510/543] cifs: fix memory leak in smb3_fs_context_parse_param error path Add proper cleanup of ctx->source and fc->source to the cifs_parse_mount_err error handler. This ensures that memory allocated for the source strings is correctly freed on all error paths, matching the cleanup already performed in the success path by smb3_cleanup_fs_context_contents(). Pointers are also set to NULL after freeing to prevent potential double-free issues. This change fixes a memory leak originally detected by syzbot. The leak occurred when processing Opt_source mount options if an error happened after ctx->source and fc->source were successfully allocated but before the function completed. The specific leak sequence was: 1. ctx->source = smb3_fs_context_fullpath(ctx, '/') allocates memory 2. fc->source = kstrdup(ctx->source, GFP_KERNEL) allocates more memory 3. A subsequent error jumps to cifs_parse_mount_err 4. The old error handler freed passwords but not the source strings, causing the memory to leak. This issue was not addressed by commit e8c73eb7db0a ("cifs: client: fix memory leak in smb3_fs_context_parse_param"), which only fixed leaks from repeated fsconfig() calls but not this error path. Patch updated with minor change suggested by kernel test robot Reported-by: syzbot+87be6809ed9bf6d718e3@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=87be6809ed9bf6d718e3 Fixes: 24e0a1eff9e2 ("cifs: switch to new mount api") Reviewed-by: David Howells Signed-off-by: Shaurya Rane Signed-off-by: Steve French --- fs/smb/client/fs_context.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 0f894d09157b..2a0d8b87bd8e 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1834,6 +1834,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->password = NULL; kfree_sensitive(ctx->password2); ctx->password2 = NULL; + kfree(ctx->source); + ctx->source = NULL; + kfree(fc->source); + fc->source = NULL; return -EINVAL; } From d5227c88174c384d83d9176bd4315ef13dce306c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Oct 2025 16:33:43 +0100 Subject: [PATCH 511/543] cifs: Add the smb3_read_* tracepoints to SMB1 Add the smb3_read_* tracepoints to SMB1's cifs_async_readv() and cifs_readv_callback(). Signed-off-by: David Howells cc: Steve French cc: Paulo Alcantara cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifssmb.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 7da194f29fef..dcc50a2bfa4b 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1363,6 +1363,14 @@ cifs_readv_callback(struct mid_q_entry *mid) if (rdata->result == -ENODATA) { rdata->result = 0; __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + trace_smb3_read_err(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, + rdata->subreq.len - rdata->subreq.transferred, + rdata->result); } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && @@ -1374,6 +1382,13 @@ cifs_readv_callback(struct mid_q_entry *mid) } if (rdata->got_bytes) __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); + trace_smb3_read_done(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, + rdata->got_bytes); } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, @@ -1445,6 +1460,13 @@ cifs_async_readv(struct cifs_io_subrequest *rdata) rdata->iov[1].iov_base = (char *)smb + 4; rdata->iov[1].iov_len = get_rfc1002_length(smb); + trace_smb3_read_enter(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.netfid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start, rdata->subreq.len); + rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive, cifs_readv_callback, NULL, rdata, 0, NULL); From f1f96511b1c4c33e53f05909dd267878e0643a9a Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 12 Nov 2025 16:05:26 +0800 Subject: [PATCH 512/543] perf: Fix 0 count issue of cpu-clock Currently cpu-clock event always returns 0 count, e.g., perf stat -e cpu-clock -- sleep 1 Performance counter stats for 'sleep 1': 0 cpu-clock # 0.000 CPUs utilized 1.002308394 seconds time elapsed The root cause is the commit 'bc4394e5e79c ("perf: Fix the throttle error of some clock events")' adds PERF_EF_UPDATE flag check before calling cpu_clock_event_update() to update the count, however the PERF_EF_UPDATE flag is never set when the cpu-clock event is stopped in counting mode (pmu->dev() -> cpu_clock_event_del() -> cpu_clock_event_stop()). This leads to the cpu-clock event count is never updated. To fix this issue, force to set PERF_EF_UPDATE flag for cpu-clock event just like what task-clock does. Fixes: bc4394e5e79c ("perf: Fix the throttle error of some clock events") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://patch.msgid.link/20251112080526.3971392-1-dapeng1.mi@linux.intel.com --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 1fd347da9026..2c35acc2722b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11901,7 +11901,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) static void cpu_clock_event_del(struct perf_event *event, int flags) { - cpu_clock_event_stop(event, flags); + cpu_clock_event_stop(event, PERF_EF_UPDATE); } static void cpu_clock_event_read(struct perf_event *event) From 678e1cc2f482e0985a0613ab4a5bf89c497e5acc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 12 Nov 2025 08:35:18 -0800 Subject: [PATCH 513/543] xfs: fix out of bounds memory read error in symlink repair xfs/286 produced this report on my test fleet: ================================================================== BUG: KFENCE: out-of-bounds read in memcpy_orig+0x54/0x110 Out-of-bounds read at 0xffff88843fe9e038 (184B right of kfence-#184): memcpy_orig+0x54/0x110 xrep_symlink_salvage_inline+0xb3/0xf0 [xfs] xrep_symlink_salvage+0x100/0x110 [xfs] xrep_symlink+0x2e/0x80 [xfs] xrep_attempt+0x61/0x1f0 [xfs] xfs_scrub_metadata+0x34f/0x5c0 [xfs] xfs_ioc_scrubv_metadata+0x387/0x560 [xfs] xfs_file_ioctl+0xe23/0x10e0 [xfs] __x64_sys_ioctl+0x76/0xc0 do_syscall_64+0x4e/0x1e0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 kfence-#184: 0xffff88843fe9df80-0xffff88843fe9dfea, size=107, cache=kmalloc-128 allocated by task 3470 on cpu 1 at 263329.131592s (192823.508886s ago): xfs_init_local_fork+0x79/0xe0 [xfs] xfs_iformat_local+0xa4/0x170 [xfs] xfs_iformat_data_fork+0x148/0x180 [xfs] xfs_inode_from_disk+0x2cd/0x480 [xfs] xfs_iget+0x450/0xd60 [xfs] xfs_bulkstat_one_int+0x6b/0x510 [xfs] xfs_bulkstat_iwalk+0x1e/0x30 [xfs] xfs_iwalk_ag_recs+0xdf/0x150 [xfs] xfs_iwalk_run_callbacks+0xb9/0x190 [xfs] xfs_iwalk_ag+0x1dc/0x2f0 [xfs] xfs_iwalk_args.constprop.0+0x6a/0x120 [xfs] xfs_iwalk+0xa4/0xd0 [xfs] xfs_bulkstat+0xfa/0x170 [xfs] xfs_ioc_fsbulkstat.isra.0+0x13a/0x230 [xfs] xfs_file_ioctl+0xbf2/0x10e0 [xfs] __x64_sys_ioctl+0x76/0xc0 do_syscall_64+0x4e/0x1e0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 CPU: 1 UID: 0 PID: 1300113 Comm: xfs_scrub Not tainted 6.18.0-rc4-djwx #rc4 PREEMPT(lazy) 3d744dd94e92690f00a04398d2bd8631dcef1954 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-4.module+el8.8.0+21164+ed375313 04/01/2014 ================================================================== On further analysis, I realized that the second parameter to min() is not correct. xfs_ifork::if_bytes is the size of the xfs_ifork::if_data buffer. if_bytes can be smaller than the data fork size because: (a) the forkoff code tries to keep the data area as large as possible (b) for symbolic links, if_bytes is the ondisk file size + 1 (c) forkoff is always a multiple of 8. Case in point: for a single-byte symlink target, forkoff will be 8 but the buffer will only be 2 bytes long. In other words, the logic here is wrong and we walk off the end of the incore buffer. Fix that. Cc: stable@vger.kernel.org # v6.10 Fixes: 2651923d8d8db0 ("xfs: online repair of symbolic links") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/symlink_repair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index 5902398185a8..df629892462f 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -184,7 +184,7 @@ xrep_symlink_salvage_inline( sc->ip->i_disk_size == 1 && old_target[0] == '?') return 0; - nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); + nr = min(XFS_SYMLINK_MAXLEN, ifp->if_bytes); memcpy(target_buf, ifp->if_data, nr); return nr; } From 3ceb6ac2116ecda1c5d779bb73271479e70fccb4 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 14 Nov 2025 10:09:51 +0100 Subject: [PATCH 514/543] net: dsa: microchip: lan937x: Fix RGMII delay tuning Correct RGMII delay application logic in lan937x_set_tune_adj(). The function was missing `data16 &= ~PORT_TUNE_ADJ` before setting the new delay value. This caused the new value to be bitwise-OR'd with the existing PORT_TUNE_ADJ field instead of replacing it. For example, when setting the RGMII 2 TX delay on port 4, the intended TUNE_ADJUST value of 0 (RGMII_2_TX_DELAY_2NS) was incorrectly OR'd with the default 0x1B (from register value 0xDA3), leaving the delay at the wrong setting. This patch adds the missing mask to clear the field, ensuring the correct delay value is written. Physical measurements on the RGMII TX lines confirm the fix, showing the delay changing from ~1ns (before change) to ~2ns. While testing on i.MX 8MP showed this was within the platform's timing tolerance, it did not match the intended hardware-characterized value. Fixes: b19ac41faa3f ("net: dsa: microchip: apply rgmii tx and rx delay in phylink mac config") Cc: stable@vger.kernel.org Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20251114090951.4057261-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- drivers/net/dsa/microchip/lan937x_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/microchip/lan937x_main.c b/drivers/net/dsa/microchip/lan937x_main.c index b1ae3b9de3d1..5a1496fff445 100644 --- a/drivers/net/dsa/microchip/lan937x_main.c +++ b/drivers/net/dsa/microchip/lan937x_main.c @@ -540,6 +540,7 @@ static void lan937x_set_tune_adj(struct ksz_device *dev, int port, ksz_pread16(dev, port, reg, &data16); /* Update tune Adjust */ + data16 &= ~PORT_TUNE_ADJ; data16 |= FIELD_PREP(PORT_TUNE_ADJ, val); ksz_pwrite16(dev, port, reg, data16); From d70b592551ff23747e26e74081205babf8dba9b6 Mon Sep 17 00:00:00 2001 From: David Bauer Date: Tue, 18 Nov 2025 01:16:18 +0100 Subject: [PATCH 515/543] l2tp: reset skb control buffer on xmit The L2TP stack did not reset the skb control buffer before sending the encapsulated package. In a setup with an ath10k radio and batman-adv over an L2TP tunnel massive fragmentations happen sporadically if the L2TP tunnel is established over IPv4. L2TP might reset some of the fields in the IP control buffer, but L2TP assumes the type of the control buffer to be of an IPv4 packet. In case the L2TP interface is used as a batadv hardif or the packet is an IPv6 packet, this assumption breaks. Clear the entire control buffer to avoid such mishaps altogether. Fixes: f77ae9390438 ("[PPPOL2TP]: Reset meta-data in xmit function") Signed-off-by: David Bauer Link: https://patch.msgid.link/20251118001619.242107-1-mail@david-bauer.net Signed-off-by: Paolo Abeni --- net/l2tp/l2tp_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 369a2f2e459c..0710281dd95a 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1246,9 +1246,9 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, uns else l2tp_build_l2tpv3_header(session, __skb_push(skb, session->hdr_len)); - /* Reset skb netfilter state */ - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); + /* Reset control buffer */ + memset(skb->cb, 0, sizeof(skb->cb)); + nf_reset_ct(skb); /* L2TP uses its own lockdep subclass to avoid lockdep splats caused by From b32cc17d607e8ae7af037303fe101368cb4dc44c Mon Sep 17 00:00:00 2001 From: Yihang Li Date: Thu, 20 Nov 2025 11:50:23 +0800 Subject: [PATCH 516/543] ata: libata-scsi: Add missing scsi_device_put() in ata_scsi_dev_rescan() Call scsi_device_put() in ata_scsi_dev_rescan() if the device or its queue are not running. Fixes: 0c76106cb975 ("scsi: sd: Fix TCG OPAL unlock on system resume") Cc: stable@vger.kernel.org Signed-off-by: Yihang Li Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-scsi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index b43a3196e2be..3fb84f690644 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -4894,8 +4894,10 @@ void ata_scsi_dev_rescan(struct work_struct *work) spin_unlock_irqrestore(ap->lock, flags); if (do_resume) { ret = scsi_resume_device(sdev); - if (ret == -EWOULDBLOCK) + if (ret == -EWOULDBLOCK) { + scsi_device_put(sdev); goto unlock_scan; + } dev->flags &= ~ATA_DFLAG_RESUMING; } ret = scsi_rescan_device(sdev); From b11890683380a36b8488229f818d5e76e8204587 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 19 Nov 2025 15:13:14 +0100 Subject: [PATCH 517/543] ata: libata-scsi: Fix system suspend for a security locked drive Commit cf3fc037623c ("ata: libata-scsi: Fix ata_to_sense_error() status handling") fixed ata_to_sense_error() to properly generate sense key ABORTED COMMAND (without any additional sense code), instead of the previous bogus sense key ILLEGAL REQUEST with the additional sense code UNALIGNED WRITE COMMAND, for a failed command. However, this broke suspend for Security locked drives (drives that have Security enabled, and have not been Security unlocked by boot firmware). The reason for this is that the SCSI disk driver, for the Synchronize Cache command only, treats any sense data with sense key ILLEGAL REQUEST as a successful command (regardless of ASC / ASCQ). After commit cf3fc037623c ("ata: libata-scsi: Fix ata_to_sense_error() status handling") the code that treats any sense data with sense key ILLEGAL REQUEST as a successful command is no longer applicable, so the command fails, which causes the system suspend to be aborted: sd 1:0:0:0: PM: dpm_run_callback(): scsi_bus_suspend returns -5 sd 1:0:0:0: PM: failed to suspend async: error -5 PM: Some devices failed to suspend, or early wake event detected To make suspend work once again, for a Security locked device only, return sense data LOGICAL UNIT ACCESS NOT AUTHORIZED, the actual sense data which a real SCSI device would have returned if locked. The SCSI disk driver treats this sense data as a successful command. Cc: stable@vger.kernel.org Reported-by: Ilia Baryshnikov Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220704 Fixes: cf3fc037623c ("ata: libata-scsi: Fix ata_to_sense_error() status handling") Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-scsi.c | 7 +++++++ include/linux/ata.h | 1 + 2 files changed, 8 insertions(+) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 3fb84f690644..434774e71fe6 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -992,6 +992,13 @@ static void ata_gen_ata_sense(struct ata_queued_cmd *qc) return; } + if (ata_id_is_locked(dev->id)) { + /* Security locked */ + /* LOGICAL UNIT ACCESS NOT AUTHORIZED */ + ata_scsi_set_sense(dev, cmd, DATA_PROTECT, 0x74, 0x71); + return; + } + if (!(qc->flags & ATA_QCFLAG_RTF_FILLED)) { ata_dev_dbg(dev, "Missing result TF: reporting aborted command\n"); diff --git a/include/linux/ata.h b/include/linux/ata.h index 792e10a09787..c9013e472aa3 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -566,6 +566,7 @@ struct ata_bmdma_prd { #define ata_id_has_ncq(id) ((id)[ATA_ID_SATA_CAPABILITY] & (1 << 8)) #define ata_id_queue_depth(id) (((id)[ATA_ID_QUEUE_DEPTH] & 0x1f) + 1) #define ata_id_removable(id) ((id)[ATA_ID_CONFIG] & (1 << 7)) +#define ata_id_is_locked(id) (((id)[ATA_ID_DLF] & 0x7) == 0x7) #define ata_id_has_atapi_AN(id) \ ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \ ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \ From 91842ed844a068a41a38f97a1ac5535b909279cd Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 19 Nov 2025 15:13:15 +0100 Subject: [PATCH 518/543] ata: libata-core: Set capacity to zero for a security locked drive For Security locked drives (drives that have Security enabled, and have not been Security unlocked by boot firmware), the automatic partition scanning will result in the user being spammed with errors such as: ata5.00: failed command: READ DMA ata5.00: cmd c8/00:08:00:00:00/00:00:00:00:00/e0 tag 7 dma 4096 in res 51/04:08:00:00:00/00:00:00:00:00/e0 Emask 0x1 (device error) ata5.00: status: { DRDY ERR } ata5.00: error: { ABRT } sd 4:0:0:0: [sda] tag#7 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s sd 4:0:0:0: [sda] tag#7 Sense Key : Aborted Command [current] sd 4:0:0:0: [sda] tag#7 Add. Sense: No additional sense information during boot, because most commands except for IDENTIFY will be aborted by a Security locked drive. For a Security locked drive, set capacity to zero, so that no automatic partition scanning will happen. If the user later unlocks the drive using e.g. hdparm, the close() by the user space application should trigger a revalidation of the drive. Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 2a210719c4ce..f48fb63d7e85 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3006,6 +3006,16 @@ int ata_dev_configure(struct ata_device *dev) } dev->n_sectors = ata_id_n_sectors(id); + if (ata_id_is_locked(id)) { + /* + * If Security locked, set capacity to zero to prevent + * any I/O, e.g. partition scanning, as any I/O to a + * locked drive will result in user visible errors. + */ + ata_dev_info(dev, + "Security locked, setting capacity to zero\n"); + dev->n_sectors = 0; + } /* get current R/W Multiple count setting */ if ((dev->id[47] >> 8) == 0x80 && (dev->id[59] & 0x100)) { From 7d277a7a58578dd62fd546ddaef459ec24ccae36 Mon Sep 17 00:00:00 2001 From: Andrey Vatoropin Date: Wed, 19 Nov 2025 10:51:12 +0000 Subject: [PATCH 519/543] be2net: pass wrb_params in case of OS2BMC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit be_insert_vlan_in_pkt() is called with the wrb_params argument being NULL at be_send_pkt_to_bmc() call site.  This may lead to dereferencing a NULL pointer when processing a workaround for specific packet, as commit bc0c3405abbb ("be2net: fix a Tx stall bug caused by a specific ipv6 packet") states. The correct way would be to pass the wrb_params from be_xmit(). Fixes: 760c295e0e8d ("be2net: Support for OS2BMC.") Cc: stable@vger.kernel.org Signed-off-by: Andrey Vatoropin Link: https://patch.msgid.link/20251119105015.194501-1-a.vatoropin@crpt.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/emulex/benet/be_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index cb004fd16252..5bb31c8fab39 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1296,7 +1296,8 @@ static void be_xmit_flush(struct be_adapter *adapter, struct be_tx_obj *txo) (adapter->bmc_filt_mask & BMC_FILT_MULTICAST) static bool be_send_pkt_to_bmc(struct be_adapter *adapter, - struct sk_buff **skb) + struct sk_buff **skb, + struct be_wrb_params *wrb_params) { struct ethhdr *eh = (struct ethhdr *)(*skb)->data; bool os2bmc = false; @@ -1360,7 +1361,7 @@ static bool be_send_pkt_to_bmc(struct be_adapter *adapter, * to BMC, asic expects the vlan to be inline in the packet. */ if (os2bmc) - *skb = be_insert_vlan_in_pkt(adapter, *skb, NULL); + *skb = be_insert_vlan_in_pkt(adapter, *skb, wrb_params); return os2bmc; } @@ -1387,7 +1388,7 @@ static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev) /* if os2bmc is enabled and if the pkt is destined to bmc, * enqueue the pkt a 2nd time with mgmt bit set. */ - if (be_send_pkt_to_bmc(adapter, &skb)) { + if (be_send_pkt_to_bmc(adapter, &skb, &wrb_params)) { BE_WRB_F_SET(wrb_params.features, OS2BMC, 1); wrb_cnt = be_xmit_enqueue(adapter, txo, skb, &wrb_params); if (unlikely(!wrb_cnt)) From 002541ef650b742a198e4be363881439bb9d86b4 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 19 Nov 2025 15:02:59 +0100 Subject: [PATCH 520/543] vsock: Ignore signal/timeout on connect() if already established During connect(), acting on a signal/timeout by disconnecting an already established socket leads to several issues: 1. connect() invoking vsock_transport_cancel_pkt() -> virtio_transport_purge_skbs() may race with sendmsg() invoking virtio_transport_get_credit(). This results in a permanently elevated `vvs->bytes_unsent`. Which, in turn, confuses the SOCK_LINGER handling. 2. connect() resetting a connected socket's state may race with socket being placed in a sockmap. A disconnected socket remaining in a sockmap breaks sockmap's assumptions. And gives rise to WARNs. 3. connect() transitioning SS_CONNECTED -> SS_UNCONNECTED allows for a transport change/drop after TCP_ESTABLISHED. Which poses a problem for any simultaneous sendmsg() or connect() and may result in a use-after-free/null-ptr-deref. Do not disconnect socket on signal/timeout. Keep the logic for unconnected sockets: they don't linger, can't be placed in a sockmap, are rejected by sendmsg(). [1]: https://lore.kernel.org/netdev/e07fd95c-9a38-4eea-9638-133e38c2ec9b@rbox.co/ [2]: https://lore.kernel.org/netdev/20250317-vsock-trans-signal-race-v4-0-fc8837f3f1d4@rbox.co/ [3]: https://lore.kernel.org/netdev/60f1b7db-3099-4f6a-875e-af9f6ef194f6@rbox.co/ Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Michal Luczaj Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20251119-vsock-interrupted-connect-v2-1-70734cf1233f@rbox.co Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 76763247a377..a9ca9c3b87b3 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1661,18 +1661,40 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, timeout = schedule_timeout(timeout); lock_sock(sk); - if (signal_pending(current)) { - err = sock_intr_errno(timeout); - sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; - sock->state = SS_UNCONNECTED; - vsock_transport_cancel_pkt(vsk); - vsock_remove_connected(vsk); - goto out_wait; - } else if ((sk->sk_state != TCP_ESTABLISHED) && (timeout == 0)) { - err = -ETIMEDOUT; + /* Connection established. Whatever happens to socket once we + * release it, that's not connect()'s concern. No need to go + * into signal and timeout handling. Call it a day. + * + * Note that allowing to "reset" an already established socket + * here is racy and insecure. + */ + if (sk->sk_state == TCP_ESTABLISHED) + break; + + /* If connection was _not_ established and a signal/timeout came + * to be, we want the socket's state reset. User space may want + * to retry. + * + * sk_state != TCP_ESTABLISHED implies that socket is not on + * vsock_connected_table. We keep the binding and the transport + * assigned. + */ + if (signal_pending(current) || timeout == 0) { + err = timeout == 0 ? -ETIMEDOUT : sock_intr_errno(timeout); + + /* Listener might have already responded with + * VIRTIO_VSOCK_OP_RESPONSE. Its handling expects our + * sk_state == TCP_SYN_SENT, which hereby we break. + * In such case VIRTIO_VSOCK_OP_RST will follow. + */ sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; + + /* Try to cancel VIRTIO_VSOCK_OP_REQUEST skb sent out by + * transport->connect(). + */ vsock_transport_cancel_pkt(vsk); + goto out_wait; } From 7b5ab04f035f829ed6008e4685501ec00b3e73c9 Mon Sep 17 00:00:00 2001 From: Malaya Kumar Rout Date: Thu, 20 Nov 2025 20:32:13 +0530 Subject: [PATCH 521/543] timekeeping: Fix resource leak in tk_aux_sysfs_init() error paths tk_aux_sysfs_init() returns immediately on error during the auxiliary clock initialization loop without cleaning up previously allocated kobjects and sysfs groups. If kobject_create_and_add() or sysfs_create_group() fails during loop iteration, the parent kobjects (tko and auxo) and any previously created child kobjects are leaked. Fix this by adding proper error handling with goto labels to ensure all allocated resources are cleaned up on failure. kobject_put() on the parent kobjects will handle cleanup of their children. Fixes: 7b95663a3d96 ("timekeeping: Provide interface to control auxiliary clocks") Signed-off-by: Malaya Kumar Rout Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251120150213.246777-1-mrout@redhat.com --- kernel/time/timekeeping.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3a4d3b2e3f74..08e0943b54da 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -3060,29 +3060,32 @@ static const struct attribute_group aux_clock_enable_attr_group = { static int __init tk_aux_sysfs_init(void) { struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + int ret = -ENOMEM; if (!tko) - return -ENOMEM; + return ret; auxo = kobject_create_and_add("aux_clocks", tko); - if (!auxo) { - kobject_put(tko); - return -ENOMEM; - } + if (!auxo) + goto err_clean; for (int i = 0; i < MAX_AUX_CLOCKS; i++) { char id[2] = { [0] = '0' + i, }; struct kobject *clk = kobject_create_and_add(id, auxo); if (!clk) - return -ENOMEM; - - int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + goto err_clean; + ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); if (ret) - return ret; + goto err_clean; } return 0; + +err_clean: + kobject_put(auxo); + kobject_put(tko); + return ret; } late_initcall(tk_aux_sysfs_init); From 46447367a52965e9d35f112f5b26fc8ff8ec443d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Nov 2025 11:40:15 -0700 Subject: [PATCH 522/543] io_uring/cmd_net: fix wrong argument types for skb_queue_splice() If timestamp retriving needs to be retried and the local list of SKB's already has entries, then it's spliced back into the socket queue. However, the arguments for the splice helper are transposed, causing exactly the wrong direction of splicing into the on-stack list. Fix that up. Cc: stable@vger.kernel.org Reported-by: Google Big Sleep Fixes: 9e4ed359b8ef ("io_uring/netcmd: add tx timestamping cmd support") Signed-off-by: Jens Axboe --- io_uring/cmd_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index 27a09aa4c9d0..3b75931bd569 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -127,7 +127,7 @@ static int io_uring_cmd_timestamp(struct socket *sock, if (!unlikely(skb_queue_empty(&list))) { scoped_guard(spinlock_irqsave, &q->lock) - skb_queue_splice(q, &list); + skb_queue_splice(&list, q); } return -EAGAIN; } From 7b6216baae751369195fa3c83d434d23bcda406a Mon Sep 17 00:00:00 2001 From: Saket Kumar Bhaskar Date: Wed, 19 Nov 2025 16:07:22 +0530 Subject: [PATCH 523/543] sched_ext: Fix scx_enable() crash on helper kthread creation failure A crash was observed when the sched_ext selftests runner was terminated with Ctrl+\ while test 15 was running: NIP [c00000000028fa58] scx_enable.constprop.0+0x358/0x12b0 LR [c00000000028fa2c] scx_enable.constprop.0+0x32c/0x12b0 Call Trace: scx_enable.constprop.0+0x32c/0x12b0 (unreliable) bpf_struct_ops_link_create+0x18c/0x22c __sys_bpf+0x23f8/0x3044 sys_bpf+0x2c/0x6c system_call_exception+0x124/0x320 system_call_vectored_common+0x15c/0x2ec kthread_run_worker() returns an ERR_PTR() on failure rather than NULL, but the current code in scx_alloc_and_add_sched() only checks for a NULL helper. Incase of failure on SIGQUIT, the error is not handled in scx_alloc_and_add_sched() and scx_enable() ends up dereferencing an error pointer. Error handling is fixed in scx_alloc_and_add_sched() to propagate PTR_ERR() into ret, so that scx_enable() jumps to the existing error path, avoiding random dereference on failure. Fixes: bff3b5aec1b7 ("sched_ext: Move disable machinery into scx_sched") Cc: stable@vger.kernel.org # v6.16+ Reported-and-tested-by: Samir Mulani Signed-off-by: Saket Kumar Bhaskar Reviewed-by: Emil Tsalapatis Reviewed-by: Andrea Righi Reviewed-by: Vishal Chourasia Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7aae1d0ce37e..979484dab2d3 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4479,8 +4479,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) goto err_free_gdsqs; sch->helper = kthread_run_worker(0, "sched_ext_helper"); - if (!sch->helper) + if (IS_ERR(sch->helper)) { + ret = PTR_ERR(sch->helper); goto err_free_pcpu; + } + sched_set_fifo(sch->helper->task); atomic_set(&sch->exit_kind, SCX_EXIT_NONE); From 75f72fe289a7f76204a728668edcf20e4a2a6097 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Thu, 13 Nov 2025 15:23:13 -0500 Subject: [PATCH 524/543] selinux: rename task_security_struct to cred_security_struct Before Linux had cred structures, the SELinux task_security_struct was per-task and although the structure was switched to being per-cred long ago, the name was never updated. This change renames it to cred_security_struct to avoid confusion and pave the way for the introduction of an actual per-task security structure for SELinux. No functional change. Cc: stable@vger.kernel.org Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/hooks.c | 68 +++++++++++++++---------------- security/selinux/include/objsec.h | 8 ++-- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index dfc22da42f30..0890e7ee84c9 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -210,7 +210,7 @@ static int selinux_lsm_notifier_avc_callback(u32 event) */ static void cred_init_security(void) { - struct task_security_struct *tsec; + struct cred_security_struct *tsec; /* NOTE: the lsm framework zeros out the buffer on allocation */ @@ -223,7 +223,7 @@ static void cred_init_security(void) */ static inline u32 cred_sid(const struct cred *cred) { - const struct task_security_struct *tsec; + const struct cred_security_struct *tsec; tsec = selinux_cred(cred); return tsec->sid; @@ -437,7 +437,7 @@ static int may_context_mount_sb_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { - const struct task_security_struct *tsec = selinux_cred(cred); + const struct cred_security_struct *tsec = selinux_cred(cred); int rc; rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, @@ -454,7 +454,7 @@ static int may_context_mount_inode_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { - const struct task_security_struct *tsec = selinux_cred(cred); + const struct cred_security_struct *tsec = selinux_cred(cred); int rc; rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); @@ -1788,7 +1788,7 @@ static int file_has_perm(const struct cred *cred, * Determine the label for an inode that might be unioned. */ static int -selinux_determine_inode_label(const struct task_security_struct *tsec, +selinux_determine_inode_label(const struct cred_security_struct *tsec, struct inode *dir, const struct qstr *name, u16 tclass, u32 *_new_isid) @@ -1817,7 +1817,7 @@ static int may_create(struct inode *dir, struct dentry *dentry, u16 tclass) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *tsec = selinux_cred(current_cred()); struct inode_security_struct *dsec; struct superblock_security_struct *sbsec; u32 sid, newsid; @@ -2251,8 +2251,8 @@ static u32 ptrace_parent_sid(void) } static int check_nnp_nosuid(const struct linux_binprm *bprm, - const struct task_security_struct *old_tsec, - const struct task_security_struct *new_tsec) + const struct cred_security_struct *old_tsec, + const struct cred_security_struct *new_tsec) { int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS); int nosuid = !mnt_may_suid(bprm->file->f_path.mnt); @@ -2305,8 +2305,8 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) { - const struct task_security_struct *old_tsec; - struct task_security_struct *new_tsec; + const struct cred_security_struct *old_tsec; + struct cred_security_struct *new_tsec; struct inode_security_struct *isec; struct common_audit_data ad; struct inode *inode = file_inode(bprm->file); @@ -2483,7 +2483,7 @@ static inline void flush_unauthorized_files(const struct cred *cred, */ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm) { - struct task_security_struct *new_tsec; + struct cred_security_struct *new_tsec; struct rlimit *rlim, *initrlim; int rc, i; @@ -2529,7 +2529,7 @@ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm) */ static void selinux_bprm_committed_creds(const struct linux_binprm *bprm) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *tsec = selinux_cred(current_cred()); u32 osid, sid; int rc; @@ -2911,7 +2911,7 @@ static int selinux_dentry_create_files_as(struct dentry *dentry, int mode, { u32 newsid; int rc; - struct task_security_struct *tsec; + struct cred_security_struct *tsec; rc = selinux_determine_inode_label(selinux_cred(old), d_inode(dentry->d_parent), name, @@ -2929,7 +2929,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *tsec = selinux_cred(current_cred()); struct superblock_security_struct *sbsec; struct xattr *xattr = lsm_get_xattr_slot(xattrs, xattr_count); u32 newsid, clen; @@ -3110,7 +3110,7 @@ static noinline int audit_inode_permission(struct inode *inode, * Clear the task's AVD cache in @tsec and reset it to the current policy's * and task's info. */ -static inline void task_avdcache_reset(struct task_security_struct *tsec) +static inline void task_avdcache_reset(struct cred_security_struct *tsec) { memset(&tsec->avdcache.dir, 0, sizeof(tsec->avdcache.dir)); tsec->avdcache.sid = tsec->sid; @@ -3127,7 +3127,7 @@ static inline void task_avdcache_reset(struct task_security_struct *tsec) * Search @tsec for a AVD cache entry that matches @isec and return it to the * caller via @avdc. Returns 0 if a match is found, negative values otherwise. */ -static inline int task_avdcache_search(struct task_security_struct *tsec, +static inline int task_avdcache_search(struct cred_security_struct *tsec, struct inode_security_struct *isec, struct avdc_entry **avdc) { @@ -3167,7 +3167,7 @@ static inline int task_avdcache_search(struct task_security_struct *tsec, * Update the AVD cache in @tsec with the @avdc and @audited info associated * with @isec. */ -static inline void task_avdcache_update(struct task_security_struct *tsec, +static inline void task_avdcache_update(struct cred_security_struct *tsec, struct inode_security_struct *isec, struct av_decision *avd, u32 audited) @@ -3201,7 +3201,7 @@ static int selinux_inode_permission(struct inode *inode, int requested) { int mask; u32 perms; - struct task_security_struct *tsec; + struct cred_security_struct *tsec; struct inode_security_struct *isec; struct avdc_entry *avdc; int rc, rc2; @@ -3283,7 +3283,7 @@ static int selinux_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, static int selinux_inode_getattr(const struct path *path) { - struct task_security_struct *tsec; + struct cred_security_struct *tsec; tsec = selinux_cred(current_cred()); @@ -3659,7 +3659,7 @@ static void selinux_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop) static int selinux_inode_copy_up(struct dentry *src, struct cred **new) { struct lsm_prop prop; - struct task_security_struct *tsec; + struct cred_security_struct *tsec; struct cred *new_creds = *new; if (new_creds == NULL) { @@ -3697,7 +3697,7 @@ static int selinux_inode_copy_up_xattr(struct dentry *dentry, const char *name) static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *tsec = selinux_cred(current_cred()); u32 parent_sid, newsid, clen; int rc; char *context; @@ -4161,8 +4161,8 @@ static int selinux_task_alloc(struct task_struct *task, static int selinux_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { - const struct task_security_struct *old_tsec = selinux_cred(old); - struct task_security_struct *tsec = selinux_cred(new); + const struct cred_security_struct *old_tsec = selinux_cred(old); + struct cred_security_struct *tsec = selinux_cred(new); *tsec = *old_tsec; return 0; @@ -4173,8 +4173,8 @@ static int selinux_cred_prepare(struct cred *new, const struct cred *old, */ static void selinux_cred_transfer(struct cred *new, const struct cred *old) { - const struct task_security_struct *old_tsec = selinux_cred(old); - struct task_security_struct *tsec = selinux_cred(new); + const struct cred_security_struct *old_tsec = selinux_cred(old); + struct cred_security_struct *tsec = selinux_cred(new); *tsec = *old_tsec; } @@ -4195,7 +4195,7 @@ static void selinux_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop) */ static int selinux_kernel_act_as(struct cred *new, u32 secid) { - struct task_security_struct *tsec = selinux_cred(new); + struct cred_security_struct *tsec = selinux_cred(new); u32 sid = current_sid(); int ret; @@ -4219,7 +4219,7 @@ static int selinux_kernel_act_as(struct cred *new, u32 secid) static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) { struct inode_security_struct *isec = inode_security(inode); - struct task_security_struct *tsec = selinux_cred(new); + struct cred_security_struct *tsec = selinux_cred(new); u32 sid = current_sid(); int ret; @@ -4744,7 +4744,7 @@ static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid) /* socket security operations */ -static int socket_sockcreate_sid(const struct task_security_struct *tsec, +static int socket_sockcreate_sid(const struct cred_security_struct *tsec, u16 secclass, u32 *socksid) { if (tsec->sockcreate_sid > SECSID_NULL) { @@ -4797,7 +4797,7 @@ static int sock_has_perm(struct sock *sk, u32 perms) static int selinux_socket_create(int family, int type, int protocol, int kern) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *tsec = selinux_cred(current_cred()); u32 newsid; u16 secclass; int rc; @@ -4816,7 +4816,7 @@ static int selinux_socket_create(int family, int type, static int selinux_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *tsec = selinux_cred(current_cred()); struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock)); struct sk_security_struct *sksec; u16 sclass = socket_type_to_security_class(family, type, protocol); @@ -6526,7 +6526,7 @@ static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode) static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p, char **value) { - const struct task_security_struct *tsec; + const struct cred_security_struct *tsec; int error; u32 sid; u32 len; @@ -6581,7 +6581,7 @@ static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p, static int selinux_lsm_setattr(u64 attr, void *value, size_t size) { - struct task_security_struct *tsec; + struct cred_security_struct *tsec; struct cred *new; u32 mysid = current_sid(), sid = 0, ptsid; int error; @@ -6876,7 +6876,7 @@ static int selinux_inode_getsecctx(struct inode *inode, struct lsm_context *cp) static int selinux_key_alloc(struct key *k, const struct cred *cred, unsigned long flags) { - const struct task_security_struct *tsec; + const struct cred_security_struct *tsec; struct key_security_struct *ksec = selinux_key(k); tsec = selinux_cred(cred); @@ -7137,7 +7137,7 @@ static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *att #endif struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { - .lbs_cred = sizeof(struct task_security_struct), + .lbs_cred = sizeof(struct cred_security_struct), .lbs_file = sizeof(struct file_security_struct), .lbs_inode = sizeof(struct inode_security_struct), .lbs_ipc = sizeof(struct ipc_security_struct), diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 2d5139c6d45b..e71ce352bc97 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -37,7 +37,7 @@ struct avdc_entry { bool permissive; /* AVC permissive flag */ }; -struct task_security_struct { +struct cred_security_struct { u32 osid; /* SID prior to last execve */ u32 sid; /* current SID */ u32 exec_sid; /* exec SID */ @@ -54,7 +54,7 @@ struct task_security_struct { } avdcache; } __randomize_layout; -static inline bool task_avdcache_permnoaudit(struct task_security_struct *tsec) +static inline bool task_avdcache_permnoaudit(struct cred_security_struct *tsec) { return (tsec->avdcache.permissive_neveraudit && tsec->sid == tsec->avdcache.sid && @@ -172,7 +172,7 @@ struct perf_event_security_struct { }; extern struct lsm_blob_sizes selinux_blob_sizes; -static inline struct task_security_struct *selinux_cred(const struct cred *cred) +static inline struct cred_security_struct *selinux_cred(const struct cred *cred) { return cred->security + selinux_blob_sizes.lbs_cred; } @@ -207,7 +207,7 @@ selinux_ipc(const struct kern_ipc_perm *ipc) */ static inline u32 current_sid(void) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *tsec = selinux_cred(current_cred()); return tsec->sid; } From dde3a5d0f4dce1d1a6095e6b8eeb59b75d28fb3b Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Thu, 13 Nov 2025 15:23:14 -0500 Subject: [PATCH 525/543] selinux: move avdcache to per-task security struct The avdcache is meant to be per-task; move it to a new task_security_struct that is duplicated per-task. Cc: stable@vger.kernel.org Fixes: 5d7ddc59b3d89b724a5aa8f30d0db94ff8d2d93f ("selinux: reduce path walk overhead") Signed-off-by: Stephen Smalley [PM: line length fixes] Signed-off-by: Paul Moore --- security/selinux/hooks.c | 31 ++++++++++++++++++------------- security/selinux/include/objsec.h | 14 ++++++++++++-- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 0890e7ee84c9..0ac4b05eb56c 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -215,7 +215,7 @@ static void cred_init_security(void) /* NOTE: the lsm framework zeros out the buffer on allocation */ tsec = selinux_cred(unrcu_pointer(current->real_cred)); - tsec->osid = tsec->sid = tsec->avdcache.sid = SECINITSID_KERNEL; + tsec->osid = tsec->sid = SECINITSID_KERNEL; } /* @@ -3110,10 +3110,10 @@ static noinline int audit_inode_permission(struct inode *inode, * Clear the task's AVD cache in @tsec and reset it to the current policy's * and task's info. */ -static inline void task_avdcache_reset(struct cred_security_struct *tsec) +static inline void task_avdcache_reset(struct task_security_struct *tsec) { memset(&tsec->avdcache.dir, 0, sizeof(tsec->avdcache.dir)); - tsec->avdcache.sid = tsec->sid; + tsec->avdcache.sid = current_sid(); tsec->avdcache.seqno = avc_policy_seqno(); tsec->avdcache.dir_spot = TSEC_AVDC_DIR_SIZE - 1; } @@ -3127,7 +3127,7 @@ static inline void task_avdcache_reset(struct cred_security_struct *tsec) * Search @tsec for a AVD cache entry that matches @isec and return it to the * caller via @avdc. Returns 0 if a match is found, negative values otherwise. */ -static inline int task_avdcache_search(struct cred_security_struct *tsec, +static inline int task_avdcache_search(struct task_security_struct *tsec, struct inode_security_struct *isec, struct avdc_entry **avdc) { @@ -3137,7 +3137,7 @@ static inline int task_avdcache_search(struct cred_security_struct *tsec, if (isec->sclass != SECCLASS_DIR) return -ENOENT; - if (unlikely(tsec->sid != tsec->avdcache.sid || + if (unlikely(current_sid() != tsec->avdcache.sid || tsec->avdcache.seqno != avc_policy_seqno())) { task_avdcache_reset(tsec); return -ENOENT; @@ -3167,7 +3167,7 @@ static inline int task_avdcache_search(struct cred_security_struct *tsec, * Update the AVD cache in @tsec with the @avdc and @audited info associated * with @isec. */ -static inline void task_avdcache_update(struct cred_security_struct *tsec, +static inline void task_avdcache_update(struct task_security_struct *tsec, struct inode_security_struct *isec, struct av_decision *avd, u32 audited) @@ -3201,7 +3201,8 @@ static int selinux_inode_permission(struct inode *inode, int requested) { int mask; u32 perms; - struct cred_security_struct *tsec; + u32 sid = current_sid(); + struct task_security_struct *tsec; struct inode_security_struct *isec; struct avdc_entry *avdc; int rc, rc2; @@ -3213,8 +3214,8 @@ static int selinux_inode_permission(struct inode *inode, int requested) if (!mask) return 0; - tsec = selinux_cred(current_cred()); - if (task_avdcache_permnoaudit(tsec)) + tsec = selinux_task(current); + if (task_avdcache_permnoaudit(tsec, sid)) return 0; isec = inode_security_rcu(inode, requested & MAY_NOT_BLOCK); @@ -3234,7 +3235,7 @@ static int selinux_inode_permission(struct inode *inode, int requested) struct av_decision avd; /* Cache miss. */ - rc = avc_has_perm_noaudit(tsec->sid, isec->sid, isec->sclass, + rc = avc_has_perm_noaudit(sid, isec->sid, isec->sclass, perms, 0, &avd); audited = avc_audit_required(perms, &avd, rc, (requested & MAY_ACCESS) ? FILE__AUDIT_ACCESS : 0, @@ -3283,11 +3284,11 @@ static int selinux_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, static int selinux_inode_getattr(const struct path *path) { - struct cred_security_struct *tsec; + struct task_security_struct *tsec; - tsec = selinux_cred(current_cred()); + tsec = selinux_task(current); - if (task_avdcache_permnoaudit(tsec)) + if (task_avdcache_permnoaudit(tsec, current_sid())) return 0; return path_has_perm(current_cred(), path, FILE__GETATTR); @@ -4151,7 +4152,10 @@ static int selinux_task_alloc(struct task_struct *task, u64 clone_flags) { u32 sid = current_sid(); + struct task_security_struct *old_tsec = selinux_task(current); + struct task_security_struct *new_tsec = selinux_task(task); + *new_tsec = *old_tsec; return avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL); } @@ -7138,6 +7142,7 @@ static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *att struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct cred_security_struct), + .lbs_task = sizeof(struct task_security_struct), .lbs_file = sizeof(struct file_security_struct), .lbs_inode = sizeof(struct inode_security_struct), .lbs_ipc = sizeof(struct ipc_security_struct), diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index e71ce352bc97..00804562c2c3 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -44,6 +44,9 @@ struct cred_security_struct { u32 create_sid; /* fscreate SID */ u32 keycreate_sid; /* keycreate SID */ u32 sockcreate_sid; /* fscreate SID */ +} __randomize_layout; + +struct task_security_struct { #define TSEC_AVDC_DIR_SIZE (1 << 2) struct { u32 sid; /* current SID for cached entries */ @@ -54,10 +57,11 @@ struct cred_security_struct { } avdcache; } __randomize_layout; -static inline bool task_avdcache_permnoaudit(struct cred_security_struct *tsec) +static inline bool task_avdcache_permnoaudit(struct task_security_struct *tsec, + u32 sid) { return (tsec->avdcache.permissive_neveraudit && - tsec->sid == tsec->avdcache.sid && + sid == tsec->avdcache.sid && tsec->avdcache.seqno == avc_policy_seqno()); } @@ -177,6 +181,12 @@ static inline struct cred_security_struct *selinux_cred(const struct cred *cred) return cred->security + selinux_blob_sizes.lbs_cred; } +static inline struct task_security_struct * +selinux_task(const struct task_struct *task) +{ + return task->security + selinux_blob_sizes.lbs_task; +} + static inline struct file_security_struct *selinux_file(const struct file *file) { return file->f_security + selinux_blob_sizes.lbs_file; From 3ded250b97c3ae94a642bc2e710a95700e72dfb0 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 18 Nov 2025 17:27:58 -0500 Subject: [PATCH 526/543] selinux: rename the cred_security_struct variables to "crsec" Along with the renaming from task_security_struct to cred_security_struct, rename the local variables to "crsec" from "tsec". This both fits with existing conventions and helps distinguish between task and cred related variables. No functional changes. Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/hooks.c | 230 +++++++++++++++--------------- security/selinux/include/objsec.h | 4 +- 2 files changed, 117 insertions(+), 117 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 0ac4b05eb56c..e713291db873 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -210,12 +210,12 @@ static int selinux_lsm_notifier_avc_callback(u32 event) */ static void cred_init_security(void) { - struct cred_security_struct *tsec; + struct cred_security_struct *crsec; /* NOTE: the lsm framework zeros out the buffer on allocation */ - tsec = selinux_cred(unrcu_pointer(current->real_cred)); - tsec->osid = tsec->sid = SECINITSID_KERNEL; + crsec = selinux_cred(unrcu_pointer(current->real_cred)); + crsec->osid = crsec->sid = SECINITSID_KERNEL; } /* @@ -223,10 +223,10 @@ static void cred_init_security(void) */ static inline u32 cred_sid(const struct cred *cred) { - const struct cred_security_struct *tsec; + const struct cred_security_struct *crsec; - tsec = selinux_cred(cred); - return tsec->sid; + crsec = selinux_cred(cred); + return crsec->sid; } static void __ad_net_init(struct common_audit_data *ad, @@ -437,15 +437,15 @@ static int may_context_mount_sb_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { - const struct cred_security_struct *tsec = selinux_cred(cred); + const struct cred_security_struct *crsec = selinux_cred(cred); int rc; - rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, + rc = avc_has_perm(crsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; - rc = avc_has_perm(tsec->sid, sid, SECCLASS_FILESYSTEM, + rc = avc_has_perm(crsec->sid, sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELTO, NULL); return rc; } @@ -454,9 +454,9 @@ static int may_context_mount_inode_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { - const struct cred_security_struct *tsec = selinux_cred(cred); + const struct cred_security_struct *crsec = selinux_cred(cred); int rc; - rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, + rc = avc_has_perm(crsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; @@ -1788,7 +1788,7 @@ static int file_has_perm(const struct cred *cred, * Determine the label for an inode that might be unioned. */ static int -selinux_determine_inode_label(const struct cred_security_struct *tsec, +selinux_determine_inode_label(const struct cred_security_struct *crsec, struct inode *dir, const struct qstr *name, u16 tclass, u32 *_new_isid) @@ -1800,11 +1800,11 @@ selinux_determine_inode_label(const struct cred_security_struct *tsec, (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) { *_new_isid = sbsec->mntpoint_sid; } else if ((sbsec->flags & SBLABEL_MNT) && - tsec->create_sid) { - *_new_isid = tsec->create_sid; + crsec->create_sid) { + *_new_isid = crsec->create_sid; } else { const struct inode_security_struct *dsec = inode_security(dir); - return security_transition_sid(tsec->sid, + return security_transition_sid(crsec->sid, dsec->sid, tclass, name, _new_isid); } @@ -1817,7 +1817,7 @@ static int may_create(struct inode *dir, struct dentry *dentry, u16 tclass) { - const struct cred_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); struct inode_security_struct *dsec; struct superblock_security_struct *sbsec; u32 sid, newsid; @@ -1827,7 +1827,7 @@ static int may_create(struct inode *dir, dsec = inode_security(dir); sbsec = selinux_superblock(dir->i_sb); - sid = tsec->sid; + sid = crsec->sid; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; @@ -1838,7 +1838,7 @@ static int may_create(struct inode *dir, if (rc) return rc; - rc = selinux_determine_inode_label(tsec, dir, &dentry->d_name, tclass, + rc = selinux_determine_inode_label(crsec, dir, &dentry->d_name, tclass, &newsid); if (rc) return rc; @@ -2251,8 +2251,8 @@ static u32 ptrace_parent_sid(void) } static int check_nnp_nosuid(const struct linux_binprm *bprm, - const struct cred_security_struct *old_tsec, - const struct cred_security_struct *new_tsec) + const struct cred_security_struct *old_crsec, + const struct cred_security_struct *new_crsec) { int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS); int nosuid = !mnt_may_suid(bprm->file->f_path.mnt); @@ -2262,7 +2262,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, if (!nnp && !nosuid) return 0; /* neither NNP nor nosuid */ - if (new_tsec->sid == old_tsec->sid) + if (new_crsec->sid == old_crsec->sid) return 0; /* No change in credentials */ /* @@ -2277,7 +2277,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, av |= PROCESS2__NNP_TRANSITION; if (nosuid) av |= PROCESS2__NOSUID_TRANSITION; - rc = avc_has_perm(old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS2, av, NULL); if (!rc) return 0; @@ -2288,8 +2288,8 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, * i.e. SIDs that are guaranteed to only be allowed a subset * of the permissions of the current SID. */ - rc = security_bounded_transition(old_tsec->sid, - new_tsec->sid); + rc = security_bounded_transition(old_crsec->sid, + new_crsec->sid); if (!rc) return 0; @@ -2305,8 +2305,8 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) { - const struct cred_security_struct *old_tsec; - struct cred_security_struct *new_tsec; + const struct cred_security_struct *old_crsec; + struct cred_security_struct *new_crsec; struct inode_security_struct *isec; struct common_audit_data ad; struct inode *inode = file_inode(bprm->file); @@ -2315,18 +2315,18 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) /* SELinux context only depends on initial program or script and not * the script interpreter */ - old_tsec = selinux_cred(current_cred()); - new_tsec = selinux_cred(bprm->cred); + old_crsec = selinux_cred(current_cred()); + new_crsec = selinux_cred(bprm->cred); isec = inode_security(inode); /* Default to the current task SID. */ - new_tsec->sid = old_tsec->sid; - new_tsec->osid = old_tsec->sid; + new_crsec->sid = old_crsec->sid; + new_crsec->osid = old_crsec->sid; /* Reset fs, key, and sock SIDs on execve. */ - new_tsec->create_sid = 0; - new_tsec->keycreate_sid = 0; - new_tsec->sockcreate_sid = 0; + new_crsec->create_sid = 0; + new_crsec->keycreate_sid = 0; + new_crsec->sockcreate_sid = 0; /* * Before policy is loaded, label any task outside kernel space @@ -2335,26 +2335,26 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) * (if the policy chooses to set SECINITSID_INIT != SECINITSID_KERNEL). */ if (!selinux_initialized()) { - new_tsec->sid = SECINITSID_INIT; + new_crsec->sid = SECINITSID_INIT; /* also clear the exec_sid just in case */ - new_tsec->exec_sid = 0; + new_crsec->exec_sid = 0; return 0; } - if (old_tsec->exec_sid) { - new_tsec->sid = old_tsec->exec_sid; + if (old_crsec->exec_sid) { + new_crsec->sid = old_crsec->exec_sid; /* Reset exec SID on execve. */ - new_tsec->exec_sid = 0; + new_crsec->exec_sid = 0; /* Fail on NNP or nosuid if not an allowed transition. */ - rc = check_nnp_nosuid(bprm, old_tsec, new_tsec); + rc = check_nnp_nosuid(bprm, old_crsec, new_crsec); if (rc) return rc; } else { /* Check for a default transition on this program. */ - rc = security_transition_sid(old_tsec->sid, + rc = security_transition_sid(old_crsec->sid, isec->sid, SECCLASS_PROCESS, NULL, - &new_tsec->sid); + &new_crsec->sid); if (rc) return rc; @@ -2362,34 +2362,34 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) * Fallback to old SID on NNP or nosuid if not an allowed * transition. */ - rc = check_nnp_nosuid(bprm, old_tsec, new_tsec); + rc = check_nnp_nosuid(bprm, old_crsec, new_crsec); if (rc) - new_tsec->sid = old_tsec->sid; + new_crsec->sid = old_crsec->sid; } ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = bprm->file; - if (new_tsec->sid == old_tsec->sid) { - rc = avc_has_perm(old_tsec->sid, isec->sid, + if (new_crsec->sid == old_crsec->sid) { + rc = avc_has_perm(old_crsec->sid, isec->sid, SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad); if (rc) return rc; } else { /* Check permissions for the transition. */ - rc = avc_has_perm(old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__TRANSITION, &ad); if (rc) return rc; - rc = avc_has_perm(new_tsec->sid, isec->sid, + rc = avc_has_perm(new_crsec->sid, isec->sid, SECCLASS_FILE, FILE__ENTRYPOINT, &ad); if (rc) return rc; /* Check for shared state */ if (bprm->unsafe & LSM_UNSAFE_SHARE) { - rc = avc_has_perm(old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__SHARE, NULL); if (rc) @@ -2401,7 +2401,7 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) if (bprm->unsafe & LSM_UNSAFE_PTRACE) { u32 ptsid = ptrace_parent_sid(); if (ptsid != 0) { - rc = avc_has_perm(ptsid, new_tsec->sid, + rc = avc_has_perm(ptsid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); if (rc) @@ -2415,7 +2415,7 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) /* Enable secure mode for SIDs transitions unless the noatsecure permission is granted between the two SIDs, i.e. ahp returns 0. */ - rc = avc_has_perm(old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__NOATSECURE, NULL); bprm->secureexec |= !!rc; @@ -2483,12 +2483,12 @@ static inline void flush_unauthorized_files(const struct cred *cred, */ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm) { - struct cred_security_struct *new_tsec; + struct cred_security_struct *new_crsec; struct rlimit *rlim, *initrlim; int rc, i; - new_tsec = selinux_cred(bprm->cred); - if (new_tsec->sid == new_tsec->osid) + new_crsec = selinux_cred(bprm->cred); + if (new_crsec->sid == new_crsec->osid) return; /* Close files for which the new task SID is not authorized. */ @@ -2507,7 +2507,7 @@ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm) * higher than the default soft limit for cases where the default is * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK. */ - rc = avc_has_perm(new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS, + rc = avc_has_perm(new_crsec->osid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__RLIMITINH, NULL); if (rc) { /* protect against do_prlimit() */ @@ -2529,12 +2529,12 @@ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm) */ static void selinux_bprm_committed_creds(const struct linux_binprm *bprm) { - const struct cred_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); u32 osid, sid; int rc; - osid = tsec->osid; - sid = tsec->sid; + osid = crsec->osid; + sid = crsec->sid; if (sid == osid) return; @@ -2911,7 +2911,7 @@ static int selinux_dentry_create_files_as(struct dentry *dentry, int mode, { u32 newsid; int rc; - struct cred_security_struct *tsec; + struct cred_security_struct *crsec; rc = selinux_determine_inode_label(selinux_cred(old), d_inode(dentry->d_parent), name, @@ -2920,8 +2920,8 @@ static int selinux_dentry_create_files_as(struct dentry *dentry, int mode, if (rc) return rc; - tsec = selinux_cred(new); - tsec->create_sid = newsid; + crsec = selinux_cred(new); + crsec->create_sid = newsid; return 0; } @@ -2929,7 +2929,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count) { - const struct cred_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); struct superblock_security_struct *sbsec; struct xattr *xattr = lsm_get_xattr_slot(xattrs, xattr_count); u32 newsid, clen; @@ -2939,9 +2939,9 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, sbsec = selinux_superblock(dir->i_sb); - newsid = tsec->create_sid; + newsid = crsec->create_sid; newsclass = inode_mode_to_security_class(inode->i_mode); - rc = selinux_determine_inode_label(tsec, dir, qstr, newsclass, &newsid); + rc = selinux_determine_inode_label(crsec, dir, qstr, newsclass, &newsid); if (rc) return rc; @@ -3660,7 +3660,7 @@ static void selinux_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop) static int selinux_inode_copy_up(struct dentry *src, struct cred **new) { struct lsm_prop prop; - struct cred_security_struct *tsec; + struct cred_security_struct *crsec; struct cred *new_creds = *new; if (new_creds == NULL) { @@ -3669,10 +3669,10 @@ static int selinux_inode_copy_up(struct dentry *src, struct cred **new) return -ENOMEM; } - tsec = selinux_cred(new_creds); + crsec = selinux_cred(new_creds); /* Get label from overlay inode and set it in create_sid */ selinux_inode_getlsmprop(d_inode(src), &prop); - tsec->create_sid = prop.selinux.secid; + crsec->create_sid = prop.selinux.secid; *new = new_creds; return 0; } @@ -3698,7 +3698,7 @@ static int selinux_inode_copy_up_xattr(struct dentry *dentry, const char *name) static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { - const struct cred_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); u32 parent_sid, newsid, clen; int rc; char *context; @@ -3726,8 +3726,8 @@ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, if (rc) return rc; - if (tsec->create_sid) { - newsid = tsec->create_sid; + if (crsec->create_sid) { + newsid = crsec->create_sid; } else { u16 secclass = inode_mode_to_security_class(kn->mode); const char *kn_name; @@ -3738,7 +3738,7 @@ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, q.name = kn_name; q.hash_len = hashlen_string(kn_dir, kn_name); - rc = security_transition_sid(tsec->sid, + rc = security_transition_sid(crsec->sid, parent_sid, secclass, &q, &newsid); if (rc) @@ -4165,10 +4165,10 @@ static int selinux_task_alloc(struct task_struct *task, static int selinux_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { - const struct cred_security_struct *old_tsec = selinux_cred(old); - struct cred_security_struct *tsec = selinux_cred(new); + const struct cred_security_struct *old_crsec = selinux_cred(old); + struct cred_security_struct *crsec = selinux_cred(new); - *tsec = *old_tsec; + *crsec = *old_crsec; return 0; } @@ -4177,10 +4177,10 @@ static int selinux_cred_prepare(struct cred *new, const struct cred *old, */ static void selinux_cred_transfer(struct cred *new, const struct cred *old) { - const struct cred_security_struct *old_tsec = selinux_cred(old); - struct cred_security_struct *tsec = selinux_cred(new); + const struct cred_security_struct *old_crsec = selinux_cred(old); + struct cred_security_struct *crsec = selinux_cred(new); - *tsec = *old_tsec; + *crsec = *old_crsec; } static void selinux_cred_getsecid(const struct cred *c, u32 *secid) @@ -4199,7 +4199,7 @@ static void selinux_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop) */ static int selinux_kernel_act_as(struct cred *new, u32 secid) { - struct cred_security_struct *tsec = selinux_cred(new); + struct cred_security_struct *crsec = selinux_cred(new); u32 sid = current_sid(); int ret; @@ -4208,10 +4208,10 @@ static int selinux_kernel_act_as(struct cred *new, u32 secid) KERNEL_SERVICE__USE_AS_OVERRIDE, NULL); if (ret == 0) { - tsec->sid = secid; - tsec->create_sid = 0; - tsec->keycreate_sid = 0; - tsec->sockcreate_sid = 0; + crsec->sid = secid; + crsec->create_sid = 0; + crsec->keycreate_sid = 0; + crsec->sockcreate_sid = 0; } return ret; } @@ -4223,7 +4223,7 @@ static int selinux_kernel_act_as(struct cred *new, u32 secid) static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) { struct inode_security_struct *isec = inode_security(inode); - struct cred_security_struct *tsec = selinux_cred(new); + struct cred_security_struct *crsec = selinux_cred(new); u32 sid = current_sid(); int ret; @@ -4233,7 +4233,7 @@ static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) NULL); if (ret == 0) - tsec->create_sid = isec->sid; + crsec->create_sid = isec->sid; return ret; } @@ -4748,15 +4748,15 @@ static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid) /* socket security operations */ -static int socket_sockcreate_sid(const struct cred_security_struct *tsec, +static int socket_sockcreate_sid(const struct cred_security_struct *crsec, u16 secclass, u32 *socksid) { - if (tsec->sockcreate_sid > SECSID_NULL) { - *socksid = tsec->sockcreate_sid; + if (crsec->sockcreate_sid > SECSID_NULL) { + *socksid = crsec->sockcreate_sid; return 0; } - return security_transition_sid(tsec->sid, tsec->sid, + return security_transition_sid(crsec->sid, crsec->sid, secclass, NULL, socksid); } @@ -4801,7 +4801,7 @@ static int sock_has_perm(struct sock *sk, u32 perms) static int selinux_socket_create(int family, int type, int protocol, int kern) { - const struct cred_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); u32 newsid; u16 secclass; int rc; @@ -4810,17 +4810,17 @@ static int selinux_socket_create(int family, int type, return 0; secclass = socket_type_to_security_class(family, type, protocol); - rc = socket_sockcreate_sid(tsec, secclass, &newsid); + rc = socket_sockcreate_sid(crsec, secclass, &newsid); if (rc) return rc; - return avc_has_perm(tsec->sid, newsid, secclass, SOCKET__CREATE, NULL); + return avc_has_perm(crsec->sid, newsid, secclass, SOCKET__CREATE, NULL); } static int selinux_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { - const struct cred_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock)); struct sk_security_struct *sksec; u16 sclass = socket_type_to_security_class(family, type, protocol); @@ -4828,7 +4828,7 @@ static int selinux_socket_post_create(struct socket *sock, int family, int err = 0; if (!kern) { - err = socket_sockcreate_sid(tsec, sclass, &sid); + err = socket_sockcreate_sid(crsec, sclass, &sid); if (err) return err; } @@ -6530,37 +6530,37 @@ static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode) static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p, char **value) { - const struct cred_security_struct *tsec; + const struct cred_security_struct *crsec; int error; u32 sid; u32 len; rcu_read_lock(); - tsec = selinux_cred(__task_cred(p)); + crsec = selinux_cred(__task_cred(p)); if (p != current) { - error = avc_has_perm(current_sid(), tsec->sid, + error = avc_has_perm(current_sid(), crsec->sid, SECCLASS_PROCESS, PROCESS__GETATTR, NULL); if (error) goto err_unlock; } switch (attr) { case LSM_ATTR_CURRENT: - sid = tsec->sid; + sid = crsec->sid; break; case LSM_ATTR_PREV: - sid = tsec->osid; + sid = crsec->osid; break; case LSM_ATTR_EXEC: - sid = tsec->exec_sid; + sid = crsec->exec_sid; break; case LSM_ATTR_FSCREATE: - sid = tsec->create_sid; + sid = crsec->create_sid; break; case LSM_ATTR_KEYCREATE: - sid = tsec->keycreate_sid; + sid = crsec->keycreate_sid; break; case LSM_ATTR_SOCKCREATE: - sid = tsec->sockcreate_sid; + sid = crsec->sockcreate_sid; break; default: error = -EOPNOTSUPP; @@ -6585,7 +6585,7 @@ static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p, static int selinux_lsm_setattr(u64 attr, void *value, size_t size) { - struct cred_security_struct *tsec; + struct cred_security_struct *crsec; struct cred *new; u32 mysid = current_sid(), sid = 0, ptsid; int error; @@ -6671,11 +6671,11 @@ static int selinux_lsm_setattr(u64 attr, void *value, size_t size) operation. See selinux_bprm_creds_for_exec for the execve checks and may_create for the file creation checks. The operation will then fail if the context is not permitted. */ - tsec = selinux_cred(new); + crsec = selinux_cred(new); if (attr == LSM_ATTR_EXEC) { - tsec->exec_sid = sid; + crsec->exec_sid = sid; } else if (attr == LSM_ATTR_FSCREATE) { - tsec->create_sid = sid; + crsec->create_sid = sid; } else if (attr == LSM_ATTR_KEYCREATE) { if (sid) { error = avc_has_perm(mysid, sid, @@ -6683,22 +6683,22 @@ static int selinux_lsm_setattr(u64 attr, void *value, size_t size) if (error) goto abort_change; } - tsec->keycreate_sid = sid; + crsec->keycreate_sid = sid; } else if (attr == LSM_ATTR_SOCKCREATE) { - tsec->sockcreate_sid = sid; + crsec->sockcreate_sid = sid; } else if (attr == LSM_ATTR_CURRENT) { error = -EINVAL; if (sid == 0) goto abort_change; if (!current_is_single_threaded()) { - error = security_bounded_transition(tsec->sid, sid); + error = security_bounded_transition(crsec->sid, sid); if (error) goto abort_change; } /* Check permissions for the transition. */ - error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS, + error = avc_has_perm(crsec->sid, sid, SECCLASS_PROCESS, PROCESS__DYNTRANSITION, NULL); if (error) goto abort_change; @@ -6713,7 +6713,7 @@ static int selinux_lsm_setattr(u64 attr, void *value, size_t size) goto abort_change; } - tsec->sid = sid; + crsec->sid = sid; } else { error = -EINVAL; goto abort_change; @@ -6880,14 +6880,14 @@ static int selinux_inode_getsecctx(struct inode *inode, struct lsm_context *cp) static int selinux_key_alloc(struct key *k, const struct cred *cred, unsigned long flags) { - const struct cred_security_struct *tsec; + const struct cred_security_struct *crsec; struct key_security_struct *ksec = selinux_key(k); - tsec = selinux_cred(cred); - if (tsec->keycreate_sid) - ksec->sid = tsec->keycreate_sid; + crsec = selinux_cred(cred); + if (crsec->keycreate_sid) + ksec->sid = crsec->keycreate_sid; else - ksec->sid = tsec->sid; + ksec->sid = crsec->sid; return 0; } diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 00804562c2c3..8fc3de5234ac 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -217,9 +217,9 @@ selinux_ipc(const struct kern_ipc_perm *ipc) */ static inline u32 current_sid(void) { - const struct cred_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); - return tsec->sid; + return crsec->sid; } static inline struct superblock_security_struct * From 9f048fa487409e364cf866c957cf0b0d782ca5a3 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 13 Nov 2025 05:21:10 +0000 Subject: [PATCH 527/543] MIPS: mm: Prevent a TLB shutdown on initial uniquification Depending on the particular CPU implementation a TLB shutdown may occur if multiple matching entries are detected upon the execution of a TLBP or the TLBWI/TLBWR instructions. Given that we don't know what entries we have been handed we need to be very careful with the initial TLB setup and avoid all these instructions. Therefore read all the TLB entries one by one with the TLBR instruction, bypassing the content addressing logic, and truncate any large pages in place so as to avoid a case in the second step where an incoming entry for a large page at a lower address overlaps with a replacement entry chosen at another index. Then preinitialize the TLB using addresses outside our usual unique range and avoiding clashes with any entries received, before making the usual call to local_flush_tlb_all(). This fixes (at least) R4x00 cores if TLBP hits multiple matching TLB entries (SGI IP22 PROM for examples sets up all TLBs to the same virtual address). Signed-off-by: Maciej W. Rozycki Fixes: 35ad7e181541 ("MIPS: mm: tlb-r4k: Uniquify TLB entries on init") Cc: stable@vger.kernel.org Reviewed-by: Jiaxun Yang Tested-by: Jiaxun Yang # Boston I6400, M5150 sim Signed-off-by: Thomas Bogendoerfer --- arch/mips/mm/tlb-r4k.c | 102 ++++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 38 deletions(-) diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index 347126dc010d..3facf7cc6c7d 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -508,54 +509,78 @@ static int __init set_ntlb(char *str) __setup("ntlb=", set_ntlb); -/* Initialise all TLB entries with unique values */ + +/* Comparison function for EntryHi VPN fields. */ +static int r4k_vpn_cmp(const void *a, const void *b) +{ + long v = *(unsigned long *)a - *(unsigned long *)b; + int s = sizeof(long) > sizeof(int) ? sizeof(long) * 8 - 1: 0; + return s ? (v != 0) | v >> s : v; +} + +/* + * Initialise all TLB entries with unique values that do not clash with + * what we have been handed over and what we'll be using ourselves. + */ static void r4k_tlb_uniquify(void) { - int entry = num_wired_entries(); + unsigned long tlb_vpns[1 << MIPS_CONF1_TLBS_SIZE]; + int tlbsize = current_cpu_data.tlbsize; + int start = num_wired_entries(); + unsigned long vpn_mask; + int cnt, ent, idx, i; + + vpn_mask = GENMASK(cpu_vmbits - 1, 13); + vpn_mask |= IS_ENABLED(CONFIG_64BIT) ? 3ULL << 62 : 1 << 31; htw_stop(); + + for (i = start, cnt = 0; i < tlbsize; i++, cnt++) { + unsigned long vpn; + + write_c0_index(i); + mtc0_tlbr_hazard(); + tlb_read(); + tlb_read_hazard(); + vpn = read_c0_entryhi(); + vpn &= vpn_mask & PAGE_MASK; + tlb_vpns[cnt] = vpn; + + /* Prevent any large pages from overlapping regular ones. */ + write_c0_pagemask(read_c0_pagemask() & PM_DEFAULT_MASK); + mtc0_tlbw_hazard(); + tlb_write_indexed(); + tlbw_use_hazard(); + } + + sort(tlb_vpns, cnt, sizeof(tlb_vpns[0]), r4k_vpn_cmp, NULL); + + write_c0_pagemask(PM_DEFAULT_MASK); write_c0_entrylo0(0); write_c0_entrylo1(0); - while (entry < current_cpu_data.tlbsize) { - unsigned long asid_mask = cpu_asid_mask(¤t_cpu_data); - unsigned long asid = 0; - int idx; + idx = 0; + ent = tlbsize; + for (i = start; i < tlbsize; i++) + while (1) { + unsigned long entryhi, vpn; - /* Skip wired MMID to make ginvt_mmid work */ - if (cpu_has_mmid) - asid = MMID_KERNEL_WIRED + 1; + entryhi = UNIQUE_ENTRYHI(ent); + vpn = entryhi & vpn_mask & PAGE_MASK; - /* Check for match before using UNIQUE_ENTRYHI */ - do { - if (cpu_has_mmid) { - write_c0_memorymapid(asid); - write_c0_entryhi(UNIQUE_ENTRYHI(entry)); - } else { - write_c0_entryhi(UNIQUE_ENTRYHI(entry) | asid); - } - mtc0_tlbw_hazard(); - tlb_probe(); - tlb_probe_hazard(); - idx = read_c0_index(); - /* No match or match is on current entry */ - if (idx < 0 || idx == entry) + if (idx >= cnt || vpn < tlb_vpns[idx]) { + write_c0_entryhi(entryhi); + write_c0_index(i); + mtc0_tlbw_hazard(); + tlb_write_indexed(); + ent++; break; - /* - * If we hit a match, we need to try again with - * a different ASID. - */ - asid++; - } while (asid < asid_mask); - - if (idx >= 0 && idx != entry) - panic("Unable to uniquify TLB entry %d", idx); - - write_c0_index(entry); - mtc0_tlbw_hazard(); - tlb_write_indexed(); - entry++; - } + } else if (vpn == tlb_vpns[idx]) { + ent++; + } else { + idx++; + } + } tlbw_use_hazard(); htw_start(); @@ -602,6 +627,7 @@ static void r4k_tlb_configure(void) /* From this point on the ARC firmware is dead. */ r4k_tlb_uniquify(); + local_flush_tlb_all(); /* Did I tell you that ARC SUCKS? */ } From 14b46ba92bf547508b4a49370c99aba76cb53b53 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Thu, 20 Nov 2025 13:10:29 +0100 Subject: [PATCH 528/543] MIPS: kernel: Fix random segmentation faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 69896119dc9d ("MIPS: vdso: Switch to generic storage implementation") switches to a generic vdso storage, which increases the number of data pages from 1 to 4. But there is only one page reserved, which causes segementation faults depending where the VDSO area is randomized to. To fix this use the same size of reservation and allocation of the VDSO data pages. Fixes: 69896119dc9d ("MIPS: vdso: Switch to generic storage implementation") Reviewed-by: Thomas Weißschuh Reviewed-by: Huacai Chen Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 29191fa1801e..a3101f2268c6 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -692,7 +692,7 @@ unsigned long mips_stack_top(void) /* Space for the VDSO, data page & GIC user page */ if (current->thread.abi) { top -= PAGE_ALIGN(current->thread.abi->vdso->size); - top -= PAGE_SIZE; + top -= VDSO_NR_PAGES * PAGE_SIZE; top -= mips_gic_present() ? PAGE_SIZE : 0; /* Space to randomize the VDSO base */ From a48f822908982353c3256e35a089e9e7d0d61580 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 21 Nov 2025 09:29:02 -0800 Subject: [PATCH 529/543] samples: work around glibc redefining some of our defines wrong Apparently as of version 2.42, glibc headers define AT_RENAME_NOREPLACE and some of the other flags for renameat2() and friends in . Which would all be fine, except for inexplicable reasons glibc decided to define them _differently_ from the kernel definitions, which then makes some of our sample code that includes both kernel headers and user space headers unhappy, because the compiler will (correctly) complain about redefining things. Now, mixing kernel headers and user space headers is always a somewhat iffy proposition due to namespacing issues, but it's kind of inevitable in our sample and selftest code. And this is just glibc being stupid. Those defines come from the kernel, glibc is exposing the kernel interfaces, and glibc shouldn't make up some random new expressions for these values. It's not like glibc headers changed the actual result values, but they arbitrarily just decided to use a different expression to describe those values. The kernel just does #define AT_RENAME_NOREPLACE 0x0001 while glibc does # define RENAME_NOREPLACE (1 << 0) # define AT_RENAME_NOREPLACE RENAME_NOREPLACE instead. Same value in the end, but very different macro definition. For absolutely no reason. This has since been fixed in the glibc development tree, so eventually we'll end up with the canonical expressions and no clashes. But in the meantime the broken headers are in the glibc-2.42 release and have made it out into distributions. Do a minimal work-around to make the samples build cleanly by just undefining the affected macros in between the user space header include and the kernel header includes. Signed-off-by: Linus Torvalds --- samples/vfs/test-statx.c | 6 ++++++ samples/watch_queue/watch_test.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c index 49c7a46cee07..424a6fa15723 100644 --- a/samples/vfs/test-statx.c +++ b/samples/vfs/test-statx.c @@ -19,6 +19,12 @@ #include #include #include + +// Work around glibc header silliness +#undef AT_RENAME_NOREPLACE +#undef AT_RENAME_EXCHANGE +#undef AT_RENAME_WHITEOUT + #include #include #define statx foo diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c index 8c6cb57d5cfc..24cf7d7a1972 100644 --- a/samples/watch_queue/watch_test.c +++ b/samples/watch_queue/watch_test.c @@ -16,6 +16,12 @@ #include #include #include + +// Work around glibc header silliness +#undef AT_RENAME_NOREPLACE +#undef AT_RENAME_EXCHANGE +#undef AT_RENAME_WHITEOUT + #include #include #include From 141fbbecec0e71fa6b35d08c7d3dba2f9853a4ee Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 20 Nov 2025 19:34:31 -0800 Subject: [PATCH 530/543] lib/crypto: tests: Fix KMSAN warning in test_sha256_finup_2x() Fully initialize *ctx, including the buf field which sha256_init() doesn't initialize, to avoid a KMSAN warning when comparing *ctx to orig_ctx. This KMSAN warning slipped in while KMSAN was not working reliably due to a stackdepot bug, which has now been fixed. Fixes: 6733968be7cb ("lib/crypto: tests: Add tests and benchmark for sha256_finup_2x()") Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251121033431.34406-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/tests/sha256_kunit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c index dcedfca06df6..5dccdee79693 100644 --- a/lib/crypto/tests/sha256_kunit.c +++ b/lib/crypto/tests/sha256_kunit.c @@ -68,6 +68,7 @@ static void test_sha256_finup_2x(struct kunit *test) rand_bytes(data1_buf, max_data_len); rand_bytes(data2_buf, max_data_len); rand_bytes(salt, sizeof(salt)); + memset(ctx, 0, sizeof(*ctx)); for (size_t i = 0; i < 500; i++) { size_t salt_len = rand_length(sizeof(salt)); From 20739af07383e6eb1ec59dcd70b72ebfa9ac362c Mon Sep 17 00:00:00 2001 From: Yipeng Zou Date: Sat, 22 Nov 2025 09:39:42 +0000 Subject: [PATCH 531/543] timers: Fix NULL function pointer race in timer_shutdown_sync() There is a race condition between timer_shutdown_sync() and timer expiration that can lead to hitting a WARN_ON in expire_timers(). The issue occurs when timer_shutdown_sync() clears the timer function to NULL while the timer is still running on another CPU. The race scenario looks like this: CPU0 CPU1 lock_timer_base() expire_timers() base->running_timer = timer; unlock_timer_base() [call_timer_fn enter] mod_timer() ... timer_shutdown_sync() lock_timer_base() // For now, will not detach the timer but only clear its function to NULL if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); if (shutdown) timer->function = NULL; unlock_timer_base() [call_timer_fn exit] lock_timer_base() base->running_timer = NULL; unlock_timer_base() ... // Now timer is pending while its function set to NULL. // next timer trigger expire_timers() WARN_ON_ONCE(!fn) // hit ... lock_timer_base() // Now timer will detach if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); if (shutdown) timer->function = NULL; unlock_timer_base() The problem is that timer_shutdown_sync() clears the timer function regardless of whether the timer is currently running. This can leave a pending timer with a NULL function pointer, which triggers the WARN_ON_ONCE(!fn) check in expire_timers(). Fix this by only clearing the timer function when actually detaching the timer. If the timer is running, leave the function pointer intact, which is safe because the timer will be properly detached when it finishes running. Fixes: 0cc04e80458a ("timers: Add shutdown mechanism to the internal functions") Signed-off-by: Yipeng Zou Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20251122093942.301559-1-zouyipeng@huawei.com --- kernel/time/timer.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 553fa469d7cc..d5ebb1d927ea 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1458,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) base = lock_timer_base(timer, &flags); - if (base->running_timer != timer) + if (base->running_timer != timer) { ret = detach_if_pending(timer, base, true); - if (shutdown) - timer->function = NULL; + if (shutdown) + timer->function = NULL; + } raw_spin_unlock_irqrestore(&base->lock, flags); From ac3fd01e4c1efce8f2c054cdeb2ddd2fc0fb150d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 23 Nov 2025 14:53:16 -0800 Subject: [PATCH 532/543] Linux 6.18-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d763c2c75cdb..d208066bcbb6 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 18 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Baby Opossum Posse # *DOCUMENTATION* From aead5ae91e4cbadac817d15737eca3b531237448 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 19 Nov 2025 18:14:22 +0200 Subject: [PATCH 533/543] spi: rzv2h-rspi: make resets optional The Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs don't have reset lines for the SPI peripheral, make them optional to prepare for adding support for them. Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20251119161434.595677-2-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rzv2h-rspi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-rzv2h-rspi.c b/drivers/spi/spi-rzv2h-rspi.c index dcc431ba60a9..09b9362e9b1f 100644 --- a/drivers/spi/spi-rzv2h-rspi.c +++ b/drivers/spi/spi-rzv2h-rspi.c @@ -384,8 +384,8 @@ static int rzv2h_rspi_probe(struct platform_device *pdev) rspi->resets[0].id = "presetn"; rspi->resets[1].id = "tresetn"; - ret = devm_reset_control_bulk_get_exclusive(dev, RSPI_RESET_NUM, - rspi->resets); + ret = devm_reset_control_bulk_get_optional_exclusive(dev, RSPI_RESET_NUM, + rspi->resets); if (ret) return dev_err_probe(dev, ret, "cannot get resets\n"); From 8e89ee6cd2b928a8431bef61e8b851ce5df1ecb0 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 19 Nov 2025 18:14:23 +0200 Subject: [PATCH 534/543] spi: rzv2h-rspi: make FIFO size chip-specific The Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs have a different FIFO size compared to RZ/V2H. Add a chip-specific structure, and set the FIFO size inside it, to prepare for adding support for them. Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20251119161434.595677-3-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rzv2h-rspi.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-rzv2h-rspi.c b/drivers/spi/spi-rzv2h-rspi.c index 09b9362e9b1f..7a7a576c17dd 100644 --- a/drivers/spi/spi-rzv2h-rspi.c +++ b/drivers/spi/spi-rzv2h-rspi.c @@ -58,7 +58,6 @@ /* Register SPDCR2 */ #define RSPI_SPDCR2_TTRG GENMASK(11, 8) #define RSPI_SPDCR2_RTRG GENMASK(3, 0) -#define RSPI_FIFO_SIZE 16 /* Register SPSR */ #define RSPI_SPSR_SPRF BIT(15) @@ -69,9 +68,14 @@ #define RSPI_RESET_NUM 2 #define RSPI_CLK_NUM 3 +struct rzv2h_rspi_info { + unsigned int fifo_size; +}; + struct rzv2h_rspi_priv { struct reset_control_bulk_data resets[RSPI_RESET_NUM]; struct spi_controller *controller; + const struct rzv2h_rspi_info *info; void __iomem *base; struct clk *tclk; wait_queue_head_t wait; @@ -305,7 +309,7 @@ static int rzv2h_rspi_prepare_message(struct spi_controller *ctlr, writeb(0, rspi->base + RSPI_SSLP); /* Setup FIFO thresholds */ - conf16 = FIELD_PREP(RSPI_SPDCR2_TTRG, RSPI_FIFO_SIZE - 1); + conf16 = FIELD_PREP(RSPI_SPDCR2_TTRG, rspi->info->fifo_size - 1); conf16 |= FIELD_PREP(RSPI_SPDCR2_RTRG, 0); writew(conf16, rspi->base + RSPI_SPDCR2); @@ -362,6 +366,8 @@ static int rzv2h_rspi_probe(struct platform_device *pdev) rspi->controller = controller; + rspi->info = device_get_match_data(dev); + rspi->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(rspi->base)) return PTR_ERR(rspi->base); @@ -445,8 +451,12 @@ static void rzv2h_rspi_remove(struct platform_device *pdev) reset_control_bulk_assert(RSPI_RESET_NUM, rspi->resets); } +static const struct rzv2h_rspi_info rzv2h_info = { + .fifo_size = 16, +}; + static const struct of_device_id rzv2h_rspi_match[] = { - { .compatible = "renesas,r9a09g057-rspi" }, + { .compatible = "renesas,r9a09g057-rspi", &rzv2h_info }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, rzv2h_rspi_match); From ebd7d6ae0dc7d65e21460c928519f40ccf95f3b9 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 19 Nov 2025 18:14:24 +0200 Subject: [PATCH 535/543] spi: rzv2h-rspi: make clocks chip-specific The Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs have different clocks compared to RZ/V2H. Set the number of clocks and the name of the transfer clock in the chip-specific structure to prepare for adding support for them. Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20251119161434.595677-4-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rzv2h-rspi.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-rzv2h-rspi.c b/drivers/spi/spi-rzv2h-rspi.c index 7a7a576c17dd..a1f17ec8727b 100644 --- a/drivers/spi/spi-rzv2h-rspi.c +++ b/drivers/spi/spi-rzv2h-rspi.c @@ -66,10 +66,11 @@ #define RSPI_SPSRC_CLEAR 0xfd80 #define RSPI_RESET_NUM 2 -#define RSPI_CLK_NUM 3 struct rzv2h_rspi_info { + const char *tclk_name; unsigned int fifo_size; + unsigned int num_clks; }; struct rzv2h_rspi_priv { @@ -373,11 +374,11 @@ static int rzv2h_rspi_probe(struct platform_device *pdev) return PTR_ERR(rspi->base); ret = devm_clk_bulk_get_all_enabled(dev, &clks); - if (ret != RSPI_CLK_NUM) + if (ret != rspi->info->num_clks) return dev_err_probe(dev, ret >= 0 ? -EINVAL : ret, "cannot get clocks\n"); - for (i = 0; i < RSPI_CLK_NUM; i++) { - if (!strcmp(clks[i].id, "tclk")) { + for (i = 0; i < rspi->info->num_clks; i++) { + if (!strcmp(clks[i].id, rspi->info->tclk_name)) { rspi->tclk = clks[i].clk; break; } @@ -452,7 +453,9 @@ static void rzv2h_rspi_remove(struct platform_device *pdev) } static const struct rzv2h_rspi_info rzv2h_info = { + .tclk_name = "tclk", .fifo_size = 16, + .num_clks = 3, }; static const struct of_device_id rzv2h_rspi_match[] = { From 1b7ce968ab2579702ea9dbc2fb599e540bbd8c88 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 19 Nov 2025 18:14:25 +0200 Subject: [PATCH 536/543] spi: rzv2h-rspi: move register writes out of rzv2h_rspi_setup_clock() In preparation for caching the last requested transfer frequency, move register writes outside of rzv2h_rspi_setup_clock(). The transfer list is iterated to determine the speed of the transfer and the bits per word. The speed of the transfer is used to compute SPR and BRDV inside rzv2h_rspi_setup_clock(). BRDV and SPB are stored in the SPCMD register. Move the transfer iteration earlier, move the SPR and BRDV writing out of rzv2h_rspi_setup_clock(), consolidate writing BRDV and SPB into the initial write to the SPCMD register. Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20251119161434.595677-5-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rzv2h-rspi.c | 45 ++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/drivers/spi/spi-rzv2h-rspi.c b/drivers/spi/spi-rzv2h-rspi.c index a1f17ec8727b..f02f25b98ec6 100644 --- a/drivers/spi/spi-rzv2h-rspi.c +++ b/drivers/spi/spi-rzv2h-rspi.c @@ -83,6 +83,8 @@ struct rzv2h_rspi_priv { unsigned int bytes_per_word; u32 freq; u16 status; + u8 spr; + u8 brdv; }; #define RZV2H_RSPI_TX(func, type) \ @@ -263,8 +265,8 @@ static u32 rzv2h_rspi_setup_clock(struct rzv2h_rspi_priv *rspi, u32 hz) return 0; clock_found: - rzv2h_rspi_reg_rmw(rspi, RSPI_SPCMD, RSPI_SPCMD_BRDV, brdv); - writeb(spr, rspi->base + RSPI_SPBR); + rspi->spr = spr; + rspi->brdv = brdv; return rzv2h_rspi_calc_bitrate(tclk_rate, spr, brdv); } @@ -283,6 +285,25 @@ static int rzv2h_rspi_prepare_message(struct spi_controller *ctlr, /* Make sure SPCR.SPE is 0 before amending the configuration */ rzv2h_rspi_spe_disable(rspi); + list_for_each_entry(xfer, &message->transfers, transfer_list) { + if (!xfer->speed_hz) + continue; + + speed_hz = min(xfer->speed_hz, speed_hz); + bits_per_word = xfer->bits_per_word; + } + + if (speed_hz == U32_MAX) + return -EINVAL; + + rspi->bytes_per_word = roundup_pow_of_two(BITS_TO_BYTES(bits_per_word)); + + rspi->freq = rzv2h_rspi_setup_clock(rspi, speed_hz); + if (!rspi->freq) + return -EINVAL; + + writeb(rspi->spr, rspi->base + RSPI_SPBR); + /* Configure the device to work in "host" mode */ conf32 = RSPI_SPCR_MSTR; @@ -301,6 +322,8 @@ static int rzv2h_rspi_prepare_message(struct spi_controller *ctlr, conf32 = FIELD_PREP(RSPI_SPCMD_CPOL, !!(spi->mode & SPI_CPOL)); conf32 |= FIELD_PREP(RSPI_SPCMD_CPHA, !!(spi->mode & SPI_CPHA)); conf32 |= FIELD_PREP(RSPI_SPCMD_LSBF, !!(spi->mode & SPI_LSB_FIRST)); + conf32 |= FIELD_PREP(RSPI_SPCMD_SPB, bits_per_word - 1); + conf32 |= FIELD_PREP(RSPI_SPCMD_BRDV, rspi->brdv); conf32 |= FIELD_PREP(RSPI_SPCMD_SSLKP, 1); conf32 |= FIELD_PREP(RSPI_SPCMD_SSLA, spi_get_chipselect(spi, 0)); writel(conf32, rspi->base + RSPI_SPCMD); @@ -316,24 +339,6 @@ static int rzv2h_rspi_prepare_message(struct spi_controller *ctlr, rzv2h_rspi_clear_fifos(rspi); - list_for_each_entry(xfer, &message->transfers, transfer_list) { - if (!xfer->speed_hz) - continue; - - speed_hz = min(xfer->speed_hz, speed_hz); - bits_per_word = xfer->bits_per_word; - } - - if (speed_hz == U32_MAX) - return -EINVAL; - - rspi->bytes_per_word = roundup_pow_of_two(BITS_TO_BYTES(bits_per_word)); - rzv2h_rspi_reg_rmw(rspi, RSPI_SPCMD, RSPI_SPCMD_SPB, bits_per_word - 1); - - rspi->freq = rzv2h_rspi_setup_clock(rspi, speed_hz); - if (!rspi->freq) - return -EINVAL; - rzv2h_rspi_spe_enable(rspi); return 0; From 88782493204512fcf4e020e2385bca3e3c5bd4c0 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 19 Nov 2025 18:14:26 +0200 Subject: [PATCH 537/543] spi: rzv2h-rspi: avoid recomputing transfer frequency Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs have a more complicated algorithm for calculating the optimal SPI transfer frequency compared to RZ/V2H, as the clock from which the SPI frequency is generated supports multiple dividers. Cache the requested transfer frequency and skip calling rzv2h_rspi_setup_clock() if it matches the last used one to prepare for adding support for variable clock frequency handling. Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20251119161434.595677-6-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rzv2h-rspi.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-rzv2h-rspi.c b/drivers/spi/spi-rzv2h-rspi.c index f02f25b98ec6..d7719f3c7b13 100644 --- a/drivers/spi/spi-rzv2h-rspi.c +++ b/drivers/spi/spi-rzv2h-rspi.c @@ -81,6 +81,7 @@ struct rzv2h_rspi_priv { struct clk *tclk; wait_queue_head_t wait; unsigned int bytes_per_word; + u32 last_speed_hz; u32 freq; u16 status; u8 spr; @@ -298,9 +299,13 @@ static int rzv2h_rspi_prepare_message(struct spi_controller *ctlr, rspi->bytes_per_word = roundup_pow_of_two(BITS_TO_BYTES(bits_per_word)); - rspi->freq = rzv2h_rspi_setup_clock(rspi, speed_hz); - if (!rspi->freq) - return -EINVAL; + if (speed_hz != rspi->last_speed_hz) { + rspi->freq = rzv2h_rspi_setup_clock(rspi, speed_hz); + if (!rspi->freq) + return -EINVAL; + + rspi->last_speed_hz = speed_hz; + } writeb(rspi->spr, rspi->base + RSPI_SPBR); From 77d931584dd38916b66c65320c80a65cbef4b122 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 19 Nov 2025 18:14:27 +0200 Subject: [PATCH 538/543] spi: rzv2h-rspi: make transfer clock rate finding chip-specific The Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs have a more complicated clocking setup for the SPI transfer clock than RZ/V2H, as the clock from which it is generated supports multiple dividers. To prepare for adding support for these SoCs, split out the logic for finding the SPR and BRDV for a fixed clock into rzv2h_rspi_find_rate_fixed(), and add and use a .find_tclk_rate() callback into the chip-specific structure. Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20251119161434.595677-7-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rzv2h-rspi.c | 62 ++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/drivers/spi/spi-rzv2h-rspi.c b/drivers/spi/spi-rzv2h-rspi.c index d7719f3c7b13..f59bcadf5e38 100644 --- a/drivers/spi/spi-rzv2h-rspi.c +++ b/drivers/spi/spi-rzv2h-rspi.c @@ -67,7 +67,18 @@ #define RSPI_RESET_NUM 2 +struct rzv2h_rspi_best_clock { + struct clk *clk; + unsigned long clk_rate; + unsigned long error; + u32 actual_hz; + u8 brdv; + u8 spr; +}; + struct rzv2h_rspi_info { + void (*find_tclk_rate)(struct clk *clk, u32 hz, u8 spr_min, u8 spr_max, + struct rzv2h_rspi_best_clock *best_clk); const char *tclk_name; unsigned int fifo_size; unsigned int num_clks; @@ -240,9 +251,13 @@ static inline u32 rzv2h_rspi_calc_bitrate(unsigned long tclk_rate, u8 spr, return DIV_ROUND_UP(tclk_rate, (2 * (spr + 1) * (1 << brdv))); } -static u32 rzv2h_rspi_setup_clock(struct rzv2h_rspi_priv *rspi, u32 hz) +static void rzv2h_rspi_find_rate_fixed(struct clk *clk, u32 hz, + u8 spr_min, u8 spr_max, + struct rzv2h_rspi_best_clock *best) { - unsigned long tclk_rate; + unsigned long clk_rate; + unsigned long error; + u32 actual_hz; int spr; u8 brdv; @@ -255,21 +270,49 @@ static u32 rzv2h_rspi_setup_clock(struct rzv2h_rspi_priv *rspi, u32 hz) * * n = SPR - is RSPI_SPBR.SPR (from 0 to 255) * * N = BRDV - is RSPI_SPCMD.BRDV (from 0 to 3) */ - tclk_rate = clk_get_rate(rspi->tclk); + clk_rate = clk_get_rate(clk); for (brdv = RSPI_SPCMD_BRDV_MIN; brdv <= RSPI_SPCMD_BRDV_MAX; brdv++) { - spr = DIV_ROUND_UP(tclk_rate, hz * (1 << (brdv + 1))); + spr = DIV_ROUND_UP(clk_rate, hz * (1 << (brdv + 1))); spr--; - if (spr >= RSPI_SPBR_SPR_MIN && spr <= RSPI_SPBR_SPR_MAX) + if (spr >= spr_min && spr <= spr_max) goto clock_found; } - return 0; + return; clock_found: - rspi->spr = spr; - rspi->brdv = brdv; + actual_hz = rzv2h_rspi_calc_bitrate(clk_rate, spr, brdv); + error = abs((long)hz - (long)actual_hz); - return rzv2h_rspi_calc_bitrate(tclk_rate, spr, brdv); + if (error >= best->error) + return; + + *best = (struct rzv2h_rspi_best_clock) { + .clk = clk, + .clk_rate = clk_rate, + .error = error, + .actual_hz = actual_hz, + .brdv = brdv, + .spr = spr, + }; +} + +static u32 rzv2h_rspi_setup_clock(struct rzv2h_rspi_priv *rspi, u32 hz) +{ + struct rzv2h_rspi_best_clock best_clock = { + .error = ULONG_MAX, + }; + + rspi->info->find_tclk_rate(rspi->tclk, hz, RSPI_SPBR_SPR_MIN, + RSPI_SPBR_SPR_MAX, &best_clock); + + if (!best_clock.clk_rate) + return -EINVAL; + + rspi->spr = best_clock.spr; + rspi->brdv = best_clock.brdv; + + return best_clock.actual_hz; } static int rzv2h_rspi_prepare_message(struct spi_controller *ctlr, @@ -463,6 +506,7 @@ static void rzv2h_rspi_remove(struct platform_device *pdev) } static const struct rzv2h_rspi_info rzv2h_info = { + .find_tclk_rate = rzv2h_rspi_find_rate_fixed, .tclk_name = "tclk", .fifo_size = 16, .num_clks = 3, From 1ce3e8adc7d0038e59a7c9f5c9e5f399ba0db5d6 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 19 Nov 2025 18:14:28 +0200 Subject: [PATCH 539/543] spi: rzv2h-rspi: add support for using PCLK for transfer clock The Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs support generating the SPI transfer clock from PCLK, with the quirk that SPR 0 is not supported, causing the highest achievable SPI transfer frequency to be 31.25MHz. Add support for generating the SPI transfer clock from PCLK. Renesas RZ/V2H (R9A09G057) also has the BPEN bit used to enable this option in the datasheet, but it is not explicitly documented and there's no details about its limitations as there are on RZ/T2H. Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20251119161434.595677-8-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rzv2h-rspi.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-rzv2h-rspi.c b/drivers/spi/spi-rzv2h-rspi.c index f59bcadf5e38..e9d8ee919261 100644 --- a/drivers/spi/spi-rzv2h-rspi.c +++ b/drivers/spi/spi-rzv2h-rspi.c @@ -34,6 +34,7 @@ #define RSPI_SPFCR 0x6c /* Register SPCR */ +#define RSPI_SPCR_BPEN BIT(31) #define RSPI_SPCR_MSTR BIT(30) #define RSPI_SPCR_SPRIE BIT(17) #define RSPI_SPCR_SCKASE BIT(12) @@ -41,6 +42,7 @@ /* Register SPBR */ #define RSPI_SPBR_SPR_MIN 0 +#define RSPI_SPBR_SPR_PCLK_MIN 1 #define RSPI_SPBR_SPR_MAX 255 /* Register SPCMD */ @@ -79,6 +81,8 @@ struct rzv2h_rspi_best_clock { struct rzv2h_rspi_info { void (*find_tclk_rate)(struct clk *clk, u32 hz, u8 spr_min, u8 spr_max, struct rzv2h_rspi_best_clock *best_clk); + void (*find_pclk_rate)(struct clk *clk, u32 hz, u8 spr_low, u8 spr_high, + struct rzv2h_rspi_best_clock *best_clk); const char *tclk_name; unsigned int fifo_size; unsigned int num_clks; @@ -90,6 +94,7 @@ struct rzv2h_rspi_priv { const struct rzv2h_rspi_info *info; void __iomem *base; struct clk *tclk; + struct clk *pclk; wait_queue_head_t wait; unsigned int bytes_per_word; u32 last_speed_hz; @@ -97,6 +102,7 @@ struct rzv2h_rspi_priv { u16 status; u8 spr; u8 brdv; + bool use_pclk; }; #define RZV2H_RSPI_TX(func, type) \ @@ -306,9 +312,18 @@ static u32 rzv2h_rspi_setup_clock(struct rzv2h_rspi_priv *rspi, u32 hz) rspi->info->find_tclk_rate(rspi->tclk, hz, RSPI_SPBR_SPR_MIN, RSPI_SPBR_SPR_MAX, &best_clock); + /* + * T2H and N2H can also use PCLK as a source, which is 125MHz, but not + * when both SPR and BRDV are 0. + */ + if (best_clock.error && rspi->info->find_pclk_rate) + rspi->info->find_pclk_rate(rspi->pclk, hz, RSPI_SPBR_SPR_PCLK_MIN, + RSPI_SPBR_SPR_MAX, &best_clock); + if (!best_clock.clk_rate) return -EINVAL; + rspi->use_pclk = best_clock.clk == rspi->pclk; rspi->spr = best_clock.spr; rspi->brdv = best_clock.brdv; @@ -361,6 +376,9 @@ static int rzv2h_rspi_prepare_message(struct spi_controller *ctlr, /* SPI receive buffer full interrupt enable */ conf32 |= RSPI_SPCR_SPRIE; + /* Bypass synchronization circuit */ + conf32 |= FIELD_PREP(RSPI_SPCR_BPEN, rspi->use_pclk); + writel(conf32, rspi->base + RSPI_SPCR); /* Use SPCMD0 only */ @@ -433,7 +451,9 @@ static int rzv2h_rspi_probe(struct platform_device *pdev) for (i = 0; i < rspi->info->num_clks; i++) { if (!strcmp(clks[i].id, rspi->info->tclk_name)) { rspi->tclk = clks[i].clk; - break; + } else if (rspi->info->find_pclk_rate && + !strcmp(clks[i].id, "pclk")) { + rspi->pclk = clks[i].clk; } } From 9c9bf4fdc5e5d09d5f4280ed2c582df6e1f837d9 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 19 Nov 2025 18:14:29 +0200 Subject: [PATCH 540/543] spi: rzv2h-rspi: add support for variable transfer clock The Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs have a more complicated clocking setup for the SPI transfer clock than RZ/V2H, as the clock from which it is generated supports multiple dividers. To prepare for adding support for these SoCs, do the following changes. Use the minimum frequency of SPI clock to calculate the SPI controller's min_speed_hz, and the maximum frequency to calculate max_speed_hz. Apply the clock rate found by the .find_tclk_rate() to the found clock. Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20251119161434.595677-9-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rzv2h-rspi.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-rzv2h-rspi.c b/drivers/spi/spi-rzv2h-rspi.c index e9d8ee919261..be45269e8853 100644 --- a/drivers/spi/spi-rzv2h-rspi.c +++ b/drivers/spi/spi-rzv2h-rspi.c @@ -308,6 +308,7 @@ static u32 rzv2h_rspi_setup_clock(struct rzv2h_rspi_priv *rspi, u32 hz) struct rzv2h_rspi_best_clock best_clock = { .error = ULONG_MAX, }; + int ret; rspi->info->find_tclk_rate(rspi->tclk, hz, RSPI_SPBR_SPR_MIN, RSPI_SPBR_SPR_MAX, &best_clock); @@ -323,6 +324,10 @@ static u32 rzv2h_rspi_setup_clock(struct rzv2h_rspi_priv *rspi, u32 hz) if (!best_clock.clk_rate) return -EINVAL; + ret = clk_set_rate(best_clock.clk, best_clock.clk_rate); + if (ret) + return 0; + rspi->use_pclk = best_clock.clk == rspi->pclk; rspi->spr = best_clock.spr; rspi->brdv = best_clock.brdv; @@ -426,8 +431,8 @@ static int rzv2h_rspi_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct rzv2h_rspi_priv *rspi; struct clk_bulk_data *clks; - unsigned long tclk_rate; int irq_rx, ret, i; + long tclk_rate; controller = devm_spi_alloc_host(dev, sizeof(*rspi)); if (!controller) @@ -460,8 +465,6 @@ static int rzv2h_rspi_probe(struct platform_device *pdev) if (!rspi->tclk) return dev_err_probe(dev, -EINVAL, "Failed to get tclk\n"); - tclk_rate = clk_get_rate(rspi->tclk); - rspi->resets[0].id = "presetn"; rspi->resets[1].id = "tresetn"; ret = devm_reset_control_bulk_get_optional_exclusive(dev, RSPI_RESET_NUM, @@ -493,9 +496,23 @@ static int rzv2h_rspi_probe(struct platform_device *pdev) controller->unprepare_message = rzv2h_rspi_unprepare_message; controller->num_chipselect = 4; controller->transfer_one = rzv2h_rspi_transfer_one; + + tclk_rate = clk_round_rate(rspi->tclk, 0); + if (tclk_rate < 0) { + ret = tclk_rate; + goto quit_resets; + } + controller->min_speed_hz = rzv2h_rspi_calc_bitrate(tclk_rate, RSPI_SPBR_SPR_MAX, RSPI_SPCMD_BRDV_MAX); + + tclk_rate = clk_round_rate(rspi->tclk, ULONG_MAX); + if (tclk_rate < 0) { + ret = tclk_rate; + goto quit_resets; + } + controller->max_speed_hz = rzv2h_rspi_calc_bitrate(tclk_rate, RSPI_SPBR_SPR_MIN, RSPI_SPCMD_BRDV_MIN); From bc4f0b1e39035b9bb3d5d9692074702110f5e2b1 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 19 Nov 2025 18:14:30 +0200 Subject: [PATCH 541/543] spi: rzv2h-rspi: add support for loopback mode Add support for loopback mode for debugging purposes, allowing us to test the SPI controller at the maximum SPI transfer clock without being limited by external wiring. Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20251119161434.595677-10-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rzv2h-rspi.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-rzv2h-rspi.c b/drivers/spi/spi-rzv2h-rspi.c index be45269e8853..da110efba971 100644 --- a/drivers/spi/spi-rzv2h-rspi.c +++ b/drivers/spi/spi-rzv2h-rspi.c @@ -24,6 +24,7 @@ /* Registers */ #define RSPI_SPDR 0x00 #define RSPI_SPCR 0x08 +#define RSPI_SPPCR 0x0e #define RSPI_SSLP 0x10 #define RSPI_SPBR 0x11 #define RSPI_SPSCR 0x13 @@ -40,6 +41,9 @@ #define RSPI_SPCR_SCKASE BIT(12) #define RSPI_SPCR_SPE BIT(0) +/* Register SPPCR */ +#define RSPI_SPPCR_SPLP2 BIT(1) + /* Register SPBR */ #define RSPI_SPBR_SPR_MIN 0 #define RSPI_SPBR_SPR_PCLK_MIN 1 @@ -345,6 +349,7 @@ static int rzv2h_rspi_prepare_message(struct spi_controller *ctlr, u8 bits_per_word; u32 conf32; u16 conf16; + u8 conf8; /* Make sure SPCR.SPE is 0 before amending the configuration */ rzv2h_rspi_spe_disable(rspi); @@ -389,6 +394,10 @@ static int rzv2h_rspi_prepare_message(struct spi_controller *ctlr, /* Use SPCMD0 only */ writeb(0x0, rspi->base + RSPI_SPSCR); + /* Setup loopback */ + conf8 = FIELD_PREP(RSPI_SPPCR_SPLP2, !!(spi->mode & SPI_LOOP)); + writeb(conf8, rspi->base + RSPI_SPPCR); + /* Setup mode */ conf32 = FIELD_PREP(RSPI_SPCMD_CPOL, !!(spi->mode & SPI_CPOL)); conf32 |= FIELD_PREP(RSPI_SPCMD_CPHA, !!(spi->mode & SPI_CPHA)); @@ -490,7 +499,7 @@ static int rzv2h_rspi_probe(struct platform_device *pdev) } controller->mode_bits = SPI_CPHA | SPI_CPOL | SPI_CS_HIGH | - SPI_LSB_FIRST; + SPI_LSB_FIRST | SPI_LOOP; controller->bits_per_word_mask = SPI_BPW_RANGE_MASK(4, 32); controller->prepare_message = rzv2h_rspi_prepare_message; controller->unprepare_message = rzv2h_rspi_unprepare_message; From e93d7b2d8b349f659fa9456048ee86e10eb422f9 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 19 Nov 2025 18:14:31 +0200 Subject: [PATCH 542/543] spi: dt-bindings: renesas,rzv2h-rspi: document RZ/T2H and RZ/N2H The Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs have four SPI peripherals. Compared to the previously supported RZ/V2H, these SoCs have a smaller FIFO, no resets, and only two clocks: PCLKSPIn and PCLK. PCLKSPIn, being the clock from which the SPI transfer clock is generated, is the equivalent of the TCLK from V2H. Document them, and use RZ/T2H as a fallback for RZ/N2H as the SPIs are entirely compatible. Signed-off-by: Cosmin Tanislav Acked-by: Conor Dooley Link: https://patch.msgid.link/20251119161434.595677-11-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Mark Brown --- .../bindings/spi/renesas,rzv2h-rspi.yaml | 62 ++++++++++++++++--- 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/Documentation/devicetree/bindings/spi/renesas,rzv2h-rspi.yaml b/Documentation/devicetree/bindings/spi/renesas,rzv2h-rspi.yaml index ab27fefc3c3a..4331df3e3d47 100644 --- a/Documentation/devicetree/bindings/spi/renesas,rzv2h-rspi.yaml +++ b/Documentation/devicetree/bindings/spi/renesas,rzv2h-rspi.yaml @@ -9,12 +9,15 @@ title: Renesas RZ/V2H(P) Renesas Serial Peripheral Interface (RSPI) maintainers: - Fabrizio Castro -allOf: - - $ref: spi-controller.yaml# - properties: compatible: - const: renesas,r9a09g057-rspi # RZ/V2H(P) + oneOf: + - enum: + - renesas,r9a09g057-rspi # RZ/V2H(P) + - renesas,r9a09g077-rspi # RZ/T2H + - items: + - const: renesas,r9a09g087-rspi # RZ/N2H + - const: renesas,r9a09g077-rspi # RZ/T2H reg: maxItems: 1 @@ -36,13 +39,12 @@ properties: - const: tx clocks: + minItems: 2 maxItems: 3 clock-names: - items: - - const: pclk - - const: pclk_sfr - - const: tclk + minItems: 2 + maxItems: 3 resets: maxItems: 2 @@ -62,12 +64,52 @@ required: - interrupt-names - clocks - clock-names - - resets - - reset-names - power-domains - '#address-cells' - '#size-cells' +allOf: + - $ref: spi-controller.yaml# + - if: + properties: + compatible: + contains: + enum: + - renesas,r9a09g057-rspi + then: + properties: + clocks: + minItems: 3 + + clock-names: + items: + - const: pclk + - const: pclk_sfr + - const: tclk + + required: + - resets + - reset-names + + - if: + properties: + compatible: + contains: + enum: + - renesas,r9a09g077-rspi + then: + properties: + clocks: + maxItems: 2 + + clock-names: + items: + - const: pclk + - const: pclkspi + + resets: false + reset-names: false + unevaluatedProperties: false examples: From 0cc8cd824b9fb7fb087a2ec6b0c80d812cc4fde7 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 19 Nov 2025 18:14:32 +0200 Subject: [PATCH 543/543] spi: rzv2h-rspi: add support for RZ/T2H and RZ/N2H Compared to the previously supported RZ/V2H, the Renesas RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs have a smaller FIFO, no resets, and only two clocks: PCLKSPIn and PCLK. PCLKSPIn, being the clock from which the SPI transfer clock is generated, is the equivalent of the TCLK clock from RZ/V2H. They also support generating the SPI transfer clock from PCLK. PCLKSPIn supports multiple dividers, generating multiple possible frequencies from its parent. To handle this, do the following changes. Use the minimum frequency of SPI clock to calculate the SPI controller's min_speed_hz, and the maximum frequency to calculate max_speed_hz. Add a new function, rzv2h_rspi_find_rate_variable(), which is used for the .find_tclk_rate() callback, and which supports handling clocks with a variable rate, with the following overall logic. Iterate through all possible BRDV values. For each BRDV, calculate two different SPRs, one for the clock's minimum frequency, and one for the maxmimum, and iterate through each SPR between them. If the minimum SPR is higher than the upper SPR limit, the minimum rate is too high to achieve the requested SPI frequency, skip to the next BRDV. For each SPR, calculate a rate and let the clock framework round it to the closest supported rate of the clock. The rate and SPR that generate a transfer frequency closest to the requested SPI transfer frequency will be picked. Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20251119161434.595677-12-cosmin-gabriel.tanislav.xa@renesas.com Signed-off-by: Mark Brown --- drivers/spi/spi-rzv2h-rspi.c | 108 +++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/drivers/spi/spi-rzv2h-rspi.c b/drivers/spi/spi-rzv2h-rspi.c index da110efba971..1db7e4e5d64e 100644 --- a/drivers/spi/spi-rzv2h-rspi.c +++ b/drivers/spi/spi-rzv2h-rspi.c @@ -261,6 +261,105 @@ static inline u32 rzv2h_rspi_calc_bitrate(unsigned long tclk_rate, u8 spr, return DIV_ROUND_UP(tclk_rate, (2 * (spr + 1) * (1 << brdv))); } +static void rzv2h_rspi_find_rate_variable(struct clk *clk, u32 hz, + u8 spr_min, u8 spr_max, + struct rzv2h_rspi_best_clock *best) +{ + long clk_rate, clk_min_rate, clk_max_rate; + int min_rate_spr, max_rate_spr; + unsigned long error; + u32 actual_hz; + u8 brdv; + int spr; + + /* + * On T2H / N2H, the source for the SPI clock is PCLKSPIn, which is a + * 1/32, 1/30, 1/25 or 1/24 divider of PLL4, which is 2400MHz, + * resulting in either 75MHz, 80MHz, 96MHz or 100MHz. + */ + clk_min_rate = clk_round_rate(clk, 0); + if (clk_min_rate < 0) + return; + + clk_max_rate = clk_round_rate(clk, ULONG_MAX); + if (clk_max_rate < 0) + return; + + /* + * From the manual: + * Bit rate = f(PCLKSPIn) / (2 * (n + 1) * 2^N) + * + * If we adapt it to the current context, we get the following: + * hz = rate / ((spr + 1) * (1 << (brdv + 1))) + * + * This can be written in multiple forms depending on what we want to + * determine. + * + * To find the rate, having hz, spr and brdv: + * rate = hz * (spr + 1) * (1 << (brdv + 1) + * + * To find the spr, having rate, hz, and spr: + * spr = rate / (hz * (1 << (brdv + 1)) - 1 + */ + + for (brdv = RSPI_SPCMD_BRDV_MIN; brdv <= RSPI_SPCMD_BRDV_MAX; brdv++) { + /* Calculate the divisor needed to find the SPR from a rate. */ + u32 rate_div = hz * (1 << (brdv + 1)); + + /* + * If the SPR for the minimum rate is greater than the maximum + * allowed value skip this BRDV. The divisor increases with each + * BRDV iteration, so the following BRDV might result in a + * minimum SPR that is in the valid range. + */ + min_rate_spr = DIV_ROUND_CLOSEST(clk_min_rate, rate_div) - 1; + if (min_rate_spr > spr_max) + continue; + + /* + * If the SPR for the maximum rate is less than the minimum + * allowed value, exit. The divisor only increases with each + * BRDV iteration, so the following BRDV cannot result in a + * maximum SPR that is in the valid range. + */ + max_rate_spr = DIV_ROUND_CLOSEST(clk_max_rate, rate_div) - 1; + if (max_rate_spr < spr_min) + break; + + if (min_rate_spr < spr_min) + min_rate_spr = spr_min; + + if (max_rate_spr > spr_max) + max_rate_spr = spr_max; + + for (spr = min_rate_spr; spr <= max_rate_spr; spr++) { + clk_rate = (spr + 1) * rate_div; + + clk_rate = clk_round_rate(clk, clk_rate); + if (clk_rate <= 0) + continue; + + actual_hz = rzv2h_rspi_calc_bitrate(clk_rate, spr, brdv); + error = abs((long)hz - (long)actual_hz); + + if (error >= best->error) + continue; + + *best = (struct rzv2h_rspi_best_clock) { + .clk = clk, + .clk_rate = clk_rate, + .error = error, + .actual_hz = actual_hz, + .brdv = brdv, + .spr = spr, + }; + + if (!error) + return; + } + } +} + static void rzv2h_rspi_find_rate_fixed(struct clk *clk, u32 hz, u8 spr_min, u8 spr_max, struct rzv2h_rspi_best_clock *best) @@ -558,8 +657,17 @@ static const struct rzv2h_rspi_info rzv2h_info = { .num_clks = 3, }; +static const struct rzv2h_rspi_info rzt2h_info = { + .find_tclk_rate = rzv2h_rspi_find_rate_variable, + .find_pclk_rate = rzv2h_rspi_find_rate_fixed, + .tclk_name = "pclkspi", + .fifo_size = 4, + .num_clks = 2, +}; + static const struct of_device_id rzv2h_rspi_match[] = { { .compatible = "renesas,r9a09g057-rspi", &rzv2h_info }, + { .compatible = "renesas,r9a09g077-rspi", &rzt2h_info }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, rzv2h_rspi_match);