From 8a3514d348de87a9d5e2ac00fbac4faae0b97996 Mon Sep 17 00:00:00 2001 From: Sanjeev Yadav Date: Fri, 23 May 2025 13:14:01 -0700 Subject: [PATCH 001/242] scsi: core: ufs: Fix a hang in the error handler ufshcd_err_handling_prepare() calls ufshcd_rpm_get_sync(). The latter function can only succeed if UFSHCD_EH_IN_PROGRESS is not set because resuming involves submitting a SCSI command and ufshcd_queuecommand() returns SCSI_MLQUEUE_HOST_BUSY if UFSHCD_EH_IN_PROGRESS is set. Fix this hang by setting UFSHCD_EH_IN_PROGRESS after ufshcd_rpm_get_sync() has been called instead of before. Backtrace: __switch_to+0x174/0x338 __schedule+0x600/0x9e4 schedule+0x7c/0xe8 schedule_timeout+0xa4/0x1c8 io_schedule_timeout+0x48/0x70 wait_for_common_io+0xa8/0x160 //waiting on START_STOP wait_for_completion_io_timeout+0x10/0x20 blk_execute_rq+0xe4/0x1e4 scsi_execute_cmd+0x108/0x244 ufshcd_set_dev_pwr_mode+0xe8/0x250 __ufshcd_wl_resume+0x94/0x354 ufshcd_wl_runtime_resume+0x3c/0x174 scsi_runtime_resume+0x64/0xa4 rpm_resume+0x15c/0xa1c __pm_runtime_resume+0x4c/0x90 // Runtime resume ongoing ufshcd_err_handler+0x1a0/0xd08 process_one_work+0x174/0x808 worker_thread+0x15c/0x490 kthread+0xf4/0x1ec ret_from_fork+0x10/0x20 Signed-off-by: Sanjeev Yadav [ bvanassche: rewrote patch description ] Fixes: 62694735ca95 ("[SCSI] ufs: Add runtime PM support for UFS host controller driver") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20250523201409.1676055-1-bvanassche@acm.org Reviewed-by: Peter Wang Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index a7513f256057..a1d7c8660c1b 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -6623,9 +6623,14 @@ static void ufshcd_err_handler(struct work_struct *work) up(&hba->host_sem); return; } + spin_unlock_irqrestore(hba->host->host_lock, flags); + + ufshcd_err_handling_prepare(hba); + + spin_lock_irqsave(hba->host->host_lock, flags); ufshcd_set_eh_in_progress(hba); spin_unlock_irqrestore(hba->host->host_lock, flags); - ufshcd_err_handling_prepare(hba); + /* Complete requests that have door-bell cleared by h/w */ ufshcd_complete_requests(hba, false); spin_lock_irqsave(hba->host->host_lock, flags); From b55eb6eb2a7427428c59b293a0900131fc849595 Mon Sep 17 00:00:00 2001 From: Mike Yuan Date: Wed, 4 Jun 2025 15:03:42 +0000 Subject: [PATCH 002/242] pidfs: never refuse ppid == 0 in PIDFD_GET_INFO In systemd we spotted an issue after switching to ioctl(PIDFD_GET_INFO) for obtaining pid number the pidfd refers to, that for processes with a parent from outer pidns PIDFD_GET_INFO unexpectedly yields -ESRCH [1]. It turned out that there's an arbitrary check blocking this, which is not really sensible given getppid() happily returns 0 for such processes. Just drop the spurious check and userspace ought to handle ppid == 0 properly everywhere. [1] https://github.com/systemd/systemd/issues/37715 Fixes: cdda1f26e74b ("pidfd: add ioctl to retrieve pid info") Signed-off-by: Mike Yuan Link: https://lore.kernel.org/20250604150238.42664-1-me@yhndnzj.com Cc: Christian Brauner Cc: Luca Boccassi Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- fs/pidfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index c1f0a067be40..69919be1c9d8 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -366,7 +366,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) kinfo.pid = task_pid_vnr(task); kinfo.mask |= PIDFD_INFO_PID; - if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) + if (kinfo.pid == 0 || kinfo.tgid == 0) return -ESRCH; copy_out: From 714d02b41939d2720379e11ef25227aec4e5bec9 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 5 Jun 2025 12:15:30 +0200 Subject: [PATCH 003/242] ovl: fix regression caused by lookup helpers API changes The lookup helpers API was changed by merge of vfs-6.16-rc1.async.dir to pass a non-const qstr pointer argument to lookup_one*() helpers. All of the callers of this API were changed to pass a pointer to temp copy of qstr, except overlays that was passing a const pointer to dentry->d_name that was changed to pass a non-const copy instead when doing a lookup in lower layer which is not the fs of said dentry. This wrong use of the API caused a regression in fstest overlay/012. Fix the regression by making a non-const copy of dentry->d_name prior to calling the lookup API, but the API should be fixed to not allow this class of bugs. Cc: NeilBrown Fixes: 5741909697a3 ("VFS: improve interface for lookup_one functions") Fixes: 390e34bc1490 ("VFS: change lookup_one_common and lookup_noperm_common to take a qstr") Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/20250605101530.2336320-1-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/overlayfs/namei.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index bf722daf19a9..f75c49ceb460 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -1371,7 +1371,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, bool ovl_lower_positive(struct dentry *dentry) { struct ovl_entry *poe = OVL_E(dentry->d_parent); - struct qstr *name = &dentry->d_name; + const struct qstr *name = &dentry->d_name; const struct cred *old_cred; unsigned int i; bool positive = false; @@ -1394,9 +1394,15 @@ bool ovl_lower_positive(struct dentry *dentry) struct dentry *this; struct ovl_path *parentpath = &ovl_lowerstack(poe)[i]; + /* + * We need to make a non-const copy of dentry->d_name, + * because lookup_one_positive_unlocked() will hash name + * with parentpath base, which is on another (lower fs). + */ this = lookup_one_positive_unlocked( mnt_idmap(parentpath->layer->mnt), - name, parentpath->dentry); + &QSTR_LEN(name->name, name->len), + parentpath->dentry); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: From 6678791ee3da0b78c28fe7d77814097f53cbb8df Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Jun 2025 08:08:21 +0100 Subject: [PATCH 004/242] KVM: arm64: Add assignment-specific sysreg accessor Assigning a value to a system register doesn't do what it is supposed to be doing if that register is one that has RESx bits. The main problem is that we use __vcpu_sys_reg(), which can be used both as a lvalue and rvalue. When used as a lvalue, the bit masking occurs *before* the new value is assigned, meaning that we (1) do pointless work on the old cvalue, and (2) potentially assign an invalid value as we fail to apply the masks to it. Fix this by providing a new __vcpu_assign_sys_reg() that does what it says on the tin, and sanitises the *new* value instead of the old one. This comes with a significant amount of churn. Reviewed-by: Miguel Luis Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250603070824.1192795-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 11 ++++++ arch/arm64/kvm/arch_timer.c | 16 ++++----- arch/arm64/kvm/hyp/exception.c | 4 +-- arch/arm64/kvm/hyp/include/hyp/switch.h | 4 +-- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 6 ++-- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 4 +-- arch/arm64/kvm/hyp/vhe/switch.c | 4 +-- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 42 +++++++++++----------- arch/arm64/kvm/pmu-emul.c | 14 ++++---- arch/arm64/kvm/sys_regs.c | 38 ++++++++++---------- arch/arm64/kvm/sys_regs.h | 4 +-- arch/arm64/kvm/vgic/vgic-v3-nested.c | 10 +++--- 12 files changed, 85 insertions(+), 72 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d941abc6b5ee..3b84ad91116b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1107,6 +1107,17 @@ static inline u64 *___ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r) #define ctxt_sys_reg(c,r) (*__ctxt_sys_reg(c,r)) u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64); + +#define __vcpu_assign_sys_reg(v, r, val) \ + do { \ + const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ + u64 __v = (val); \ + if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ + __v = kvm_vcpu_apply_reg_masks((v), (r), __v); \ + \ + ctxt_sys_reg(ctxt, (r)) = __v; \ + } while (0) + #define __vcpu_sys_reg(v,r) \ (*({ \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 5133dcbfe9f7..5a67a6d4d95b 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -108,16 +108,16 @@ static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: - __vcpu_sys_reg(vcpu, CNTV_CTL_EL0) = ctl; + __vcpu_assign_sys_reg(vcpu, CNTV_CTL_EL0, ctl); break; case TIMER_PTIMER: - __vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = ctl; + __vcpu_assign_sys_reg(vcpu, CNTP_CTL_EL0, ctl); break; case TIMER_HVTIMER: - __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2) = ctl; + __vcpu_assign_sys_reg(vcpu, CNTHV_CTL_EL2, ctl); break; case TIMER_HPTIMER: - __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2) = ctl; + __vcpu_assign_sys_reg(vcpu, CNTHP_CTL_EL2, ctl); break; default: WARN_ON(1); @@ -130,16 +130,16 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: - __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0) = cval; + __vcpu_assign_sys_reg(vcpu, CNTV_CVAL_EL0, cval); break; case TIMER_PTIMER: - __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = cval; + __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, cval); break; case TIMER_HVTIMER: - __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2) = cval; + __vcpu_assign_sys_reg(vcpu, CNTHV_CVAL_EL2, cval); break; case TIMER_HPTIMER: - __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = cval; + __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, cval); break; default: WARN_ON(1); diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 424a5107cddb..6a2a899a344e 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -37,7 +37,7 @@ static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) if (unlikely(vcpu_has_nv(vcpu))) vcpu_write_sys_reg(vcpu, val, reg); else if (!__vcpu_write_sys_reg_to_cpu(val, reg)) - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); } static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode, @@ -51,7 +51,7 @@ static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode, } else if (has_vhe()) { write_sysreg_el1(val, SYS_SPSR); } else { - __vcpu_sys_reg(vcpu, SPSR_EL1) = val; + __vcpu_assign_sys_reg(vcpu, SPSR_EL1, val); } } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index eef310cdbdbd..aa5b561b9218 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -45,7 +45,7 @@ static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu) if (!vcpu_el1_is_32bit(vcpu)) return; - __vcpu_sys_reg(vcpu, FPEXC32_EL2) = read_sysreg(fpexc32_el2); + __vcpu_assign_sys_reg(vcpu, FPEXC32_EL2, read_sysreg(fpexc32_el2)); } static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) @@ -457,7 +457,7 @@ static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) */ if (vcpu_has_sve(vcpu)) { zcr_el1 = read_sysreg_el1(SYS_ZCR); - __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1; + __vcpu_assign_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu), zcr_el1); /* * The guest's state is always saved using the guest's max VL. diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index b9cff893bbe0..4d0dbea4c56f 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -307,11 +307,11 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) vcpu->arch.ctxt.spsr_irq = read_sysreg(spsr_irq); vcpu->arch.ctxt.spsr_fiq = read_sysreg(spsr_fiq); - __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2); - __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2); + __vcpu_assign_sys_reg(vcpu, DACR32_EL2, read_sysreg(dacr32_el2)); + __vcpu_assign_sys_reg(vcpu, IFSR32_EL2, read_sysreg(ifsr32_el2)); if (has_vhe() || kvm_debug_regs_in_use(vcpu)) - __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2); + __vcpu_assign_sys_reg(vcpu, DBGVCR32_EL2, read_sysreg(dbgvcr32_el2)); } static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 8e8848de4d47..e9198e56e784 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -26,7 +26,7 @@ void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu) { - __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); + __vcpu_assign_sys_reg(vcpu, ZCR_EL1, read_sysreg_el1(SYS_ZCR)); /* * On saving/restoring guest sve state, always use the maximum VL for * the guest. The layout of the data when saving the sve state depends @@ -79,7 +79,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm)); if (has_fpmr) - __vcpu_sys_reg(vcpu, FPMR) = read_sysreg_s(SYS_FPMR); + __vcpu_assign_sys_reg(vcpu, FPMR, read_sysreg_s(SYS_FPMR)); if (system_supports_sve()) __hyp_sve_restore_host(); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index c9b330dc2066..09df2b42bc1b 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -223,9 +223,9 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ val = read_sysreg_el0(SYS_CNTP_CVAL); if (map.direct_ptimer == vcpu_ptimer(vcpu)) - __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = val; + __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, val); if (map.direct_ptimer == vcpu_hptimer(vcpu)) - __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = val; + __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, val); offset = read_sysreg_s(SYS_CNTPOFF_EL2); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 3814b0b2c937..34c7bf7fe9de 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -18,17 +18,17 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) { /* These registers are common with EL1 */ - __vcpu_sys_reg(vcpu, PAR_EL1) = read_sysreg(par_el1); - __vcpu_sys_reg(vcpu, TPIDR_EL1) = read_sysreg(tpidr_el1); + __vcpu_assign_sys_reg(vcpu, PAR_EL1, read_sysreg(par_el1)); + __vcpu_assign_sys_reg(vcpu, TPIDR_EL1, read_sysreg(tpidr_el1)); - __vcpu_sys_reg(vcpu, ESR_EL2) = read_sysreg_el1(SYS_ESR); - __vcpu_sys_reg(vcpu, AFSR0_EL2) = read_sysreg_el1(SYS_AFSR0); - __vcpu_sys_reg(vcpu, AFSR1_EL2) = read_sysreg_el1(SYS_AFSR1); - __vcpu_sys_reg(vcpu, FAR_EL2) = read_sysreg_el1(SYS_FAR); - __vcpu_sys_reg(vcpu, MAIR_EL2) = read_sysreg_el1(SYS_MAIR); - __vcpu_sys_reg(vcpu, VBAR_EL2) = read_sysreg_el1(SYS_VBAR); - __vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR); - __vcpu_sys_reg(vcpu, AMAIR_EL2) = read_sysreg_el1(SYS_AMAIR); + __vcpu_assign_sys_reg(vcpu, ESR_EL2, read_sysreg_el1(SYS_ESR)); + __vcpu_assign_sys_reg(vcpu, AFSR0_EL2, read_sysreg_el1(SYS_AFSR0)); + __vcpu_assign_sys_reg(vcpu, AFSR1_EL2, read_sysreg_el1(SYS_AFSR1)); + __vcpu_assign_sys_reg(vcpu, FAR_EL2, read_sysreg_el1(SYS_FAR)); + __vcpu_assign_sys_reg(vcpu, MAIR_EL2, read_sysreg_el1(SYS_MAIR)); + __vcpu_assign_sys_reg(vcpu, VBAR_EL2, read_sysreg_el1(SYS_VBAR)); + __vcpu_assign_sys_reg(vcpu, CONTEXTIDR_EL2, read_sysreg_el1(SYS_CONTEXTIDR)); + __vcpu_assign_sys_reg(vcpu, AMAIR_EL2, read_sysreg_el1(SYS_AMAIR)); /* * In VHE mode those registers are compatible between EL1 and EL2, @@ -46,21 +46,21 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) * are always trapped, ensuring that the in-memory * copy is always up-to-date. A small blessing... */ - __vcpu_sys_reg(vcpu, SCTLR_EL2) = read_sysreg_el1(SYS_SCTLR); - __vcpu_sys_reg(vcpu, TTBR0_EL2) = read_sysreg_el1(SYS_TTBR0); - __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1); - __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR); + __vcpu_assign_sys_reg(vcpu, SCTLR_EL2, read_sysreg_el1(SYS_SCTLR)); + __vcpu_assign_sys_reg(vcpu, TTBR0_EL2, read_sysreg_el1(SYS_TTBR0)); + __vcpu_assign_sys_reg(vcpu, TTBR1_EL2, read_sysreg_el1(SYS_TTBR1)); + __vcpu_assign_sys_reg(vcpu, TCR_EL2, read_sysreg_el1(SYS_TCR)); if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { - __vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2); + __vcpu_assign_sys_reg(vcpu, TCR2_EL2, read_sysreg_el1(SYS_TCR2)); if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { - __vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0); - __vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR); + __vcpu_assign_sys_reg(vcpu, PIRE0_EL2, read_sysreg_el1(SYS_PIRE0)); + __vcpu_assign_sys_reg(vcpu, PIR_EL2, read_sysreg_el1(SYS_PIR)); } if (ctxt_has_s1poe(&vcpu->arch.ctxt)) - __vcpu_sys_reg(vcpu, POR_EL2) = read_sysreg_el1(SYS_POR); + __vcpu_assign_sys_reg(vcpu, POR_EL2, read_sysreg_el1(SYS_POR)); } /* @@ -74,9 +74,9 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val; } - __vcpu_sys_reg(vcpu, SP_EL2) = read_sysreg(sp_el1); - __vcpu_sys_reg(vcpu, ELR_EL2) = read_sysreg_el1(SYS_ELR); - __vcpu_sys_reg(vcpu, SPSR_EL2) = read_sysreg_el1(SYS_SPSR); + __vcpu_assign_sys_reg(vcpu, SP_EL2, read_sysreg(sp_el1)); + __vcpu_assign_sys_reg(vcpu, ELR_EL2, read_sysreg_el1(SYS_ELR)); + __vcpu_assign_sys_reg(vcpu, SPSR_EL2, read_sysreg_el1(SYS_SPSR)); } static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 25c29107f13f..4f0ae8073f78 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -178,7 +178,7 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) val |= lower_32_bits(val); } - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); /* Recreate the perf event to reflect the updated sample_period */ kvm_pmu_create_perf_event(pmc); @@ -204,7 +204,7 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) { kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); - __vcpu_sys_reg(vcpu, counter_index_to_reg(select_idx)) = val; + __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(select_idx), val); kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); } @@ -239,7 +239,7 @@ static void kvm_pmu_stop_counter(struct kvm_pmc *pmc) reg = counter_index_to_reg(pmc->idx); - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); kvm_pmu_release_perf_event(pmc); } @@ -503,7 +503,7 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1; if (!kvm_pmc_is_64bit(pmc)) reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg; + __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(i), reg); /* No overflow? move on */ if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg)) @@ -602,7 +602,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); /* The reset bits don't indicate any state, and shouldn't be saved. */ - __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P); + __vcpu_assign_sys_reg(vcpu, PMCR_EL0, (val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P))); if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); @@ -779,7 +779,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 reg; reg = counter_index_to_evtreg(pmc->idx); - __vcpu_sys_reg(vcpu, reg) = data & kvm_pmu_evtyper_mask(vcpu->kvm); + __vcpu_assign_sys_reg(vcpu, reg, (data & kvm_pmu_evtyper_mask(vcpu->kvm))); kvm_pmu_create_perf_event(pmc); } @@ -1038,7 +1038,7 @@ static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr) u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2); val &= ~MDCR_EL2_HPMN; val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters); - __vcpu_sys_reg(vcpu, MDCR_EL2) = val; + __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); } } } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 707c651aff03..93d0ca7ed936 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -228,7 +228,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) * to reverse-translate virtual EL2 system registers for a * non-VHE guest hypervisor. */ - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); switch (reg) { case CNTHCTL_EL2: @@ -263,7 +263,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) return; memory_write: - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); } /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ @@ -605,7 +605,7 @@ static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, if ((val ^ rd->val) & ~OSLSR_EL1_OSLK) return -EINVAL; - __vcpu_sys_reg(vcpu, rd->reg) = val; + __vcpu_assign_sys_reg(vcpu, rd->reg, val); return 0; } @@ -835,7 +835,7 @@ static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) * The value of PMCR.N field is included when the * vCPU register is read via kvm_vcpu_read_pmcr(). */ - __vcpu_sys_reg(vcpu, r->reg) = pmcr; + __vcpu_assign_sys_reg(vcpu, r->reg, pmcr); return __vcpu_sys_reg(vcpu, r->reg); } @@ -907,7 +907,7 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return false; if (p->is_write) - __vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval; + __vcpu_assign_sys_reg(vcpu, PMSELR_EL0, p->regval); else /* return PMSELR.SEL field */ p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0) @@ -1076,7 +1076,7 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va { u64 mask = kvm_pmu_accessible_counter_mask(vcpu); - __vcpu_sys_reg(vcpu, r->reg) = val & mask; + __vcpu_assign_sys_reg(vcpu, r->reg, val & mask); kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); return 0; @@ -1185,8 +1185,8 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (!vcpu_mode_priv(vcpu)) return undef_access(vcpu, p, r); - __vcpu_sys_reg(vcpu, PMUSERENR_EL0) = - p->regval & ARMV8_PMU_USERENR_MASK; + __vcpu_assign_sys_reg(vcpu, PMUSERENR_EL0, + (p->regval & ARMV8_PMU_USERENR_MASK)); } else { p->regval = __vcpu_sys_reg(vcpu, PMUSERENR_EL0) & ARMV8_PMU_USERENR_MASK; @@ -1237,7 +1237,7 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; - __vcpu_sys_reg(vcpu, r->reg) = val; + __vcpu_assign_sys_reg(vcpu, r->reg, val); kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); return 0; @@ -2207,7 +2207,7 @@ static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) if (kvm_has_mte(vcpu->kvm)) clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc); - __vcpu_sys_reg(vcpu, r->reg) = clidr; + __vcpu_assign_sys_reg(vcpu, r->reg, clidr); return __vcpu_sys_reg(vcpu, r->reg); } @@ -2221,7 +2221,7 @@ static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, if ((val & CLIDR_EL1_RES0) || (!(ctr_el0 & CTR_EL0_IDC) && idc)) return -EINVAL; - __vcpu_sys_reg(vcpu, rd->reg) = val; + __vcpu_assign_sys_reg(vcpu, rd->reg, val); return 0; } @@ -2398,7 +2398,7 @@ static bool access_sp_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (p->is_write) - __vcpu_sys_reg(vcpu, SP_EL1) = p->regval; + __vcpu_assign_sys_reg(vcpu, SP_EL1, p->regval); else p->regval = __vcpu_sys_reg(vcpu, SP_EL1); @@ -2422,7 +2422,7 @@ static bool access_spsr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (p->is_write) - __vcpu_sys_reg(vcpu, SPSR_EL1) = p->regval; + __vcpu_assign_sys_reg(vcpu, SPSR_EL1, p->regval); else p->regval = __vcpu_sys_reg(vcpu, SPSR_EL1); @@ -2434,7 +2434,7 @@ static bool access_cntkctl_el12(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (p->is_write) - __vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval; + __vcpu_assign_sys_reg(vcpu, CNTKCTL_EL1, p->regval); else p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1); @@ -2448,7 +2448,9 @@ static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) val |= HCR_E2H; - return __vcpu_sys_reg(vcpu, r->reg) = val; + __vcpu_assign_sys_reg(vcpu, r->reg, val); + + return __vcpu_sys_reg(vcpu, r->reg); } static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu, @@ -2619,7 +2621,7 @@ static bool access_mdcr(struct kvm_vcpu *vcpu, u64_replace_bits(val, hpmn, MDCR_EL2_HPMN); } - __vcpu_sys_reg(vcpu, MDCR_EL2) = val; + __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); /* * Request a reload of the PMU to enable/disable the counters @@ -2748,7 +2750,7 @@ static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, static u64 reset_mdcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - __vcpu_sys_reg(vcpu, r->reg) = vcpu->kvm->arch.nr_pmu_counters; + __vcpu_assign_sys_reg(vcpu, r->reg, vcpu->kvm->arch.nr_pmu_counters); return vcpu->kvm->arch.nr_pmu_counters; } @@ -5006,7 +5008,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, if (r->set_user) { ret = (r->set_user)(vcpu, r, val); } else { - __vcpu_sys_reg(vcpu, r->reg) = val; + __vcpu_assign_sys_reg(vcpu, r->reg, val); ret = 0; } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index cc6338d38766..ef97d9fc67cc 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -137,7 +137,7 @@ static inline u64 reset_unknown(struct kvm_vcpu *vcpu, { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); - __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL; + __vcpu_assign_sys_reg(vcpu, r->reg, 0x1de7ec7edbadc0deULL); return __vcpu_sys_reg(vcpu, r->reg); } @@ -145,7 +145,7 @@ static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); - __vcpu_sys_reg(vcpu, r->reg) = r->val; + __vcpu_assign_sys_reg(vcpu, r->reg, r->val); return __vcpu_sys_reg(vcpu, r->reg); } diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 4f6954c30674..d22a8ad7bcc5 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -356,12 +356,12 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); val &= ~ICH_HCR_EL2_EOIcount_MASK; val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK); - __vcpu_sys_reg(vcpu, ICH_HCR_EL2) = val; - __vcpu_sys_reg(vcpu, ICH_VMCR_EL2) = s_cpu_if->vgic_vmcr; + __vcpu_assign_sys_reg(vcpu, ICH_HCR_EL2, val); + __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, s_cpu_if->vgic_vmcr); for (i = 0; i < 4; i++) { - __vcpu_sys_reg(vcpu, ICH_AP0RN(i)) = s_cpu_if->vgic_ap0r[i]; - __vcpu_sys_reg(vcpu, ICH_AP1RN(i)) = s_cpu_if->vgic_ap1r[i]; + __vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]); + __vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]); } for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { @@ -370,7 +370,7 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) val &= ~ICH_LR_STATE; val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE; - __vcpu_sys_reg(vcpu, ICH_LRN(i)) = val; + __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); s_cpu_if->vgic_lr[i] = 0; } From 8800b7c4bbede3cd40831726d3f98e8080baf4df Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Jun 2025 08:08:22 +0100 Subject: [PATCH 005/242] KVM: arm64: Add RMW specific sysreg accessor In a number of cases, we perform a Read-Modify-Write operation on a system register, meaning that we would apply the RESx masks twice. Instead, provide a new accessor that performs this RMW operation, allowing the masks to be applied exactly once per operation. Reviewed-by: Miguel Luis Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250603070824.1192795-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 11 +++++++++++ arch/arm64/kvm/debug.c | 4 ++-- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 4 ++-- arch/arm64/kvm/nested.c | 2 +- arch/arm64/kvm/pmu-emul.c | 10 +++++----- arch/arm64/kvm/sys_regs.c | 22 +++++++++++----------- 6 files changed, 32 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3b84ad91116b..8b8c649f225b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1118,6 +1118,17 @@ u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64); ctxt_sys_reg(ctxt, (r)) = __v; \ } while (0) +#define __vcpu_rmw_sys_reg(v, r, op, val) \ + do { \ + const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ + u64 __v = ctxt_sys_reg(ctxt, (r)); \ + __v op (val); \ + if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ + __v = kvm_vcpu_apply_reg_masks((v), (r), __v); \ + \ + ctxt_sys_reg(ctxt, (r)) = __v; \ + } while (0) + #define __vcpu_sys_reg(v,r) \ (*({ \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 0e4c805e7e89..1a7dab333f55 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -216,9 +216,9 @@ void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu) void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val) { if (val & OSLAR_EL1_OSLK) - __vcpu_sys_reg(vcpu, OSLSR_EL1) |= OSLSR_EL1_OSLK; + __vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, |=, OSLSR_EL1_OSLK); else - __vcpu_sys_reg(vcpu, OSLSR_EL1) &= ~OSLSR_EL1_OSLK; + __vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, &=, ~OSLSR_EL1_OSLK); preempt_disable(); kvm_arch_vcpu_put(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 34c7bf7fe9de..73e4bc7fde9e 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -70,8 +70,8 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) */ val = read_sysreg_el1(SYS_CNTKCTL); val &= CNTKCTL_VALID_BITS; - __vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS; - __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val; + __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, &=, ~CNTKCTL_VALID_BITS); + __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, |=, val); } __vcpu_assign_sys_reg(vcpu, SP_EL2, read_sysreg(sp_el1)); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 4a53e4147fb0..5b191f4dc566 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1757,7 +1757,7 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) out: for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) - (void)__vcpu_sys_reg(vcpu, sr); + __vcpu_rmw_sys_reg(vcpu, sr, |=, 0); return 0; } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 4f0ae8073f78..b03dbda7f1ab 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -510,7 +510,7 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, continue; /* Mark overflow */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(i)); if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(i + 1), @@ -556,7 +556,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, perf_event->attr.sample_period = period; perf_event->hw.sample_period = period; - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(idx)); if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(idx + 1), @@ -914,9 +914,9 @@ void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { u64 mask = kvm_pmu_implemented_counter_mask(vcpu); - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= mask; - __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= mask; - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= mask; + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, mask); + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, mask); + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, mask); kvm_pmu_reprogram_counter_mask(vcpu, mask); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 93d0ca7ed936..e89d345e42d0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -791,7 +791,7 @@ static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) mask |= GENMASK(n - 1, 0); reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= mask; + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, mask); return __vcpu_sys_reg(vcpu, r->reg); } @@ -799,7 +799,7 @@ static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, GENMASK(31, 0)); return __vcpu_sys_reg(vcpu, r->reg); } @@ -811,7 +811,7 @@ static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) return 0; reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= kvm_pmu_evtyper_mask(vcpu->kvm); + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, kvm_pmu_evtyper_mask(vcpu->kvm)); return __vcpu_sys_reg(vcpu, r->reg); } @@ -819,7 +819,7 @@ static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= PMSELR_EL0_SEL_MASK; + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, PMSELR_EL0_SEL_MASK); return __vcpu_sys_reg(vcpu, r->reg); } @@ -1103,10 +1103,10 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, val = p->regval & mask; if (r->Op2 & 0x1) /* accessing PMCNTENSET_EL0 */ - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, |=, val); else /* accessing PMCNTENCLR_EL0 */ - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, ~val); kvm_pmu_reprogram_counter_mask(vcpu, val); } else { @@ -1129,10 +1129,10 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (r->Op2 & 0x1) /* accessing PMINTENSET_EL1 */ - __vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val; + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, |=, val); else /* accessing PMINTENCLR_EL1 */ - __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val; + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, ~val); } else { p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1); } @@ -1151,10 +1151,10 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (p->is_write) { if (r->CRm & 0x2) /* accessing PMOVSSET_EL0 */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, (p->regval & mask)); else /* accessing PMOVSCLR_EL0 */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, ~(p->regval & mask)); } else { p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); } @@ -4786,7 +4786,7 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) r->reset(vcpu, r); if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS) - (void)__vcpu_sys_reg(vcpu, r->reg); + __vcpu_rmw_sys_reg(vcpu, r->reg, |=, 0); } set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); From b61fae4ee120f337b8700dff43b2fd639f3bf6a5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Jun 2025 08:08:23 +0100 Subject: [PATCH 006/242] KVM: arm64: Don't use __vcpu_sys_reg() to get the address of a sysreg We are about to prevent the use of __vcpu_sys_reg() as a lvalue, and getting the address of a rvalue is not a thing. Update the couple of places where we do this to use the __ctxt_sys_reg() accessor, which return the address of a register. Reviewed-by: Miguel Luis Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250603070824.1192795-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 2 +- arch/arm64/kvm/fpsimd.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 5a67a6d4d95b..c6b4fb4c8e08 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -1036,7 +1036,7 @@ void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) if (vcpu_has_nv(vcpu)) { struct arch_timer_offset *offs = &vcpu_vtimer(vcpu)->offset; - offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2); + offs->vcpu_offset = __ctxt_sys_reg(&vcpu->arch.ctxt, CNTVOFF_EL2); offs->vm_offset = &vcpu->kvm->arch.timer_data.poffset; } diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 7f6e43d25691..8f6c8f57c6b9 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -103,8 +103,8 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) fp_state.sve_state = vcpu->arch.sve_state; fp_state.sve_vl = vcpu->arch.sve_max_vl; fp_state.sme_state = NULL; - fp_state.svcr = &__vcpu_sys_reg(vcpu, SVCR); - fp_state.fpmr = &__vcpu_sys_reg(vcpu, FPMR); + fp_state.svcr = __ctxt_sys_reg(&vcpu->arch.ctxt, SVCR); + fp_state.fpmr = __ctxt_sys_reg(&vcpu->arch.ctxt, FPMR); fp_state.fp_type = &vcpu->arch.fp_type; if (vcpu_has_sve(vcpu)) From b5fa1f91e11fdf74ad4e2ac6dae246a57cbd2d95 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Jun 2025 08:08:24 +0100 Subject: [PATCH 007/242] KVM: arm64: Make __vcpu_sys_reg() a pure rvalue operand Now that we don't have any use of __vcpu_sys_reg() as a lvalue, remove the in-place update, and directly return the sanitised value. Reviewed-by: Miguel Luis Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250603070824.1192795-5-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8b8c649f225b..382b382d14da 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1130,13 +1130,13 @@ u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64); } while (0) #define __vcpu_sys_reg(v,r) \ - (*({ \ + ({ \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ - u64 *__r = __ctxt_sys_reg(ctxt, (r)); \ + u64 __v = ctxt_sys_reg(ctxt, (r)); \ if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ - *__r = kvm_vcpu_apply_reg_masks((v), (r), *__r);\ - __r; \ - })) + __v = kvm_vcpu_apply_reg_masks((v), (r), __v); \ + __v; \ + }) u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); From 9a9864fd09c7e52440c05e70609bf5ee8da425d0 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 5 Jun 2025 12:36:10 +0200 Subject: [PATCH 008/242] KVM: arm64: selftests: Fix help text for arch_timer_edge_cases Fix the help text for arch_timer_edge_cases to show the correct option for setting the wait time. Signed-off-by: Sebastian Ott Link: https://lore.kernel.org/r/20250605103613.14544-2-sebott@redhat.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c index a36a7e2db434..c4716e0c1438 100644 --- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c +++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c @@ -986,7 +986,7 @@ static void test_print_help(char *name) pr_info("\t-b: Test both physical and virtual timers (default: true)\n"); pr_info("\t-l: Delta (in ms) used for long wait time test (default: %u)\n", LONG_WAIT_TEST_MS); - pr_info("\t-l: Delta (in ms) used for wait times (default: %u)\n", + pr_info("\t-w: Delta (in ms) used for wait times (default: %u)\n", WAIT_TEST_MS); pr_info("\t-p: Test physical timer (default: true)\n"); pr_info("\t-v: Test virtual timer (default: true)\n"); From 050632ae6571d0f148fa0d4b90b6409a12645461 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 5 Jun 2025 12:36:11 +0200 Subject: [PATCH 009/242] KVM: arm64: selftests: Fix thread migration in arch_timer_edge_cases arch_timer_edge_cases tries to migrate itself across host cpus. Before the first test, it migrates to cpu 0 by setting up an affinity mask with only bit 0 set. After that it looks for the next possible cpu in the current affinity mask which still has only bit 0 set. So there is no migration at all. Fix this by reading the default mask at start and use this to find the next cpu in each iteration. Signed-off-by: Sebastian Ott Link: https://lore.kernel.org/r/20250605103613.14544-3-sebott@redhat.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c index c4716e0c1438..a813b4c6c817 100644 --- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c +++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c @@ -849,17 +849,17 @@ static void guest_code(enum arch_timer timer) GUEST_DONE(); } +static cpu_set_t default_cpuset; + static uint32_t next_pcpu(void) { uint32_t max = get_nprocs(); uint32_t cur = sched_getcpu(); uint32_t next = cur; - cpu_set_t cpuset; + cpu_set_t cpuset = default_cpuset; TEST_ASSERT(max > 1, "Need at least two physical cpus"); - sched_getaffinity(0, sizeof(cpuset), &cpuset); - do { next = (next + 1) % CPU_SETSIZE; } while (!CPU_ISSET(next, &cpuset)); @@ -1046,6 +1046,8 @@ int main(int argc, char *argv[]) if (!parse_args(argc, argv)) exit(KSFT_SKIP); + sched_getaffinity(0, sizeof(default_cpuset), &default_cpuset); + if (test_args.test_virtual) { test_vm_create(&vm, &vcpu, VIRTUAL); test_run(vm, vcpu); From 05ce38d489dbc96eb1dd700b287f949cb8cc0610 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 5 Jun 2025 12:36:12 +0200 Subject: [PATCH 010/242] KVM: arm64: selftests: Fix xVAL init in arch_timer_edge_cases arch_timer_edge_cases hits the following assertion in < 10% of the test runs: ==== Test Assertion Failure ==== arm64/arch_timer_edge_cases.c:490: timer_get_cntct(timer) >= DEF_CNT + (timer_get_cntfrq() * (uint64_t)(delta_2_ms) / 1000) pid=17110 tid=17110 errno=4 - Interrupted system call 1 0x0000000000404ec7: test_run at arch_timer_edge_cases.c:945 2 0x0000000000401fa3: main at arch_timer_edge_cases.c:1074 3 0x0000ffffa774b587: ?? ??:0 4 0x0000ffffa774b65f: ?? ??:0 5 0x000000000040206f: _start at ??:? timer_get_cntct(timer) >= DEF_CNT + msec_to_cycles(delta_2_ms) Enabling the timer without proper xval initialization in set_tval_irq() resulted in an early interrupt during timer reprogramming. Make sure to set the xval before setting the enable bit. Signed-off-by: Sebastian Ott Link: https://lore.kernel.org/r/20250605103613.14544-4-sebott@redhat.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c index a813b4c6c817..be8bbedc933b 100644 --- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c +++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c @@ -191,8 +191,8 @@ static void set_tval_irq(enum arch_timer timer, uint64_t tval_cycles, { atomic_set(&shared_data.handled, 0); atomic_set(&shared_data.spurious, 0); - timer_set_ctl(timer, ctl); timer_set_tval(timer, tval_cycles); + timer_set_ctl(timer, ctl); } static void set_xval_irq(enum arch_timer timer, uint64_t xval, uint32_t ctl, From fad4cf944839da7f5c3376243aa353295c88f588 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 5 Jun 2025 12:36:13 +0200 Subject: [PATCH 011/242] KVM: arm64: selftests: Determine effective counter width in arch_timer_edge_cases arch_timer_edge_cases uses ~0 as the maximum counter value, however there's no architectural guarantee that this is valid. Figure out the effective counter width based on the effective frequency like it's done by the kernel. This also serves as a workaround for AC03_CPU_14 that led to the following assertion failure on ampere-one machines: ==== Test Assertion Failure ==== arm64/arch_timer_edge_cases.c:169: timer_condition == istatus pid=11236 tid=11236 errno=4 - Interrupted system call 1 0x0000000000404ce7: test_run at arch_timer_edge_cases.c:938 2 0x0000000000401ebb: main at arch_timer_edge_cases.c:1053 3 0x0000ffff9fa8625b: ?? ??:0 4 0x0000ffff9fa8633b: ?? ??:0 5 0x0000000000401fef: _start at ??:? 0x1 != 0x0 (timer_condition != istatus) Note that the following subtest only worked since the counter initialized with CVAL_MAX would instantly overflow (which is no longer the case): test_set_cnt_after_cval_no_irq(timer, 0, DEF_CNT, CVAL_MAX, sm); To fix this we could swap CVAL_MAX for 0 here but since that is already done by test_move_counters_behind_timers() let's remove that subtest. Link: https://lore.kernel.org/kvmarm/ac1de1d2-ef2b-d439-dc48-8615e121b07b@redhat.com Link: https://amperecomputing.com/assets/AmpereOne_Developer_ER_v0_80_20240823_28945022f4.pdf Signed-off-by: Sebastian Ott Link: https://lore.kernel.org/r/20250605103613.14544-5-sebott@redhat.com Signed-off-by: Marc Zyngier --- .../kvm/arm64/arch_timer_edge_cases.c | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c index be8bbedc933b..b4d22b3ab7cc 100644 --- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c +++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c @@ -22,7 +22,8 @@ #include "gic.h" #include "vgic.h" -static const uint64_t CVAL_MAX = ~0ULL; +/* Depends on counter width. */ +static uint64_t CVAL_MAX; /* tval is a signed 32-bit int. */ static const int32_t TVAL_MAX = INT32_MAX; static const int32_t TVAL_MIN = INT32_MIN; @@ -30,8 +31,8 @@ static const int32_t TVAL_MIN = INT32_MIN; /* After how much time we say there is no IRQ. */ static const uint32_t TIMEOUT_NO_IRQ_US = 50000; -/* A nice counter value to use as the starting one for most tests. */ -static const uint64_t DEF_CNT = (CVAL_MAX / 2); +/* Counter value to use as the starting one for most tests. Set to CVAL_MAX/2 */ +static uint64_t DEF_CNT; /* Number of runs. */ static const uint32_t NR_TEST_ITERS_DEF = 5; @@ -732,12 +733,6 @@ static void test_move_counters_ahead_of_timers(enum arch_timer timer) test_set_cnt_after_tval(timer, 0, tval, (uint64_t) tval + 1, wm); } - - for (i = 0; i < ARRAY_SIZE(sleep_method); i++) { - sleep_method_t sm = sleep_method[i]; - - test_set_cnt_after_cval_no_irq(timer, 0, DEF_CNT, CVAL_MAX, sm); - } } /* @@ -975,6 +970,8 @@ static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu, test_init_timer_irq(*vm, *vcpu); vgic_v3_setup(*vm, 1, 64); sync_global_to_guest(*vm, test_args); + sync_global_to_guest(*vm, CVAL_MAX); + sync_global_to_guest(*vm, DEF_CNT); } static void test_print_help(char *name) @@ -1035,6 +1032,17 @@ static bool parse_args(int argc, char *argv[]) return false; } +static void set_counter_defaults(void) +{ + const uint64_t MIN_ROLLOVER_SECS = 40ULL * 365 * 24 * 3600; + uint64_t freq = read_sysreg(CNTFRQ_EL0); + uint64_t width = ilog2(MIN_ROLLOVER_SECS * freq); + + width = clamp(width, 56, 64); + CVAL_MAX = GENMASK_ULL(width - 1, 0); + DEF_CNT = CVAL_MAX / 2; +} + int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; @@ -1047,6 +1055,7 @@ int main(int argc, char *argv[]) exit(KSFT_SKIP); sched_getaffinity(0, sizeof(default_cpuset), &default_cpuset); + set_counter_defaults(); if (test_args.test_virtual) { test_vm_create(&vm, &vcpu, VIRTUAL); From 800d0b9b6a8b1b354637b4194cc167ad1ce2bdd3 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Thu, 5 Jun 2025 12:51:16 -0400 Subject: [PATCH 012/242] fs/xattr.c: fix simple_xattr_list() commit 8b0ba61df5a1 ("fs/xattr.c: fix simple_xattr_list to always include security.* xattrs") failed to reset err after the call to security_inode_listsecurity(), which returns the length of the returned xattr name. This results in simple_xattr_list() incorrectly returning this length even if a POSIX acl is also set on the inode. Reported-by: Collin Funk Closes: https://lore.kernel.org/selinux/8734ceal7q.fsf@gmail.com/ Reported-by: Paul Eggert Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2369561 Fixes: 8b0ba61df5a1 ("fs/xattr.c: fix simple_xattr_list to always include security.* xattrs") Signed-off-by: Stephen Smalley Link: https://lore.kernel.org/20250605165116.2063-1-stephen.smalley.work@gmail.com Signed-off-by: Christian Brauner --- fs/xattr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xattr.c b/fs/xattr.c index 8ec5b0204bfd..600ae97969cf 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1479,6 +1479,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, buffer += err; } remaining_size -= err; + err = 0; read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { From 11fcf368506d347088e613edf6cd2604d70c454f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 6 Jun 2025 10:23:57 +0200 Subject: [PATCH 013/242] uapi: bitops: use UAPI-safe variant of BITS_PER_LONG again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1e7933a575ed ("uapi: Revert "bitops: avoid integer overflow in GENMASK(_ULL)"") did not take in account that the usage of BITS_PER_LONG in __GENMASK() was changed to __BITS_PER_LONG for UAPI-safety in commit 3c7a8e190bc5 ("uapi: introduce uapi-friendly macros for GENMASK"). BITS_PER_LONG can not be used in UAPI headers as it derives from the kernel configuration and not from the current compiler invocation. When building compat userspace code or a compat vDSO its value will be incorrect. Switch back to __BITS_PER_LONG. Fixes: 1e7933a575ed ("uapi: Revert "bitops: avoid integer overflow in GENMASK(_ULL)"") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Signed-off-by: Yury Norov [NVIDIA] --- include/uapi/linux/bits.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bits.h b/include/uapi/linux/bits.h index 682b406e1067..a04afef9efca 100644 --- a/include/uapi/linux/bits.h +++ b/include/uapi/linux/bits.h @@ -4,9 +4,9 @@ #ifndef _UAPI_LINUX_BITS_H #define _UAPI_LINUX_BITS_H -#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h)))) +#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) -#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) +#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) #define __GENMASK_U128(h, l) \ ((_BIT128((h)) << 1) - (_BIT128(l))) From c0f691388992c708436ab5f6e810865be6ddf5c6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Jun 2025 17:04:11 +0200 Subject: [PATCH 014/242] intel_idle: Use subsys_initcall_sync() for initialization It is not necessary to wait until the device_initcall() stage with intel_idle initialization. All of its dependencies are met after all subsys_initcall()s have run, so subsys_initcall_sync() can be used for initializing it. It is also better to ensure that intel_idle will always initialize before the ACPI processor driver that uses module_init() for its initialization. Signed-off-by: Rafael J. Wysocki Tested-by: Artem Bityutskiy Link: https://patch.msgid.link/2994397.e9J7NaK4W3@rjwysocki.net --- drivers/idle/intel_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 8ccb483204fa..64ac4da08094 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -2518,7 +2518,7 @@ static int __init intel_idle_init(void) return retval; } -device_initcall(intel_idle_init); +subsys_initcall_sync(intel_idle_init); /* * We are not really modular, but we used to support that. Meaning we also From 4c529a4a7260776bb4abe264498857b4537aa70d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 7 Jun 2025 14:22:56 +0200 Subject: [PATCH 015/242] x86/smp: PM/hibernate: Split arch_resume_nosmt() Move the inner part of the arch_resume_nosmt() code into a separate function called arch_cpu_rescan_dead_smt_siblings(), so it can be used in other places where "dead" SMT siblings may need to be taken online and offline again in order to get into deep idle states. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Dave Hansen Tested-by: Artem Bityutskiy Link: https://patch.msgid.link/3361688.44csPzL39Z@rjwysocki.net [ rjw: Prevent build issues with CONFIG_SMP unset ] Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/smp.c | 24 ++++++++++++++++++++++++ arch/x86/power/hibernate.c | 17 +++++------------ include/linux/cpu.h | 3 +++ 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18266cc3d98c..b014e6d229f9 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -299,3 +299,27 @@ struct smp_ops smp_ops = { .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); + +int arch_cpu_rescan_dead_smt_siblings(void) +{ + enum cpuhp_smt_control old = cpu_smt_control; + int ret; + + /* + * If SMT has been disabled and SMT siblings are in HLT, bring them back + * online and offline them again so that they end up in MWAIT proper. + * + * Called with hotplug enabled. + */ + if (old != CPU_SMT_DISABLED && old != CPU_SMT_FORCE_DISABLED) + return 0; + + ret = cpuhp_smt_enable(); + if (ret) + return ret; + + ret = cpuhp_smt_disable(old); + + return ret; +} +EXPORT_SYMBOL_GPL(arch_cpu_rescan_dead_smt_siblings); diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index a7c23f2a58c9..a2294c1649f6 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -192,7 +192,8 @@ int relocate_restore_code(void) int arch_resume_nosmt(void) { - int ret = 0; + int ret; + /* * We reached this while coming out of hibernation. This means * that SMT siblings are sleeping in hlt, as mwait is not safe @@ -206,18 +207,10 @@ int arch_resume_nosmt(void) * Called with hotplug disabled. */ cpu_hotplug_enable(); - if (cpu_smt_control == CPU_SMT_DISABLED || - cpu_smt_control == CPU_SMT_FORCE_DISABLED) { - enum cpuhp_smt_control old = cpu_smt_control; - ret = cpuhp_smt_enable(); - if (ret) - goto out; - ret = cpuhp_smt_disable(old); - if (ret) - goto out; - } -out: + ret = arch_cpu_rescan_dead_smt_siblings(); + cpu_hotplug_disable(); + return ret; } diff --git a/include/linux/cpu.h b/include/linux/cpu.h index e6089abc28e2..96a3a0d6a60e 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -120,6 +120,7 @@ extern void cpu_maps_update_begin(void); extern void cpu_maps_update_done(void); int bringup_hibernate_cpu(unsigned int sleep_cpu); void bringup_nonboot_cpus(unsigned int max_cpus); +int arch_cpu_rescan_dead_smt_siblings(void); #else /* CONFIG_SMP */ #define cpuhp_tasks_frozen 0 @@ -134,6 +135,8 @@ static inline void cpu_maps_update_done(void) static inline int add_cpu(unsigned int cpu) { return 0;} +static inline int arch_cpu_rescan_dead_smt_siblings(void) { return 0; } + #endif /* CONFIG_SMP */ extern const struct bus_type cpu_subsys; From a430c11f401589a0f4f57fd398271a5d85142c7a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Jun 2025 17:06:08 +0200 Subject: [PATCH 016/242] intel_idle: Rescan "dead" SMT siblings during initialization Make intel_idle_init() call arch_cpu_rescan_dead_smt_siblings() after successfully registering intel_idle as the cpuidle driver so as to allow the "dead" SMT siblings (if any) to go into deep idle states. This is necessary for the processor to be able to reach deep package C-states (like PC10) going forward which is requisite for reducing power sufficiently in suspend-to-idle, among other things. Signed-off-by: Rafael J. Wysocki Tested-by: Artem Bityutskiy Link: https://patch.msgid.link/10669885.nUPlyArG6x@rjwysocki.net --- drivers/idle/intel_idle.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 64ac4da08094..63565814c7e5 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -2507,6 +2507,8 @@ static int __init intel_idle_init(void) pr_debug("Local APIC timer is reliable in %s\n", boot_cpu_has(X86_FEATURE_ARAT) ? "all C-states" : "C1"); + arch_cpu_rescan_dead_smt_siblings(); + return 0; hp_setup_fail: From f694481b1d3177144fcac4242eb750cfcb9f7bd5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Jun 2025 17:07:31 +0200 Subject: [PATCH 017/242] ACPI: processor: Rescan "dead" SMT siblings during initialization Make acpi_processor_driver_init() call arch_cpu_rescan_dead_smt_siblings(), via a new wrapper function called acpi_idle_rescan_dead_smt_siblings(), after successfully initializing the driver, to allow the "dead" SMT siblings to go into deep idle states, which is necessary for the processor to be able to reach deep package C-states (like PC10) going forward, so that power can be reduced sufficiently in suspend-to-idle, among other things. However, do it only if the ACPI idle driver is the current cpuidle driver (otherwise it is assumed that another cpuidle driver will take care of this) and avoid doing it on architectures other than x86. Signed-off-by: Rafael J. Wysocki Tested-by: Artem Bityutskiy Link: https://patch.msgid.link/2005721.PYKUYFuaPT@rjwysocki.net --- drivers/acpi/internal.h | 6 ++++++ drivers/acpi/processor_driver.c | 3 +++ drivers/acpi/processor_idle.c | 8 ++++++++ 3 files changed, 17 insertions(+) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 00910ccd7eda..e2781864fdce 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -175,6 +175,12 @@ bool processor_physically_present(acpi_handle handle); static inline void acpi_early_processor_control_setup(void) {} #endif +#ifdef CONFIG_ACPI_PROCESSOR_CSTATE +void acpi_idle_rescan_dead_smt_siblings(void); +#else +static inline void acpi_idle_rescan_dead_smt_siblings(void) {} +#endif + /* -------------------------------------------------------------------------- Embedded Controller -------------------------------------------------------------------------- */ diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 3b281bc1e73c..65e779be64ff 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -279,6 +279,9 @@ static int __init acpi_processor_driver_init(void) * after acpi_cppc_processor_probe() has been called for all online CPUs */ acpi_processor_init_invariance_cppc(); + + acpi_idle_rescan_dead_smt_siblings(); + return 0; err: driver_unregister(&acpi_processor_driver); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index e2febca2ec13..2c2dc559e0f8 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -24,6 +24,8 @@ #include #include +#include "internal.h" + /* * Include the apic definitions for x86 to have the APIC timer related defines * available also for UP (on SMP it gets magically included via linux/smp.h). @@ -55,6 +57,12 @@ struct cpuidle_driver acpi_idle_driver = { }; #ifdef CONFIG_ACPI_PROCESSOR_CSTATE +void acpi_idle_rescan_dead_smt_siblings(void) +{ + if (cpuidle_get_driver() == &acpi_idle_driver) + arch_cpu_rescan_dead_smt_siblings(); +} + static DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX], acpi_cstate); From a18d098f2aab82abde1d59c56aef9beca01c892b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Jun 2025 17:09:35 +0200 Subject: [PATCH 018/242] Reapply "x86/smp: Eliminate mwait_play_dead_cpuid_hint()" Revert commit 70523f335734 ("Revert "x86/smp: Eliminate mwait_play_dead_cpuid_hint()"") to reapply the changes from commit 96040f7273e2 ("x86/smp: Eliminate mwait_play_dead_cpuid_hint()") reverted by it. Previously, these changes caused idle power to rise on systems booting with "nosmt" in the kernel command line because they effectively caused "dead" SMT siblings to remain in idle state C1 after executing the HLT instruction, which prevented the processor from reaching package idle states deeper than PC2 going forward. Now, the "dead" SMT siblings are rescanned after initializing a proper cpuidle driver for the processor (either intel_idle or ACPI idle), at which point they are able to enter a sufficiently deep idle state in native_play_dead() via cpuidle, so the code changes in question can be reapplied. Signed-off-by: Rafael J. Wysocki Acked-by: Dave Hansen Tested-by: Artem Bityutskiy Link: https://patch.msgid.link/7813065.EvYhyI6sBW@rjwysocki.net --- arch/x86/kernel/smpboot.c | 54 +++++---------------------------------- 1 file changed, 7 insertions(+), 47 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fc78c2325fd2..58ede3fa6a75 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1244,6 +1244,10 @@ void play_dead_common(void) local_irq_disable(); } +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ void __noreturn mwait_play_dead(unsigned int eax_hint) { struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); @@ -1289,50 +1293,6 @@ void __noreturn mwait_play_dead(unsigned int eax_hint) } } -/* - * We need to flush the caches before going to sleep, lest we have - * dirty data in our caches when we come back up. - */ -static inline void mwait_play_dead_cpuid_hint(void) -{ - unsigned int eax, ebx, ecx, edx; - unsigned int highest_cstate = 0; - unsigned int highest_subcstate = 0; - int i; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - return; - if (!this_cpu_has(X86_FEATURE_MWAIT)) - return; - if (!this_cpu_has(X86_FEATURE_CLFLUSH)) - return; - - eax = CPUID_LEAF_MWAIT; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - /* - * eax will be 0 if EDX enumeration is not valid. - * Initialized below to cstate, sub_cstate value when EDX is valid. - */ - if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { - eax = 0; - } else { - edx >>= MWAIT_SUBSTATE_SIZE; - for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { - if (edx & MWAIT_SUBSTATE_MASK) { - highest_cstate = i; - highest_subcstate = edx & MWAIT_SUBSTATE_MASK; - } - } - eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | - (highest_subcstate - 1); - } - - mwait_play_dead(eax); -} - /* * Kick all "offline" CPUs out of mwait on kexec(). See comment in * mwait_play_dead(). @@ -1383,9 +1343,9 @@ void native_play_dead(void) play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - mwait_play_dead_cpuid_hint(); - if (cpuidle_play_dead()) - hlt_play_dead(); + /* Below returns only on error. */ + cpuidle_play_dead(); + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ From 06118ae36855b7d3d22688298e74a766ccf0cb7a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 27 May 2025 08:44:14 +0300 Subject: [PATCH 019/242] regulator: max20086: Fix refcount leak in max20086_parse_regulators_dt() There is a missing call to of_node_put() if devm_kcalloc() fails. Fix this by changing the code to use cleanup.h magic to drop the refcount. Fixes: 6b0cd72757c6 ("regulator: max20086: fix invalid memory access") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/aDVRLqgJWMxYU03G@stanley.mountain Reviewed-by: Laurent Pinchart Signed-off-by: Mark Brown --- drivers/regulator/max20086-regulator.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/regulator/max20086-regulator.c b/drivers/regulator/max20086-regulator.c index b4fe76e33ff2..fcdd2d0317a5 100644 --- a/drivers/regulator/max20086-regulator.c +++ b/drivers/regulator/max20086-regulator.c @@ -5,6 +5,7 @@ // Copyright (C) 2022 Laurent Pinchart // Copyright (C) 2018 Avnet, Inc. +#include #include #include #include @@ -133,11 +134,11 @@ static int max20086_regulators_register(struct max20086 *chip) static int max20086_parse_regulators_dt(struct max20086 *chip, bool *boot_on) { struct of_regulator_match *matches; - struct device_node *node; unsigned int i; int ret; - node = of_get_child_by_name(chip->dev->of_node, "regulators"); + struct device_node *node __free(device_node) = + of_get_child_by_name(chip->dev->of_node, "regulators"); if (!node) { dev_err(chip->dev, "regulators node not found\n"); return -ENODEV; @@ -153,7 +154,6 @@ static int max20086_parse_regulators_dt(struct max20086 *chip, bool *boot_on) ret = of_regulator_match(chip->dev, node, matches, chip->info->num_outputs); - of_node_put(node); if (ret < 0) { dev_err(chip->dev, "Failed to match regulators\n"); return -EINVAL; From a5bf5272295d3f058adeee025d2a0b6625f8ba7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Pi=C3=A9dallu?= Date: Fri, 6 Jun 2025 15:37:24 +0200 Subject: [PATCH 020/242] spi: omap2-mcspi: Disable multi mode when CS should be kept asserted after message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the last transfer of a SPI message has the cs_change flag, the CS is kept asserted after the message. Multi-mode can't respect this as CS is deasserted by the hardware at the end of the message. Disable multi-mode when not applicable to the current message. Fixes: d153ff4056cb ("spi: omap2-mcspi: Add support for MULTI-mode") Signed-off-by: Félix Piédallu Link: https://patch.msgid.link/20250606-cs_change_fix-v1-1-27191a98a2e5@non.se.com Signed-off-by: Mark Brown --- drivers/spi/spi-omap2-mcspi.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c index 29c616e2c408..e834fb42fd4b 100644 --- a/drivers/spi/spi-omap2-mcspi.c +++ b/drivers/spi/spi-omap2-mcspi.c @@ -1287,9 +1287,15 @@ static int omap2_mcspi_prepare_message(struct spi_controller *ctlr, mcspi->use_multi_mode = false; } - /* Check if transfer asks to change the CS status after the transfer */ - if (!tr->cs_change) - mcspi->use_multi_mode = false; + if (list_is_last(&tr->transfer_list, &msg->transfers)) { + /* Check if transfer asks to keep the CS status after the whole message */ + if (tr->cs_change) + mcspi->use_multi_mode = false; + } else { + /* Check if transfer asks to change the CS status after the transfer */ + if (!tr->cs_change) + mcspi->use_multi_mode = false; + } /* * If at least one message is not compatible, switch back to single mode From 10c24e0d2f7cd2bc8a847cf750f01301ce67dbc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Pi=C3=A9dallu?= Date: Fri, 6 Jun 2025 15:37:25 +0200 Subject: [PATCH 021/242] spi: omap2-mcspi: Disable multi-mode when the previous message kept CS asserted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the last transfer of a SPI message has the cs_change flag, the CS is kept asserted after the message. The next message can't use multi-mode because the CS will be briefly deasserted before the first transfer. Remove the early exit of the list_for_each_entry because the last transfer actually needs to be always checked. Fixes: d153ff4056cb ("spi: omap2-mcspi: Add support for MULTI-mode") Signed-off-by: Félix Piédallu Link: https://patch.msgid.link/20250606-cs_change_fix-v1-2-27191a98a2e5@non.se.com Signed-off-by: Mark Brown --- drivers/spi/spi-omap2-mcspi.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c index e834fb42fd4b..70bb74b3bd9c 100644 --- a/drivers/spi/spi-omap2-mcspi.c +++ b/drivers/spi/spi-omap2-mcspi.c @@ -134,6 +134,7 @@ struct omap2_mcspi { size_t max_xfer_len; u32 ref_clk_hz; bool use_multi_mode; + bool last_msg_kept_cs; }; struct omap2_mcspi_cs { @@ -1269,6 +1270,10 @@ static int omap2_mcspi_prepare_message(struct spi_controller *ctlr, * multi-mode is applicable. */ mcspi->use_multi_mode = true; + + if (mcspi->last_msg_kept_cs) + mcspi->use_multi_mode = false; + list_for_each_entry(tr, &msg->transfers, transfer_list) { if (!tr->bits_per_word) bits_per_word = msg->spi->bits_per_word; @@ -1289,22 +1294,17 @@ static int omap2_mcspi_prepare_message(struct spi_controller *ctlr, if (list_is_last(&tr->transfer_list, &msg->transfers)) { /* Check if transfer asks to keep the CS status after the whole message */ - if (tr->cs_change) + if (tr->cs_change) { mcspi->use_multi_mode = false; + mcspi->last_msg_kept_cs = true; + } else { + mcspi->last_msg_kept_cs = false; + } } else { /* Check if transfer asks to change the CS status after the transfer */ if (!tr->cs_change) mcspi->use_multi_mode = false; } - - /* - * If at least one message is not compatible, switch back to single mode - * - * The bits_per_word of certain transfer can be different, but it will have no - * impact on the signal itself. - */ - if (!mcspi->use_multi_mode) - break; } omap2_mcspi_set_mode(ctlr); From 1dd630088332b5b310f3dd7eaacae9f3c4465651 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 8 Jun 2025 22:29:39 +0800 Subject: [PATCH 022/242] spi: loongson: Fix build warnings about export.h After commit a934a57a42f64a4 ("scripts/misc-check: check missing #include when W=1") and 7d95680d64ac8e836c ("scripts/misc-check: check unnecessary #include when W=1"), we get some build warnings with W=1: drivers/spi/spi-loongson-core.c: warning: EXPORT_SYMBOL() is used, but #include is missing So fix these build warnings for SPI/Loongson. Signed-off-by: Huacai Chen Link: https://patch.msgid.link/20250608142939.172108-1-chenhuacai@loongson.cn Signed-off-by: Mark Brown --- drivers/spi/spi-loongson-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/spi/spi-loongson-core.c b/drivers/spi/spi-loongson-core.c index 4fec226456d1..b46f072a0387 100644 --- a/drivers/spi/spi-loongson-core.c +++ b/drivers/spi/spi-loongson-core.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include From 2b74aea6d078ade810a3b0f7d1bcfcba2eaad416 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 6 Jun 2025 12:04:14 +0300 Subject: [PATCH 023/242] spi: spi-pci1xxxx: Fix error code in probe Return the error code if pci_alloc_irq_vectors() fails. Don't return success. Fixes: b4608e944177 ("spi: spi-pci1xxxx: Fix Probe failure with Dual SPI instance with INTx interrupts") Signed-off-by: Dan Carpenter Reviewed-by: Thangaraj Samynathan Link: https://patch.msgid.link/aEKvDrUxD19GWi0u@stanley.mountain Signed-off-by: Mark Brown --- drivers/spi/spi-pci1xxxx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-pci1xxxx.c b/drivers/spi/spi-pci1xxxx.c index c6489e90b8b9..9112d8a1a0c8 100644 --- a/drivers/spi/spi-pci1xxxx.c +++ b/drivers/spi/spi-pci1xxxx.c @@ -765,7 +765,7 @@ static int pci1xxxx_spi_probe(struct pci_dev *pdev, const struct pci_device_id * PCI_IRQ_ALL_TYPES); if (num_vector < 0) { dev_err(&pdev->dev, "Error allocating MSI vectors\n"); - return ret; + return num_vector; } init_completion(&spi_sub_ptr->spi_xfer_done); From 5808c34216954cd832bd4b8bc52dfa287049122b Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Mon, 26 May 2025 04:18:07 +0800 Subject: [PATCH 024/242] platform/x86: ideapad-laptop: use usleep_range() for EC polling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It was reported that ideapad-laptop sometimes causes some recent (since 2024) Lenovo ThinkBook models shut down when: - suspending/resuming - closing/opening the lid - (dis)connecting a charger - reading/writing some sysfs properties, e.g., fan_mode, touchpad - pressing down some Fn keys, e.g., Brightness Up/Down (Fn+F5/F6) - (seldom) loading the kmod The issue has existed since the launch day of such models, and there have been some out-of-tree workarounds (see Link:) for the issue. One disables some functionalities, while another one simply shortens IDEAPAD_EC_TIMEOUT. The disabled functionalities have read_ec_data() in their call chains, which calls schedule() between each poll. It turns out that these models suffer from the indeterminacy of schedule() because of their low tolerance for being polled too frequently. Sometimes schedule() returns too soon due to the lack of ready tasks, causing the margin between two polls to be too short. In this case, the command is somehow aborted, and too many subsequent polls (they poll for "nothing!") may eventually break the state machine in the EC, resulting in a hard shutdown. This explains why shortening IDEAPAD_EC_TIMEOUT works around the issue - it reduces the total number of polls sent to the EC. Even when it doesn't lead to a shutdown, frequent polls may also disturb the ongoing operation and notably delay (+ 10-20ms) the availability of EC response. This phenomenon is unlikely to be exclusive to the models mentioned above, so dropping the schedule() manner should also slightly improve the responsiveness of various models. Fix these issues by migrating to usleep_range(150, 300). The interval is chosen to add some margin to the minimal 50us and considering EC responses are usually available after 150-2500us based on my test. It should be enough to fix these issues on all models subject to the EC bug without introducing latency on other models. Tested on ThinkBook 14 G7+ ASP and solved both issues. No regression was introduced in the test on a model without the EC bug (ThinkBook X IMH, thanks Eric). Link: https://github.com/ty2/ideapad-laptop-tb2024g6plus/commit/6c5db18c9e8109873c2c90a7d2d7f552148f7ad4 Link: https://github.com/ferstar/ideapad-laptop-tb/commit/42d1e68e5009529d31bd23f978f636f79c023e80 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218771 Fixes: 6a09f21dd1e2 ("ideapad: add ACPI helpers") Cc: stable@vger.kernel.org Tested-by: Felix Yan Tested-by: Eric Long Tested-by: Jianfei Zhang Tested-by: Mingcong Bai Tested-by: Minh Le Tested-by: Sicheng Zhu Signed-off-by: Rong Zhang Link: https://lore.kernel.org/r/20250525201833.37939-1-i@rong.moe Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/ideapad-laptop.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index ede483573fe0..b5e4da6a6779 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -267,6 +268,20 @@ static void ideapad_shared_exit(struct ideapad_private *priv) */ #define IDEAPAD_EC_TIMEOUT 200 /* in ms */ +/* + * Some models (e.g., ThinkBook since 2024) have a low tolerance for being + * polled too frequently. Doing so may break the state machine in the EC, + * resulting in a hard shutdown. + * + * It is also observed that frequent polls may disturb the ongoing operation + * and notably delay the availability of EC response. + * + * These values are used as the delay before the first poll and the interval + * between subsequent polls to solve the above issues. + */ +#define IDEAPAD_EC_POLL_MIN_US 150 +#define IDEAPAD_EC_POLL_MAX_US 300 + static int eval_int(acpi_handle handle, const char *name, unsigned long *res) { unsigned long long result; @@ -383,7 +398,7 @@ static int read_ec_data(acpi_handle handle, unsigned long cmd, unsigned long *da end_jiffies = jiffies + msecs_to_jiffies(IDEAPAD_EC_TIMEOUT) + 1; while (time_before(jiffies, end_jiffies)) { - schedule(); + usleep_range(IDEAPAD_EC_POLL_MIN_US, IDEAPAD_EC_POLL_MAX_US); err = eval_vpcr(handle, 1, &val); if (err) @@ -414,7 +429,7 @@ static int write_ec_cmd(acpi_handle handle, unsigned long cmd, unsigned long dat end_jiffies = jiffies + msecs_to_jiffies(IDEAPAD_EC_TIMEOUT) + 1; while (time_before(jiffies, end_jiffies)) { - schedule(); + usleep_range(IDEAPAD_EC_POLL_MIN_US, IDEAPAD_EC_POLL_MAX_US); err = eval_vpcr(handle, 1, &val); if (err) From 685f88c72a0c4d12d3bd2ff50286938f14486f85 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 6 Jun 2025 13:53:00 -0700 Subject: [PATCH 025/242] platform/x86/intel-uncore-freq: Fail module load when plat_info is NULL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address a Smatch static checker warning regarding an unchecked dereference in the function call: set_cdie_id(i, cluster_info, plat_info) when plat_info is NULL. Instead of addressing this one case, in general if plat_info is NULL then it can cause other issues. For example in a two package system it will give warning for duplicate sysfs entry as package ID will be always zero for both packages when creating string for attribute group name. plat_info is derived from TPMI ID TPMI_BUS_INFO, which is integral to the core TPMI design. Therefore, it should not be NULL on a production platform. Consequently, the module should fail to load if plat_info is NULL. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/platform-driver-x86/aEKvGCLd1qmX04Tc@stanley.mountain/T/#u Fixes: 8a54e2253e4c ("platform/x86/intel-uncore-freq: Uncore frequency control via TPMI") Signed-off-by: Srinivas Pandruvada Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250606205300.2384494-1-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../x86/intel/uncore-frequency/uncore-frequency-tpmi.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c index 1c7b2f2716ca..44d9948ed224 100644 --- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c @@ -511,10 +511,13 @@ static int uncore_probe(struct auxiliary_device *auxdev, const struct auxiliary_ /* Get the package ID from the TPMI core */ plat_info = tpmi_get_platform_data(auxdev); - if (plat_info) - pkg = plat_info->package_id; - else + if (unlikely(!plat_info)) { dev_info(&auxdev->dev, "Platform information is NULL\n"); + ret = -ENODEV; + goto err_rem_common; + } + + pkg = plat_info->package_id; for (i = 0; i < num_resources; ++i) { struct tpmi_uncore_power_domain_info *pd_info; From afbdc4bbb3a6418778cf969b0795bbaa8237cdd3 Mon Sep 17 00:00:00 2001 From: Joshua Grisham Date: Fri, 6 Jun 2025 15:09:08 +0200 Subject: [PATCH 026/242] platform/x86: samsung-galaxybook: Add SAM0426 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add device ID SAM0426 (Notebook 9 Pro and similar devices) as reported and tested by GitHub user "diego-karsa" [1]. [1]: https://github.com/joshuagrisham/samsung-galaxybook-extras/issues/69 Signed-off-by: Joshua Grisham Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20250606130909.207047-1-josh@joshuagrisham.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/samsung-galaxybook.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/samsung-galaxybook.c b/drivers/platform/x86/samsung-galaxybook.c index 5878a351993e..3c13e13d4885 100644 --- a/drivers/platform/x86/samsung-galaxybook.c +++ b/drivers/platform/x86/samsung-galaxybook.c @@ -1403,6 +1403,7 @@ static int galaxybook_probe(struct platform_device *pdev) } static const struct acpi_device_id galaxybook_device_ids[] = { + { "SAM0426" }, { "SAM0427" }, { "SAM0428" }, { "SAM0429" }, From 1d0a61940e22e165e6acc4a9c6fb26edbe69112e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 6 Jun 2025 12:04:32 +0300 Subject: [PATCH 027/242] platform/x86/intel: power-domains: Fix error code in tpmi_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return -ENOMEM instead of success if kcalloc() fails. Fixes: e37be5d85c60 ("platform/x86/intel: power-domains: Add interface to get Linux die ID") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/aEKvIGCt6d8Gcx4S@stanley.mountain Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/tpmi_power_domains.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/tpmi_power_domains.c b/drivers/platform/x86/intel/tpmi_power_domains.c index 0c5c88eb7baf..9d8247bb9cfa 100644 --- a/drivers/platform/x86/intel/tpmi_power_domains.c +++ b/drivers/platform/x86/intel/tpmi_power_domains.c @@ -228,8 +228,10 @@ static int __init tpmi_init(void) domain_die_map = kcalloc(size_mul(topology_max_packages(), MAX_POWER_DOMAINS), sizeof(*domain_die_map), GFP_KERNEL); - if (!domain_die_map) + if (!domain_die_map) { + ret = -ENOMEM; goto free_domain_mask; + } ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "platform/x86/tpmi_power_domains:online", From 4dbd11796f3a8eb95647507befc41995458a4023 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 3 Jun 2025 08:24:08 -0500 Subject: [PATCH 028/242] platform/x86/amd: pmc: Clear metrics table at start of cycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The area of memory that contains the metrics table may contain garbage when the cycle starts. This normally doesn't matter because the cycle itself will populate it with valid data, however commit 9f5595d5f03fd ("platform/x86/amd: pmc: Require at least 2.5 seconds between HW sleep cycles") started to use it during the check() phase. Depending upon what garbage is in the table it's possible that the system will wait 2.5 seconds for even the first cycle, which will be visible to a user. To prevent this from happening explicitly clear the table when logging is started. Fixes: 9f5595d5f03fd ("platform/x86/amd: pmc: Require at least 2.5 seconds between HW sleep cycles") Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20250603132412.3555302-1-superm1@kernel.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c index 37c7a57afee5..0b9b23eb7c2c 100644 --- a/drivers/platform/x86/amd/pmc/pmc.c +++ b/drivers/platform/x86/amd/pmc/pmc.c @@ -157,6 +157,8 @@ static int amd_pmc_setup_smu_logging(struct amd_pmc_dev *dev) return -ENOMEM; } + memset_io(dev->smu_virt_addr, 0, sizeof(struct smu_metrics)); + /* Start the logging */ amd_pmc_send_cmd(dev, 0, NULL, SMU_MSG_LOG_RESET, false); amd_pmc_send_cmd(dev, 0, NULL, SMU_MSG_LOG_START, false); From f8afb12a2d7503de6558c23cacd7acbf6e9fe678 Mon Sep 17 00:00:00 2001 From: Jake Hillion Date: Thu, 5 Jun 2025 19:09:26 +0100 Subject: [PATCH 029/242] x86/platform/amd: move final timeout check to after final sleep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __hsmp_send_message sleeps between result read attempts and has a timeout of 100ms. Under extreme load it's possible for these sleeps to take a long time, exceeding the 100ms. In this case the current code does not check the register and fails with ETIMEDOUT. Refactor the loop to ensure there is at least one read of the register after a sleep of any duration. This removes instances of ETIMEDOUT with a single caller, even with a misbehaving scheduler. Tested on AMD Bergamo machines. Suggested-by: Blaise Sanouillet Reviewed-by: Suma Hegde Tested-by: Suma Hegde Signed-off-by: Jake Hillion Link: https://lore.kernel.org/r/20250605-amd-hsmp-v2-1-a811bc3dd74a@hillion.co.uk Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/hsmp/hsmp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/hsmp/hsmp.c b/drivers/platform/x86/amd/hsmp/hsmp.c index 538b36b97095..fa8bd5839dd2 100644 --- a/drivers/platform/x86/amd/hsmp/hsmp.c +++ b/drivers/platform/x86/amd/hsmp/hsmp.c @@ -97,7 +97,7 @@ static int __hsmp_send_message(struct hsmp_socket *sock, struct hsmp_message *ms short_sleep = jiffies + msecs_to_jiffies(HSMP_SHORT_SLEEP); timeout = jiffies + msecs_to_jiffies(HSMP_MSG_TIMEOUT); - while (time_before(jiffies, timeout)) { + while (true) { ret = sock->amd_hsmp_rdwr(sock, mbinfo->msg_resp_off, &mbox_status, HSMP_RD); if (ret) { dev_err(sock->dev, "Error %d reading mailbox status\n", ret); @@ -106,6 +106,10 @@ static int __hsmp_send_message(struct hsmp_socket *sock, struct hsmp_message *ms if (mbox_status != HSMP_STATUS_NOT_READY) break; + + if (!time_before(jiffies, timeout)) + break; + if (time_before(jiffies, short_sleep)) usleep_range(50, 100); else From 784e48a82976ee0b645788750343cd1b28a372f3 Mon Sep 17 00:00:00 2001 From: Jake Hillion Date: Thu, 5 Jun 2025 19:09:27 +0100 Subject: [PATCH 030/242] x86/platform/amd: replace down_timeout() with down_interruptible() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently hsmp_send_message() uses down_timeout() with a 100ms timeout to take the semaphore. However __hsmp_send_message(), the content of the critical section, has a sleep in it. On systems with significantly delayed scheduling behaviour this may take over 100ms. Convert this method to down_interruptible(). Leave the error handling the same as the documentation currently is not specific about what error is returned. Previous behaviour: a caller who competes with another caller stuck in the critical section due to scheduler delays would receive -ETIME. New behaviour: a caller who competes with another caller stuck in the critical section due to scheduler delays will complete successfully. Reviewed-by: Suma Hegde Tested-by: Suma Hegde Signed-off-by: Jake Hillion Link: https://lore.kernel.org/r/20250605-amd-hsmp-v2-2-a811bc3dd74a@hillion.co.uk Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/hsmp/hsmp.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/platform/x86/amd/hsmp/hsmp.c b/drivers/platform/x86/amd/hsmp/hsmp.c index fa8bd5839dd2..885e2f8136fd 100644 --- a/drivers/platform/x86/amd/hsmp/hsmp.c +++ b/drivers/platform/x86/amd/hsmp/hsmp.c @@ -214,13 +214,7 @@ int hsmp_send_message(struct hsmp_message *msg) return -ENODEV; sock = &hsmp_pdev.sock[msg->sock_ind]; - /* - * The time taken by smu operation to complete is between - * 10us to 1ms. Sometime it may take more time. - * In SMP system timeout of 100 millisecs should - * be enough for the previous thread to finish the operation - */ - ret = down_timeout(&sock->hsmp_sem, msecs_to_jiffies(HSMP_MSG_TIMEOUT)); + ret = down_interruptible(&sock->hsmp_sem); if (ret < 0) return ret; From e51a086117ed857ea455c9ea774dbfb82f53e517 Mon Sep 17 00:00:00 2001 From: Andres Urian Florez Date: Sun, 8 Jun 2025 18:04:21 -0500 Subject: [PATCH 031/242] spi: offload: check offload ops existence before disabling the trigger Add a safe guard in spi_offload_trigger to check the existence of offload->ops before invoking the trigger_disable callback Signed-off-by: Andres Urian Florez Link: https://patch.msgid.link/20250608230422.325360-1-andres.emb.sys@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-offload.c b/drivers/spi/spi-offload.c index e674097bf3be..d336f4d228d5 100644 --- a/drivers/spi/spi-offload.c +++ b/drivers/spi/spi-offload.c @@ -297,7 +297,7 @@ int spi_offload_trigger_enable(struct spi_offload *offload, if (trigger->ops->enable) { ret = trigger->ops->enable(trigger, config); if (ret) { - if (offload->ops->trigger_disable) + if (offload->ops && offload->ops->trigger_disable) offload->ops->trigger_disable(offload); return ret; } From 779a0c9e06a91d0007f2a4f4071e3b9a8102b15e Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 4 Jun 2025 07:32:17 -0700 Subject: [PATCH 032/242] accel/amdxdna: Fix incorrect PSP firmware size The incorrect PSP firmware size is used for initializing. It may cause error for newer version firmware. Fixes: 8c9ff1b181ba ("accel/amdxdna: Add a new driver for AMD AI Engine") Acked-by: Alex Deucher Signed-off-by: Lizhi Hou Link: https://lore.kernel.org/r/20250604143217.1386272-1-lizhi.hou@amd.com --- drivers/accel/amdxdna/aie2_psp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/accel/amdxdna/aie2_psp.c b/drivers/accel/amdxdna/aie2_psp.c index dc3a072ce3b6..f28a060a8810 100644 --- a/drivers/accel/amdxdna/aie2_psp.c +++ b/drivers/accel/amdxdna/aie2_psp.c @@ -126,8 +126,8 @@ struct psp_device *aie2m_psp_create(struct drm_device *ddev, struct psp_config * psp->ddev = ddev; memcpy(psp->psp_regs, conf->psp_regs, sizeof(psp->psp_regs)); - psp->fw_buf_sz = ALIGN(conf->fw_size, PSP_FW_ALIGN) + PSP_FW_ALIGN; - psp->fw_buffer = drmm_kmalloc(ddev, psp->fw_buf_sz, GFP_KERNEL); + psp->fw_buf_sz = ALIGN(conf->fw_size, PSP_FW_ALIGN); + psp->fw_buffer = drmm_kmalloc(ddev, psp->fw_buf_sz + PSP_FW_ALIGN, GFP_KERNEL); if (!psp->fw_buffer) { drm_err(ddev, "no memory for fw buffer"); return NULL; From d6fb4f01736a1d18cc981eb04fa2907a7121fc27 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 21 May 2025 11:01:02 +0200 Subject: [PATCH 033/242] drm/xe/svm: Fix regression disallowing 64K SVM migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When changing the condition from >= SZ_64K, it was changed to <= SZ_64K. This disallows migration of 64K, which is the exact minimum allowed. Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/5057 Fixes: 794f5493f518 ("drm/xe: Strict migration policy for atomic SVM faults") Cc: stable@vger.kernel.org Cc: Matthew Brost Cc: Himal Prasad Ghimiray Reviewed-by: Himal Prasad Ghimiray Signed-off-by: Maarten Lankhorst Link: https://lore.kernel.org/r/20250521090102.2965100-1-dev@lankhorst.se (cherry picked from commit 531bef26d189b28bf0d694878c0e064b30990b6c) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index 6345896585de..f0b167b3fb6a 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -764,7 +764,7 @@ static bool xe_svm_range_needs_migrate_to_vram(struct xe_svm_range *range, return false; } - if (range_size <= SZ_64K && !supports_4K_migration(vm->xe)) { + if (range_size < SZ_64K && !supports_4K_migration(vm->xe)) { drm_dbg(&vm->xe->drm, "Platform doesn't support SZ_4K range migration\n"); return false; } From cf2c3eceb757e3f28e6f1034f9bc178e1535f5cc Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Mon, 9 Jun 2025 17:05:04 +0200 Subject: [PATCH 034/242] spi: stm32-ospi: Make usage of reset_control_acquire/release() API As ospi reset is consumed by both OMM and OSPI drivers, use the reset acquire/release mechanism which ensure exclusive reset usage. This avoid to call reset_control_get/put() in OMM driver each time we need to reset OSPI children and guarantee the reset line stays deasserted. During resume, OMM driver takes temporarily control of reset. Fixes: 79b8a705e26c ("spi: stm32: Add OSPI driver") Signed-off-by: Patrice Chotard Reviewed-by: Philipp Zabel Link: https://patch.msgid.link/20250609-b4-upstream_ospi_reset_update-v6-1-5b602b567e8a@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32-ospi.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-stm32-ospi.c b/drivers/spi/spi-stm32-ospi.c index 7c1fa55fbc47..db6b1cfc970f 100644 --- a/drivers/spi/spi-stm32-ospi.c +++ b/drivers/spi/spi-stm32-ospi.c @@ -804,7 +804,7 @@ static int stm32_ospi_get_resources(struct platform_device *pdev) return ret; } - ospi->rstc = devm_reset_control_array_get_exclusive(dev); + ospi->rstc = devm_reset_control_array_get_exclusive_released(dev); if (IS_ERR(ospi->rstc)) return dev_err_probe(dev, PTR_ERR(ospi->rstc), "Can't get reset\n"); @@ -936,11 +936,13 @@ static int stm32_ospi_probe(struct platform_device *pdev) if (ret < 0) goto err_pm_enable; - if (ospi->rstc) { - reset_control_assert(ospi->rstc); - udelay(2); - reset_control_deassert(ospi->rstc); - } + ret = reset_control_acquire(ospi->rstc); + if (ret) + return dev_err_probe(dev, ret, "Can not acquire reset %d\n", ret); + + reset_control_assert(ospi->rstc); + udelay(2); + reset_control_deassert(ospi->rstc); ret = spi_register_controller(ctrl); if (ret) { @@ -987,6 +989,8 @@ static void stm32_ospi_remove(struct platform_device *pdev) if (ospi->dma_chrx) dma_release_channel(ospi->dma_chrx); + reset_control_release(ospi->rstc); + pm_runtime_put_sync_suspend(ospi->dev); pm_runtime_force_suspend(ospi->dev); } @@ -997,6 +1001,8 @@ static int __maybe_unused stm32_ospi_suspend(struct device *dev) pinctrl_pm_select_sleep_state(dev); + reset_control_release(ospi->rstc); + return pm_runtime_force_suspend(ospi->dev); } @@ -1016,6 +1022,12 @@ static int __maybe_unused stm32_ospi_resume(struct device *dev) if (ret < 0) return ret; + ret = reset_control_acquire(ospi->rstc); + if (ret) { + dev_err(dev, "Can not acquire reset\n"); + return ret; + } + writel_relaxed(ospi->cr_reg, regs_base + OSPI_CR); writel_relaxed(ospi->dcr_reg, regs_base + OSPI_DCR1); pm_runtime_mark_last_busy(ospi->dev); From e34dbbc85d64af59176fe59fad7b4122f4330fe2 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Mon, 9 Jun 2025 01:40:53 -0700 Subject: [PATCH 035/242] x86/fred/signal: Prevent immediate repeat of single step trap on return from SIGTRAP handler Clear the software event flag in the augmented SS to prevent immediate repeat of single step trap on return from SIGTRAP handler if the trap flag (TF) is set without an external debugger attached. Following is a typical single-stepping flow for a user process: 1) The user process is prepared for single-stepping by setting RFLAGS.TF = 1. 2) When any instruction in user space completes, a #DB is triggered. 3) The kernel handles the #DB and returns to user space, invoking the SIGTRAP handler with RFLAGS.TF = 0. 4) After the SIGTRAP handler finishes, the user process performs a sigreturn syscall, restoring the original state, including RFLAGS.TF = 1. 5) Goto step 2. According to the FRED specification: A) Bit 17 in the augmented SS is designated as the software event flag, which is set to 1 for FRED event delivery of SYSCALL, SYSENTER, or INT n. B) If bit 17 of the augmented SS is 1 and ERETU would result in RFLAGS.TF = 1, a single-step trap will be pending upon completion of ERETU. In step 4) above, the software event flag is set upon the sigreturn syscall, and its corresponding ERETU would restore RFLAGS.TF = 1. This combination causes a pending single-step trap upon completion of ERETU. Therefore, another #DB is triggered before any user space instruction is executed, which leads to an infinite loop in which the SIGTRAP handler keeps being invoked on the same user space IP. Fixes: 14619d912b65 ("x86/fred: FRED entry/exit and dispatch code") Suggested-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li (Intel) Signed-off-by: Dave Hansen Tested-by: Sohil Mehta Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20250609084054.2083189-2-xin%40zytor.com --- arch/x86/include/asm/sighandling.h | 22 ++++++++++++++++++++++ arch/x86/kernel/signal_32.c | 4 ++++ arch/x86/kernel/signal_64.c | 4 ++++ 3 files changed, 30 insertions(+) diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index e770c4fc47f4..8727c7e21dd1 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -24,4 +24,26 @@ int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); +/* + * To prevent immediate repeat of single step trap on return from SIGTRAP + * handler if the trap flag (TF) is set without an external debugger attached, + * clear the software event flag in the augmented SS, ensuring no single-step + * trap is pending upon ERETU completion. + * + * Note, this function should be called in sigreturn() before the original + * state is restored to make sure the TF is read from the entry frame. + */ +static __always_inline void prevent_single_step_upon_eretu(struct pt_regs *regs) +{ + /* + * If the trap flag (TF) is set, i.e., the sigreturn() SYSCALL instruction + * is being single-stepped, do not clear the software event flag in the + * augmented SS, thus a debugger won't skip over the following instruction. + */ +#ifdef CONFIG_X86_FRED + if (!(regs->flags & X86_EFLAGS_TF)) + regs->fred_ss.swevent = 0; +#endif +} + #endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 98123ff10506..42bbc42bd350 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -152,6 +152,8 @@ SYSCALL32_DEFINE0(sigreturn) struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; + prevent_single_step_upon_eretu(regs); + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) @@ -175,6 +177,8 @@ SYSCALL32_DEFINE0(rt_sigreturn) struct rt_sigframe_ia32 __user *frame; sigset_t set; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); if (!access_ok(frame, sizeof(*frame))) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ee9453891901..d483b585c6c6 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -250,6 +250,8 @@ SYSCALL_DEFINE0(rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -366,6 +368,8 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); if (!access_ok(frame, sizeof(*frame))) From f287822688eeb44ae1cf6ac45701d965efc33218 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Mon, 9 Jun 2025 01:40:54 -0700 Subject: [PATCH 036/242] selftests/x86: Add a test to detect infinite SIGTRAP handler loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When FRED is enabled, if the Trap Flag (TF) is set without an external debugger attached, it can lead to an infinite loop in the SIGTRAP handler. To avoid this, the software event flag in the augmented SS must be cleared, ensuring that no single-step trap remains pending when ERETU completes. This test checks for that specific scenario—verifying whether the kernel correctly prevents an infinite SIGTRAP loop in this edge case when FRED is enabled. The test should _always_ pass with IDT event delivery, thus no need to disable the test even when FRED is not enabled. Signed-off-by: Xin Li (Intel) Signed-off-by: Dave Hansen Tested-by: Sohil Mehta Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20250609084054.2083189-3-xin%40zytor.com --- tools/testing/selftests/x86/Makefile | 2 +- tools/testing/selftests/x86/sigtrap_loop.c | 101 +++++++++++++++++++++ 2 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/sigtrap_loop.c diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index f703fcfe9f7c..83148875a12c 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh "$(CC)" trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl ioperm \ - test_vsyscall mov_ss_trap \ + test_vsyscall mov_ss_trap sigtrap_loop \ syscall_arg_fault fsgsbase_restore sigaltstack TARGETS_C_BOTHBITS += nx_stack TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ diff --git a/tools/testing/selftests/x86/sigtrap_loop.c b/tools/testing/selftests/x86/sigtrap_loop.c new file mode 100644 index 000000000000..9d065479e89f --- /dev/null +++ b/tools/testing/selftests/x86/sigtrap_loop.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Intel Corporation + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include + +#ifdef __x86_64__ +# define REG_IP REG_RIP +#else +# define REG_IP REG_EIP +#endif + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); + + return; +} + +static void sigtrap(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t *)ctx_void; + static unsigned int loop_count_on_same_ip; + static unsigned long last_trap_ip; + + if (last_trap_ip == ctx->uc_mcontext.gregs[REG_IP]) { + printf("\tTrapped at %016lx\n", last_trap_ip); + + /* + * If the same IP is hit more than 10 times in a row, it is + * _considered_ an infinite loop. + */ + if (++loop_count_on_same_ip > 10) { + printf("[FAIL]\tDetected SIGTRAP infinite loop\n"); + exit(1); + } + + return; + } + + loop_count_on_same_ip = 0; + last_trap_ip = ctx->uc_mcontext.gregs[REG_IP]; + printf("\tTrapped at %016lx\n", last_trap_ip); +} + +int main(int argc, char *argv[]) +{ + sethandler(SIGTRAP, sigtrap, 0); + + /* + * Set the Trap Flag (TF) to single-step the test code, therefore to + * trigger a SIGTRAP signal after each instruction until the TF is + * cleared. + * + * Because the arithmetic flags are not significant here, the TF is + * set by pushing 0x302 onto the stack and then popping it into the + * flags register. + * + * Four instructions in the following asm code are executed with the + * TF set, thus the SIGTRAP handler is expected to run four times. + */ + printf("[RUN]\tSIGTRAP infinite loop detection\n"); + asm volatile( +#ifdef __x86_64__ + /* + * Avoid clobbering the redzone + * + * Equivalent to "sub $128, %rsp", however -128 can be encoded + * in a single byte immediate while 128 uses 4 bytes. + */ + "add $-128, %rsp\n\t" +#endif + "push $0x302\n\t" + "popf\n\t" + "nop\n\t" + "nop\n\t" + "push $0x202\n\t" + "popf\n\t" +#ifdef __x86_64__ + "sub $-128, %rsp\n\t" +#endif + ); + + printf("[OK]\tNo SIGTRAP infinite loop detected\n"); + return 0; +} From e044b8a9545cd8265c7110c179aeec2624c16455 Mon Sep 17 00:00:00 2001 From: "Francesco Poli (wintermute)" Date: Wed, 21 May 2025 23:14:39 +0200 Subject: [PATCH 037/242] cpupower: split unitdir from libdir in Makefile Improve the installation procedure for the systemd service unit 'cpupower.service', to be more flexible. Some distros install libraries to /usr/lib64/, but systemd service units have to be installed to /usr/lib/systemd/system: as a consequence, the installation procedure should not assume that systemd service units can be installed to ${libdir}/systemd/system ... Define a dedicated variable ("unitdir") in the Makefile. Link: https://lore.kernel.org/linux-pm/260b6d79-ab61-43b7-a0eb-813e257bc028@leemhuis.info/T/#m0601940ab439d5cbd288819d2af190ce59e810e6 Fixes: 9c70b779ad91 ("cpupower: add a systemd service to run cpupower") Link: https://lore.kernel.org/r/20250521211656.65646-1-invernomuto@paranoici.org Signed-off-by: Francesco Poli (wintermute) Tested-by: Thorsten Leemhuis Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index be8dfac14076..c43db1c41205 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -73,6 +73,7 @@ sbindir ?= /usr/sbin mandir ?= /usr/man libdir ?= /usr/lib libexecdir ?= /usr/libexec +unitdir ?= /usr/lib/systemd/system includedir ?= /usr/include localedir ?= /usr/share/locale docdir ?= /usr/share/doc/packages/cpupower @@ -309,9 +310,9 @@ install-tools: $(OUTPUT)cpupower $(INSTALL_DATA) cpupower-service.conf '$(DESTDIR)${confdir}' $(INSTALL) -d $(DESTDIR)${libexecdir} $(INSTALL_PROGRAM) cpupower.sh '$(DESTDIR)${libexecdir}/cpupower' - $(INSTALL) -d $(DESTDIR)${libdir}/systemd/system - sed 's|___CDIR___|${confdir}|; s|___LDIR___|${libexecdir}|' cpupower.service.in > '$(DESTDIR)${libdir}/systemd/system/cpupower.service' - $(SETPERM_DATA) '$(DESTDIR)${libdir}/systemd/system/cpupower.service' + $(INSTALL) -d $(DESTDIR)${unitdir} + sed 's|___CDIR___|${confdir}|; s|___LDIR___|${libexecdir}|' cpupower.service.in > '$(DESTDIR)${unitdir}/cpupower.service' + $(SETPERM_DATA) '$(DESTDIR)${unitdir}/cpupower.service' install-man: $(INSTALL_DATA) -D man/cpupower.1 $(DESTDIR)${mandir}/man1/cpupower.1 @@ -348,7 +349,7 @@ uninstall: - rm -f $(DESTDIR)${bindir}/utils/cpupower - rm -f $(DESTDIR)${confdir}cpupower-service.conf - rm -f $(DESTDIR)${libexecdir}/cpupower - - rm -f $(DESTDIR)${libdir}/systemd/system/cpupower.service + - rm -f $(DESTDIR)${unitdir}/cpupower.service - rm -f $(DESTDIR)${mandir}/man1/cpupower.1 - rm -f $(DESTDIR)${mandir}/man1/cpupower-frequency-set.1 - rm -f $(DESTDIR)${mandir}/man1/cpupower-frequency-info.1 From ad0f54842cd23127070d8c310dd06d8f7add025c Mon Sep 17 00:00:00 2001 From: Ankit Chauhan Date: Wed, 28 May 2025 16:36:04 +0530 Subject: [PATCH 038/242] scsi: mvsas: Fix typos in per-phy comments and SAS cmd port registers Spelling fixes: Deocder --> Decoder Memroy --> Memory This is a non-functional change aimed at improving code clarity. Signed-off-by: Ankit Chauhan Link: https://lore.kernel.org/r/20250528110604.59528-1-ankitchauhan2065@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mvsas/mv_defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/mvsas/mv_defs.h b/drivers/scsi/mvsas/mv_defs.h index 8ef174cd4d37..3e4124177b2a 100644 --- a/drivers/scsi/mvsas/mv_defs.h +++ b/drivers/scsi/mvsas/mv_defs.h @@ -215,7 +215,7 @@ enum hw_register_bits { /* MVS_Px_INT_STAT, MVS_Px_INT_MASK (per-phy events) */ PHYEV_DEC_ERR = (1U << 24), /* Phy Decoding Error */ - PHYEV_DCDR_ERR = (1U << 23), /* STP Deocder Error */ + PHYEV_DCDR_ERR = (1U << 23), /* STP Decoder Error */ PHYEV_CRC_ERR = (1U << 22), /* STP CRC Error */ PHYEV_UNASSOC_FIS = (1U << 19), /* unassociated FIS rx'd */ PHYEV_AN = (1U << 18), /* SATA async notification */ @@ -347,7 +347,7 @@ enum sas_cmd_port_registers { CMD_SATA_PORT_MEM_CTL0 = 0x158, /* SATA Port Memory Control 0 */ CMD_SATA_PORT_MEM_CTL1 = 0x15c, /* SATA Port Memory Control 1 */ CMD_XOR_MEM_BIST_CTL = 0x160, /* XOR Memory BIST Control */ - CMD_XOR_MEM_BIST_STAT = 0x164, /* XOR Memroy BIST Status */ + CMD_XOR_MEM_BIST_STAT = 0x164, /* XOR Memory BIST Status */ CMD_DMA_MEM_BIST_CTL = 0x168, /* DMA Memory BIST Control */ CMD_DMA_MEM_BIST_STAT = 0x16c, /* DMA Memory BIST Status */ CMD_PORT_MEM_BIST_CTL = 0x170, /* Port Memory BIST Control */ From 9b17621366d210ffee83262a8754086ebbde5e55 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Fri, 30 May 2025 12:29:35 -0700 Subject: [PATCH 039/242] scsi: iscsi: Fix incorrect error path labels for flashnode operations Correct the error handling goto labels used when host lookup fails in various flashnode-related event handlers: - iscsi_new_flashnode() - iscsi_del_flashnode() - iscsi_login_flashnode() - iscsi_logout_flashnode() - iscsi_logout_flashnode_sid() scsi_host_put() is not required when shost is NULL, so jumping to the correct label avoids unnecessary operations. These functions previously jumped to the wrong goto label (put_host), which did not match the intended cleanup logic. Use the correct exit labels (exit_new_fnode, exit_del_fnode, etc.) to ensure proper error handling. Also remove the unused put_host label under iscsi_new_flashnode() as it is no longer needed. No functional changes beyond accurate error path correction. Fixes: c6a4bb2ef596 ("[SCSI] scsi_transport_iscsi: Add flash node mgmt support") Signed-off-by: Alok Tiwari Link: https://lore.kernel.org/r/20250530193012.3312911-1-alok.a.tiwari@oracle.com Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_iscsi.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 0b8c91bf793f..c75a806496d6 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -3499,7 +3499,7 @@ static int iscsi_new_flashnode(struct iscsi_transport *transport, pr_err("%s could not find host no %u\n", __func__, ev->u.new_flashnode.host_no); err = -ENODEV; - goto put_host; + goto exit_new_fnode; } index = transport->new_flashnode(shost, data, len); @@ -3509,7 +3509,6 @@ static int iscsi_new_flashnode(struct iscsi_transport *transport, else err = -EIO; -put_host: scsi_host_put(shost); exit_new_fnode: @@ -3534,7 +3533,7 @@ static int iscsi_del_flashnode(struct iscsi_transport *transport, pr_err("%s could not find host no %u\n", __func__, ev->u.del_flashnode.host_no); err = -ENODEV; - goto put_host; + goto exit_del_fnode; } idx = ev->u.del_flashnode.flashnode_idx; @@ -3576,7 +3575,7 @@ static int iscsi_login_flashnode(struct iscsi_transport *transport, pr_err("%s could not find host no %u\n", __func__, ev->u.login_flashnode.host_no); err = -ENODEV; - goto put_host; + goto exit_login_fnode; } idx = ev->u.login_flashnode.flashnode_idx; @@ -3628,7 +3627,7 @@ static int iscsi_logout_flashnode(struct iscsi_transport *transport, pr_err("%s could not find host no %u\n", __func__, ev->u.logout_flashnode.host_no); err = -ENODEV; - goto put_host; + goto exit_logout_fnode; } idx = ev->u.logout_flashnode.flashnode_idx; @@ -3678,7 +3677,7 @@ static int iscsi_logout_flashnode_sid(struct iscsi_transport *transport, pr_err("%s could not find host no %u\n", __func__, ev->u.logout_flashnode.host_no); err = -ENODEV; - goto put_host; + goto exit_logout_sid; } session = iscsi_session_lookup(ev->u.logout_flashnode_sid.sid); From 9697ca0d53e3db357be26d2414276143c4a2cd49 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Tue, 3 Jun 2025 20:21:56 +0200 Subject: [PATCH 040/242] scsi: s390: zfcp: Ensure synchronous unit_add Improve the usability of the unit_add sysfs attribute by ensuring that the associated FCP LUN scan processing is completed synchronously. This enables configuration tooling to consistently determine the end of the scan process to allow for serialization of follow-on actions. While the scan process associated with unit_add typically completes synchronously, it is deferred to an asynchronous background process if unit_add is used before initial remote port scanning has completed. This occurs when unit_add is used immediately after setting the associated FCP device online. To ensure synchronous unit_add processing, wait for remote port scanning to complete before initiating the FCP LUN scan. Cc: stable@vger.kernel.org Reviewed-by: M Nikhil Reviewed-by: Nihar Panda Signed-off-by: Peter Oberparleiter Signed-off-by: Nihar Panda Link: https://lore.kernel.org/r/20250603182252.2287285-2-niharp@linux.ibm.com Signed-off-by: Martin K. Petersen --- drivers/s390/scsi/zfcp_sysfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/scsi/zfcp_sysfs.c b/drivers/s390/scsi/zfcp_sysfs.c index 41e36af35488..90a84ae98b97 100644 --- a/drivers/s390/scsi/zfcp_sysfs.c +++ b/drivers/s390/scsi/zfcp_sysfs.c @@ -449,6 +449,8 @@ static ssize_t zfcp_sysfs_unit_add_store(struct device *dev, if (kstrtoull(buf, 0, (unsigned long long *) &fcp_lun)) return -EINVAL; + flush_work(&port->rport_work); + retval = zfcp_unit_add(port, fcp_lun); if (retval) return retval; From 93310053663ba647e402ef67e4bb18ec06ff8dc4 Mon Sep 17 00:00:00 2001 From: Philipp Kerling Date: Sun, 8 Jun 2025 20:59:00 +0200 Subject: [PATCH 041/242] smb: client: disable path remapping with POSIX extensions If SMB 3.1.1 POSIX Extensions are available and negotiated, the client should be able to use all characters and not remap anything. Currently, the user has to explicitly request this behavior by specifying the "nomapposix" mount option. Link: https://lore.kernel.org/4195bb677b33d680e77549890a4f4dd3b474ceaf.camel@rx2.rx-server.de Signed-off-by: Philipp Kerling Reviewed-by: Namjae Jeon Signed-off-by: Steve French --- Documentation/admin-guide/cifs/usage.rst | 2 ++ fs/smb/client/connect.c | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/cifs/usage.rst b/Documentation/admin-guide/cifs/usage.rst index c09674a75a9e..d989ae5778ba 100644 --- a/Documentation/admin-guide/cifs/usage.rst +++ b/Documentation/admin-guide/cifs/usage.rst @@ -270,6 +270,8 @@ configured for Unix Extensions (and the client has not disabled illegal Windows/NTFS/SMB characters to a remap range (this mount parameter is the default for SMB3). This remap (``mapposix``) range is also compatible with Mac (and "Services for Mac" on some older Windows). +When POSIX Extensions for SMB 3.1.1 are negotiated, remapping is automatically +disabled. CIFS VFS Mount Options ====================== diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 28bc33496623..c4fb80b37738 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3718,9 +3718,15 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx) goto out; } - /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */ - if (tcon->posix_extensions) + /* + * if new SMB3.11 POSIX extensions are supported, do not change anything in the + * path (i.e., do not remap / and \ and do not map any special characters) + */ + if (tcon->posix_extensions) { cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; + cifs_sb->mnt_cifs_flags &= ~(CIFS_MOUNT_MAP_SFM_CHR | + CIFS_MOUNT_MAP_SPECIAL_CHR); + } #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* tell server which Unix caps we support */ From b2f966568faaad326de97481096d0f3dc0971c43 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 6 Jun 2025 13:57:39 -0700 Subject: [PATCH 042/242] scsi: storvsc: Increase the timeouts to storvsc_timeout Currently storvsc_timeout is only used in storvsc_sdev_configure(), and 5s and 10s are used elsewhere. It turns out that rarely the 5s is not enough on Azure, so let's use storvsc_timeout everywhere. In case a timeout happens and storvsc_channel_init() returns an error, close the VMBus channel so that any host-to-guest messages in the channel's ringbuffer, which might come late, can be safely ignored. Add a "const" to storvsc_timeout. Cc: stable@kernel.org Signed-off-by: Dexuan Cui Link: https://lore.kernel.org/r/1749243459-10419-1-git-send-email-decui@microsoft.com Reviewed-by: Long Li Signed-off-by: Martin K. Petersen --- drivers/scsi/storvsc_drv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 2e6b2412d2c9..d9e59204a9c3 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -362,7 +362,7 @@ MODULE_PARM_DESC(ring_avail_percent_lowater, /* * Timeout in seconds for all devices managed by this driver. */ -static int storvsc_timeout = 180; +static const int storvsc_timeout = 180; #if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) static struct scsi_transport_template *fc_transport_template; @@ -768,7 +768,7 @@ static void handle_multichannel_storage(struct hv_device *device, int max_chns) return; } - t = wait_for_completion_timeout(&request->wait_event, 10*HZ); + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); if (t == 0) { dev_err(dev, "Failed to create sub-channel: timed out\n"); return; @@ -833,7 +833,7 @@ static int storvsc_execute_vstor_op(struct hv_device *device, if (ret != 0) return ret; - t = wait_for_completion_timeout(&request->wait_event, 5*HZ); + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); if (t == 0) return -ETIMEDOUT; @@ -1350,6 +1350,8 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size, return ret; ret = storvsc_channel_init(device, is_fc); + if (ret) + vmbus_close(device->channel); return ret; } @@ -1668,7 +1670,7 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd) if (ret != 0) return FAILED; - t = wait_for_completion_timeout(&request->wait_event, 5*HZ); + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); if (t == 0) return TIMEOUT_ERROR; From 5c3ba81923e02adae354ec8afd006f93289b4a3c Mon Sep 17 00:00:00 2001 From: Rajashekhar M A Date: Fri, 6 Jun 2025 15:59:24 +0200 Subject: [PATCH 043/242] scsi: error: alua: I/O errors for ALUA state transitions When a host is configured with a few LUNs and I/O is running, injecting FC faults repeatedly leads to path recovery problems. The LUNs have 4 paths each and 3 of them come back active after say an FC fault which makes 2 of the paths go down, instead of all 4. This happens after several iterations of continuous FC faults. Reason here is that we're returning an I/O error whenever we're encountering sense code 06/04/0a (LOGICAL UNIT NOT ACCESSIBLE, ASYMMETRIC ACCESS STATE TRANSITION) instead of retrying. Signed-off-by: Rajashekhar M A Signed-off-by: Hannes Reinecke Link: https://lore.kernel.org/r/20250606135924.27397-1-hare@kernel.org Reviewed-by: Lee Duncan Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_error.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 376b8897ab90..746ff6a1f309 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -665,7 +665,8 @@ enum scsi_disposition scsi_check_sense(struct scsi_cmnd *scmd) * if the device is in the process of becoming ready, we * should retry. */ - if ((sshdr.asc == 0x04) && (sshdr.ascq == 0x01)) + if ((sshdr.asc == 0x04) && + (sshdr.ascq == 0x01 || sshdr.ascq == 0x0a)) return NEEDS_RETRY; /* * if the device is not started, we need to wake From d9db3a941270d92bbd1a6a6b54a10324484f2f2d Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 21 May 2025 19:34:55 -0500 Subject: [PATCH 044/242] platform/x86/amd: pmf: Use device managed allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If setting up smart PC fails for any reason then this can lead to a double free when unloading amd-pmf. This is because dev->buf was freed but never set to NULL and is again freed in amd_pmf_remove(). To avoid subtle allocation bugs in failures leading to a double free change all allocations into device managed allocations. Fixes: 5b1122fc4995f ("platform/x86/amd/pmf: fix cleanup in amd_pmf_init_smart_pc()") Link: https://lore.kernel.org/r/20250512211154.2510397-2-superm1@kernel.org Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20250522003457.1516679-2-superm1@kernel.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/core.c | 3 +- drivers/platform/x86/amd/pmf/tee-if.c | 56 ++++++++++----------------- 2 files changed, 22 insertions(+), 37 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index 76910601cac8..ef988605c4da 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -280,7 +280,7 @@ int amd_pmf_set_dram_addr(struct amd_pmf_dev *dev, bool alloc_buffer) dev_err(dev->dev, "Invalid CPU id: 0x%x", dev->cpu_id); } - dev->buf = kzalloc(dev->mtable_size, GFP_KERNEL); + dev->buf = devm_kzalloc(dev->dev, dev->mtable_size, GFP_KERNEL); if (!dev->buf) return -ENOMEM; } @@ -493,7 +493,6 @@ static void amd_pmf_remove(struct platform_device *pdev) mutex_destroy(&dev->lock); mutex_destroy(&dev->update_mutex); mutex_destroy(&dev->cb_mutex); - kfree(dev->buf); } static const struct attribute_group *amd_pmf_driver_groups[] = { diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index d3bd12ad036a..027e992b7147 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -358,30 +358,28 @@ static ssize_t amd_pmf_get_pb_data(struct file *filp, const char __user *buf, return -EINVAL; /* re-alloc to the new buffer length of the policy binary */ - new_policy_buf = memdup_user(buf, length); - if (IS_ERR(new_policy_buf)) - return PTR_ERR(new_policy_buf); + new_policy_buf = devm_kzalloc(dev->dev, length, GFP_KERNEL); + if (!new_policy_buf) + return -ENOMEM; - kfree(dev->policy_buf); + if (copy_from_user(new_policy_buf, buf, length)) { + devm_kfree(dev->dev, new_policy_buf); + return -EFAULT; + } + + devm_kfree(dev->dev, dev->policy_buf); dev->policy_buf = new_policy_buf; dev->policy_sz = length; - if (!amd_pmf_pb_valid(dev)) { - ret = -EINVAL; - goto cleanup; - } + if (!amd_pmf_pb_valid(dev)) + return -EINVAL; amd_pmf_hex_dump_pb(dev); ret = amd_pmf_start_policy_engine(dev); if (ret < 0) - goto cleanup; + return ret; return length; - -cleanup: - kfree(dev->policy_buf); - dev->policy_buf = NULL; - return ret; } static const struct file_operations pb_fops = { @@ -532,13 +530,13 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) dev->policy_base = devm_ioremap_resource(dev->dev, dev->res); if (IS_ERR(dev->policy_base)) { ret = PTR_ERR(dev->policy_base); - goto err_free_dram_buf; + goto err_cancel_work; } - dev->policy_buf = kzalloc(dev->policy_sz, GFP_KERNEL); + dev->policy_buf = devm_kzalloc(dev->dev, dev->policy_sz, GFP_KERNEL); if (!dev->policy_buf) { ret = -ENOMEM; - goto err_free_dram_buf; + goto err_cancel_work; } memcpy_fromio(dev->policy_buf, dev->policy_base, dev->policy_sz); @@ -546,21 +544,21 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) if (!amd_pmf_pb_valid(dev)) { dev_info(dev->dev, "No Smart PC policy present\n"); ret = -EINVAL; - goto err_free_policy; + goto err_cancel_work; } amd_pmf_hex_dump_pb(dev); - dev->prev_data = kzalloc(sizeof(*dev->prev_data), GFP_KERNEL); + dev->prev_data = devm_kzalloc(dev->dev, sizeof(*dev->prev_data), GFP_KERNEL); if (!dev->prev_data) { ret = -ENOMEM; - goto err_free_policy; + goto err_cancel_work; } for (i = 0; i < ARRAY_SIZE(amd_pmf_ta_uuid); i++) { ret = amd_pmf_tee_init(dev, &amd_pmf_ta_uuid[i]); if (ret) - goto err_free_prev_data; + goto err_cancel_work; ret = amd_pmf_start_policy_engine(dev); switch (ret) { @@ -575,7 +573,7 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) default: ret = -EINVAL; amd_pmf_tee_deinit(dev); - goto err_free_prev_data; + goto err_cancel_work; } if (status) @@ -584,7 +582,7 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) if (!status && !pb_side_load) { ret = -EINVAL; - goto err_free_prev_data; + goto err_cancel_work; } if (pb_side_load) @@ -600,12 +598,6 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) if (pb_side_load && dev->esbin) amd_pmf_remove_pb(dev); amd_pmf_tee_deinit(dev); -err_free_prev_data: - kfree(dev->prev_data); -err_free_policy: - kfree(dev->policy_buf); -err_free_dram_buf: - kfree(dev->buf); err_cancel_work: cancel_delayed_work_sync(&dev->pb_work); @@ -621,11 +613,5 @@ void amd_pmf_deinit_smart_pc(struct amd_pmf_dev *dev) amd_pmf_remove_pb(dev); cancel_delayed_work_sync(&dev->pb_work); - kfree(dev->prev_data); - dev->prev_data = NULL; - kfree(dev->policy_buf); - dev->policy_buf = NULL; - kfree(dev->buf); - dev->buf = NULL; amd_pmf_tee_deinit(dev); } From 93103d56650d7a38ed37ba4041578310f82776ae Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 21 May 2025 19:34:56 -0500 Subject: [PATCH 045/242] platform/x86/amd: pmf: Prevent amd_pmf_tee_deinit() from running twice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If any of the tee init fails, pass up the errors and clear the tee_ctx pointer. This will prevent cleaning up multiple times. Fixes: ac052d8c08f9d ("platform/x86/amd/pmf: Add PMF TEE interface") Suggested-by: Dan Carpenter Link: https://lore.kernel.org/r/20250512211154.2510397-3-superm1@kernel.org Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20250522003457.1516679-3-superm1@kernel.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/tee-if.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 027e992b7147..76efce48a45c 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -420,12 +420,12 @@ static int amd_pmf_ta_open_session(struct tee_context *ctx, u32 *id, const uuid_ rc = tee_client_open_session(ctx, &sess_arg, NULL); if (rc < 0 || sess_arg.ret != 0) { pr_err("Failed to open TEE session err:%#x, rc:%d\n", sess_arg.ret, rc); - return rc; + return rc ?: -EINVAL; } *id = sess_arg.session; - return rc; + return 0; } static int amd_pmf_register_input_device(struct amd_pmf_dev *dev) @@ -460,7 +460,9 @@ static int amd_pmf_tee_init(struct amd_pmf_dev *dev, const uuid_t *uuid) dev->tee_ctx = tee_client_open_context(NULL, amd_pmf_amdtee_ta_match, NULL, NULL); if (IS_ERR(dev->tee_ctx)) { dev_err(dev->dev, "Failed to open TEE context\n"); - return PTR_ERR(dev->tee_ctx); + ret = PTR_ERR(dev->tee_ctx); + dev->tee_ctx = NULL; + return ret; } ret = amd_pmf_ta_open_session(dev->tee_ctx, &dev->session_id, uuid); @@ -500,9 +502,12 @@ static int amd_pmf_tee_init(struct amd_pmf_dev *dev, const uuid_t *uuid) static void amd_pmf_tee_deinit(struct amd_pmf_dev *dev) { + if (!dev->tee_ctx) + return; tee_shm_free(dev->fw_shm_pool); tee_client_close_session(dev->tee_ctx, dev->session_id); tee_client_close_context(dev->tee_ctx); + dev->tee_ctx = NULL; } int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) From d7186dfd41924ef01ef490be5b82ab63cc417b31 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 21 May 2025 19:34:57 -0500 Subject: [PATCH 046/242] platform/x86/amd: pmf: Simplify error flow in amd_pmf_init_smart_pc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5b1122fc4995f ("platform/x86/amd/pmf: fix cleanup in amd_pmf_init_smart_pc()") adjusted the error handling flow to use a ladder but this isn't actually needed because work is only scheduled in amd_pmf_start_policy_engine() and with device managed cleanups pointers for allocations don't need to be freed. Adjust the error flow to a single call to amd_pmf_deinit_smart_pc() for the cases that need to clean up. Cc: Dan Carpenter Link: https://lore.kernel.org/r/20250512211154.2510397-4-superm1@kernel.org Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20250522003457.1516679-4-superm1@kernel.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/tee-if.c | 55 ++++++++------------------- 1 file changed, 16 insertions(+), 39 deletions(-) diff --git a/drivers/platform/x86/amd/pmf/tee-if.c b/drivers/platform/x86/amd/pmf/tee-if.c index 76efce48a45c..4f626ebcb619 100644 --- a/drivers/platform/x86/amd/pmf/tee-if.c +++ b/drivers/platform/x86/amd/pmf/tee-if.c @@ -530,64 +530,45 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) ret = amd_pmf_set_dram_addr(dev, true); if (ret) - goto err_cancel_work; + return ret; dev->policy_base = devm_ioremap_resource(dev->dev, dev->res); - if (IS_ERR(dev->policy_base)) { - ret = PTR_ERR(dev->policy_base); - goto err_cancel_work; - } + if (IS_ERR(dev->policy_base)) + return PTR_ERR(dev->policy_base); dev->policy_buf = devm_kzalloc(dev->dev, dev->policy_sz, GFP_KERNEL); - if (!dev->policy_buf) { - ret = -ENOMEM; - goto err_cancel_work; - } + if (!dev->policy_buf) + return -ENOMEM; memcpy_fromio(dev->policy_buf, dev->policy_base, dev->policy_sz); if (!amd_pmf_pb_valid(dev)) { dev_info(dev->dev, "No Smart PC policy present\n"); - ret = -EINVAL; - goto err_cancel_work; + return -EINVAL; } amd_pmf_hex_dump_pb(dev); dev->prev_data = devm_kzalloc(dev->dev, sizeof(*dev->prev_data), GFP_KERNEL); - if (!dev->prev_data) { - ret = -ENOMEM; - goto err_cancel_work; - } + if (!dev->prev_data) + return -ENOMEM; for (i = 0; i < ARRAY_SIZE(amd_pmf_ta_uuid); i++) { ret = amd_pmf_tee_init(dev, &amd_pmf_ta_uuid[i]); if (ret) - goto err_cancel_work; + return ret; ret = amd_pmf_start_policy_engine(dev); - switch (ret) { - case TA_PMF_TYPE_SUCCESS: - status = true; - break; - case TA_ERROR_CRYPTO_INVALID_PARAM: - case TA_ERROR_CRYPTO_BIN_TOO_LARGE: - amd_pmf_tee_deinit(dev); - status = false; - break; - default: - ret = -EINVAL; - amd_pmf_tee_deinit(dev); - goto err_cancel_work; - } - + dev_dbg(dev->dev, "start policy engine ret: %d\n", ret); + status = ret == TA_PMF_TYPE_SUCCESS; if (status) break; + amd_pmf_tee_deinit(dev); } if (!status && !pb_side_load) { ret = -EINVAL; - goto err_cancel_work; + goto err; } if (pb_side_load) @@ -595,16 +576,12 @@ int amd_pmf_init_smart_pc(struct amd_pmf_dev *dev) ret = amd_pmf_register_input_device(dev); if (ret) - goto err_pmf_remove_pb; + goto err; return 0; -err_pmf_remove_pb: - if (pb_side_load && dev->esbin) - amd_pmf_remove_pb(dev); - amd_pmf_tee_deinit(dev); -err_cancel_work: - cancel_delayed_work_sync(&dev->pb_work); +err: + amd_pmf_deinit_smart_pc(dev); return ret; } From a2f32c7467e843727f20cae0e9b1545e1504a977 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Mon, 9 Jun 2025 13:46:55 -0500 Subject: [PATCH 047/242] platform/x86: dell_rbu: Fix lock context warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a sparse lock context warning. Signed-off-by: Stuart Hayes Link: https://lore.kernel.org/r/20250609184659.7210-2-stuart.w.hayes@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/dell_rbu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/dell/dell_rbu.c b/drivers/platform/x86/dell/dell_rbu.c index e30ca325938c..7b019fb72e86 100644 --- a/drivers/platform/x86/dell/dell_rbu.c +++ b/drivers/platform/x86/dell/dell_rbu.c @@ -91,7 +91,7 @@ static void init_packet_head(void) rbu_data.imagesize = 0; } -static int create_packet(void *data, size_t length) +static int create_packet(void *data, size_t length) __must_hold(&rbu_data.lock) { struct packet_data *newpacket; int ordernum = 0; From 61ce04601e0d8265ec6d2ffa6df5a7e1bce64854 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Mon, 9 Jun 2025 13:46:56 -0500 Subject: [PATCH 048/242] platform/x86: dell_rbu: Fix list usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass the correct list head to list_for_each_entry*() when looping through the packet list. Without this patch, reading the packet data via sysfs will show the data incorrectly (because it starts at the wrong packet), and clearing the packet list will result in a NULL pointer dereference. Fixes: d19f359fbdc6 ("platform/x86: dell_rbu: don't open code list_for_each_entry*()") Signed-off-by: Stuart Hayes Link: https://lore.kernel.org/r/20250609184659.7210-3-stuart.w.hayes@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/dell_rbu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/dell/dell_rbu.c b/drivers/platform/x86/dell/dell_rbu.c index 7b019fb72e86..722979b19e0e 100644 --- a/drivers/platform/x86/dell/dell_rbu.c +++ b/drivers/platform/x86/dell/dell_rbu.c @@ -292,7 +292,7 @@ static int packet_read_list(char *data, size_t * pread_length) remaining_bytes = *pread_length; bytes_read = rbu_data.packet_read_count; - list_for_each_entry(newpacket, (&packet_data_head.list)->next, list) { + list_for_each_entry(newpacket, &packet_data_head.list, list) { bytes_copied = do_packet_read(pdest, newpacket, remaining_bytes, bytes_read, &temp_count); remaining_bytes -= bytes_copied; @@ -315,7 +315,7 @@ static void packet_empty_list(void) { struct packet_data *newpacket, *tmp; - list_for_each_entry_safe(newpacket, tmp, (&packet_data_head.list)->next, list) { + list_for_each_entry_safe(newpacket, tmp, &packet_data_head.list, list) { list_del(&newpacket->list); /* From f4b0fa38d5fefe9aed6ed831f3bd3538c168ee19 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Mon, 9 Jun 2025 13:46:58 -0500 Subject: [PATCH 049/242] platform/x86: dell_rbu: Stop overwriting data buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dell_rbu driver will use memset() to clear the data held by each packet when it is no longer needed (when the driver is unloaded, the packet size is changed, etc). The amount of memory that is cleared (before this patch) is the normal packet size. However, the last packet in the list may be smaller. Fix this to only clear the memory actually used by each packet, to prevent it from writing past the end of data buffer. Because the packet data buffers are allocated with __get_free_pages() (in page-sized increments), this bug could only result in a buffer being overwritten when a packet size larger than one page is used. The only user of the dell_rbu module should be the Dell BIOS update program, which uses a packet size of 4096, so no issues should be seen without the patch, it just blocks the possiblity. Fixes: 6c54c28e69f2 ("[PATCH] dell_rbu: new Dell BIOS update driver") Signed-off-by: Stuart Hayes Link: https://lore.kernel.org/r/20250609184659.7210-5-stuart.w.hayes@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/dell_rbu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/dell/dell_rbu.c b/drivers/platform/x86/dell/dell_rbu.c index 722979b19e0e..eb3534ff8dad 100644 --- a/drivers/platform/x86/dell/dell_rbu.c +++ b/drivers/platform/x86/dell/dell_rbu.c @@ -322,7 +322,7 @@ static void packet_empty_list(void) * zero out the RBU packet memory before freeing * to make sure there are no stale RBU packets left in memory */ - memset(newpacket->data, 0, rbu_data.packetsize); + memset(newpacket->data, 0, newpacket->length); set_memory_wb((unsigned long)newpacket->data, 1 << newpacket->ordernum); free_pages((unsigned long) newpacket->data, From ea7af94548159c57747aa8e86408a19192a1e1fe Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Mon, 9 Jun 2025 13:46:59 -0500 Subject: [PATCH 050/242] platform/x86: dell_rbu: Bump version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bump the module version. Signed-off-by: Stuart Hayes Link: https://lore.kernel.org/r/20250609184659.7210-6-stuart.w.hayes@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/dell_rbu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/dell/dell_rbu.c b/drivers/platform/x86/dell/dell_rbu.c index eb3534ff8dad..9dd9f2cb074f 100644 --- a/drivers/platform/x86/dell/dell_rbu.c +++ b/drivers/platform/x86/dell/dell_rbu.c @@ -45,7 +45,7 @@ MODULE_AUTHOR("Abhay Salunke "); MODULE_DESCRIPTION("Driver for updating BIOS image on DELL systems"); MODULE_LICENSE("GPL"); -MODULE_VERSION("3.2"); +MODULE_VERSION("3.3"); #define BIOS_SCAN_LIMIT 0xffffffff #define MAX_IMAGE_LENGTH 16 From 3fbf25ecf8b7b524b4774b427657d30a24e696ef Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 9 Jun 2025 16:35:57 +0200 Subject: [PATCH 051/242] MAINTAINERS: .mailmap: Update Hans de Goede's email address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I'm moving all my kernel work over to using my kernel.org email address. Update .mailmap and MAINTAINER entries still using hdegoede@redhat.com. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20250609143558.42941-2-hansg@kernel.org Signed-off-by: Ilpo Järvinen --- .mailmap | 1 + MAINTAINERS | 72 ++++++++++++++++++++++++++--------------------------- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/.mailmap b/.mailmap index c8a21d72241f..645294560b19 100644 --- a/.mailmap +++ b/.mailmap @@ -282,6 +282,7 @@ Gustavo Padovan Gustavo Padovan Hamza Mahfooz Hanjun Guo +Hans de Goede Hans Verkuil Hans Verkuil Harry Yoo <42.hyeyoo@gmail.com> diff --git a/MAINTAINERS b/MAINTAINERS index a92290fffa16..e8f3dc93a569 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -207,7 +207,7 @@ X: arch/*/include/uapi/ X: include/uapi/ ABIT UGURU 1,2 HARDWARE MONITOR DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-hwmon@vger.kernel.org S: Maintained F: drivers/hwmon/abituguru.c @@ -371,7 +371,7 @@ S: Maintained F: drivers/platform/x86/quickstart.c ACPI SERIAL MULTI INSTANTIATE DRIVER -M: Hans de Goede +M: Hans de Goede L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/serial-multi-instantiate.c @@ -3551,7 +3551,7 @@ F: arch/arm64/boot/Makefile F: scripts/make_fit.py ARM64 PLATFORM DRIVERS -M: Hans de Goede +M: Hans de Goede M: Ilpo Järvinen R: Bryan O'Donoghue L: platform-driver-x86@vger.kernel.org @@ -3712,7 +3712,7 @@ F: drivers/platform/x86/asus*.c F: drivers/platform/x86/eeepc*.c ASUS TF103C DOCK DRIVER -M: Hans de Goede +M: Hans de Goede L: platform-driver-x86@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git @@ -5613,14 +5613,14 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/peter.chen/usb.git F: drivers/usb/chipidea/ CHIPONE ICN8318 I2C TOUCHSCREEN DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-input@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/input/touchscreen/chipone,icn8318.yaml F: drivers/input/touchscreen/chipone_icn8318.c CHIPONE ICN8505 I2C TOUCHSCREEN DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-input@vger.kernel.org S: Maintained F: drivers/input/touchscreen/chipone_icn8505.c @@ -6917,7 +6917,7 @@ F: include/dt-bindings/pmu/exynos_ppmu.h F: include/linux/devfreq-event.h DEVICE RESOURCE MANAGEMENT HELPERS -M: Hans de Goede +M: Hans de Goede R: Matti Vaittinen S: Maintained F: include/linux/devm-helpers.h @@ -7516,7 +7516,7 @@ F: drivers/gpu/drm/gud/ F: include/drm/gud.h DRM DRIVER FOR GRAIN MEDIA GM12U320 PROJECTORS -M: Hans de Goede +M: Hans de Goede S: Maintained T: git https://gitlab.freedesktop.org/drm/misc/kernel.git F: drivers/gpu/drm/tiny/gm12u320.c @@ -7916,7 +7916,7 @@ F: drivers/gpu/drm/ci/xfails/vkms* F: drivers/gpu/drm/vkms/ DRM DRIVER FOR VIRTUALBOX VIRTUAL GPU -M: Hans de Goede +M: Hans de Goede L: dri-devel@lists.freedesktop.org S: Maintained T: git https://gitlab.freedesktop.org/drm/misc/kernel.git @@ -8317,7 +8317,7 @@ F: drivers/gpu/drm/panel/ F: include/drm/drm_panel.h DRM PRIVACY-SCREEN CLASS -M: Hans de Goede +M: Hans de Goede L: dri-devel@lists.freedesktop.org S: Maintained T: git https://gitlab.freedesktop.org/drm/misc/kernel.git @@ -10221,7 +10221,7 @@ S: Maintained F: Documentation/devicetree/bindings/connector/gocontroll,moduline-module-slot.yaml GOODIX TOUCHSCREEN -M: Hans de Goede +M: Hans de Goede L: linux-input@vger.kernel.org S: Maintained F: drivers/input/touchscreen/goodix* @@ -10260,7 +10260,7 @@ F: include/dt-bindings/clock/google,gs101.h K: [gG]oogle.?[tT]ensor GPD POCKET FAN DRIVER -M: Hans de Goede +M: Hans de Goede L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/gpd-pocket-fan.c @@ -11421,7 +11421,7 @@ F: drivers/i2c/busses/i2c-via.c F: drivers/i2c/busses/i2c-viapro.c I2C/SMBUS INTEL CHT WHISKEY COVE PMIC DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/busses/i2c-cht-wc.c @@ -12011,13 +12011,13 @@ S: Supported F: sound/soc/intel/ INTEL ATOMISP2 DUMMY / POWER-MANAGEMENT DRIVER -M: Hans de Goede +M: Hans de Goede L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/intel/atomisp2/pm.c INTEL ATOMISP2 LED DRIVER -M: Hans de Goede +M: Hans de Goede L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/intel/atomisp2/led.c @@ -13678,7 +13678,7 @@ S: Maintained F: drivers/platform/x86/lenovo-wmi-hotkey-utilities.c LETSKETCH HID TABLET DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-input@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git @@ -13728,7 +13728,7 @@ F: drivers/ata/sata_gemini.c F: drivers/ata/sata_gemini.h LIBATA SATA AHCI PLATFORM devices support -M: Hans de Goede +M: Hans de Goede L: linux-ide@vger.kernel.org S: Maintained F: drivers/ata/ahci_platform.c @@ -14098,7 +14098,7 @@ F: Documentation/admin-guide/ldm.rst F: block/partitions/ldm.* LOGITECH HID GAMING KEYBOARDS -M: Hans de Goede +M: Hans de Goede L: linux-input@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git @@ -14780,7 +14780,7 @@ F: Documentation/devicetree/bindings/power/supply/maxim,max17040.yaml F: drivers/power/supply/max17040_battery.c MAXIM MAX17042 FAMILY FUEL GAUGE DRIVERS -R: Hans de Goede +R: Hans de Goede R: Krzysztof Kozlowski R: Marek Szyprowski R: Sebastian Krzyszkowiak @@ -15582,7 +15582,7 @@ Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlxfw/ MELLANOX HARDWARE PLATFORM SUPPORT -M: Hans de Goede +M: Hans de Goede M: Ilpo Järvinen M: Vadim Pasternak L: platform-driver-x86@vger.kernel.org @@ -16538,7 +16538,7 @@ S: Maintained F: drivers/platform/surface/surface_gpe.c MICROSOFT SURFACE HARDWARE PLATFORM SUPPORT -M: Hans de Goede +M: Hans de Goede M: Ilpo Järvinen M: Maximilian Luz L: platform-driver-x86@vger.kernel.org @@ -17706,7 +17706,7 @@ F: tools/include/nolibc/ F: tools/testing/selftests/nolibc/ NOVATEK NVT-TS I2C TOUCHSCREEN DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-input@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/input/touchscreen/novatek,nvt-ts.yaml @@ -22708,7 +22708,7 @@ K: fu[57]40 K: [^@]sifive SILEAD TOUCHSCREEN DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-input@vger.kernel.org L: platform-driver-x86@vger.kernel.org S: Maintained @@ -22741,7 +22741,7 @@ F: Documentation/devicetree/bindings/i3c/silvaco,i3c-master.yaml F: drivers/i3c/master/svc-i3c-master.c SIMPLEFB FB DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-fbdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/display/simple-framebuffer.yaml @@ -22870,7 +22870,7 @@ F: Documentation/hwmon/emc2103.rst F: drivers/hwmon/emc2103.c SMSC SCH5627 HARDWARE MONITOR DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-hwmon@vger.kernel.org S: Supported F: Documentation/hwmon/sch5627.rst @@ -23525,7 +23525,7 @@ S: Supported F: Documentation/process/stable-kernel-rules.rst STAGING - ATOMISP DRIVER -M: Hans de Goede +M: Hans de Goede M: Mauro Carvalho Chehab R: Sakari Ailus L: linux-media@vger.kernel.org @@ -23822,7 +23822,7 @@ F: arch/m68k/sun3*/ F: drivers/net/ethernet/i825xx/sun3* SUN4I LOW RES ADC ATTACHED TABLET KEYS DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-input@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/input/allwinner,sun4i-a10-lradc-keys.yaml @@ -25590,7 +25590,7 @@ F: Documentation/hid/hiddev.rst F: drivers/hid/usbhid/ USB INTEL XHCI ROLE MUX DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-usb@vger.kernel.org S: Maintained F: drivers/usb/roles/intel-xhci-usb-role-switch.c @@ -25781,7 +25781,7 @@ F: Documentation/firmware-guide/acpi/intel-pmc-mux.rst F: drivers/usb/typec/mux/intel_pmc_mux.c USB TYPEC PI3USB30532 MUX DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-usb@vger.kernel.org S: Maintained F: drivers/usb/typec/mux/pi3usb30532.c @@ -25810,7 +25810,7 @@ F: drivers/usb/host/uhci* USB VIDEO CLASS M: Laurent Pinchart -M: Hans de Goede +M: Hans de Goede L: linux-media@vger.kernel.org S: Maintained W: http://www.ideasonboard.org/uvc/ @@ -26341,7 +26341,7 @@ F: include/uapi/linux/virtio_snd.h F: sound/virtio/* VIRTUAL BOX GUEST DEVICE DRIVER -M: Hans de Goede +M: Hans de Goede M: Arnd Bergmann M: Greg Kroah-Hartman S: Maintained @@ -26350,7 +26350,7 @@ F: include/linux/vbox_utils.h F: include/uapi/linux/vbox*.h VIRTUAL BOX SHARED FOLDER VFS DRIVER -M: Hans de Goede +M: Hans de Goede L: linux-fsdevel@vger.kernel.org S: Maintained F: fs/vboxsf/* @@ -26604,7 +26604,7 @@ F: drivers/mmc/host/wbsd.* WACOM PROTOCOL 4 SERIAL TABLETS M: Julian Squires -M: Hans de Goede +M: Hans de Goede L: linux-input@vger.kernel.org S: Maintained F: drivers/input/tablet/wacom_serial4.c @@ -26771,7 +26771,7 @@ F: include/linux/wwan.h F: include/uapi/linux/wwan.h X-POWERS AXP288 PMIC DRIVERS -M: Hans de Goede +M: Hans de Goede S: Maintained F: drivers/acpi/pmic/intel_pmic_xpower.c N: axp288 @@ -26863,14 +26863,14 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/mm F: arch/x86/mm/ X86 PLATFORM ANDROID TABLETS DSDT FIXUP DRIVER -M: Hans de Goede +M: Hans de Goede L: platform-driver-x86@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git F: drivers/platform/x86/x86-android-tablets/ X86 PLATFORM DRIVERS -M: Hans de Goede +M: Hans de Goede M: Ilpo Järvinen L: platform-driver-x86@vger.kernel.org S: Maintained From c0317ad44f45b3c1f0ff46a4e28d14c7bccdedf4 Mon Sep 17 00:00:00 2001 From: Gabriel Dalimonte Date: Sun, 1 Jun 2025 12:45:36 -0400 Subject: [PATCH 052/242] drm/vc4: fix infinite EPROBE_DEFER loop `vc4_hdmi_audio_init` calls `devm_snd_dmaengine_pcm_register` which may return EPROBE_DEFER. Calling `drm_connector_hdmi_audio_init` adds a child device. The driver model docs[1] state that adding a child device prior to returning EPROBE_DEFER may result in an infinite loop. [1] https://www.kernel.org/doc/html/v6.14/driver-api/driver-model/driver.html Fixes: 9640f1437a88 ("drm/vc4: hdmi: switch to using generic HDMI Codec infrastructure") Signed-off-by: Gabriel Dalimonte Link: https://lore.kernel.org/r/20250601-vc4-audio-inf-probe-v2-1-9ad43c7b6147@gmail.com Signed-off-by: Maxime Ripard --- drivers/gpu/drm/vc4/vc4_hdmi.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index a29a6ef266f9..163d092bd973 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -560,12 +560,6 @@ static int vc4_hdmi_connector_init(struct drm_device *dev, if (ret) return ret; - ret = drm_connector_hdmi_audio_init(connector, dev->dev, - &vc4_hdmi_audio_funcs, - 8, false, -1); - if (ret) - return ret; - drm_connector_helper_add(connector, &vc4_hdmi_connector_helper_funcs); /* @@ -2291,6 +2285,12 @@ static int vc4_hdmi_audio_init(struct vc4_hdmi *vc4_hdmi) return ret; } + ret = drm_connector_hdmi_audio_init(&vc4_hdmi->connector, dev, + &vc4_hdmi_audio_funcs, 8, false, + -1); + if (ret) + return ret; + dai_link->cpus = &vc4_hdmi->audio.cpu; dai_link->codecs = &vc4_hdmi->audio.codec; dai_link->platforms = &vc4_hdmi->audio.platform; From 4823a58093c6dfa20df62b5c18da613621b9716e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 10 Jun 2025 16:38:56 +0530 Subject: [PATCH 053/242] cpufreq: Convert `/// SAFETY` lines to `# Safety` sections Replace `/// SAFETY` comments in doc comments with proper `# Safety` sections, as per rustdoc conventions. Also mark the C FFI callbacks as `unsafe` to correctly reflect their safety requirements. Reported-by: Miguel Ojeda Closes: https://github.com/Rust-for-Linux/linux/issues/1169 Signed-off-by: Viresh Kumar --- rust/kernel/cpufreq.rs | 146 ++++++++++++++++++++++++++++++----------- 1 file changed, 109 insertions(+), 37 deletions(-) diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index b0a9c6182aec..9b995f18aac6 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -1055,8 +1055,11 @@ pub fn new_foreign_owned(dev: &Device) -> Result { impl Registration { /// Driver's `init` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn init_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn init_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1070,8 +1073,11 @@ extern "C" fn init_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi:: /// Driver's `exit` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn exit_callback(ptr: *mut bindings::cpufreq_policy) { + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn exit_callback(ptr: *mut bindings::cpufreq_policy) { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. let policy = unsafe { Policy::from_raw_mut(ptr) }; @@ -1082,8 +1088,11 @@ extern "C" fn exit_callback(ptr: *mut bindings::cpufreq_policy) { /// Driver's `online` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn online_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn online_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1094,8 +1103,13 @@ extern "C" fn online_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi /// Driver's `offline` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn offline_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn offline_callback( + ptr: *mut bindings::cpufreq_policy, + ) -> kernel::ffi::c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1106,8 +1120,13 @@ extern "C" fn offline_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ff /// Driver's `suspend` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn suspend_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn suspend_callback( + ptr: *mut bindings::cpufreq_policy, + ) -> kernel::ffi::c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1118,8 +1137,11 @@ extern "C" fn suspend_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ff /// Driver's `resume` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn resume_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn resume_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1130,8 +1152,11 @@ extern "C" fn resume_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi /// Driver's `ready` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn ready_callback(ptr: *mut bindings::cpufreq_policy) { + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn ready_callback(ptr: *mut bindings::cpufreq_policy) { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. let policy = unsafe { Policy::from_raw_mut(ptr) }; @@ -1140,8 +1165,13 @@ extern "C" fn ready_callback(ptr: *mut bindings::cpufreq_policy) { /// Driver's `verify` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn verify_callback(ptr: *mut bindings::cpufreq_policy_data) -> kernel::ffi::c_int { + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn verify_callback( + ptr: *mut bindings::cpufreq_policy_data, + ) -> kernel::ffi::c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1152,8 +1182,13 @@ extern "C" fn verify_callback(ptr: *mut bindings::cpufreq_policy_data) -> kernel /// Driver's `setpolicy` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn setpolicy_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn setpolicy_callback( + ptr: *mut bindings::cpufreq_policy, + ) -> kernel::ffi::c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1164,8 +1199,11 @@ extern "C" fn setpolicy_callback(ptr: *mut bindings::cpufreq_policy) -> kernel:: /// Driver's `target` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn target_callback( + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn target_callback( ptr: *mut bindings::cpufreq_policy, target_freq: u32, relation: u32, @@ -1180,8 +1218,11 @@ extern "C" fn target_callback( /// Driver's `target_index` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn target_index_callback( + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn target_index_callback( ptr: *mut bindings::cpufreq_policy, index: u32, ) -> kernel::ffi::c_int { @@ -1200,8 +1241,11 @@ extern "C" fn target_index_callback( /// Driver's `fast_switch` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn fast_switch_callback( + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn fast_switch_callback( ptr: *mut bindings::cpufreq_policy, target_freq: u32, ) -> kernel::ffi::c_uint { @@ -1212,7 +1256,11 @@ extern "C" fn fast_switch_callback( } /// Driver's `adjust_perf` callback. - extern "C" fn adjust_perf_callback( + /// + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + unsafe extern "C" fn adjust_perf_callback( cpu: u32, min_perf: usize, target_perf: usize, @@ -1225,8 +1273,11 @@ extern "C" fn adjust_perf_callback( /// Driver's `get_intermediate` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn get_intermediate_callback( + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn get_intermediate_callback( ptr: *mut bindings::cpufreq_policy, index: u32, ) -> kernel::ffi::c_uint { @@ -1243,8 +1294,11 @@ extern "C" fn get_intermediate_callback( /// Driver's `target_intermediate` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn target_intermediate_callback( + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn target_intermediate_callback( ptr: *mut bindings::cpufreq_policy, index: u32, ) -> kernel::ffi::c_int { @@ -1262,12 +1316,21 @@ extern "C" fn target_intermediate_callback( } /// Driver's `get` callback. - extern "C" fn get_callback(cpu: u32) -> kernel::ffi::c_uint { + /// + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + unsafe extern "C" fn get_callback(cpu: u32) -> kernel::ffi::c_uint { PolicyCpu::from_cpu(cpu).map_or(0, |mut policy| T::get(&mut policy).map_or(0, |f| f)) } /// Driver's `update_limit` callback. - extern "C" fn update_limits_callback(ptr: *mut bindings::cpufreq_policy) { + /// + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn update_limits_callback(ptr: *mut bindings::cpufreq_policy) { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. let policy = unsafe { Policy::from_raw_mut(ptr) }; @@ -1276,8 +1339,11 @@ extern "C" fn update_limits_callback(ptr: *mut bindings::cpufreq_policy) { /// Driver's `bios_limit` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn bios_limit_callback(cpu: i32, limit: *mut u32) -> kernel::ffi::c_int { + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn bios_limit_callback(cpu: i32, limit: *mut u32) -> kernel::ffi::c_int { from_result(|| { let mut policy = PolicyCpu::from_cpu(cpu as u32)?; @@ -1288,8 +1354,11 @@ extern "C" fn bios_limit_callback(cpu: i32, limit: *mut u32) -> kernel::ffi::c_i /// Driver's `set_boost` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn set_boost_callback( + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn set_boost_callback( ptr: *mut bindings::cpufreq_policy, state: i32, ) -> kernel::ffi::c_int { @@ -1303,8 +1372,11 @@ extern "C" fn set_boost_callback( /// Driver's `register_em` callback. /// - /// SAFETY: Called from C. Inputs must be valid pointers. - extern "C" fn register_em_callback(ptr: *mut bindings::cpufreq_policy) { + /// # Safety + /// + /// - This function may only be called from the cpufreq C infrastructure. + /// - The pointer arguments must be valid pointers. + unsafe extern "C" fn register_em_callback(ptr: *mut bindings::cpufreq_policy) { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. let policy = unsafe { Policy::from_raw_mut(ptr) }; From d29fc02caad7f94b62d56ee1b01c954f9c961ba7 Mon Sep 17 00:00:00 2001 From: Tasos Sahanidis Date: Mon, 19 May 2025 11:49:45 +0300 Subject: [PATCH 054/242] ata: pata_via: Force PIO for ATAPI devices on VT6415/VT6330 The controller has a hardware bug that can hard hang the system when doing ATAPI DMAs without any trace of what happened. Depending on the device attached, it can also prevent the system from booting. In this case, the system hangs when reading the ATIP from optical media with cdrecord -vvv -atip on an _NEC DVD_RW ND-4571A 1-01 and an Optiarc DVD RW AD-7200A 1.06 attached to an ASRock 990FX Extreme 4, running at UDMA/33. The issue can be reproduced by running the same command with a cygwin build of cdrecord on WinXP, although it requires more attempts to cause it. The hang in that case is also resolved by forcing PIO. It doesn't appear that VIA has produced any drivers for that OS, thus no known workaround exists. HDDs attached to the controller do not suffer from any DMA issues. Cc: stable@vger.kernel.org Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/916677 Signed-off-by: Tasos Sahanidis Link: https://lore.kernel.org/r/20250519085508.1398701-1-tasos@tasossah.com Signed-off-by: Niklas Cassel --- drivers/ata/pata_via.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c index 696b99720dcb..d82728a01832 100644 --- a/drivers/ata/pata_via.c +++ b/drivers/ata/pata_via.c @@ -368,7 +368,8 @@ static unsigned int via_mode_filter(struct ata_device *dev, unsigned int mask) } if (dev->class == ATA_DEV_ATAPI && - dmi_check_system(no_atapi_dma_dmi_table)) { + (dmi_check_system(no_atapi_dma_dmi_table) || + config->id == PCI_DEVICE_ID_VIA_6415)) { ata_dev_warn(dev, "controller locks up on ATAPI DMA, forcing PIO\n"); mask &= ATA_MASK_PIO; } From 33877220b8641b4cde474a4229ea92c0e3637883 Mon Sep 17 00:00:00 2001 From: Tasos Sahanidis Date: Mon, 19 May 2025 11:56:55 +0300 Subject: [PATCH 055/242] ata: libata-acpi: Do not assume 40 wire cable if no devices are enabled On at least an ASRock 990FX Extreme 4 with a VIA VT6330, the devices have not yet been enabled by the first time ata_acpi_cbl_80wire() is called. This means that the ata_for_each_dev loop is never entered, and a 40 wire cable is assumed. The VIA controller on this board does not report the cable in the PCI config space, thus having to fall back to ACPI even though no SATA bridge is present. The _GTM values are correctly reported by the firmware through ACPI, which has already set up faster transfer modes, but due to the above the controller is forced down to a maximum of UDMA/33. Resolve this by modifying ata_acpi_cbl_80wire() to directly return the cable type. First, an unknown cable is assumed which preserves the mode set by the firmware, and then on subsequent calls when the devices have been enabled, an 80 wire cable is correctly detected. Since the function now directly returns the cable type, it is renamed to ata_acpi_cbl_pata_type(). Signed-off-by: Tasos Sahanidis Link: https://lore.kernel.org/r/20250519085945.1399466-1-tasos@tasossah.com Signed-off-by: Niklas Cassel --- drivers/ata/libata-acpi.c | 24 ++++++++++++++++-------- drivers/ata/pata_via.c | 6 ++---- include/linux/libata.h | 7 +++---- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c index b7f0bf795521..f2140fc06ba0 100644 --- a/drivers/ata/libata-acpi.c +++ b/drivers/ata/libata-acpi.c @@ -514,15 +514,19 @@ unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev, EXPORT_SYMBOL_GPL(ata_acpi_gtm_xfermask); /** - * ata_acpi_cbl_80wire - Check for 80 wire cable + * ata_acpi_cbl_pata_type - Return PATA cable type * @ap: Port to check - * @gtm: GTM data to use * - * Return 1 if the @gtm indicates the BIOS selected an 80wire mode. + * Return ATA_CBL_PATA* according to the transfer mode selected by BIOS */ -int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm) +int ata_acpi_cbl_pata_type(struct ata_port *ap) { struct ata_device *dev; + int ret = ATA_CBL_PATA_UNK; + const struct ata_acpi_gtm *gtm = ata_acpi_init_gtm(ap); + + if (!gtm) + return ATA_CBL_PATA40; ata_for_each_dev(dev, &ap->link, ENABLED) { unsigned int xfer_mask, udma_mask; @@ -530,13 +534,17 @@ int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm) xfer_mask = ata_acpi_gtm_xfermask(dev, gtm); ata_unpack_xfermask(xfer_mask, NULL, NULL, &udma_mask); - if (udma_mask & ~ATA_UDMA_MASK_40C) - return 1; + ret = ATA_CBL_PATA40; + + if (udma_mask & ~ATA_UDMA_MASK_40C) { + ret = ATA_CBL_PATA80; + break; + } } - return 0; + return ret; } -EXPORT_SYMBOL_GPL(ata_acpi_cbl_80wire); +EXPORT_SYMBOL_GPL(ata_acpi_cbl_pata_type); static void ata_acpi_gtf_to_tf(struct ata_device *dev, const struct ata_acpi_gtf *gtf, diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c index d82728a01832..bb80e7800dcb 100644 --- a/drivers/ata/pata_via.c +++ b/drivers/ata/pata_via.c @@ -201,11 +201,9 @@ static int via_cable_detect(struct ata_port *ap) { two drives */ if (ata66 & (0x10100000 >> (16 * ap->port_no))) return ATA_CBL_PATA80; + /* Check with ACPI so we can spot BIOS reported SATA bridges */ - if (ata_acpi_init_gtm(ap) && - ata_acpi_cbl_80wire(ap, ata_acpi_init_gtm(ap))) - return ATA_CBL_PATA80; - return ATA_CBL_PATA40; + return ata_acpi_cbl_pata_type(ap); } static int via_pre_reset(struct ata_link *link, unsigned long deadline) diff --git a/include/linux/libata.h b/include/linux/libata.h index 31be45fd47a6..1e5aec839041 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1352,7 +1352,7 @@ int ata_acpi_stm(struct ata_port *ap, const struct ata_acpi_gtm *stm); int ata_acpi_gtm(struct ata_port *ap, struct ata_acpi_gtm *stm); unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev, const struct ata_acpi_gtm *gtm); -int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm); +int ata_acpi_cbl_pata_type(struct ata_port *ap); #else static inline const struct ata_acpi_gtm *ata_acpi_init_gtm(struct ata_port *ap) { @@ -1377,10 +1377,9 @@ static inline unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev, return 0; } -static inline int ata_acpi_cbl_80wire(struct ata_port *ap, - const struct ata_acpi_gtm *gtm) +static inline int ata_acpi_cbl_pata_type(struct ata_port *ap) { - return 0; + return ATA_CBL_PATA40; } #endif From d17e61ab63fb7747b340d6a66bf1408cd5c6562b Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Fri, 6 Jun 2025 22:37:29 +0200 Subject: [PATCH 056/242] drm/meson: fix debug log statement when setting the HDMI clocks The "phy" and "vclk" frequency labels were swapped, making it more difficult to debug driver errors. Swap the label order to make them match with the actual frequencies printed to correct this. Fixes: e5fab2ec9ca4 ("drm/meson: vclk: add support for YUV420 setup") Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20250606203729.3311592-1-martin.blumenstingl@googlemail.com --- drivers/gpu/drm/meson/meson_encoder_hdmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/meson/meson_encoder_hdmi.c b/drivers/gpu/drm/meson/meson_encoder_hdmi.c index 47136bbbe8c6..ab08d690d882 100644 --- a/drivers/gpu/drm/meson/meson_encoder_hdmi.c +++ b/drivers/gpu/drm/meson/meson_encoder_hdmi.c @@ -109,7 +109,7 @@ static void meson_encoder_hdmi_set_vclk(struct meson_encoder_hdmi *encoder_hdmi, venc_freq /= 2; dev_dbg(priv->dev, - "vclk:%lluHz phy=%lluHz venc=%lluHz hdmi=%lluHz enci=%d\n", + "phy:%lluHz vclk=%lluHz venc=%lluHz hdmi=%lluHz enci=%d\n", phy_freq, vclk_freq, venc_freq, hdmi_freq, priv->venc.hdmi_use_enci); From faf2f8382088e8c74bd6eeb236c8c9190e61615e Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sat, 7 Jun 2025 00:10:31 +0200 Subject: [PATCH 057/242] drm/meson: use vclk_freq instead of pixel_freq in debug print meson_vclk_vic_supported_freq() has a debug print which includes the pixel freq. However, within the whole function the pixel freq is irrelevant, other than checking the end of the params array. Switch to printing the vclk_freq which is being compared / matched against the inputs to the function to avoid confusion when analyzing error reports from users. Fixes: e5fab2ec9ca4 ("drm/meson: vclk: add support for YUV420 setup") Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20250606221031.3419353-1-martin.blumenstingl@googlemail.com --- drivers/gpu/drm/meson/meson_vclk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_vclk.c b/drivers/gpu/drm/meson/meson_vclk.c index 3325580d885d..c4123bb958e4 100644 --- a/drivers/gpu/drm/meson/meson_vclk.c +++ b/drivers/gpu/drm/meson/meson_vclk.c @@ -790,9 +790,9 @@ meson_vclk_vic_supported_freq(struct meson_drm *priv, } for (i = 0 ; params[i].pixel_freq ; ++i) { - DRM_DEBUG_DRIVER("i = %d pixel_freq = %lluHz alt = %lluHz\n", - i, params[i].pixel_freq, - PIXEL_FREQ_1000_1001(params[i].pixel_freq)); + DRM_DEBUG_DRIVER("i = %d vclk_freq = %lluHz alt = %lluHz\n", + i, params[i].vclk_freq, + PIXEL_FREQ_1000_1001(params[i].vclk_freq)); DRM_DEBUG_DRIVER("i = %d phy_freq = %lluHz alt = %lluHz\n", i, params[i].phy_freq, PHY_FREQ_1000_1001(params[i].phy_freq)); From 0cee6c4d3518b2e757aedae78771f17149f57653 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Mon, 9 Jun 2025 22:27:51 +0200 Subject: [PATCH 058/242] drm/meson: fix more rounding issues with 59.94Hz modes Commit 1017560164b6 ("drm/meson: use unsigned long long / Hz for frequency types") attempts to resolve video playback using 59.94Hz. using YUV420 by changing the clock calculation to use Hz instead of kHz (thus yielding more precision). The basic calculation itself is correct, however the comparisions in meson_vclk_vic_supported_freq() and meson_vclk_setup() don't work anymore for 59.94Hz modes (using the freq * 1000 / 1001 logic). For example, drm/edid specifies a 593407kHz clock for 3840x2160@59.94Hz. With the mentioend commit we convert this to Hz. Then meson_vclk tries to find a matchig "params" entry (as the clock setup code currently only supports specific frequencies) by taking the venc_freq from the params and calculating the "alt frequency" (used for the 59.94Hz modes) from it, which is: (594000000Hz * 1000) / 1001 = 593406593Hz Similar calculation is applied to the phy_freq (TMDS clock), which is 10 times the pixel clock. Implement a new meson_vclk_freqs_are_matching_param() function whose purpose is to compare if the requested and calculated frequencies. They may not match exactly (for the reasons mentioned above). Allow the clocks to deviate slightly to make the 59.94Hz modes again. Fixes: 1017560164b6 ("drm/meson: use unsigned long long / Hz for frequency types") Reported-by: Christian Hewitt Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20250609202751.962208-1-martin.blumenstingl@googlemail.com --- drivers/gpu/drm/meson/meson_vclk.c | 55 ++++++++++++++++++------------ 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_vclk.c b/drivers/gpu/drm/meson/meson_vclk.c index c4123bb958e4..dfe0c28a0f05 100644 --- a/drivers/gpu/drm/meson/meson_vclk.c +++ b/drivers/gpu/drm/meson/meson_vclk.c @@ -110,10 +110,7 @@ #define HDMI_PLL_LOCK BIT(31) #define HDMI_PLL_LOCK_G12A (3 << 30) -#define PIXEL_FREQ_1000_1001(_freq) \ - DIV_ROUND_CLOSEST_ULL((_freq) * 1000ULL, 1001ULL) -#define PHY_FREQ_1000_1001(_freq) \ - (PIXEL_FREQ_1000_1001(DIV_ROUND_DOWN_ULL(_freq, 10ULL)) * 10) +#define FREQ_1000_1001(_freq) DIV_ROUND_CLOSEST_ULL((_freq) * 1000ULL, 1001ULL) /* VID PLL Dividers */ enum { @@ -772,6 +769,36 @@ static void meson_hdmi_pll_generic_set(struct meson_drm *priv, pll_freq); } +static bool meson_vclk_freqs_are_matching_param(unsigned int idx, + unsigned long long phy_freq, + unsigned long long vclk_freq) +{ + DRM_DEBUG_DRIVER("i = %d vclk_freq = %lluHz alt = %lluHz\n", + idx, params[idx].vclk_freq, + FREQ_1000_1001(params[idx].vclk_freq)); + DRM_DEBUG_DRIVER("i = %d phy_freq = %lluHz alt = %lluHz\n", + idx, params[idx].phy_freq, + FREQ_1000_1001(params[idx].phy_freq)); + + /* Match strict frequency */ + if (phy_freq == params[idx].phy_freq && + vclk_freq == params[idx].vclk_freq) + return true; + + /* Match 1000/1001 variant: vclk deviation has to be less than 1kHz + * (drm EDID is defined in 1kHz steps, so everything smaller must be + * rounding error) and the PHY freq deviation has to be less than + * 10kHz (as the TMDS clock is 10 times the pixel clock, so anything + * smaller must be rounding error as well). + */ + if (abs(vclk_freq - FREQ_1000_1001(params[idx].vclk_freq)) < 1000 && + abs(phy_freq - FREQ_1000_1001(params[idx].phy_freq)) < 10000) + return true; + + /* no match */ + return false; +} + enum drm_mode_status meson_vclk_vic_supported_freq(struct meson_drm *priv, unsigned long long phy_freq, @@ -790,19 +817,7 @@ meson_vclk_vic_supported_freq(struct meson_drm *priv, } for (i = 0 ; params[i].pixel_freq ; ++i) { - DRM_DEBUG_DRIVER("i = %d vclk_freq = %lluHz alt = %lluHz\n", - i, params[i].vclk_freq, - PIXEL_FREQ_1000_1001(params[i].vclk_freq)); - DRM_DEBUG_DRIVER("i = %d phy_freq = %lluHz alt = %lluHz\n", - i, params[i].phy_freq, - PHY_FREQ_1000_1001(params[i].phy_freq)); - /* Match strict frequency */ - if (phy_freq == params[i].phy_freq && - vclk_freq == params[i].vclk_freq) - return MODE_OK; - /* Match 1000/1001 variant */ - if (phy_freq == PHY_FREQ_1000_1001(params[i].phy_freq) && - vclk_freq == PIXEL_FREQ_1000_1001(params[i].vclk_freq)) + if (meson_vclk_freqs_are_matching_param(i, phy_freq, vclk_freq)) return MODE_OK; } @@ -1075,10 +1090,8 @@ void meson_vclk_setup(struct meson_drm *priv, unsigned int target, } for (freq = 0 ; params[freq].pixel_freq ; ++freq) { - if ((phy_freq == params[freq].phy_freq || - phy_freq == PHY_FREQ_1000_1001(params[freq].phy_freq)) && - (vclk_freq == params[freq].vclk_freq || - vclk_freq == PIXEL_FREQ_1000_1001(params[freq].vclk_freq))) { + if (meson_vclk_freqs_are_matching_param(freq, phy_freq, + vclk_freq)) { if (vclk_freq != params[freq].vclk_freq) vic_alternate_clock = true; else From fe5b391fc56f77cf3c22a9dd4f0ce20db0e3533f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 6 Jun 2025 11:01:11 +0200 Subject: [PATCH 059/242] ata: pata_cs5536: fix build on 32-bit UML On 32-bit ARCH=um, CONFIG_X86_32 is still defined, so it doesn't indicate building on real X86 machines. There's no MSR on UML though, so add a check for CONFIG_X86. Reported-by: Arnd Bergmann Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20250606090110.15784-2-johannes@sipsolutions.net Signed-off-by: Niklas Cassel --- drivers/ata/pata_cs5536.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/pata_cs5536.c b/drivers/ata/pata_cs5536.c index b811efd2cc34..73e81e160c91 100644 --- a/drivers/ata/pata_cs5536.c +++ b/drivers/ata/pata_cs5536.c @@ -27,7 +27,7 @@ #include #include -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86) && defined(CONFIG_X86_32) #include static int use_msr; module_param_named(msr, use_msr, int, 0644); From 8a157d8a00e815cab4432653cb50c9cedbbb4931 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Jun 2025 09:33:48 -0400 Subject: [PATCH 060/242] tracing: Do not free "head" on error path of filter_free_subsystem_filters() The variable "head" is allocated and initialized as a list before allocating the first "item" for the list. If the allocation of "item" fails, it frees "head" and then jumps to the label "free_now" which will process head and free it. This will cause a UAF of "head", and it doesn't need to free it before jumping to the "free_now" label as that code will free it. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250610093348.33c5643a@gandalf.local.home Fixes: a9d0aab5eb33 ("tracing: Fix regression of filter waiting a long time on RCU synchronization") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202506070424.lCiNreTI-lkp@intel.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index ea8b364b6818..08141f105c95 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1437,10 +1437,8 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, INIT_LIST_HEAD(&head->list); item = kmalloc(sizeof(*item), GFP_KERNEL); - if (!item) { - kfree(head); + if (!item) goto free_now; - } item->filter = filter; list_add_tail(&item->list, &head->list); From ac0b8b327a5677dc6fecdf353d808161525b1ff0 Mon Sep 17 00:00:00 2001 From: Penglei Jiang Date: Tue, 10 Jun 2025 10:18:01 -0700 Subject: [PATCH 061/242] io_uring: fix use-after-free of sq->thread in __io_uring_show_fdinfo() syzbot reports: BUG: KASAN: slab-use-after-free in getrusage+0x1109/0x1a60 Read of size 8 at addr ffff88810de2d2c8 by task a.out/304 CPU: 0 UID: 0 PID: 304 Comm: a.out Not tainted 6.16.0-rc1 #1 PREEMPT(voluntary) Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Call Trace: dump_stack_lvl+0x53/0x70 print_report+0xd0/0x670 ? __pfx__raw_spin_lock_irqsave+0x10/0x10 ? getrusage+0x1109/0x1a60 kasan_report+0xce/0x100 ? getrusage+0x1109/0x1a60 getrusage+0x1109/0x1a60 ? __pfx_getrusage+0x10/0x10 __io_uring_show_fdinfo+0x9fe/0x1790 ? ksys_read+0xf7/0x1c0 ? do_syscall_64+0xa4/0x260 ? vsnprintf+0x591/0x1100 ? __pfx___io_uring_show_fdinfo+0x10/0x10 ? __pfx_vsnprintf+0x10/0x10 ? mutex_trylock+0xcf/0x130 ? __pfx_mutex_trylock+0x10/0x10 ? __pfx_show_fd_locks+0x10/0x10 ? io_uring_show_fdinfo+0x57/0x80 io_uring_show_fdinfo+0x57/0x80 seq_show+0x38c/0x690 seq_read_iter+0x3f7/0x1180 ? inode_set_ctime_current+0x160/0x4b0 seq_read+0x271/0x3e0 ? __pfx_seq_read+0x10/0x10 ? __pfx__raw_spin_lock+0x10/0x10 ? __mark_inode_dirty+0x402/0x810 ? selinux_file_permission+0x368/0x500 ? file_update_time+0x10f/0x160 vfs_read+0x177/0xa40 ? __pfx___handle_mm_fault+0x10/0x10 ? __pfx_vfs_read+0x10/0x10 ? mutex_lock+0x81/0xe0 ? __pfx_mutex_lock+0x10/0x10 ? fdget_pos+0x24d/0x4b0 ksys_read+0xf7/0x1c0 ? __pfx_ksys_read+0x10/0x10 ? do_user_addr_fault+0x43b/0x9c0 do_syscall_64+0xa4/0x260 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f0f74170fc9 Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 8 RSP: 002b:00007fffece049e8 EFLAGS: 00000206 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0f74170fc9 RDX: 0000000000001000 RSI: 00007fffece049f0 RDI: 0000000000000004 RBP: 00007fffece05ad0 R08: 0000000000000000 R09: 00007fffece04d90 R10: 0000000000000000 R11: 0000000000000206 R12: 00005651720a1100 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 298: kasan_save_stack+0x33/0x60 kasan_save_track+0x14/0x30 __kasan_slab_alloc+0x6e/0x70 kmem_cache_alloc_node_noprof+0xe8/0x330 copy_process+0x376/0x5e00 create_io_thread+0xab/0xf0 io_sq_offload_create+0x9ed/0xf20 io_uring_setup+0x12b0/0x1cc0 do_syscall_64+0xa4/0x260 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 22: kasan_save_stack+0x33/0x60 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3b/0x60 __kasan_slab_free+0x37/0x50 kmem_cache_free+0xc4/0x360 rcu_core+0x5ff/0x19f0 handle_softirqs+0x18c/0x530 run_ksoftirqd+0x20/0x30 smpboot_thread_fn+0x287/0x6c0 kthread+0x30d/0x630 ret_from_fork+0xef/0x1a0 ret_from_fork_asm+0x1a/0x30 Last potentially related work creation: kasan_save_stack+0x33/0x60 kasan_record_aux_stack+0x8c/0xa0 __call_rcu_common.constprop.0+0x68/0x940 __schedule+0xff2/0x2930 __cond_resched+0x4c/0x80 mutex_lock+0x5c/0xe0 io_uring_del_tctx_node+0xe1/0x2b0 io_uring_clean_tctx+0xb7/0x160 io_uring_cancel_generic+0x34e/0x760 do_exit+0x240/0x2350 do_group_exit+0xab/0x220 __x64_sys_exit_group+0x39/0x40 x64_sys_call+0x1243/0x1840 do_syscall_64+0xa4/0x260 entry_SYSCALL_64_after_hwframe+0x77/0x7f The buggy address belongs to the object at ffff88810de2cb00 which belongs to the cache task_struct of size 3712 The buggy address is located 1992 bytes inside of freed 3712-byte region [ffff88810de2cb00, ffff88810de2d980) which is caused by the task_struct pointed to by sq->thread being released while it is being used in the function __io_uring_show_fdinfo(). Holding ctx->uring_lock does not prevent ehre relase or exit of sq->thread. Fix this by assigning and looking up ->thread under RCU, and grabbing a reference to the task_struct. This ensures that it cannot get released while fdinfo is using it. Reported-by: syzbot+531502bbbe51d2f769f4@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/682b06a5.a70a0220.3849cf.00b3.GAE@google.com Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads") Signed-off-by: Penglei Jiang Link: https://lore.kernel.org/r/20250610171801.70960-1-superman.xpt@gmail.com [axboe: massage commit message] Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 12 ++++++++++-- io_uring/sqpoll.c | 9 ++++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index e9355276ab5d..9798d6fb4ec7 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -141,18 +141,26 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sq = ctx->sq_data; + struct task_struct *tsk; + rcu_read_lock(); + tsk = rcu_dereference(sq->thread); /* * sq->thread might be NULL if we raced with the sqpoll * thread termination. */ - if (sq->thread) { + if (tsk) { + get_task_struct(tsk); + rcu_read_unlock(); + getrusage(tsk, RUSAGE_SELF, &sq_usage); + put_task_struct(tsk); sq_pid = sq->task_pid; sq_cpu = sq->sq_cpu; - getrusage(sq->thread, RUSAGE_SELF, &sq_usage); sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000 + sq_usage.ru_stime.tv_usec); sq_work_time = sq->work_time; + } else { + rcu_read_unlock(); } } diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 03c699493b5a..0625a421626f 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -270,7 +270,8 @@ static int io_sq_thread(void *data) /* offload context creation failed, just exit */ if (!current->io_uring) { mutex_lock(&sqd->lock); - sqd->thread = NULL; + rcu_assign_pointer(sqd->thread, NULL); + put_task_struct(current); mutex_unlock(&sqd->lock); goto err_out; } @@ -379,7 +380,8 @@ static int io_sq_thread(void *data) io_sq_tw(&retry_list, UINT_MAX); io_uring_cancel_generic(true, sqd); - sqd->thread = NULL; + rcu_assign_pointer(sqd->thread, NULL); + put_task_struct(current); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); io_run_task_work(); @@ -495,9 +497,6 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, ret = -EINVAL; goto err; } - - if (task_to_put) - put_task_struct(task_to_put); return 0; err_sqpoll: complete(&ctx->sq_data->exited); From c393befa14ab26596fb86d702566d648832dae06 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 6 Jun 2025 20:32:26 -0700 Subject: [PATCH 062/242] driver core: faux: Suppress bind attributes faux_device_create() is almost a suitable candidate to replace platform_driver_probe() if not for the fact that faux_device_create() supports dynamic attach/detach of the driver. Drop the bind attributes with the expectation that simple faux devices can always assume that the device is permanently bound at create, and only unbound at 'destroy'. The acpi-einj driver depends on static bind. Fixes: 6cb9441bfe8d ("ACPI: APEI: EINJ: Transition to the faux device interface") Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250607033228.1475625-2-dan.j.williams@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/base/faux.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/base/faux.c b/drivers/base/faux.c index 9054d346bd7f..934da77ca48b 100644 --- a/drivers/base/faux.c +++ b/drivers/base/faux.c @@ -86,6 +86,7 @@ static struct device_driver faux_driver = { .name = "faux_driver", .bus = &faux_bus_type, .probe_type = PROBE_FORCE_SYNCHRONOUS, + .suppress_bind_attrs = true, }; static void faux_device_release(struct device *dev) From ff53a6e247285687df1a71d8ee5c457939792c13 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 6 Jun 2025 20:32:27 -0700 Subject: [PATCH 063/242] driver core: faux: Quiet probe failures The acpi-einj conversion to faux_device_create() leads to a noisy error message when the error injection facility is disabled. Quiet the error as CXL error injection via ACPI expects the module to stay loaded even if the error injection facility is disabled. This situation arose because CXL knows proper kernel named objects to trigger errors against, but acpi-einj knows how to perform the error injection. The injection mechanism is shared with non-CXL use cases. The result is CXL now has a module dependency on einj-core.ko, and init/probe failures are handled at runtime. Fixes: 6cb9441bfe8d ("ACPI: APEI: EINJ: Transition to the faux device interface") Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250607033228.1475625-3-dan.j.williams@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/base/faux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/faux.c b/drivers/base/faux.c index 934da77ca48b..f5fbda0a9a44 100644 --- a/drivers/base/faux.c +++ b/drivers/base/faux.c @@ -170,7 +170,7 @@ struct faux_device *faux_device_create_with_groups(const char *name, * successful is almost impossible to determine by the caller. */ if (!dev->driver) { - dev_err(dev, "probe did not succeed, tearing down the device\n"); + dev_dbg(dev, "probe did not succeed, tearing down the device\n"); faux_device_destroy(faux_dev); faux_dev = NULL; } From 162457f5853ce3348e7956666916f5e5e31be51f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 6 Jun 2025 20:32:28 -0700 Subject: [PATCH 064/242] ACPI: APEI: EINJ: Do not fail einj_init() on faux_device_create() failure CXL has a symbol dependency on einj_core.ko, so if einj_init() fails then cxl_core.ko fails to load. Prior to the faux_device_create() conversion, einj_probe() failures were tracked by the einj_initialized flag without failing einj_init(). Revert to that behavior and always succeed einj_init() given there is no way, and no pressing need, to discern faux device-create vs device-probe failures. This situation arose because CXL knows proper kernel named objects to trigger errors against, but acpi-einj knows how to perform the error injection. The injection mechanism is shared with non-CXL use cases. The result is CXL now has a module dependency on einj-core.ko, and init/probe failures are handled at runtime. Fixes: 6cb9441bfe8d ("ACPI: APEI: EINJ: Transition to the faux device interface") Signed-off-by: Dan Williams Reviewed-by: Ben Cheatham Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250607033228.1475625-4-dan.j.williams@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/einj-core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/apei/einj-core.c b/drivers/acpi/apei/einj-core.c index fea11a35eea3..9b041415a9d0 100644 --- a/drivers/acpi/apei/einj-core.c +++ b/drivers/acpi/apei/einj-core.c @@ -883,19 +883,16 @@ static int __init einj_init(void) } einj_dev = faux_device_create("acpi-einj", NULL, &einj_device_ops); - if (!einj_dev) - return -ENODEV; - einj_initialized = true; + if (einj_dev) + einj_initialized = true; return 0; } static void __exit einj_exit(void) { - if (einj_initialized) - faux_device_destroy(einj_dev); - + faux_device_destroy(einj_dev); } module_init(einj_init); From 5b2d595efbfc9c46823bdb9ef11e1f9fa46adf9d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 6 Jun 2025 11:05:05 +0900 Subject: [PATCH 065/242] rust: time: Fix compile error in impl_has_hr_timer macro Fix a compile error in the `impl_has_hr_timer!` macro as follows: error[E0599]: no method named cast_mut found for raw pointer *mut Foo in the current scope The `container_of!` macro already returns a mutable pointer when used in a `*mut T` context so the `.cast_mut()` method is not available. [ We missed this one because there is no caller yet and it is a macro. - Miguel ] Fixes: 74d6a606c2b3 ("rust: retain pointer mut-ness in `container_of!`") Signed-off-by: FUJITA Tomonori Reviewed-by: Benno Lossin Acked-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250606020505.3186533-1-fujita.tomonori@gmail.com Signed-off-by: Miguel Ojeda --- rust/kernel/time/hrtimer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/time/hrtimer.rs b/rust/kernel/time/hrtimer.rs index 9df3dcd2fa39..36e1290cd079 100644 --- a/rust/kernel/time/hrtimer.rs +++ b/rust/kernel/time/hrtimer.rs @@ -517,7 +517,7 @@ unsafe fn timer_container_of( ) -> *mut Self { // SAFETY: As per the safety requirement of this function, `ptr` // is pointing inside a `$timer_type`. - unsafe { ::kernel::container_of!(ptr, $timer_type, $field).cast_mut() } + unsafe { ::kernel::container_of!(ptr, $timer_type, $field) } } } } From 2f76d269073bdb2971b253ef87d1f96f1a94c50e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 9 Jun 2025 08:42:06 +0200 Subject: [PATCH 066/242] ACPI: PAD: Update arguments of mwait_idle_with_hints() Commit a17b37a3f416 ("x86/idle: Change arguments of mwait_idle_with_hints() to u32") changed the type of arguments of mwait_idle_with_hints() from unsigned long to u32. Change the type of variables in the call to mwait_idle_with_hints() to unsigned int to follow the change. Signed-off-by: Uros Bizjak Link: https://patch.msgid.link/20250609064235.49146-1-ubizjak@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_pad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index 6f8bbe1247a5..c9a0bcaba2e4 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -33,7 +33,7 @@ static DEFINE_MUTEX(isolated_cpus_lock); static DEFINE_MUTEX(round_robin_lock); -static unsigned long power_saving_mwait_eax; +static unsigned int power_saving_mwait_eax; static unsigned char tsc_detected_unstable; static unsigned char tsc_marked_unstable; From 15eece6c5b05e5f9db0711978c3e3b7f1a2cfe12 Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Wed, 4 Jun 2025 10:30:36 +0800 Subject: [PATCH 067/242] ACPI: CPPC: Fix NULL pointer dereference when nosmp is used With nosmp in cmdline, other CPUs are not brought up, leaving their cpc_desc_ptr NULL. CPU0's iteration via for_each_possible_cpu() dereferences these NULL pointers, causing panic. Panic backtrace: [ 0.401123] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000b8 ... [ 0.403255] [] cppc_allow_fast_switch+0x6a/0xd4 ... Kernel panic - not syncing: Attempted to kill init! Fixes: 3cc30dd00a58 ("cpufreq: CPPC: Enable fast_switch") Reported-by: Xu Lu Signed-off-by: Yunhui Cui Link: https://patch.msgid.link/20250604023036.99553-1-cuiyunhui@bytedance.com [ rjw: New subject ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index a9ae2fd62863..6b649031808f 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -476,7 +476,7 @@ bool cppc_allow_fast_switch(void) struct cpc_desc *cpc_ptr; int cpu; - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { cpc_ptr = per_cpu(cpc_desc_ptr, cpu); desired_reg = &cpc_ptr->cpc_regs[DESIRED_PERF]; if (!CPC_IN_SYSTEM_MEMORY(desired_reg) && From 7a0d59f6a913a2bc7680c663b8cf1e45d1bdbf26 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 30 May 2025 01:53:10 +0200 Subject: [PATCH 068/242] ACPI: EC: Ignore ECDT tables with an invalid ID string On the MSI Modern 14 C5M the ECDT table contains invalid data: UID : 00000000 GPE Number : 00 /* Invalid, 03 would be correct */ Namepath : "" /* Invalid, "\_SB.PCI0.SBRG.EC" would * be correct */ This slows down the EC access as the wrong GPE event is used for communication. Additionally the ID string is invalid. Ignore such faulty ECDT tables by verifying that the ID string has a valid format. Tested-by: glpnk@proton.me Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20250529235310.540530-1-W_Armin@gmx.de Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 6f4203716b53..75c7db8b156a 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -23,8 +23,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -2031,6 +2033,21 @@ void __init acpi_ec_ecdt_probe(void) goto out; } + if (!strstarts(ecdt_ptr->id, "\\")) { + /* + * The ECDT table on some MSI notebooks contains invalid data, together + * with an empty ID string (""). + * + * Section 5.2.15 of the ACPI specification requires the ID string to be + * a "fully qualified reference to the (...) embedded controller device", + * so this string always has to start with a backslash. + * + * By verifying this we can avoid such faulty ECDT tables in a safe way. + */ + pr_err(FW_BUG "Ignoring ECDT due to invalid ID string \"%s\"\n", ecdt_ptr->id); + goto out; + } + ec = acpi_ec_alloc(); if (!ec) goto out; From c99ad987d3e9b550e9839d5df22de97d90462e5f Mon Sep 17 00:00:00 2001 From: Wentao Guan Date: Tue, 3 Jun 2025 20:20:59 +0800 Subject: [PATCH 069/242] ACPI: resource: Use IRQ override on MACHENIKE 16P Use ACPI IRQ override on MACHENIKE laptop to make the internal keyboard work. Add a new entry to the irq1_edge_low_force_override structure, similar to the existing ones. Link: https://bbs.deepin.org.cn/zh/post/287628 Signed-off-by: Wentao Guan Link: https://patch.msgid.link/20250603122059.1072790-1-guanwentao@uniontech.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index 7d59c6c9185f..b1ab192d7a08 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -666,6 +666,13 @@ static const struct dmi_system_id irq1_edge_low_force_override[] = { DMI_MATCH(DMI_BOARD_NAME, "GMxHGxx"), }, }, + { + /* MACHENIKE L16P/L16P */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "MACHENIKE"), + DMI_MATCH(DMI_BOARD_NAME, "L16P"), + }, + }, { /* * TongFang GM5HG0A in case of the SKIKK Vanaheim relabel the From 72840238e2bcb8fb24cb35d8d1d5a822c04e62a4 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 9 Jun 2025 08:35:01 +0200 Subject: [PATCH 070/242] intel_idle: Update arguments of mwait_idle_with_hints() Commit a17b37a3f416 ("x86/idle: Change arguments of mwait_idle_with_hints() to u32") changed the type of arguments of mwait_idle_with_hints() from unsigned long to u32. Change the type of variables in the call to mwait_idle_with_hints() to unsigned int to follow the change. Signed-off-by: Uros Bizjak Reviewed-by: Artem Bityutskiy Link: https://patch.msgid.link/20250609063528.48715-1-ubizjak@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 63565814c7e5..73747d20df85 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -152,8 +152,8 @@ static __always_inline int __intel_idle(struct cpuidle_device *dev, int index, bool irqoff) { struct cpuidle_state *state = &drv->states[index]; - unsigned long eax = flg2MWAIT(state->flags); - unsigned long ecx = 1*irqoff; /* break on interrupt flag */ + unsigned int eax = flg2MWAIT(state->flags); + unsigned int ecx = 1*irqoff; /* break on interrupt flag */ mwait_idle_with_hints(eax, ecx); @@ -226,9 +226,9 @@ static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev, static __cpuidle int intel_idle_s2idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - unsigned long ecx = 1; /* break on interrupt flag */ struct cpuidle_state *state = &drv->states[index]; - unsigned long eax = flg2MWAIT(state->flags); + unsigned int eax = flg2MWAIT(state->flags); + unsigned int ecx = 1; /* break on interrupt flag */ if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) fpu_idle_fpregs(); From 0b3bc018e86afdc0cbfef61328c63d5c08f8b370 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Sat, 7 Jun 2025 01:07:37 +1200 Subject: [PATCH 071/242] x86/virt/tdx: Avoid indirect calls to TDX assembly functions Two 'static inline' TDX helper functions (sc_retry() and sc_retry_prerr()) take function pointer arguments which refer to assembly functions. Normally, the compiler inlines the TDX helper, realizes that the function pointer targets are completely static -- thus can be resolved at compile time -- and generates direct call instructions. But, other times (like when CONFIG_CC_OPTIMIZE_FOR_SIZE=y), the compiler declines to inline the helpers and will instead generate indirect call instructions. Indirect calls to assembly functions require special annotation (for various Control Flow Integrity mechanisms). But TDX assembly functions lack the special annotations and can only be called directly. Annotate both the helpers as '__always_inline' to prod the compiler into maintaining the direct calls. There is no guarantee here, but Peter has volunteered to report the compiler bug if this assumption ever breaks[1]. Fixes: 1e66a7e27539 ("x86/virt/tdx: Handle SEAMCALL no entropy error in common code") Fixes: df01f5ae07dd ("x86/virt/tdx: Add SEAMCALL error printing for module initialization") Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/lkml/20250605145914.GW39944@noisy.programming.kicks-ass.net/ [1] Link: https://lore.kernel.org/all/20250606130737.30713-1-kai.huang%40intel.com --- arch/x86/include/asm/tdx.h | 2 +- arch/x86/virt/vmx/tdx/tdx.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 8b19294600c4..7ddef3a69866 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -106,7 +106,7 @@ void tdx_init(void); typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); -static inline u64 sc_retry(sc_func_t func, u64 fn, +static __always_inline u64 sc_retry(sc_func_t func, u64 fn, struct tdx_module_args *args) { int retry = RDRAND_RETRY_LOOPS; diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 2457d13c3f9e..c7a9a087ccaf 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -75,8 +75,9 @@ static inline void seamcall_err_ret(u64 fn, u64 err, args->r9, args->r10, args->r11); } -static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func, - u64 fn, struct tdx_module_args *args) +static __always_inline int sc_retry_prerr(sc_func_t func, + sc_err_func_t err_func, + u64 fn, struct tdx_module_args *args) { u64 sret = sc_retry(func, fn, args); From 40a98e702b528c631094f2e524d309faf33dc774 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 10 Jun 2025 12:16:00 -0700 Subject: [PATCH 072/242] crypto: hkdf - move to late_initcall The HKDF self-tests depend on the HMAC algorithms being registered. HMAC is now registered at module_init, which put it at the same level as HKDF. Move HKDF to late_initcall so that it runs afterwards. Fixes: ef93f1562803 ("Revert "crypto: run initcalls for generic implementations earlier"") Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/hkdf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/hkdf.c b/crypto/hkdf.c index f24c2a8d4df9..82d1b32ca6ce 100644 --- a/crypto/hkdf.c +++ b/crypto/hkdf.c @@ -566,7 +566,7 @@ static int __init crypto_hkdf_module_init(void) static void __exit crypto_hkdf_module_exit(void) {} -module_init(crypto_hkdf_module_init); +late_initcall(crypto_hkdf_module_init); module_exit(crypto_hkdf_module_exit); MODULE_LICENSE("GPL"); From b8d3291d3185b5c69ba36362ae1a370f06a33e5c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 10 Jun 2025 12:40:38 -0700 Subject: [PATCH 073/242] drm/sitronix: st7571-i2c: Select VIDEOMODE_HELPERS This driver requires of_get_display_timing() from CONFIG_VIDEOMODE_HELPERS but does not select it. If no other driver selects it, there will be a failure from the linker if the driver is built in or modpost if it is a module. ERROR: modpost: "of_get_display_timing" [drivers/gpu/drm/sitronix/st7571-i2c.ko] undefined! Select CONFIG_VIDEOMODE_HELPERS to resolve the build failure. Fixes: 4b35f0f41ee2 ("drm/st7571-i2c: add support for Sitronix ST7571 LCD controller") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250610-drm-st7571-i2c-select-videomode-helpers-v1-1-d30b50ff6e64@kernel.org Signed-off-by: Maxime Ripard --- drivers/gpu/drm/sitronix/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/sitronix/Kconfig b/drivers/gpu/drm/sitronix/Kconfig index c069d0d41775..741d1bb4b83f 100644 --- a/drivers/gpu/drm/sitronix/Kconfig +++ b/drivers/gpu/drm/sitronix/Kconfig @@ -5,6 +5,7 @@ config DRM_ST7571_I2C select DRM_GEM_SHMEM_HELPER select DRM_KMS_HELPER select REGMAP_I2C + select VIDEOMODE_HELPERS help DRM driver for Sitronix ST7571 panels controlled over I2C. From 1dbf30fdb5e57fb2c39f17f35f2b544d5de34397 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 3 Jun 2025 14:14:41 +0300 Subject: [PATCH 074/242] x86/mm/pat: don't collapse pages without PSE set Collapsing pages to a leaf PMD or PUD should be done only if X86_FEATURE_PSE is available, which is not the case when running e.g. as a Xen PV guest. Fixes: 41d88484c71c ("x86/mm/pat: restore large ROX pages after fragmentation") Signed-off-by: Juergen Gross Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250528123557.12847-3-jgross@suse.com --- arch/x86/mm/pat/set_memory.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 46edc11726b7..8834c76f91c9 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1257,6 +1257,9 @@ static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, pgprot_t pgprot; int i = 0; + if (!cpu_feature_enabled(X86_FEATURE_PSE)) + return 0; + addr &= PMD_MASK; pte = pte_offset_kernel(pmd, addr); first = *pte; From 47410d839fcda6890cb82828f874f97710982f24 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 3 Jun 2025 14:14:42 +0300 Subject: [PATCH 075/242] x86/Kconfig: only enable ROX cache in execmem when STRICT_MODULE_RWX is set Currently ROX cache in execmem is enabled regardless of STRICT_MODULE_RWX setting. This breaks an assumption that module memory is writable when STRICT_MODULE_RWX is disabled, for instance for kernel debuggin. Only enable ROX cache in execmem when STRICT_MODULE_RWX is set to restore the original behaviour of module text permissions. Fixes: 64f6a4e10c05 ("x86: re-enable EXECMEM_ROX support") Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250603111446.2609381-3-rppt@kernel.org --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 340e5468980e..71019b3b54ea 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -89,7 +89,7 @@ config X86 select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE - select ARCH_HAS_EXECMEM_ROX if X86_64 + select ARCH_HAS_EXECMEM_ROX if X86_64 && STRICT_MODULE_RWX select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL From 0b0cae7119a0ec9449d7261b5e672a5fed765068 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 3 Jun 2025 14:14:43 +0300 Subject: [PATCH 076/242] x86/its: move its_pages array to struct mod_arch_specific The of pages with ITS thunks allocated for modules are tracked by an array in 'struct module'. Since this is very architecture specific data structure, move it to 'struct mod_arch_specific'. No functional changes. Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches") Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250603111446.2609381-4-rppt@kernel.org --- arch/x86/include/asm/module.h | 8 ++++++++ arch/x86/kernel/alternative.c | 19 ++++++++++--------- include/linux/module.h | 5 ----- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e988bac0a4a1..3c2de4ce3b10 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,12 +5,20 @@ #include #include +struct its_array { +#ifdef CONFIG_MITIGATION_ITS + void **pages; + int num; +#endif +}; + struct mod_arch_specific { #ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; #endif + struct its_array its_pages; }; #endif /* _ASM_X86_MODULE_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ecfe7b497cad..b50fe6ce4655 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -173,8 +173,8 @@ void its_fini_mod(struct module *mod) its_page = NULL; mutex_unlock(&text_mutex); - for (int i = 0; i < mod->its_num_pages; i++) { - void *page = mod->its_page_array[i]; + for (int i = 0; i < mod->arch.its_pages.num; i++) { + void *page = mod->arch.its_pages.pages[i]; execmem_restore_rox(page, PAGE_SIZE); } } @@ -184,11 +184,11 @@ void its_free_mod(struct module *mod) if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return; - for (int i = 0; i < mod->its_num_pages; i++) { - void *page = mod->its_page_array[i]; + for (int i = 0; i < mod->arch.its_pages.num; i++) { + void *page = mod->arch.its_pages.pages[i]; execmem_free(page); } - kfree(mod->its_page_array); + kfree(mod->arch.its_pages.pages); } #endif /* CONFIG_MODULES */ @@ -201,14 +201,15 @@ static void *its_alloc(void) #ifdef CONFIG_MODULES if (its_mod) { - void *tmp = krealloc(its_mod->its_page_array, - (its_mod->its_num_pages+1) * sizeof(void *), + struct its_array *pages = &its_mod->arch.its_pages; + void *tmp = krealloc(pages->pages, + (pages->num+1) * sizeof(void *), GFP_KERNEL); if (!tmp) return NULL; - its_mod->its_page_array = tmp; - its_mod->its_page_array[its_mod->its_num_pages++] = page; + pages->pages = tmp; + pages->pages[pages->num++] = page; execmem_make_temp_rw(page, PAGE_SIZE); } diff --git a/include/linux/module.h b/include/linux/module.h index 92e1420fccdf..5faa1fb1f4b4 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -586,11 +586,6 @@ struct module { atomic_t refcnt; #endif -#ifdef CONFIG_MITIGATION_ITS - int its_num_pages; - void **its_page_array; -#endif - #ifdef CONFIG_CONSTRUCTORS /* Constructor functions. */ ctor_fn_t *ctors; From a82b26451de126a5ae130361081986bc459afe9b Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 3 Jun 2025 14:14:44 +0300 Subject: [PATCH 077/242] x86/its: explicitly manage permissions for ITS pages execmem_alloc() sets permissions differently depending on the kernel configuration, CPU support for PSE and whether a page is allocated before or after mark_rodata_ro(). Add tracking for pages allocated for ITS when patching the core kernel and make sure the permissions for ITS pages are explicitly managed for both kernel and module allocations. Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches") Signed-off-by: Peter Zijlstra (Intel) Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Nikolay Borisov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250603111446.2609381-5-rppt@kernel.org --- arch/x86/kernel/alternative.c | 74 ++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b50fe6ce4655..6455f7f751b3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -116,6 +116,24 @@ static struct module *its_mod; #endif static void *its_page; static unsigned int its_offset; +struct its_array its_pages; + +static void *__its_alloc(struct its_array *pages) +{ + void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + if (!page) + return NULL; + + void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *), + GFP_KERNEL); + if (!tmp) + return NULL; + + pages->pages = tmp; + pages->pages[pages->num++] = page; + + return no_free_ptr(page); +} /* Initialize a thunk with the "jmp *reg; int3" instructions. */ static void *its_init_thunk(void *thunk, int reg) @@ -151,6 +169,21 @@ static void *its_init_thunk(void *thunk, int reg) return thunk + offset; } +static void its_pages_protect(struct its_array *pages) +{ + for (int i = 0; i < pages->num; i++) { + void *page = pages->pages[i]; + execmem_restore_rox(page, PAGE_SIZE); + } +} + +static void its_fini_core(void) +{ + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + its_pages_protect(&its_pages); + kfree(its_pages.pages); +} + #ifdef CONFIG_MODULES void its_init_mod(struct module *mod) { @@ -173,10 +206,8 @@ void its_fini_mod(struct module *mod) its_page = NULL; mutex_unlock(&text_mutex); - for (int i = 0; i < mod->arch.its_pages.num; i++) { - void *page = mod->arch.its_pages.pages[i]; - execmem_restore_rox(page, PAGE_SIZE); - } + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + its_pages_protect(&mod->arch.its_pages); } void its_free_mod(struct module *mod) @@ -194,28 +225,23 @@ void its_free_mod(struct module *mod) static void *its_alloc(void) { - void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + struct its_array *pages = &its_pages; + void *page; +#ifdef CONFIG_MODULE + if (its_mod) + pages = &its_mod->arch.its_pages; +#endif + + page = __its_alloc(pages); if (!page) return NULL; -#ifdef CONFIG_MODULES - if (its_mod) { - struct its_array *pages = &its_mod->arch.its_pages; - void *tmp = krealloc(pages->pages, - (pages->num+1) * sizeof(void *), - GFP_KERNEL); - if (!tmp) - return NULL; + execmem_make_temp_rw(page, PAGE_SIZE); + if (pages == &its_pages) + set_memory_x((unsigned long)page, 1); - pages->pages = tmp; - pages->pages[pages->num++] = page; - - execmem_make_temp_rw(page, PAGE_SIZE); - } -#endif /* CONFIG_MODULES */ - - return no_free_ptr(page); + return page; } static void *its_allocate_thunk(int reg) @@ -269,7 +295,9 @@ u8 *its_static_thunk(int reg) return thunk; } -#endif +#else +static inline void its_fini_core(void) {} +#endif /* CONFIG_MITIGATION_ITS */ /* * Nomenclature for variable names to simplify and clarify this code and ease @@ -2339,6 +2367,8 @@ void __init alternative_instructions(void) apply_retpolines(__retpoline_sites, __retpoline_sites_end); apply_returns(__return_sites, __return_sites_end); + its_fini_core(); + /* * Adjust all CALL instructions to point to func()-10, including * those in .altinstr_replacement. From 7cd9a11dd0c3d1dd225795ed1b5b53132888e7b5 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 3 Jun 2025 14:14:45 +0300 Subject: [PATCH 078/242] Revert "mm/execmem: Unify early execmem_cache behaviour" The commit d6d1e3e6580c ("mm/execmem: Unify early execmem_cache behaviour") changed early behaviour of execemem ROX cache to allow its usage in early x86 code that allocates text pages when CONFIG_MITGATION_ITS is enabled. The permission management of the pages allocated from execmem for ITS mitigation is now completely contained in arch/x86/kernel/alternatives.c and therefore there is no need to special case early allocations in execmem. This reverts commit d6d1e3e6580ca35071ad474381f053cbf1fb6414. Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250603111446.2609381-6-rppt@kernel.org --- arch/x86/mm/init_32.c | 3 --- arch/x86/mm/init_64.c | 3 --- include/linux/execmem.h | 8 +------- mm/execmem.c | 40 +++------------------------------------- 4 files changed, 4 insertions(+), 50 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 607d6a2e66e2..8a34fff6ab2b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -749,8 +748,6 @@ void mark_rodata_ro(void) pr_info("Write protecting kernel text and read-only data: %luk\n", size >> 10); - execmem_cache_make_ro(); - kernel_set_to_readonly = 1; #ifdef CONFIG_CPA_DEBUG diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ee66fae9ebcc..fdb6cab524f0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -1392,8 +1391,6 @@ void mark_rodata_ro(void) (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); - execmem_cache_make_ro(); - kernel_set_to_readonly = 1; /* diff --git a/include/linux/execmem.h b/include/linux/execmem.h index ca42d5e46ccc..3be35680a54f 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -54,7 +54,7 @@ enum execmem_range_flags { EXECMEM_ROX_CACHE = (1 << 1), }; -#if defined(CONFIG_ARCH_HAS_EXECMEM_ROX) && defined(CONFIG_EXECMEM) +#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX /** * execmem_fill_trapping_insns - set memory to contain instructions that * will trap @@ -94,15 +94,9 @@ int execmem_make_temp_rw(void *ptr, size_t size); * Return: 0 on success or negative error code on failure. */ int execmem_restore_rox(void *ptr, size_t size); - -/* - * Called from mark_readonly(), where the system transitions to ROX. - */ -void execmem_cache_make_ro(void); #else static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } -static inline void execmem_cache_make_ro(void) { } #endif /** diff --git a/mm/execmem.c b/mm/execmem.c index 9720ac2dfa41..2b683e7d864d 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -254,34 +254,6 @@ static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) return ptr; } -static bool execmem_cache_rox = false; - -void execmem_cache_make_ro(void) -{ - struct maple_tree *free_areas = &execmem_cache.free_areas; - struct maple_tree *busy_areas = &execmem_cache.busy_areas; - MA_STATE(mas_free, free_areas, 0, ULONG_MAX); - MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); - struct mutex *mutex = &execmem_cache.mutex; - void *area; - - execmem_cache_rox = true; - - mutex_lock(mutex); - - mas_for_each(&mas_free, area, ULONG_MAX) { - unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT; - set_memory_ro(mas_free.index, pages); - } - - mas_for_each(&mas_busy, area, ULONG_MAX) { - unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT; - set_memory_ro(mas_busy.index, pages); - } - - mutex_unlock(mutex); -} - static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; @@ -302,15 +274,9 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - if (execmem_cache_rox) { - err = set_memory_rox((unsigned long)p, vm->nr_pages); - if (err) - goto err_free_mem; - } else { - err = set_memory_x((unsigned long)p, vm->nr_pages); - if (err) - goto err_free_mem; - } + err = set_memory_rox((unsigned long)p, vm->nr_pages); + if (err) + goto err_free_mem; err = execmem_cache_add(p, alloc_size); if (err) From aa3f93cd75bf8195e434deefff853d70d723d6f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 5 Jun 2025 10:38:30 +0200 Subject: [PATCH 079/242] dma-buf: fix compare in WARN_ON_ONCE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Smatch pointed out this trivial typo: drivers/dma-buf/dma-buf.c:1123 dma_buf_map_attachment() warn: passing positive error code '16' to 'ERR_PTR' drivers/dma-buf/dma-buf.c 1113 dma_resv_assert_held(attach->dmabuf->resv); 1114 1115 if (dma_buf_pin_on_map(attach)) { 1116 ret = attach->dmabuf->ops->pin(attach); 1117 /* 1118 * Catch exporters making buffers inaccessible even when 1119 * attachments preventing that exist. 1120 */ 1121 WARN_ON_ONCE(ret == EBUSY); ^^^^^ This was probably intended to be -EBUSY? 1122 if (ret) --> 1123 return ERR_PTR(ret); ^^^ Otherwise we will eventually crash. 1124 } 1125 1126 sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction); 1127 if (!sg_table) 1128 sg_table = ERR_PTR(-ENOMEM); 1129 if (IS_ERR(sg_table)) 1130 goto error_unpin; 1131 Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Link: https://lore.kernel.org/r/20250605085336.62156-1-christian.koenig@amd.com --- drivers/dma-buf/dma-buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 890ecac04dac..2bcf9ceca997 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1118,7 +1118,7 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach, * Catch exporters making buffers inaccessible even when * attachments preventing that exist. */ - WARN_ON_ONCE(ret == EBUSY); + WARN_ON_ONCE(ret == -EBUSY); if (ret) return ERR_PTR(ret); } From 6bdd3a01fe4627ad7a562ba38eb759eba715b671 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Jun 2025 16:00:20 +0200 Subject: [PATCH 080/242] fs: add missing values to TRACE_IOCB_STRINGS Make sure all values are covered. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250610140020.2227932-1-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 96c7925a6551..d27c402f1162 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -399,7 +399,9 @@ struct readahead_control; { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ - { IOCB_DIO_CALLER_COMP, "CALLER_COMP" } + { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \ + { IOCB_AIO_RW, "AIO_RW" }, \ + { IOCB_HAS_METADATA, "AIO_HAS_METADATA" } struct kiocb { struct file *ki_filp; From ad5a0351064c8f744416df3a49156d2c61af5bc0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 10 Jun 2025 11:04:04 +1000 Subject: [PATCH 081/242] VFS: change try_lookup_noperm() to skip revalidation The recent change from using d_hash_and_lookup() to using try_lookup_noperm() inadvertently introduce a d_revalidate() call when the lookup was successful. Steven French reports that this resulted in worse than halving of performance in some cases. Prior to the offending patch the only caller of try_lookup_noperm() was autofs which does not need the d_revalidate(). So it is safe to remove the d_revalidate() call providing we stop using try_lookup_noperm() to implement lookup_noperm(). The "try_" in the name is strongly suggestive that the caller isn't expecting much effort, so it seems reasonable to avoid the effort of d_revalidate(). Fixes: 06c567403ae5 ("Use try_lookup_noperm() instead of d_hash_and_lookup() outside of VFS") Reported-by: Steve French Link: https://lore.kernel.org/all/CAH2r5mu5SfBrdc2CFHwzft8=n9koPMk+Jzwpy-oUMx-wCRCesQ@mail.gmail.com/ Signed-off-by: NeilBrown Link: https://lore.kernel.org/174951744454.608730.18354002683881684261@noble.neil.brown.name Tested-by: Steve French Signed-off-by: Christian Brauner --- fs/namei.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 4bb889fc980b..f761cafaeaad 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2917,7 +2917,8 @@ static int lookup_one_common(struct mnt_idmap *idmap, * @base: base directory to lookup from * * Look up a dentry by name in the dcache, returning NULL if it does not - * currently exist. The function does not try to create a dentry. + * currently exist. The function does not try to create a dentry and if one + * is found it doesn't try to revalidate it. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. @@ -2933,7 +2934,7 @@ struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base) if (err) return ERR_PTR(err); - return lookup_dcache(name, base, 0); + return d_lookup(base, name); } EXPORT_SYMBOL(try_lookup_noperm); @@ -3057,14 +3058,22 @@ EXPORT_SYMBOL(lookup_one_positive_unlocked); * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. * - * Unlike lookup_noperm, it should be called without the parent + * Unlike lookup_noperm(), it should be called without the parent * i_rwsem held, and will take the i_rwsem itself if necessary. + * + * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already + * existed. */ struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base) { struct dentry *ret; + int err; - ret = try_lookup_noperm(name, base); + err = lookup_noperm_common(name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); if (!ret) ret = lookup_slow(name, base, 0); return ret; From 3e5378779091c2f9a96d4404066e1923c92ceb8b Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Tue, 10 Jun 2025 15:54:14 +0200 Subject: [PATCH 082/242] ata: pata_macio: Fix PCI region leak pci_request_regions() became a managed devres functions if the PCI device was enabled with pcim_enable_device(), which is the case for pata_macio. The PCI subsystem recently removed this hybrid feature from pci_request_region(). When doing so, pata_macio was forgotten to be ported to use pcim_request_all_regions(). If that function is not used, pata_macio will fail on driver-reload because the PCI regions will remain blocked. Fix the region leak by replacing pci_request_regions() with its managed counterpart, pcim_request_all_regions(). Fixes: 51f6aec99cb0 ("PCI: Remove hybrid devres nature from request functions") Signed-off-by: Philipp Stanner Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20250610135413.35930-2-phasta@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/pata_macio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c index fbf5f07ea357..f7a933eefe05 100644 --- a/drivers/ata/pata_macio.c +++ b/drivers/ata/pata_macio.c @@ -1298,7 +1298,7 @@ static int pata_macio_pci_attach(struct pci_dev *pdev, priv->dev = &pdev->dev; /* Get MMIO regions */ - if (pci_request_regions(pdev, "pata-macio")) { + if (pcim_request_all_regions(pdev, "pata-macio")) { dev_err(&pdev->dev, "Cannot obtain PCI resources\n"); return -EBUSY; From 6f29d393061c2c1fa30a72a0e9b01e15e09dae34 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 10 Jun 2025 13:07:59 +0200 Subject: [PATCH 083/242] ata: ahci: Use correct BIOS build date for ThinkPad W541 quirk Fix the TODO in ahci_broken_lpm() by using the proper BIOS build date. The proper BIOS build date was provided by Hans, see Link. Link: https://lore.kernel.org/linux-ide/6ea509c8-b38d-4941-8a29-c1117ff3dd5b@redhat.com/ Reviewed-by: Damien Le Moal Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20250610110757.1318959-6-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/ahci.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 163ac909bd06..e7c8357cbc54 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1438,13 +1438,7 @@ static bool ahci_broken_lpm(struct pci_dev *pdev) DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad W541"), }, - /* - * Note date based on release notes, 2.35 has been - * reported to be good, but I've been unable to get - * a hold of the reporter to get the DMI BIOS date. - * TODO: fix this. - */ - .driver_data = "20180310", /* 2.35 */ + .driver_data = "20180409", /* 2.35 */ }, { } /* terminate list */ }; From afe382843717d44b24ef5014d57dcbaab75a4052 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 7 May 2025 18:09:12 +0200 Subject: [PATCH 084/242] udmabuf: use sgtable-based scatterlist wrappers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use common wrappers operating directly on the struct sg_table objects to fix incorrect use of scatterlists sync calls. dma_sync_sg_for_*() functions have to be called with the number of elements originally passed to dma_map_sg_*() function, not the one returned in sgtable's nents. Fixes: 1ffe09590121 ("udmabuf: fix dma-buf cpu access") CC: stable@vger.kernel.org Signed-off-by: Marek Szyprowski Acked-by: Vivek Kasireddy Reviewed-by: Christian König Signed-off-by: Christian König Link: https://lore.kernel.org/r/20250507160913.2084079-3-m.szyprowski@samsung.com --- drivers/dma-buf/udmabuf.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 7eee3eb47a8e..c9d0c68d2fcb 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -264,8 +264,7 @@ static int begin_cpu_udmabuf(struct dma_buf *buf, ubuf->sg = NULL; } } else { - dma_sync_sg_for_cpu(dev, ubuf->sg->sgl, ubuf->sg->nents, - direction); + dma_sync_sgtable_for_cpu(dev, ubuf->sg, direction); } return ret; @@ -280,7 +279,7 @@ static int end_cpu_udmabuf(struct dma_buf *buf, if (!ubuf->sg) return -EINVAL; - dma_sync_sg_for_device(dev, ubuf->sg->sgl, ubuf->sg->nents, direction); + dma_sync_sgtable_for_device(dev, ubuf->sg, direction); return 0; } From 4bb08cf974c54adc321b8505ce82720c2a15c451 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 11 Jun 2025 16:49:38 +0800 Subject: [PATCH 085/242] loop: move lo_set_size() out of queue freeze It isn't necessary to freeze queue for updating disk size given submit_bio() doesn't grab queue usage counter for checking eod. Also many driver won't freeze queue for calling set_capacity_and_notify(). Move lo_set_size() out of queue freeze for fixing many lockdep warning report. Link: https://lore.kernel.org/linux-block/67ea99e0.050a0220.3c3d88.0042.GAE@google.com/ Reported-by: syzbot+9dd7dbb1a4b915dee638@syzkaller.appspotmail.com Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250611084938.108829-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f8d136684109..500840e4a74e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1248,12 +1248,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) lo->lo_flags &= ~LOOP_SET_STATUS_CLEARABLE_FLAGS; lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS); - if (size_changed) { - loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, - lo->lo_backing_file); - loop_set_size(lo, new_size); - } - /* update the direct I/O flag if lo_offset changed */ loop_update_dio(lo); @@ -1261,6 +1255,11 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) blk_mq_unfreeze_queue(lo->lo_queue, memflags); if (partscan) clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); + if (!err && size_changed) { + loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, + lo->lo_backing_file); + loop_set_size(lo, new_size); + } out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) From ff20c516485efbeb5c32bcb6aa5a24f73774185b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 11 Jun 2025 16:56:32 +0800 Subject: [PATCH 086/242] ublk: document auto buffer registration(UBLK_F_AUTO_BUF_REG) Document recently merged feature auto buffer registration(UBLK_F_AUTO_BUF_REG). Cc: Caleb Sander Mateos Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250611085632.109719-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- Documentation/block/ublk.rst | 75 ++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index c368e1081b41..abec524a04ed 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -352,6 +352,81 @@ For reaching best IO performance, ublk server should align its segment parameter of `struct ublk_param_segment` with backend for avoiding unnecessary IO split, which usually hurts io_uring performance. +Auto Buffer Registration +------------------------ + +The ``UBLK_F_AUTO_BUF_REG`` feature automatically handles buffer registration +and unregistration for I/O requests, which simplifies the buffer management +process and reduces overhead in the ublk server implementation. + +This is another feature flag for using zero copy, and it is compatible with +``UBLK_F_SUPPORT_ZERO_COPY``. + +Feature Overview +~~~~~~~~~~~~~~~~ + +This feature automatically registers request buffers to the io_uring context +before delivering I/O commands to the ublk server and unregisters them when +completing I/O commands. This eliminates the need for manual buffer +registration/unregistration via ``UBLK_IO_REGISTER_IO_BUF`` and +``UBLK_IO_UNREGISTER_IO_BUF`` commands, then IO handling in ublk server +can avoid dependency on the two uring_cmd operations. + +IOs can't be issued concurrently to io_uring if there is any dependency +among these IOs. So this way not only simplifies ublk server implementation, +but also makes concurrent IO handling becomes possible by removing the +dependency on buffer registration & unregistration commands. + +Usage Requirements +~~~~~~~~~~~~~~~~~~ + +1. The ublk server must create a sparse buffer table on the same ``io_ring_ctx`` + used for ``UBLK_IO_FETCH_REQ`` and ``UBLK_IO_COMMIT_AND_FETCH_REQ``. If + uring_cmd is issued on a different ``io_ring_ctx``, manual buffer + unregistration is required. + +2. Buffer registration data must be passed via uring_cmd's ``sqe->addr`` with the + following structure:: + + struct ublk_auto_buf_reg { + __u16 index; /* Buffer index for registration */ + __u8 flags; /* Registration flags */ + __u8 reserved0; /* Reserved for future use */ + __u32 reserved1; /* Reserved for future use */ + }; + + ublk_auto_buf_reg_to_sqe_addr() is for converting the above structure into + ``sqe->addr``. + +3. All reserved fields in ``ublk_auto_buf_reg`` must be zeroed. + +4. Optional flags can be passed via ``ublk_auto_buf_reg.flags``. + +Fallback Behavior +~~~~~~~~~~~~~~~~~ + +If auto buffer registration fails: + +1. When ``UBLK_AUTO_BUF_REG_FALLBACK`` is enabled: + - The uring_cmd is completed + - ``UBLK_IO_F_NEED_REG_BUF`` is set in ``ublksrv_io_desc.op_flags`` + - The ublk server must manually deal with the failure, such as, register + the buffer manually, or using user copy feature for retrieving the data + for handling ublk IO + +2. If fallback is not enabled: + - The ublk I/O request fails silently + - The uring_cmd won't be completed + +Limitations +~~~~~~~~~~~ + +- Requires same ``io_ring_ctx`` for all operations +- May require manual buffer management in fallback cases +- io_ring_ctx buffer table has a max size of 16K, which may not be enough + in case that too many ublk devices are handled by this single io_ring_ctx + and each one has very large queue depth + References ========== From f705d33c2f0353039d03e5d6f18f70467d86080e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Jun 2025 09:59:15 +0900 Subject: [PATCH 087/242] block: Clear BIO_EMULATES_ZONE_APPEND flag on BIO completion When blk_zone_write_plug_bio_endio() is called for a regular write BIO used to emulate a zone append operation, that is, a BIO flagged with BIO_EMULATES_ZONE_APPEND, the BIO operation code is restored to the original REQ_OP_ZONE_APPEND but the BIO_EMULATES_ZONE_APPEND flag is not cleared. Clear it to fully return the BIO to its orginal definition. Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250611005915.89843-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 8f15d1aa6eb8..cee7a0774a9b 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1225,6 +1225,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= REQ_OP_ZONE_APPEND; + bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND); } /* From cf625013d8741c01407bbb4a60c111b61b9fa69d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Jun 2025 06:44:16 +0200 Subject: [PATCH 088/242] block: don't use submit_bio_noacct_nocheck in blk_zone_wplug_bio_work Bios queued up in the zone write plug have already gone through all all preparation in the submit_bio path, including the freeze protection. Submitting them through submit_bio_noacct_nocheck duplicates the work and can can cause deadlocks when freezing a queue with pending bio write plugs. Go straight to ->submit_bio or blk_mq_submit_bio to bypass the superfluous extra freeze protection and checks. Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation") Reported-by: Bart Van Assche Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Link: https://lore.kernel.org/r/20250611044416.2351850-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-zoned.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index cee7a0774a9b..351d659280e1 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1307,7 +1307,6 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) spin_unlock_irqrestore(&zwplug->lock, flags); bdev = bio->bi_bdev; - submit_bio_noacct_nocheck(bio); /* * blk-mq devices will reuse the extra reference on the request queue @@ -1315,8 +1314,12 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) * path for BIO-based devices will not do that. So drop this extra * reference here. */ - if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) + if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { + bdev->bd_disk->fops->submit_bio(bio); blk_queue_exit(bdev->bd_disk->queue); + } else { + blk_mq_submit_bio(bio); + } put_zwplug: /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ From 961296e89dc3800e6a3abc3f5d5bb4192cf31e98 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Jun 2025 08:48:46 -0600 Subject: [PATCH 089/242] block: use plug request list tail for one-shot backmerge attempt Previously, the block layer stored the requests in the plug list in LIFO order. For this reason, blk_attempt_plug_merge() would check just the head entry for a back merge attempt, and abort after that unless requests for multiple queues existed in the plug list. If more than one request is present in the plug list, this makes the one-shot back merging less useful than before, as it'll always fail to find a quick merge candidate. Use the tail entry for the one-shot merge attempt, which is the last added request in the list. If that fails, abort immediately unless there are multiple queues available. If multiple queues are available, then scan the list. Ideally the latter scan would be a backwards scan of the list, but as it currently stands, the plug list is singly linked and hence this isn't easily feasible. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-block/20250611121626.7252-1-abuehaze@amazon.com/ Reported-by: Hazem Mohamed Abuelfotoh Fixes: e70c301faece ("block: don't reorder requests in blk_add_rq_to_plug") Signed-off-by: Jens Axboe --- block/blk-merge.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 3af1d284add5..70d704615be5 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -998,20 +998,20 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, if (!plug || rq_list_empty(&plug->mq_list)) return false; - rq_list_for_each(&plug->mq_list, rq) { - if (rq->q == q) { - if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == - BIO_MERGE_OK) - return true; - break; - } + rq = plug->mq_list.tail; + if (rq->q == q) + return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK; + else if (!plug->multiple_queues) + return false; - /* - * Only keep iterating plug list for merges if we have multiple - * queues - */ - if (!plug->multiple_queues) - break; + rq_list_for_each(&plug->mq_list, rq) { + if (rq->q != q) + continue; + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK) + return true; + break; } return false; } From 83f066fac3c231e58e9adf3b307e96fee172dfb3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 11 Jun 2025 16:09:45 +0300 Subject: [PATCH 090/242] spi: stm32-ospi: clean up on error in probe() If reset_control_acquire() fails, then we can't return directly. We need to do a little clean up first. Fixes: cf2c3eceb757 ("spi: stm32-ospi: Make usage of reset_control_acquire/release() API") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/aEmAGTUzzKZlLe3K@stanley.mountain Signed-off-by: Mark Brown --- drivers/spi/spi-stm32-ospi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-stm32-ospi.c b/drivers/spi/spi-stm32-ospi.c index db6b1cfc970f..4ab7e86f4bd5 100644 --- a/drivers/spi/spi-stm32-ospi.c +++ b/drivers/spi/spi-stm32-ospi.c @@ -937,8 +937,10 @@ static int stm32_ospi_probe(struct platform_device *pdev) goto err_pm_enable; ret = reset_control_acquire(ospi->rstc); - if (ret) - return dev_err_probe(dev, ret, "Can not acquire reset %d\n", ret); + if (ret) { + dev_err_probe(dev, ret, "Can not acquire reset %d\n", ret); + goto err_pm_resume; + } reset_control_assert(ospi->rstc); udelay(2); From 179a8427fcbffe36ccfed5e138d7c9b6180caff9 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 12 May 2025 22:16:34 +0000 Subject: [PATCH 091/242] KVM: SEV: Disable SEV-SNP support on initialization failure During platform init, SNP initialization may fail for several reasons, such as firmware command failures and incompatible versions. However, the KVM capability may continue to advertise support for it. The platform may have SNP enabled but if SNP_INIT fails then SNP is not supported by KVM. During KVM module initialization query the SNP platform status to obtain the SNP initialization state and use it as an additional condition to determine support for SEV-SNP. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Co-developed-by: Pratik R. Sampat Signed-off-by: Pratik R. Sampat Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Reviewed-by: Pankaj Gupta Reviewed-by: Pavan Kumar Paluri Message-ID: <20250512221634.12045-1-Ashish.Kalra@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 44 +++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5a69b657dae9..459c3b791fd4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2871,6 +2871,33 @@ void __init sev_set_cpu_caps(void) } } +static bool is_sev_snp_initialized(void) +{ + struct sev_user_data_snp_status *status; + struct sev_data_snp_addr buf; + bool initialized = false; + int ret, error = 0; + + status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); + if (!status) + return false; + + buf.address = __psp_pa(status); + ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); + if (ret) { + pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + goto out; + } + + initialized = !!status->state; + +out: + snp_free_firmware_page(status); + + return initialized; +} + void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; @@ -2975,6 +3002,14 @@ void __init sev_hardware_setup(void) sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); out: + if (sev_enabled) { + init_args.probe = true; + if (sev_platform_init(&init_args)) + sev_supported = sev_es_supported = sev_snp_supported = false; + else if (sev_snp_supported) + sev_snp_supported = is_sev_snp_initialized(); + } + if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : @@ -3001,15 +3036,6 @@ void __init sev_hardware_setup(void) sev_supported_vmsa_features = 0; if (sev_es_debug_swap_enabled) sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; - - if (!sev_enabled) - return; - - /* - * Do both SNP and SEV initialization at KVM module load. - */ - init_args.probe = true; - sev_platform_init(&init_args); } void sev_hardware_unsetup(void) From 62a65b32bddb0f242b106b8c464913f2f01c108d Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Fri, 6 Jun 2025 11:11:17 -0700 Subject: [PATCH 092/242] dt-bindings: pmem: Convert binding to YAML Convert the PMEM device tree binding from text to YAML. This will allow device trees with pmem-region nodes to pass dtbs_check. [iweiny: fix ups from Rob/Drew] Acked-by: Conor Dooley Acked-by: Oliver O'Halloran Signed-off-by: Drew Fustini Acked-by: Dan Williams Link: https://patch.msgid.link/20250606184405.359812-4-drew@pdp7.com Signed-off-by: Ira Weiny --- .../devicetree/bindings/pmem/pmem-region.txt | 65 ------------------- .../devicetree/bindings/pmem/pmem-region.yaml | 48 ++++++++++++++ MAINTAINERS | 2 +- 3 files changed, 49 insertions(+), 66 deletions(-) delete mode 100644 Documentation/devicetree/bindings/pmem/pmem-region.txt create mode 100644 Documentation/devicetree/bindings/pmem/pmem-region.yaml diff --git a/Documentation/devicetree/bindings/pmem/pmem-region.txt b/Documentation/devicetree/bindings/pmem/pmem-region.txt deleted file mode 100644 index cd79975e85ec..000000000000 --- a/Documentation/devicetree/bindings/pmem/pmem-region.txt +++ /dev/null @@ -1,65 +0,0 @@ -Device-tree bindings for persistent memory regions ------------------------------------------------------ - -Persistent memory refers to a class of memory devices that are: - - a) Usable as main system memory (i.e. cacheable), and - b) Retain their contents across power failure. - -Given b) it is best to think of persistent memory as a kind of memory mapped -storage device. To ensure data integrity the operating system needs to manage -persistent regions separately to the normal memory pool. To aid with that this -binding provides a standardised interface for discovering where persistent -memory regions exist inside the physical address space. - -Bindings for the region nodes: ------------------------------ - -Required properties: - - compatible = "pmem-region" - - - reg = ; - The reg property should specify an address range that is - translatable to a system physical address range. This address - range should be mappable as normal system memory would be - (i.e cacheable). - - If the reg property contains multiple address ranges - each address range will be treated as though it was specified - in a separate device node. Having multiple address ranges in a - node implies no special relationship between the two ranges. - -Optional properties: - - Any relevant NUMA associativity properties for the target platform. - - - volatile; This property indicates that this region is actually - backed by non-persistent memory. This lets the OS know that it - may skip the cache flushes required to ensure data is made - persistent after a write. - - If this property is absent then the OS must assume that the region - is backed by non-volatile memory. - -Examples: --------------------- - - /* - * This node specifies one 4KB region spanning from - * 0x5000 to 0x5fff that is backed by non-volatile memory. - */ - pmem@5000 { - compatible = "pmem-region"; - reg = <0x00005000 0x00001000>; - }; - - /* - * This node specifies two 4KB regions that are backed by - * volatile (normal) memory. - */ - pmem@6000 { - compatible = "pmem-region"; - reg = < 0x00006000 0x00001000 - 0x00008000 0x00001000 >; - volatile; - }; - diff --git a/Documentation/devicetree/bindings/pmem/pmem-region.yaml b/Documentation/devicetree/bindings/pmem/pmem-region.yaml new file mode 100644 index 000000000000..bd0f0c793f03 --- /dev/null +++ b/Documentation/devicetree/bindings/pmem/pmem-region.yaml @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pmem-region.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +maintainers: + - Oliver O'Halloran + +title: Persistent Memory Regions + +description: | + Persistent memory refers to a class of memory devices that are: + + a) Usable as main system memory (i.e. cacheable), and + b) Retain their contents across power failure. + + Given b) it is best to think of persistent memory as a kind of memory mapped + storage device. To ensure data integrity the operating system needs to manage + persistent regions separately to the normal memory pool. To aid with that this + binding provides a standardised interface for discovering where persistent + memory regions exist inside the physical address space. + +properties: + compatible: + const: pmem-region + + reg: + maxItems: 1 + + volatile: + description: + Indicates the region is volatile (non-persistent) and the OS can skip + cache flushes for writes + type: boolean + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + pmem@5000 { + compatible = "pmem-region"; + reg = <0x00005000 0x00001000>; + }; diff --git a/MAINTAINERS b/MAINTAINERS index a92290fffa16..73ded8134ffd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13798,7 +13798,7 @@ M: Oliver O'Halloran L: nvdimm@lists.linux.dev S: Supported Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ -F: Documentation/devicetree/bindings/pmem/pmem-region.txt +F: Documentation/devicetree/bindings/pmem/pmem-region.yaml F: drivers/nvdimm/of_pmem.c LIBNVDIMM: NON-VOLATILE MEMORY DEVICE SUBSYSTEM From 47096d301e39f96962ca1fd6c7b71bfa796c53db Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 11 Jun 2025 11:05:16 -1000 Subject: [PATCH 093/242] sched_ext: Update mailing list entry in MAINTAINERS Use sched-ext@lists.linux.dev instead of LKML. Signed-off-by: Tejun Heo --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a92290fffa16..8841f24d408b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22171,7 +22171,7 @@ R: Tejun Heo R: David Vernet R: Andrea Righi R: Changwoo Min -L: linux-kernel@vger.kernel.org +L: sched-ext@lists.linux.dev S: Maintained W: https://github.com/sched-ext/scx T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git From ebf2e500e06f707654572bc7d8bc569a8caa51aa Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 9 Jun 2025 12:38:43 +0530 Subject: [PATCH 094/242] rust: cpu: Introduce CpuId abstraction This adds abstraction for representing a CPU identifier. Suggested-by: Boqun Feng Signed-off-by: Viresh Kumar Reviewed-by: Boqun Feng --- rust/kernel/cpu.rs | 110 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/rust/kernel/cpu.rs b/rust/kernel/cpu.rs index 10c5c3b25873..6a3aecb12468 100644 --- a/rust/kernel/cpu.rs +++ b/rust/kernel/cpu.rs @@ -6,6 +6,116 @@ use crate::{bindings, device::Device, error::Result, prelude::ENODEV}; +/// Returns the maximum number of possible CPUs in the current system configuration. +#[inline] +pub fn nr_cpu_ids() -> u32 { + #[cfg(any(NR_CPUS_1, CONFIG_FORCE_NR_CPUS))] + { + bindings::NR_CPUS + } + + #[cfg(not(any(NR_CPUS_1, CONFIG_FORCE_NR_CPUS)))] + // SAFETY: `nr_cpu_ids` is a valid global provided by the kernel. + unsafe { + bindings::nr_cpu_ids + } +} + +/// The CPU ID. +/// +/// Represents a CPU identifier as a wrapper around an [`u32`]. +/// +/// # Invariants +/// +/// The CPU ID lies within the range `[0, nr_cpu_ids())`. +/// +/// # Examples +/// +/// ``` +/// use kernel::cpu::CpuId; +/// +/// let cpu = 0; +/// +/// // SAFETY: 0 is always a valid CPU number. +/// let id = unsafe { CpuId::from_u32_unchecked(cpu) }; +/// +/// assert_eq!(id.as_u32(), cpu); +/// assert!(CpuId::from_i32(0).is_some()); +/// assert!(CpuId::from_i32(-1).is_none()); +/// ``` +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +pub struct CpuId(u32); + +impl CpuId { + /// Creates a new [`CpuId`] from the given `id` without checking bounds. + /// + /// # Safety + /// + /// The caller must ensure that `id` is a valid CPU ID (i.e., `0 <= id < nr_cpu_ids()`). + #[inline] + pub unsafe fn from_i32_unchecked(id: i32) -> Self { + debug_assert!(id >= 0); + debug_assert!((id as u32) < nr_cpu_ids()); + + // INVARIANT: The function safety guarantees `id` is a valid CPU id. + Self(id as u32) + } + + /// Creates a new [`CpuId`] from the given `id`, checking that it is valid. + pub fn from_i32(id: i32) -> Option { + if id < 0 || id as u32 >= nr_cpu_ids() { + None + } else { + // INVARIANT: `id` has just been checked as a valid CPU ID. + Some(Self(id as u32)) + } + } + + /// Creates a new [`CpuId`] from the given `id` without checking bounds. + /// + /// # Safety + /// + /// The caller must ensure that `id` is a valid CPU ID (i.e., `0 <= id < nr_cpu_ids()`). + #[inline] + pub unsafe fn from_u32_unchecked(id: u32) -> Self { + debug_assert!(id < nr_cpu_ids()); + + // Ensure the `id` fits in an [`i32`] as it's also representable that way. + debug_assert!(id <= i32::MAX as u32); + + // INVARIANT: The function safety guarantees `id` is a valid CPU id. + Self(id) + } + + /// Creates a new [`CpuId`] from the given `id`, checking that it is valid. + pub fn from_u32(id: u32) -> Option { + if id >= nr_cpu_ids() { + None + } else { + // INVARIANT: `id` has just been checked as a valid CPU ID. + Some(Self(id)) + } + } + + /// Returns CPU number. + #[inline] + pub fn as_u32(&self) -> u32 { + self.0 + } +} + +impl From for u32 { + fn from(id: CpuId) -> Self { + id.as_u32() + } +} + +impl From for i32 { + fn from(id: CpuId) -> Self { + id.as_u32() as i32 + } +} + /// Creates a new instance of CPU's device. /// /// # Safety From 47fe65b105f29ef22f463c17ab8a4c0eeb741045 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Jun 2025 20:22:17 -0400 Subject: [PATCH 095/242] bcachefs: Add missing restart handling to check_topology() The next patch will add logging of the specific error being corrected in repair paths to the journal; this means __bch2_fsck_err() can return transaction restarts in places that previously weren't expecting them. check_topology() is old code that doesn't use btree iterators for btree node locking - it'll have to be rewritten in the future to work online. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 95 ++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 9ddcbe1bda78..e92cf3928c63 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -397,7 +397,11 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct continue; } - ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan); + ret = lockrestart_do(trans, + btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan)); + if (ret < 0) + goto err; + if (ret == DID_FILL_FROM_SCAN) { new_pass = true; ret = 0; @@ -438,7 +442,8 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct if (!ret && !IS_ERR_OR_NULL(prev)) { BUG_ON(cur); - ret = btree_repair_node_end(trans, b, prev, pulled_from_scan); + ret = lockrestart_do(trans, + btree_repair_node_end(trans, b, prev, pulled_from_scan)); if (ret == DID_FILL_FROM_SCAN) { new_pass = true; ret = 0; @@ -519,6 +524,46 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct bch2_bkey_buf_exit(&prev_k, c); bch2_bkey_buf_exit(&cur_k, c); printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +static int bch2_check_root(struct btree_trans *trans, enum btree_id i, + bool *reconstructed_root) +{ + struct bch_fs *c = trans->c; + struct btree_root *r = bch2_btree_id_root(c, i); + struct printbuf buf = PRINTBUF; + int ret = 0; + + bch2_btree_id_to_text(&buf, i); + + if (r->error) { + bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); + + r->alive = false; + r->error = 0; + + if (!bch2_btree_has_scanned_nodes(c, i)) { + __fsck_err(trans, + FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), + btree_root_unreadable_and_scan_found_nothing, + "no nodes found for btree %s, continue?", buf.buf); + bch2_btree_root_alloc_fake_trans(trans, i, 0); + } else { + bch2_btree_root_alloc_fake_trans(trans, i, 1); + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); + if (ret) + goto err; + } + + *reconstructed_root = true; + } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); return ret; } @@ -526,42 +571,18 @@ int bch2_check_topology(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct bpos pulled_from_scan = POS_MIN; - struct printbuf buf = PRINTBUF; int ret = 0; bch2_trans_srcu_unlock(trans); for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - struct btree_root *r = bch2_btree_id_root(c, i); bool reconstructed_root = false; +recover: + ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root)); + if (ret) + break; - printbuf_reset(&buf); - bch2_btree_id_to_text(&buf, i); - - if (r->error) { -reconstruct_root: - bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - - r->alive = false; - r->error = 0; - - if (!bch2_btree_has_scanned_nodes(c, i)) { - __fsck_err(trans, - FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), - btree_root_unreadable_and_scan_found_nothing, - "no nodes found for btree %s, continue?", buf.buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); - } else { - bch2_btree_root_alloc_fake_trans(trans, i, 1); - bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); - if (ret) - break; - } - - reconstructed_root = true; - } - + struct btree_root *r = bch2_btree_id_root(c, i); struct btree *b = r->b; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); @@ -575,17 +596,21 @@ int bch2_check_topology(struct bch_fs *c) r->b = NULL; - if (!reconstructed_root) - goto reconstruct_root; + if (!reconstructed_root) { + r->error = -EIO; + goto recover; + } + struct printbuf buf = PRINTBUF; + bch2_btree_id_to_text(&buf, i); bch_err(c, "empty btree root %s", buf.buf); + printbuf_exit(&buf); bch2_btree_root_alloc_fake_trans(trans, i, 0); r->alive = false; ret = 0; } } -fsck_err: - printbuf_exit(&buf); + bch2_trans_put(trans); return ret; } From b43f724927686387589b98eb762e1a4109bb1bac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Jun 2025 13:07:31 -0400 Subject: [PATCH 096/242] bcachefs: Log fsck errors in the journal Log the specific error being corrected in the journal when we're repairing, this helps greatly with 'bcachefs list_journal' analysis. Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 63951e293c47..ff49cebfd0e8 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -620,6 +620,9 @@ int __bch2_fsck_err(struct bch_fs *c, if (s) s->ret = ret; + + if (trans) + ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret; err_unlock: mutex_unlock(&c->fsck_error_msgs_lock); err: From c7e351be7aa4e176223f0128e4d1b959ffad9598 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 5 Jun 2025 19:00:11 -0400 Subject: [PATCH 097/242] bcachefs: Add range being updated to btree_update_to_text() We had a deadlock during recovery where interior btree updates became wedged and all open_buckets were consumed; start adding more introspection. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 26 ++++++++++++++++++++++++-- fs/bcachefs/btree_update_interior.h | 7 +++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d2ecb782919b..340812b28d08 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1232,6 +1232,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, list_add_tail(&as->list, &c->btree_interior_update_list); mutex_unlock(&c->btree_interior_update_lock); + struct btree *b = btree_path_node(path, path->level); + as->node_start = b->data->min_key; + as->node_end = b->data->max_key; + as->node_needed_rewrite = btree_node_need_rewrite(b); + as->node_written = b->written; + as->node_sectors = btree_buf_bytes(b) >> 9; + as->node_remaining = __bch2_btree_u64s_remaining(b, + btree_bkey_last(b, bset_tree_last(b))); + /* * We don't want to allocate if we're in an error state, that can cause * deadlock on emergency shutdown due to open buckets getting stuck in @@ -2108,6 +2117,9 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (ret) goto err; + as->node_start = prev->data->min_key; + as->node_end = next->data->max_key; + trace_and_count(c, btree_node_merge, trans, b); n = bch2_btree_node_alloc(as, trans, b->c.level); @@ -2681,9 +2693,19 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update prt_str(out, " "); bch2_btree_id_to_text(out, as->btree_id); - prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + prt_printf(out, " l=%u-%u ", as->update_level_start, - as->update_level_end, + as->update_level_end); + bch2_bpos_to_text(out, as->node_start); + prt_char(out, ' '); + bch2_bpos_to_text(out, as->node_end); + prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %u", + as->node_written, + as->node_sectors, + as->node_remaining, + as->node_needed_rewrite); + + prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", bch2_btree_update_modes[as->mode], as->nodes_written, closure_nr_remaining(&as->cl), diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7fe793788a79..85682a13de4d 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -57,6 +57,13 @@ struct btree_update { unsigned took_gc_lock:1; enum btree_id btree_id; + struct bpos node_start; + struct bpos node_end; + bool node_needed_rewrite; + u16 node_written; + u16 node_sectors; + u16 node_remaining; + unsigned update_level_start; unsigned update_level_end; From b76cce12700b8bb8d59b9ff444504b94db96dd4b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 5 Jun 2025 20:53:01 -0400 Subject: [PATCH 098/242] bcachefs: Add more flags to btree nodes for rewrite reason It seems excessive forced btree node rewrites can cause interior btree updates to become wedged during recovery, before we're using the write buffer for backpointer updates. Add more flags so we can determine where these are coming from. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 10 ++++++++-- fs/bcachefs/btree_types.h | 29 +++++++++++++++++++++++++++++ fs/bcachefs/btree_update_interior.c | 13 ++++++++++--- fs/bcachefs/btree_update_interior.h | 2 +- 4 files changed, 48 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 57eff3012a7b..6787d5b91eab 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1045,6 +1045,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, le16_add_cpu(&i->u64s, -next_good_key); memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_error(b); } fsck_err: printbuf_exit(&buf); @@ -1305,6 +1306,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, (u64 *) vstruct_end(i) - (u64 *) k); set_btree_bset_end(b, b->set); set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_error(b); continue; } if (ret) @@ -1329,12 +1331,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_degraded(b); + } } - if (!ptr_written) + if (!ptr_written) { set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_ptr_written_zero(b); + } fsck_err: mempool_free(iter, &c->fill_iter); printbuf_exit(&buf); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index c61c4171ae50..3aa4a602bd02 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -617,6 +617,9 @@ enum btree_write_type { x(dying) \ x(fake) \ x(need_rewrite) \ + x(need_rewrite_error) \ + x(need_rewrite_degraded) \ + x(need_rewrite_ptr_written_zero) \ x(never_write) \ x(pinned) @@ -641,6 +644,32 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \ BTREE_FLAGS() #undef x +#define BTREE_NODE_REWRITE_REASON() \ + x(none) \ + x(unknown) \ + x(error) \ + x(degraded) \ + x(ptr_written_zero) + +enum btree_node_rewrite_reason { +#define x(n) BTREE_NODE_REWRITE_##n, + BTREE_NODE_REWRITE_REASON() +#undef x +}; + +static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b) +{ + if (btree_node_need_rewrite_ptr_written_zero(b)) + return BTREE_NODE_REWRITE_ptr_written_zero; + if (btree_node_need_rewrite_degraded(b)) + return BTREE_NODE_REWRITE_degraded; + if (btree_node_need_rewrite_error(b)) + return BTREE_NODE_REWRITE_error; + if (btree_node_need_rewrite(b)) + return BTREE_NODE_REWRITE_unknown; + return BTREE_NODE_REWRITE_none; +} + static inline struct btree_write *btree_current_write(struct btree *b) { return b->writes + btree_node_write_idx(b); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 340812b28d08..e77584607f0d 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1138,6 +1138,13 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * start_time); } +static const char * const btree_node_reawrite_reason_strs[] = { +#define x(n) #n, + BTREE_NODE_REWRITE_REASON() +#undef x + NULL, +}; + static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, unsigned level_start, bool split, @@ -1235,7 +1242,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, struct btree *b = btree_path_node(path, path->level); as->node_start = b->data->min_key; as->node_end = b->data->max_key; - as->node_needed_rewrite = btree_node_need_rewrite(b); + as->node_needed_rewrite = btree_node_rewrite_reason(b); as->node_written = b->written; as->node_sectors = btree_buf_bytes(b) >> 9; as->node_remaining = __bch2_btree_u64s_remaining(b, @@ -2699,11 +2706,11 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update bch2_bpos_to_text(out, as->node_start); prt_char(out, ' '); bch2_bpos_to_text(out, as->node_end); - prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %u", + prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s", as->node_written, as->node_sectors, as->node_remaining, - as->node_needed_rewrite); + btree_node_reawrite_reason_strs[as->node_needed_rewrite]); prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", bch2_btree_update_modes[as->mode], diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 85682a13de4d..b649c36c3fbb 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -59,7 +59,7 @@ struct btree_update { enum btree_id btree_id; struct bpos node_start; struct bpos node_end; - bool node_needed_rewrite; + enum btree_node_rewrite_reason node_needed_rewrite; u16 node_written; u16 node_sectors; u16 node_remaining; From af5b88618a38371545251ad92ce42bc4bfa1142a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 12:01:22 -0400 Subject: [PATCH 099/242] bcachefs: Update /dev/disk/by-uuid on device add Invalidate pagecache after we write the new superblock and send a uevent. Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 397a69da5a75..8648f22411be 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1995,6 +1995,22 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err_late; } + /* + * We just changed the superblock UUID, invalidate cache and send a + * uevent to update /dev/disk/by-uuid + */ + invalidate_bdev(ca->disk_sb.bdev); + + char uuid_str[37]; + snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid); + + char *envp[] = { + "CHANGE=uuid", + uuid_str, + NULL, + }; + kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); + up_write(&c->state_lock); out: printbuf_exit(&label); From 7b0e6b198e4ef39ec89285d953778c5a3a04d390 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 12:56:15 -0400 Subject: [PATCH 100/242] bcachefs: Mark need_discard_freespace_key_bad autofix Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 6fdbf265e4c0..90c878f289b2 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -134,7 +134,7 @@ enum bch_fsck_flags { x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \ x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ - x(need_discard_freespace_key_bad, 124, 0) \ + x(need_discard_freespace_key_bad, 124, FSCK_AUTOFIX) \ x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ x(backpointer_bucket_offset_wrong, 125, 0) \ x(backpointer_level_bad, 294, 0) \ From b47a82ff4772ea9d7091b85ef5f34dc78c866a02 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 12:56:33 -0400 Subject: [PATCH 101/242] bcachefs: Only run 'increase_depth' for keys from btree node csan bch2_btree_increase_depth() was originally for disaster recovery, to get some data back from the journal when a btree root was bad. We don't need it for that purpose anymore; on bad btree root we'll launch btree node scan and reconstruct all the interior nodes. If there's a key in the journal for a depth that doesn't exists, and it's not from check_topology/btree node scan, we should just ignore it. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 1e68e61f08e8..dbd066260725 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -271,13 +271,24 @@ static int bch2_journal_replay_key(struct btree_trans *trans, goto out; struct btree_path *path = btree_iter_path(trans, &iter); - if (unlikely(!btree_path_node(path, k->level))) { + if (unlikely(!btree_path_node(path, k->level) && + !k->allocated)) { + struct bch_fs *c = trans->c; + + if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| + BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { + bch_err(c, "have key in journal replay for btree depth that does not exist, confused"); + ret = -EINVAL; + } +#if 0 bch2_trans_iter_exit(trans, &iter); bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, 0, iter_flags); ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_increase_depth(trans, iter.path, 0) ?: -BCH_ERR_transaction_restart_nested; +#endif + k->overwritten = true; goto out; } From dd22844f48a71a414bd7e08cd4a9a5847974c07e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 14:22:24 -0400 Subject: [PATCH 102/242] bcachefs: Read error message now prints if self healing Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 11 +++++++++-- fs/bcachefs/io_read.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index a77779afad01..04bbdcf58e40 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -343,6 +343,10 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, *bounce = true; *read_full = promote_full; + + if (have_io_error(failed)) + orig->self_healing = true; + return promote; nopromote: trace_io_read_nopromote(c, ret); @@ -635,12 +639,15 @@ static void bch2_rbio_retry(struct work_struct *work) prt_str(&buf, "(internal move) "); prt_str(&buf, "data read error, "); - if (!ret) + if (!ret) { prt_str(&buf, "successful retry"); - else + if (rbio->self_healing) + prt_str(&buf, ", self healing"); + } else prt_str(&buf, bch2_err_str(ret)); prt_newline(&buf); + if (!bkey_deleted(&sk.k->k)) { bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); prt_newline(&buf); diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 45c959018919..9c5ddbf861b3 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -44,6 +44,7 @@ struct bch_read_bio { have_ioref:1, narrow_crcs:1, saw_error:1, + self_healing:1, context:2; }; u16 _state; From 263561649ee5875a922f4358e96d3deb17a91021 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 14:27:35 -0400 Subject: [PATCH 103/242] bcachefs: Don't persistently run scan_for_btree_nodes bch2_btree_lost_data() gets called on btree node read error, but the error might be transient. btree_node_scan is expensive, and there's no need to run it persistently (marking it in the superblock as required to run) - check_topology will run it if required, via bch2_get_scanned_nodes(). Running it non-persistently is fine, to avoid check_topology having to rewind recovery to run it. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 2 ++ fs/bcachefs/recovery_passes.c | 14 +++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index dbd066260725..06d0ba0fbffb 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -99,9 +99,11 @@ int bch2_btree_lost_data(struct bch_fs *c, goto out; case BTREE_ID_snapshots: ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; default: + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; } diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 605588e33fb3..35ac0d64d73a 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -294,8 +294,13 @@ static bool recovery_pass_needs_set(struct bch_fs *c, enum bch_run_recovery_pass_flags *flags) { struct bch_fs_recovery *r = &c->recovery; - bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); + + /* + * Never run scan_for_btree_nodes persistently: check_topology will run + * it if required + */ + if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) + *flags |= RUN_RECOVERY_PASS_nopersistent; if ((*flags & RUN_RECOVERY_PASS_ratelimit) && !bch2_recovery_pass_want_ratelimit(c, pass)) @@ -310,6 +315,8 @@ static bool recovery_pass_needs_set(struct bch_fs *c, * Otherwise, we run run_explicit_recovery_pass when we find damage, so * it should run again even if it's already run: */ + bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); + bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); if (persistent ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) @@ -334,6 +341,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, struct bch_fs_recovery *r = &c->recovery; int ret = 0; + lockdep_assert_held(&c->sb_lock); bch2_printbuf_make_room(out, 1024); @@ -446,7 +454,7 @@ int bch2_require_recovery_pass(struct bch_fs *c, int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent; + enum bch_run_recovery_pass_flags flags = 0; if (!recovery_pass_needs_set(c, pass, &flags)) return 0; From 3315113af178041ab474723804e1322cee7d0a97 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 18:56:28 -0400 Subject: [PATCH 104/242] bcachefs: mark more errors autofix Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 90c878f289b2..d06e73884871 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -165,7 +165,7 @@ enum bch_fsck_flags { x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \ x(ptr_to_missing_stripe, 150, 0) \ x(ptr_to_incorrect_stripe, 151, 0) \ - x(ptr_gen_newer_than_bucket_gen, 152, 0) \ + x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \ x(ptr_too_stale, 153, 0) \ x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ x(ptr_bucket_data_type_mismatch, 155, 0) \ @@ -236,7 +236,7 @@ enum bch_fsck_flags { x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ - x(inode_has_child_snapshots_wrong, 287, 0) \ + x(inode_has_child_snapshots_wrong, 287, FSCK_AUTOFIX) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ @@ -279,8 +279,8 @@ enum bch_fsck_flags { x(root_dir_missing, 239, 0) \ x(root_inode_not_dir, 240, 0) \ x(dir_loop, 241, 0) \ - x(hash_table_key_duplicate, 242, 0) \ - x(hash_table_key_wrong_offset, 243, 0) \ + x(hash_table_key_duplicate, 242, FSCK_AUTOFIX) \ + x(hash_table_key_wrong_offset, 243, FSCK_AUTOFIX) \ x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \ x(reflink_p_front_pad_bad, 245, 0) \ x(journal_entry_dup_same_device, 246, 0) \ From 0acb385ec19c99b213d28a309a941790758c53a8 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Tue, 20 May 2025 15:34:28 +0800 Subject: [PATCH 105/242] bcachefs: Fix possible console lock involved deadlock Link: https://lore.kernel.org/all/6822ab02.050a0220.f2294.00cb.GAE@google.com/T/ Reported-by: syzbot+2c3ef91c9523c3d1a25c@syzkaller.appspotmail.com Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/btree_locking.c | 2 +- fs/bcachefs/error.c | 2 +- fs/bcachefs/super.c | 11 +++-------- fs/bcachefs/util.c | 10 ++-------- fs/bcachefs/util.h | 2 +- 6 files changed, 8 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 3651a296d506..5a1cede2febf 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -296,7 +296,6 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") void bch2_print_str(struct bch_fs *, const char *, const char *); -void bch2_print_str_nonblocking(struct bch_fs *, const char *, const char *); __printf(2, 3) void bch2_print_opts(struct bch_opts *, const char *, ...); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 47035aae232e..91a51aef82f1 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -213,7 +213,7 @@ static noinline __noreturn void break_cycle_fail(struct lock_graph *g) prt_newline(&buf); } - bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf); + bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf); printbuf_exit(&buf); BUG(); } diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index ff49cebfd0e8..a8ec6aae5738 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -69,7 +69,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra if (trans) bch2_trans_updates_to_text(&buf, trans); bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_str_nonblocking(c, KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); return ret; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 8648f22411be..ae8fa58a208d 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -104,7 +104,7 @@ const char * const bch2_dev_write_refs[] = { #undef x static void __bch2_print_str(struct bch_fs *c, const char *prefix, - const char *str, bool nonblocking) + const char *str) { #ifdef __KERNEL__ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); @@ -114,17 +114,12 @@ static void __bch2_print_str(struct bch_fs *c, const char *prefix, return; } #endif - bch2_print_string_as_lines(KERN_ERR, str, nonblocking); + bch2_print_string_as_lines(KERN_ERR, str); } void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) { - __bch2_print_str(c, prefix, str, false); -} - -void bch2_print_str_nonblocking(struct bch_fs *c, const char *prefix, const char *str) -{ - __bch2_print_str(c, prefix, str, true); + __bch2_print_str(c, prefix, str); } __printf(2, 0) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index dc3817f545fa..df9a6071fe18 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -262,8 +262,7 @@ static bool string_is_spaces(const char *str) return true; } -void bch2_print_string_as_lines(const char *prefix, const char *lines, - bool nonblocking) +void bch2_print_string_as_lines(const char *prefix, const char *lines) { bool locked = false; const char *p; @@ -273,12 +272,7 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines, return; } - if (!nonblocking) { - console_lock(); - locked = true; - } else { - locked = console_trylock(); - } + locked = console_trylock(); while (*lines) { p = strchrnul(lines, '\n'); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 0a4b1d433621..6488f098d140 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -214,7 +214,7 @@ u64 bch2_read_flag_list(const char *, const char * const[]); void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); void bch2_prt_u64_base2(struct printbuf *, u64); -void bch2_print_string_as_lines(const char *, const char *, bool); +void bch2_print_string_as_lines(const char *, const char *); typedef DARRAY(unsigned long) bch_stacktrace; int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); From f946ce0be45e75801583a5371fccf7466961d9d0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Jun 2025 20:18:16 -0400 Subject: [PATCH 106/242] bcachefs: Make sure opts.read_only gets propagated back to VFS If we think we're read-only but the VFS doesn't, fun will ensue. And now that we know we have to be able to do this safely, just make nochanges imply ro. Reported-by: syzbot+a7d6ceaba099cc21dee4@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 8 ++++++++ fs/bcachefs/recovery.c | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 85d13f800165..3063a8ddc2df 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2490,6 +2490,14 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (ret) goto err_stop_fs; + /* + * We might be doing a RO mount because other options required it, or we + * have no alloc info and it's a small image with no room to regenerate + * it + */ + if (c->opts.read_only) + fc->sb_flags |= SB_RDONLY; + sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); ret = PTR_ERR_OR_ZERO(sb); if (ret) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 06d0ba0fbffb..520eb72c4971 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -752,9 +752,11 @@ int bch2_fs_recovery(struct bch_fs *c) ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) : BCH_RECOVERY_PASS_snapshots_read; c->opts.nochanges = true; - c->opts.read_only = true; } + if (c->opts.nochanges) + c->opts.read_only = true; + mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; From 757601ef853359fe2d57d75c00b5045f62efc608 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Jun 2025 11:40:00 -0400 Subject: [PATCH 107/242] bcachefs: Don't put rhashtable on stack Object debugging generally needs special provisions for putting said objects on the stack, which rhashtable does not have. Reported-by: syzbot+bcc38a9556d0324c2ec2@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/movinggc.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 6d7b1d5f7697..27e68d470ad0 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -28,7 +28,7 @@ #include struct buckets_in_flight { - struct rhashtable table; + struct rhashtable *table; struct move_bucket *first; struct move_bucket *last; size_t nr; @@ -98,7 +98,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, static void move_bucket_free(struct buckets_in_flight *list, struct move_bucket *b) { - int ret = rhashtable_remove_fast(&list->table, &b->hash, + int ret = rhashtable_remove_fast(list->table, &b->hash, bch_move_bucket_params); BUG_ON(ret); kfree(b); @@ -133,7 +133,7 @@ static void move_buckets_wait(struct moving_context *ctxt, static bool bucket_in_flight(struct buckets_in_flight *list, struct move_bucket_key k) { - return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); + return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); } static int bch2_copygc_get_buckets(struct moving_context *ctxt, @@ -185,7 +185,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, goto err; } - ret2 = rhashtable_lookup_insert_fast(&buckets_in_flight->table, &b_i->hash, + ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, bch_move_bucket_params); BUG_ON(ret2); @@ -350,10 +350,13 @@ static int bch2_copygc_thread(void *arg) struct buckets_in_flight buckets = {}; u64 last, wait; - int ret = rhashtable_init(&buckets.table, &bch_move_bucket_params); + buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL); + int ret = !buckets.table + ? -ENOMEM + : rhashtable_init(buckets.table, &bch_move_bucket_params); bch_err_msg(c, ret, "allocating copygc buckets in flight"); if (ret) - return ret; + goto err; set_freezable(); @@ -421,11 +424,12 @@ static int bch2_copygc_thread(void *arg) } move_buckets_wait(&ctxt, &buckets, true); - rhashtable_destroy(&buckets.table); + rhashtable_destroy(buckets.table); bch2_moving_ctxt_exit(&ctxt); bch2_move_stats_exit(&move_stats, c); - - return 0; +err: + kfree(buckets.table); + return ret; } void bch2_copygc_stop(struct bch_fs *c) From 082c74411491f8b0d31465fc104b8342e66c4056 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Jun 2025 11:58:59 -0400 Subject: [PATCH 108/242] bcachefs: Fix downgrade_table_extra() Fix a UAF: we were calling darray_make_room() and retaining a pointer to the old buffer. And fix an UBSAN warning: struct bch_sb_field_downgrade_entry uses __counted_by, so set dst->nr_errors before assigning to the array entry. Reported-by: syzbot+14c52d86ddbd89bea13e@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-downgrade.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index b61f88450a6d..1506d05e0665 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -253,6 +253,7 @@ DOWNGRADE_TABLE() static int downgrade_table_extra(struct bch_fs *c, darray_char *table) { + unsigned dst_offset = table->nr; struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table); unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); int ret = 0; @@ -268,6 +269,9 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table) if (ret) return ret; + dst = (void *) &table->data[dst_offset]; + dst->nr_errors = cpu_to_le16(nr_errors + 1); + /* open coded __set_bit_le64, as dst is packed and * dst->recovery_passes is misaligned */ unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations; @@ -278,7 +282,6 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table) break; } - dst->nr_errors = cpu_to_le16(nr_errors); return ret; } From 54aacfe3976893ee04582a0bd61967bbee5a7e04 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Jun 2025 12:17:02 -0400 Subject: [PATCH 109/242] bcachefs: Fix rcu_pending for PREEMPT_RT PREEMPT_RT redefines how standard spinlocks work, so local_irq_save() + spin_lock() is no longer equivalent to spin_lock_irqsave(). Fortunately, we don't strictly need to do it that way. Signed-off-by: Kent Overstreet --- fs/bcachefs/rcu_pending.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c index bef2aa1b8bcd..b1438be9d690 100644 --- a/fs/bcachefs/rcu_pending.c +++ b/fs/bcachefs/rcu_pending.c @@ -182,11 +182,6 @@ static inline void kfree_bulk(size_t nr, void ** p) while (nr--) kfree(*p); } - -#define local_irq_save(flags) \ -do { \ - flags = 0; \ -} while (0) #endif static noinline void __process_finished_items(struct rcu_pending *pending, @@ -429,9 +424,15 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); - local_irq_save(flags); - p = this_cpu_ptr(pending->p); - spin_lock(&p->lock); + /* We could technically be scheduled before taking the lock and end up + * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock + * anyways + * + * And we have to do it this way to avoid breaking PREEMPT_RT, which + * redefines how spinlocks work: + */ + p = raw_cpu_ptr(pending->p); + spin_lock_irqsave(&p->lock, flags); rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); restart: if (may_sleep && @@ -520,9 +521,8 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, goto free_node; } - local_irq_save(flags); - p = this_cpu_ptr(pending->p); - spin_lock(&p->lock); + p = raw_cpu_ptr(pending->p); + spin_lock_irqsave(&p->lock, flags); goto restart; } From 9e48f574e55731106b26dc33d8b0be1adedf3f20 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 9 Jun 2025 17:30:40 -0400 Subject: [PATCH 110/242] bcachefs: Fix leak in bch2_fs_recovery() error path Fix a small leak of the superblock 'clean' section. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 520eb72c4971..0b21fa6ff062 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1108,9 +1108,6 @@ int bch2_fs_recovery(struct bch_fs *c) out: bch2_flush_fsck_errs(c); - if (!IS_ERR(clean)) - kfree(clean); - if (!ret && test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && !c->opts.nochanges) { @@ -1119,6 +1116,9 @@ int bch2_fs_recovery(struct bch_fs *c) } bch_err_fn(c, ret); +final_out: + if (!IS_ERR(clean)) + kfree(clean); return ret; err: fsck_err: @@ -1132,7 +1132,7 @@ int bch2_fs_recovery(struct bch_fs *c) bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); } - return ret; + goto final_out; } int bch2_fs_initialize(struct bch_fs *c) From c3dd25319c1818e067f41f41127f935f153498e6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 9 Jun 2025 17:28:00 -0400 Subject: [PATCH 111/242] bcachefs: Don't pass trans to fsck_err() in gc_accounting_done fsck_err() can return a transaction restart if passed a transaction object - this has always been true when it has to drop locks to prompt for user input, but we're seeing this more now that we're logging the error being corrected in the journal. gc_accounting_done() doesn't call fsck_err() from an actual commit loop, and it doesn't need to be holding btree locks when it calls fsck_err(), so the easy fix here for the unhandled transaction restart is to just not pass it the transaction object. We'll miss out on the fancy new logging, but that's ok. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 3d59a57a5256..f7528cd69c73 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -618,7 +618,9 @@ int bch2_gc_accounting_done(struct bch_fs *c) for (unsigned j = 0; j < nr; j++) src_v[j] -= dst_v[j]; - if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) { + bch2_trans_unlock_long(trans); + + if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) { percpu_up_write(&c->mark_lock); ret = commit_do(trans, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); From e82b3a63a9a91cc5c585efb3e28f32100f24e3e1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Jun 2025 11:24:04 +0200 Subject: [PATCH 112/242] bcachefs: ioctl: avoid stack overflow warning Multiple ioctl handlers individually use a lot of stack space, and clang chooses to inline them into the bch2_fs_ioctl() function, blowing through the warning limit: fs/bcachefs/chardev.c:655:6: error: stack frame size (1032) exceeds limit (1024) in 'bch2_fs_ioctl' [-Werror,-Wframe-larger-than] 655 | long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) By marking the largest two of them as noinline_for_stack, no indidual code path ends up using this much, which avoids the warning and reduces the possible total stack usage in the ioctl handler. Signed-off-by: Arnd Bergmann Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 2d38466eddfd..fde3c2380e28 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -399,7 +399,7 @@ static long bch2_ioctl_data(struct bch_fs *c, return ret; } -static long bch2_ioctl_fs_usage(struct bch_fs *c, +static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c, struct bch_ioctl_fs_usage __user *user_arg) { struct bch_ioctl_fs_usage arg = {}; @@ -469,7 +469,7 @@ static long bch2_ioctl_query_accounting(struct bch_fs *c, } /* obsolete, didn't allow for new data types: */ -static long bch2_ioctl_dev_usage(struct bch_fs *c, +static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_ioctl_dev_usage __user *user_arg) { struct bch_ioctl_dev_usage arg; From 625c494db95624f09f5e7b81b64d4da34e45bd2a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 11 Jun 2025 13:32:58 -0400 Subject: [PATCH 113/242] bcachefs: Fix version checks in validate_bset() It seems btree node scan picked up a partially overwritten btree node, and corrected the "bset version older than sb version_min" error - resulting in an invalid superblock with a bad version_min field. Don't run this check at all when we're in btree node scan, and when we do run it, do something saner if the bset version is totally crazy. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 6787d5b91eab..d8f3c4c65e90 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -741,16 +741,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BCH_VERSION_MAJOR(version), BCH_VERSION_MINOR(version)); - if (btree_err_on(version < c->sb.version_min, + if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes && + btree_err_on(version < c->sb.version_min, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, NULL, btree_node_bset_older_than_sb_min, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->version_min = cpu_to_le16(version); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + if (bch2_version_compatible(version)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version_min = cpu_to_le16(version); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } else { + /* We have no idea what's going on: */ + i->version = cpu_to_le16(c->sb.version); + } } if (btree_err_on(BCH_VERSION_MAJOR(version) > From 205da7c026739d965e605d39001049c7b6728e87 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Jun 2025 11:31:23 -0400 Subject: [PATCH 114/242] bcachefs: Don't trust sb->nr_devices in members_to_text() We have to be able to print superblock sections even if they fail to validate (for debugging), so we have to calculate the number of entries from the field size. Reported-by: syzbot+5138f00559ffb3cb3610@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-members.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 363eb0c6eb7c..6245e342a8a8 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -325,9 +325,17 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - unsigned i; - for (i = 0; i < sb->nr_devices; i++) + if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { + prt_printf(out, "field ends before start of entries"); + return; + } + + unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]); + if (nr != sb->nr_devices) + prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); + + for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) member_to_text(out, members_v1_get(mi, i), gi, sb, i); } @@ -341,9 +349,27 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - unsigned i; - for (i = 0; i < sb->nr_devices; i++) + if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { + prt_printf(out, "field ends before start of entries"); + return; + } + + if (!le16_to_cpu(mi->member_bytes)) { + prt_printf(out, "member_bytes 0"); + return; + } + + unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes); + if (nr != sb->nr_devices) + prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); + + /* + * We call to_text() on superblock sections that haven't passed + * validate, so we can't trust sb->nr_devices. + */ + + for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) member_to_text(out, members_v2_get(mi, i), gi, sb, i); } From b68baf9a87330148d3ad074e48503a4e9c5c3929 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 11 Jun 2025 15:57:48 -0400 Subject: [PATCH 115/242] bcachefs: Print devices we're mounting on multi device filesystems Previously, we only ever logged the filesystem UUID. Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index ae8fa58a208d..a5b97c9c5163 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1067,12 +1067,13 @@ noinline_for_stack static void print_mount_opts(struct bch_fs *c) { enum bch_opt_id i; - struct printbuf p = PRINTBUF; - bool first = true; + CLASS(printbuf, p)(); + bch2_log_msg_start(c, &p); prt_str(&p, "starting version "); bch2_version_to_text(&p, c->sb.version); + bool first = true; for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); @@ -1089,17 +1090,24 @@ static void print_mount_opts(struct bch_fs *c) } if (c->sb.version_incompat_allowed != c->sb.version) { - prt_printf(&p, "\n allowing incompatible features above "); + prt_printf(&p, "\nallowing incompatible features above "); bch2_version_to_text(&p, c->sb.version_incompat_allowed); } if (c->opts.verbose) { - prt_printf(&p, "\n features: "); + prt_printf(&p, "\nfeatures: "); prt_bitflags(&p, bch2_sb_features, c->sb.features); } - bch_info(c, "%s", p.buf); - printbuf_exit(&p); + if (c->sb.multi_device) { + prt_printf(&p, "\nwith devices"); + for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { + prt_char(&p, ' '); + prt_str(&p, ca->name); + } + } + + bch2_print_str(c, KERN_INFO, p.buf); } static bool bch2_fs_may_start(struct bch_fs *c) From cd1124244be30fd3e87da9186508aab371e9307d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Jun 2025 12:35:20 -0400 Subject: [PATCH 116/242] bcachefs: Ensure that snapshot creation propagates has_case_insensitive We normally can't create a new directory with the case-insensitive option already set - except when we're creating a snapshot. And if casefolding is enabled filesystem wide, we should still set it even though not strictly required, for consistency. Signed-off-by: Kent Overstreet --- fs/bcachefs/namei.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 24120037c031..779c22eb3979 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -175,6 +175,16 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_dir_offset = dir_offset; } + if (S_ISDIR(mode)) { + ret = bch2_maybe_propagate_has_case_insensitive(trans, + (subvol_inum) { + new_inode->bi_subvol ?: dir.subvol, + new_inode->bi_inum }, + new_inode); + if (ret) + goto err; + } + if (S_ISDIR(mode) && !new_inode->bi_subvol) new_inode->bi_depth = dir_u->bi_depth + 1; From aef22f6fe7a630d536f9eaa0a7a2ed0f90ea369e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Jun 2025 22:32:14 -0400 Subject: [PATCH 117/242] bcachefs: Don't trace should_be_locked unless changing Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 9adca77e2580..f2173a3316f4 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -417,8 +417,10 @@ static inline void btree_path_set_should_be_locked(struct btree_trans *trans, st EBUG_ON(!btree_node_locked(path, path->level)); EBUG_ON(path->uptodate); - path->should_be_locked = true; - trace_btree_path_should_be_locked(trans, path); + if (!path->should_be_locked) { + path->should_be_locked = true; + trace_btree_path_should_be_locked(trans, path); + } } static inline void __btree_path_set_level_up(struct btree_trans *trans, From aa2024c01a9afba6728b362626f868811ca872ee Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Jun 2025 20:10:18 -0400 Subject: [PATCH 118/242] KVM: x86/mmu: Embed direct bits into gpa for KVM_PRE_FAULT_MEMORY Bug[*] reported for TDX case when enabling KVM_PRE_FAULT_MEMORY in QEMU. It turns out that @gpa passed to kvm_mmu_do_page_fault() doesn't have shared bit set when the memory attribute of it is shared, and it leads to wrong root in tdp_mmu_get_root_for_fault(). Fix it by embedding the direct bits in the gpa that is passed to kvm_tdp_map_page(), when the memory of the gpa is not private. [*] https://lore.kernel.org/qemu-devel/4a757796-11c2-47f1-ae0d-335626e818fd@intel.com/ Reported-by: Xiaoyao Li Closes: https://lore.kernel.org/qemu-devel/4a757796-11c2-47f1-ae0d-335626e818fd@intel.com/ Signed-off-by: Paolo Bonzini Signed-off-by: Xiaoyao Li Message-ID: <20250611001018.2179964-1-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index cbc84c6abc2e..a4040578b537 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4896,6 +4896,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, { u64 error_code = PFERR_GUEST_FINAL_MASK; u8 level = PG_LEVEL_4K; + u64 direct_bits; u64 end; int r; @@ -4910,15 +4911,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, if (r) return r; + direct_bits = 0; if (kvm_arch_has_private_mem(vcpu->kvm) && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) error_code |= PFERR_PRIVATE_ACCESS; + else + direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); /* * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging. */ - r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level); + r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); if (r < 0) return r; From 8046d29dde17002523f94d3e6e0ebe486ce52166 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 12 Jun 2025 00:47:44 -0400 Subject: [PATCH 119/242] KVM: x86/mmu: Reject direct bits in gpa passed to KVM_PRE_FAULT_MEMORY Only let userspace pass the same addresses that were used in KVM_SET_USER_MEMORY_REGION (or KVM_SET_USER_MEMORY_REGION2); gpas in the the upper half of the address space are an implementation detail of TDX and KVM. Extracted from a patch by Sean Christopherson . Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a4040578b537..4e06e2e89a8f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4903,6 +4903,9 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, if (!vcpu->kvm->arch.pre_fault_allowed) return -EOPNOTSUPP; + if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa))) + return -EINVAL; + /* * reload is efficient when called repeatedly, so we can do it on * every iteration. From 33db8c97b4cfa0328054fb755dfbcd6e7f3c7a9d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 9 Jun 2025 14:26:54 +0530 Subject: [PATCH 120/242] rust: Use CpuId in place of raw CPU numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the newly defined `CpuId` abstraction instead of raw CPU numbers. This also fixes a doctest failure for configurations where `nr_cpu_ids < 4`. The C `cpumask_{set|clear}_cpu()` APIs emit a warning when given an invalid CPU number — but only if `CONFIG_DEBUG_PER_CPU_MAPS=y` is set. Meanwhile, `cpumask_weight()` only considers CPUs up to `nr_cpu_ids`, which can cause inconsistencies: a CPU number greater than `nr_cpu_ids` may be set in the mask, yet the weight calculation won't reflect it. This leads to doctest failures when `nr_cpu_ids < 4`, as the test tries to set CPUs 2 and 3: rust_doctest_kernel_cpumask_rs_0.location: rust/kernel/cpumask.rs:180 rust_doctest_kernel_cpumask_rs_0: ASSERTION FAILED at rust/kernel/cpumask.rs:190 Fixes: 8961b8cb3099 ("rust: cpumask: Add initial abstractions") Reported-by: Miguel Ojeda Closes: https://lore.kernel.org/rust-for-linux/CANiq72k3ozKkLMinTLQwvkyg9K=BeRxs1oYZSKhJHY-veEyZdg@mail.gmail.com/ Reported-by: Andreas Hindborg Closes: https://lore.kernel.org/all/87qzzy3ric.fsf@kernel.org/ Suggested-by: Boqun Feng Signed-off-by: Viresh Kumar Reviewed-by: Boqun Feng --- drivers/cpufreq/rcpufreq_dt.rs | 4 +-- rust/kernel/cpu.rs | 4 +-- rust/kernel/cpufreq.rs | 27 ++++++++++++------ rust/kernel/cpumask.rs | 51 ++++++++++++++++++++++++---------- 4 files changed, 59 insertions(+), 27 deletions(-) diff --git a/drivers/cpufreq/rcpufreq_dt.rs b/drivers/cpufreq/rcpufreq_dt.rs index 94ed81644fe1..43c87d0259b6 100644 --- a/drivers/cpufreq/rcpufreq_dt.rs +++ b/drivers/cpufreq/rcpufreq_dt.rs @@ -26,9 +26,9 @@ fn find_supply_name_exact(dev: &Device, name: &str) -> Option { } /// Finds supply name for the CPU from DT. -fn find_supply_names(dev: &Device, cpu: u32) -> Option> { +fn find_supply_names(dev: &Device, cpu: cpu::CpuId) -> Option> { // Try "cpu0" for older DTs, fallback to "cpu". - let name = (cpu == 0) + let name = (cpu.as_u32() == 0) .then(|| find_supply_name_exact(dev, "cpu0")) .flatten() .or_else(|| find_supply_name_exact(dev, "cpu"))?; diff --git a/rust/kernel/cpu.rs b/rust/kernel/cpu.rs index 6a3aecb12468..abc780d7a8ec 100644 --- a/rust/kernel/cpu.rs +++ b/rust/kernel/cpu.rs @@ -127,9 +127,9 @@ fn from(id: CpuId) -> Self { /// Callers must ensure that the CPU device is not used after it has been unregistered. /// This can be achieved, for example, by registering a CPU hotplug notifier and removing /// any references to the CPU device within the notifier's callback. -pub unsafe fn from_cpu(cpu: u32) -> Result<&'static Device> { +pub unsafe fn from_cpu(cpu: CpuId) -> Result<&'static Device> { // SAFETY: It is safe to call `get_cpu_device()` for any CPU. - let ptr = unsafe { bindings::get_cpu_device(cpu) }; + let ptr = unsafe { bindings::get_cpu_device(u32::from(cpu)) }; if ptr.is_null() { return Err(ENODEV); } diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index 9b995f18aac6..11b03e9d7e89 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -10,6 +10,7 @@ use crate::{ clk::Hertz, + cpu::CpuId, cpumask, device::{Bound, Device}, devres::Devres, @@ -465,8 +466,9 @@ fn as_mut_ref(&mut self) -> &mut bindings::cpufreq_policy { /// Returns the primary CPU for the [`Policy`]. #[inline] - pub fn cpu(&self) -> u32 { - self.as_ref().cpu + pub fn cpu(&self) -> CpuId { + // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. + unsafe { CpuId::from_u32_unchecked(self.as_ref().cpu) } } /// Returns the minimum frequency for the [`Policy`]. @@ -525,7 +527,7 @@ pub fn generic_suspend(&mut self) -> Result { #[inline] pub fn generic_get(&self) -> Result { // SAFETY: By the type invariant, the pointer stored in `self` is valid. - Ok(unsafe { bindings::cpufreq_generic_get(self.cpu()) }) + Ok(unsafe { bindings::cpufreq_generic_get(u32::from(self.cpu())) }) } /// Provides a wrapper to the register with energy model using the OPP core. @@ -678,9 +680,9 @@ fn clear_data(&mut self) -> Option { struct PolicyCpu<'a>(&'a mut Policy); impl<'a> PolicyCpu<'a> { - fn from_cpu(cpu: u32) -> Result { + fn from_cpu(cpu: CpuId) -> Result { // SAFETY: It is safe to call `cpufreq_cpu_get` for any valid CPU. - let ptr = from_err_ptr(unsafe { bindings::cpufreq_cpu_get(cpu) })?; + let ptr = from_err_ptr(unsafe { bindings::cpufreq_cpu_get(u32::from(cpu)) })?; Ok(Self( // SAFETY: The `ptr` is guaranteed to be valid and remains valid for the lifetime of @@ -1266,7 +1268,10 @@ impl Registration { target_perf: usize, capacity: usize, ) { - if let Ok(mut policy) = PolicyCpu::from_cpu(cpu) { + // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. + let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) }; + + if let Ok(mut policy) = PolicyCpu::from_cpu(cpu_id) { T::adjust_perf(&mut policy, min_perf, target_perf, capacity); } } @@ -1321,7 +1326,10 @@ impl Registration { /// /// - This function may only be called from the cpufreq C infrastructure. unsafe extern "C" fn get_callback(cpu: u32) -> kernel::ffi::c_uint { - PolicyCpu::from_cpu(cpu).map_or(0, |mut policy| T::get(&mut policy).map_or(0, |f| f)) + // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. + let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) }; + + PolicyCpu::from_cpu(cpu_id).map_or(0, |mut policy| T::get(&mut policy).map_or(0, |f| f)) } /// Driver's `update_limit` callback. @@ -1344,8 +1352,11 @@ impl Registration { /// - This function may only be called from the cpufreq C infrastructure. /// - The pointer arguments must be valid pointers. unsafe extern "C" fn bios_limit_callback(cpu: i32, limit: *mut u32) -> kernel::ffi::c_int { + // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. + let cpu_id = unsafe { CpuId::from_i32_unchecked(cpu) }; + from_result(|| { - let mut policy = PolicyCpu::from_cpu(cpu as u32)?; + let mut policy = PolicyCpu::from_cpu(cpu_id)?; // SAFETY: `limit` is guaranteed by the C code to be valid. T::bios_limit(&mut policy, &mut (unsafe { *limit })).map(|()| 0) diff --git a/rust/kernel/cpumask.rs b/rust/kernel/cpumask.rs index c90bfac9346a..19c607709b5f 100644 --- a/rust/kernel/cpumask.rs +++ b/rust/kernel/cpumask.rs @@ -6,6 +6,7 @@ use crate::{ alloc::{AllocError, Flags}, + cpu::CpuId, prelude::*, types::Opaque, }; @@ -35,9 +36,10 @@ /// /// ``` /// use kernel::bindings; +/// use kernel::cpu::CpuId; /// use kernel::cpumask::Cpumask; /// -/// fn set_clear_cpu(ptr: *mut bindings::cpumask, set_cpu: u32, clear_cpu: i32) { +/// fn set_clear_cpu(ptr: *mut bindings::cpumask, set_cpu: CpuId, clear_cpu: CpuId) { /// // SAFETY: The `ptr` is valid for writing and remains valid for the lifetime of the /// // returned reference. /// let mask = unsafe { Cpumask::as_mut_ref(ptr) }; @@ -90,9 +92,9 @@ pub fn as_raw(&self) -> *mut bindings::cpumask { /// This mismatches kernel naming convention and corresponds to the C /// function `__cpumask_set_cpu()`. #[inline] - pub fn set(&mut self, cpu: u32) { + pub fn set(&mut self, cpu: CpuId) { // SAFETY: By the type invariant, `self.as_raw` is a valid argument to `__cpumask_set_cpu`. - unsafe { bindings::__cpumask_set_cpu(cpu, self.as_raw()) }; + unsafe { bindings::__cpumask_set_cpu(u32::from(cpu), self.as_raw()) }; } /// Clear `cpu` in the cpumask. @@ -101,19 +103,19 @@ pub fn set(&mut self, cpu: u32) { /// This mismatches kernel naming convention and corresponds to the C /// function `__cpumask_clear_cpu()`. #[inline] - pub fn clear(&mut self, cpu: i32) { + pub fn clear(&mut self, cpu: CpuId) { // SAFETY: By the type invariant, `self.as_raw` is a valid argument to // `__cpumask_clear_cpu`. - unsafe { bindings::__cpumask_clear_cpu(cpu, self.as_raw()) }; + unsafe { bindings::__cpumask_clear_cpu(i32::from(cpu), self.as_raw()) }; } /// Test `cpu` in the cpumask. /// /// Equivalent to the kernel's `cpumask_test_cpu` API. #[inline] - pub fn test(&self, cpu: i32) -> bool { + pub fn test(&self, cpu: CpuId) -> bool { // SAFETY: By the type invariant, `self.as_raw` is a valid argument to `cpumask_test_cpu`. - unsafe { bindings::cpumask_test_cpu(cpu, self.as_raw()) } + unsafe { bindings::cpumask_test_cpu(i32::from(cpu), self.as_raw()) } } /// Set all CPUs in the cpumask. @@ -178,21 +180,40 @@ pub fn copy(&self, dstp: &mut Self) { /// The following example demonstrates how to create and update a [`CpumaskVar`]. /// /// ``` +/// use kernel::cpu::CpuId; /// use kernel::cpumask::CpumaskVar; /// /// let mut mask = CpumaskVar::new_zero(GFP_KERNEL).unwrap(); /// /// assert!(mask.empty()); -/// mask.set(2); -/// assert!(mask.test(2)); -/// mask.set(3); -/// assert!(mask.test(3)); -/// assert_eq!(mask.weight(), 2); +/// let mut count = 0; +/// +/// let cpu2 = CpuId::from_u32(2); +/// if let Some(cpu) = cpu2 { +/// mask.set(cpu); +/// assert!(mask.test(cpu)); +/// count += 1; +/// } +/// +/// let cpu3 = CpuId::from_u32(3); +/// if let Some(cpu) = cpu3 { +/// mask.set(cpu); +/// assert!(mask.test(cpu)); +/// count += 1; +/// } +/// +/// assert_eq!(mask.weight(), count); /// /// let mask2 = CpumaskVar::try_clone(&mask).unwrap(); -/// assert!(mask2.test(2)); -/// assert!(mask2.test(3)); -/// assert_eq!(mask2.weight(), 2); +/// +/// if let Some(cpu) = cpu2 { +/// assert!(mask2.test(cpu)); +/// } +/// +/// if let Some(cpu) = cpu3 { +/// assert!(mask2.test(cpu)); +/// } +/// assert_eq!(mask2.weight(), count); /// ``` pub struct CpumaskVar { #[cfg(CONFIG_CPUMASK_OFFSTACK)] From c7f005f70d22cd5613cac30bf6d34867189e36a9 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 9 Jun 2025 16:44:16 +0530 Subject: [PATCH 121/242] rust: cpu: Add CpuId::current() to retrieve current CPU ID Introduce `CpuId::current()`, a constructor that wraps the C function `raw_smp_processor_id()` to retrieve the current CPU identifier without guaranteeing stability. This function should be used only when the caller can ensure that the CPU ID won't change unexpectedly due to preemption or migration. Suggested-by: Boqun Feng Signed-off-by: Viresh Kumar Reviewed-by: Boqun Feng --- MAINTAINERS | 1 + rust/helpers/cpu.c | 8 ++++++++ rust/helpers/helpers.c | 1 + rust/kernel/cpu.rs | 11 +++++++++++ 4 files changed, 21 insertions(+) create mode 100644 rust/helpers/cpu.c diff --git a/MAINTAINERS b/MAINTAINERS index a92290fffa16..4255186784c4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6254,6 +6254,7 @@ F: include/linux/cpuhotplug.h F: include/linux/smpboot.h F: kernel/cpu.c F: kernel/smpboot.* +F: rust/helper/cpu.c F: rust/kernel/cpu.rs CPU IDLE TIME MANAGEMENT FRAMEWORK diff --git a/rust/helpers/cpu.c b/rust/helpers/cpu.c new file mode 100644 index 000000000000..824e0adb19d4 --- /dev/null +++ b/rust/helpers/cpu.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +unsigned int rust_helper_raw_smp_processor_id(void) +{ + return raw_smp_processor_id(); +} diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 0f1b5d115985..16fa9bca5949 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -13,6 +13,7 @@ #include "build_assert.c" #include "build_bug.c" #include "clk.c" +#include "cpu.c" #include "cpufreq.c" #include "cpumask.c" #include "cred.c" diff --git a/rust/kernel/cpu.rs b/rust/kernel/cpu.rs index abc780d7a8ec..b75403b0eb56 100644 --- a/rust/kernel/cpu.rs +++ b/rust/kernel/cpu.rs @@ -102,6 +102,17 @@ pub fn from_u32(id: u32) -> Option { pub fn as_u32(&self) -> u32 { self.0 } + + /// Returns the ID of the CPU the code is currently running on. + /// + /// The returned value is considered unstable because it may change + /// unexpectedly due to preemption or CPU migration. It should only be + /// used when the context ensures that the task remains on the same CPU + /// or the users could use a stale (yet valid) CPU ID. + pub fn current() -> Self { + // SAFETY: raw_smp_processor_id() always returns a valid CPU ID. + unsafe { Self::from_u32_unchecked(bindings::raw_smp_processor_id()) } + } } impl From for u32 { From b1a529bdb9641392b4f5fc91545598793c0b3b96 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 5 Jun 2025 22:34:31 +0100 Subject: [PATCH 122/242] selftests/mm: skip failed memfd setups in gup_longterm Unlike the other cases gup_longterm's memfd tests previously skipped the test when failing to set up the file descriptor to test. Restore this behavior to avoid hitting failures when hugetlb isn't configured. Link: https://lkml.kernel.org/r/20250605-selftest-mm-gup-longterm-tweaks-v1-1-2fae34b05958@kernel.org Fixes: 66bce7afbaca ("selftests/mm: fix test result reporting in gup_longterm") Signed-off-by: Mark Brown Reported-by: Lorenzo Stoakes Closes: https://lkml.kernel.org/r/a76fc252-0fe3-4d4b-a9a1-4a2895c2680d@lucifer.local Reviewed-by: Lorenzo Stoakes Tested-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_longterm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index 8a97ac5176a4..29047d2e0c49 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -298,8 +298,11 @@ static void run_with_memfd(test_fn fn, const char *desc) log_test_start("%s ... with memfd", desc); fd = memfd_create("test", 0); - if (fd < 0) + if (fd < 0) { ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno)); + log_test_result(KSFT_SKIP); + return; + } fn(fd, pagesize); close(fd); @@ -366,6 +369,8 @@ static void run_with_memfd_hugetlb(test_fn fn, const char *desc, fd = memfd_create("test", flags); if (fd < 0) { ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno)); + log_test_result(KSFT_SKIP); + return; } fn(fd, hugetlbsize); From 331843c845d15413e9d89fd91cdcfa6912e291c3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 4 Jun 2025 17:23:37 -0700 Subject: [PATCH 123/242] scatterlist: fix extraneous '@'-sign kernel-doc notation Using "@argname@" in kernel-doc produces "argname****" (with "argname" in bold) in the generated html output, so use the expected kernel-doc notation of just "@argname" instead. "Fixes:" lines are added in case Matthew's patch [1] is backported. Link: https://lkml.kernel.org/r/20250605002337.2842659-1-rdunlap@infradead.org Link: https://lore.kernel.org/linux-doc/3bc4e779-7a79-42c1-8867-024f643a22fc@infradead.org/T/#m5d2bd9d21fb34f297aa4e7db069f09bc27b89007 [1] Fixes: 0db9299f48eb ("SG: Move functions to lib/scatterlist.c and add sg chaining allocator helpers") Fixes: 8d1d4b538bb1 ("scatterlist: inline sg_next()") Fixes: 18dabf473e15 ("Change table chaining layout") Signed-off-by: Randy Dunlap Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 4 ++-- lib/scatterlist.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 0cdbfc42f153..6f8a4965f9b9 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -99,7 +99,7 @@ static inline bool sg_is_last(struct scatterlist *sg) * @sg: The current sg entry * * Description: - * Usually the next entry will be @sg@ + 1, but if this sg element is part + * Usually the next entry will be @sg + 1, but if this sg element is part * of a chained scatterlist, it could jump to the start of a new * scatterlist array. * @@ -254,7 +254,7 @@ static inline void __sg_chain(struct scatterlist *chain_sg, * @sgl: Second scatterlist * * Description: - * Links @prv@ and @sgl@ together, to form a longer scatterlist. + * Links @prv and @sgl together, to form a longer scatterlist. * **/ static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 7582dfab7fe3..4af1c8b0775a 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -73,9 +73,9 @@ EXPORT_SYMBOL(sg_nents_for_len); * Should only be used casually, it (currently) scans the entire list * to get the last entry. * - * Note that the @sgl@ pointer passed in need not be the first one, - * the important bit is that @nents@ denotes the number of entries that - * exist from @sgl@. + * Note that the @sgl pointer passed in need not be the first one, + * the important bit is that @nents denotes the number of entries that + * exist from @sgl. * **/ struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents) @@ -345,7 +345,7 @@ EXPORT_SYMBOL(__sg_alloc_table); * @gfp_mask: GFP allocation mask * * Description: - * Allocate and initialize an sg table. If @nents@ is larger than + * Allocate and initialize an sg table. If @nents is larger than * SG_MAX_SINGLE_ALLOC a chained sg table will be setup. * **/ From 1b8e4091ffb4655c761354eb33f41b618e809427 Mon Sep 17 00:00:00 2001 From: wangfushuai Date: Sat, 7 Jun 2025 23:36:14 +0800 Subject: [PATCH 124/242] docs: proc: update VmFlags documentation in smaps Remove outdated VM_DENYWRITE("dw") reference and add missing VM_LOCKONFAULT("lf") and VM_UFFD_MINOR("ui") flags. [akpm@linux-foundation.org: add "dp" (VM_DROPPABLE), per Tal] Link: https://lkml.kernel.org/r/20250607153614.81914-1-wangfushuai@baidu.com Signed-off-by: wangfushuai Cc: Andrii Nakryiko Cc: Catalin Marinas Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Mariano Pache Cc: xu xin Cc: Tal Zussman Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 2a17865dfe39..5236cb52e357 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -584,7 +584,6 @@ encoded manner. The codes are the following: ms may share gd stack segment growns down pf pure PFN range - dw disabled write to the mapped file lo pages are locked in memory io memory mapped I/O area sr sequential read advise provided @@ -607,8 +606,11 @@ encoded manner. The codes are the following: mt arm64 MTE allocation tags are enabled um userfaultfd missing tracking uw userfaultfd wr-protect tracking + ui userfaultfd minor fault ss shadow/guarded control stack page sl sealed + lf lock on fault pages + dp always lazily freeable mapping == ======================================= Note that there is no guarantee that every flag and associated mnemonic will From 0cf4b1687a187ba9247c71721d8b064634eda1f7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 6 Jun 2025 13:50:32 +0100 Subject: [PATCH 125/242] mm/vma: reset VMA iterator on commit_merge() OOM failure While an OOM failure in commit_merge() isn't really feasible due to the allocation which might fail (a maple tree pre-allocation) being 'too small to fail', we do need to handle this case correctly regardless. In vma_merge_existing_range(), we can theoretically encounter failures which result in an OOM error in two ways - firstly dup_anon_vma() might fail with an OOM error, and secondly commit_merge() failing, ultimately, to pre-allocate a maple tree node. The abort logic for dup_anon_vma() resets the VMA iterator to the initial range, ensuring that any logic looping on this iterator will correctly proceed to the next VMA. However the commit_merge() abort logic does not do the same thing. This resulted in a syzbot report occurring because mlockall() iterates through VMAs, is tolerant of errors, but ended up with an incorrect previous VMA being specified due to incorrect iterator state. While making this change, it became apparent we are duplicating logic - the logic introduced in commit 41e6ddcaa0f1 ("mm/vma: add give_up_on_oom option on modify/merge, use in uffd release") duplicates the vmg->give_up_on_oom check in both abort branches. Additionally, we observe that we can perform the anon_dup check safely on dup_anon_vma() failure, as this will not be modified should this call fail. Finally, we need to reset the iterator in both cases, so now we can simply use the exact same code to abort for both. We remove the VM_WARN_ON(err != -ENOMEM) as it would be silly for this to be otherwise and it allows us to implement the abort check more neatly. Link: https://lkml.kernel.org/r/20250606125032.164249-1-lorenzo.stoakes@oracle.com Fixes: 47b16d0462a4 ("mm: abort vma_modify() on merge out of memory failure") Signed-off-by: Lorenzo Stoakes Reported-by: syzbot+d16409ea9ecc16ed261a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/6842cc67.a00a0220.29ac89.003b.GAE@google.com/ Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: Signed-off-by: Andrew Morton --- mm/vma.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index 726b2a31ce59..0fb9b2c7b734 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -967,26 +967,9 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( err = dup_anon_vma(next, middle, &anon_dup); } - if (err) + if (err || commit_merge(vmg)) goto abort; - err = commit_merge(vmg); - if (err) { - VM_WARN_ON(err != -ENOMEM); - - if (anon_dup) - unlink_anon_vmas(anon_dup); - - /* - * We've cleaned up any cloned anon_vma's, no VMAs have been - * modified, no harm no foul if the user requests that we not - * report this and just give up, leaving the VMAs unmerged. - */ - if (!vmg->give_up_on_oom) - vmg->state = VMA_MERGE_ERROR_NOMEM; - return NULL; - } - khugepaged_enter_vma(vmg->target, vmg->flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -995,6 +978,9 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( vma_iter_set(vmg->vmi, start); vma_iter_load(vmg->vmi); + if (anon_dup) + unlink_anon_vmas(anon_dup); + /* * This means we have failed to clone anon_vma's correctly, but no * actual changes to VMAs have occurred, so no harm no foul - if the From 383c4613c67c26e90e8eebb72e3083457d02033f Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 6 Jun 2025 10:28:07 +0100 Subject: [PATCH 126/242] mm: close theoretical race where stale TLB entries could linger Commit 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries") described a theoretical race as such: """ Nadav Amit identified a theoretical race between page reclaim and mprotect due to TLB flushes being batched outside of the PTL being held. He described the race as follows: CPU0 CPU1 ---- ---- user accesses memory using RW PTE [PTE now cached in TLB] try_to_unmap_one() ==> ptep_get_and_clear() ==> set_tlb_ubc_flush_pending() mprotect(addr, PROT_READ) ==> change_pte_range() ==> [ PTE non-present - no flush ] user writes using cached RW PTE ... try_to_unmap_flush() The same type of race exists for reads when protecting for PROT_NONE and also exists for operations that can leave an old TLB entry behind such as munmap, mremap and madvise. """ The solution was to introduce flush_tlb_batched_pending() and call it under the PTL from mprotect/madvise/munmap/mremap to complete any pending tlb flushes. However, while madvise_free_pte_range() and madvise_cold_or_pageout_pte_range() were both retro-fitted to call flush_tlb_batched_pending() immediately after initially acquiring the PTL, they both temporarily release the PTL to split a large folio if they stumble upon one. In this case, where re-acquiring the PTL flush_tlb_batched_pending() must be called again, but it previously was not. Let's fix that. There are 2 Fixes: tags here: the first is the commit that fixed madvise_free_pte_range(). The second is the commit that added madvise_cold_or_pageout_pte_range(), which looks like it copy/pasted the faulty pattern from madvise_free_pte_range(). This is a theoretical bug discovered during code review. Link: https://lkml.kernel.org/r/20250606092809.4194056-1-ryan.roberts@arm.com Fixes: 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries") Fixes: 9c276cc65a58 ("mm: introduce MADV_COLD") Signed-off-by: Ryan Roberts Reviewed-by: Jann Horn Acked-by: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mel Gorman Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/madvise.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/madvise.c b/mm/madvise.c index 5f7a66a1617e..1d44a35ae85c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -508,6 +508,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; @@ -741,6 +742,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, start_pte = pte; if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; From 50695153d7ddde3b1696dbf0085be0033bf3ddb3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 7 Jun 2025 17:43:18 -0700 Subject: [PATCH 127/242] drivers/rapidio/rio_cm.c: prevent possible heap overwrite In riocm_cdev_ioctl(RIO_CM_CHAN_SEND) -> cm_chan_msg_send() -> riocm_ch_send() cm_chan_msg_send() checks that userspace didn't send too much data but riocm_ch_send() failed to check that userspace sent sufficient data. The result is that riocm_ch_send() can write to fields in the rio_ch_chan_hdr which were outside the bounds of the space which cm_chan_msg_send() allocated. Address this by teaching riocm_ch_send() to check that the entire rio_ch_chan_hdr was copied in from userspace. Reported-by: maher azz Cc: Matt Porter Cc: Alexandre Bounine Cc: Linus Torvalds Cc: Signed-off-by: Andrew Morton --- drivers/rapidio/rio_cm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c index 97287e838ce1..66464674223f 100644 --- a/drivers/rapidio/rio_cm.c +++ b/drivers/rapidio/rio_cm.c @@ -783,6 +783,9 @@ static int riocm_ch_send(u16 ch_id, void *buf, int len) if (buf == NULL || ch_id == 0 || len == 0 || len > RIO_MAX_MSG_SIZE) return -EINVAL; + if (len < sizeof(struct rio_ch_chan_hdr)) + return -EINVAL; /* insufficient data from user */ + ch = riocm_get_channel(ch_id); if (!ch) { riocm_error("%s(%d) ch_%d not found", current->comm, From 02fb36505c61c35d5fd879f5a66a97935dbcb2ee Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 9 Jun 2025 12:24:42 +1200 Subject: [PATCH 128/242] MAINTAINERS: add Barry as a THP reviewer I have been actively contributing to mTHP and reviewing related patches for an extended period, and I would like to continue supporting patch reviews. Link: https://lkml.kernel.org/r/20250609002442.1856-1-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: Zi Yan Acked-by: Baolin Wang Acked-by: Dev Jain Acked-by: Lorenzo Stoakes Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Nico Pache Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index a92290fffa16..8315c84df075 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15919,6 +15919,7 @@ R: Liam R. Howlett R: Nico Pache R: Ryan Roberts R: Dev Jain +R: Barry Song L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org From 66ac1a4d366d3faa95fcf6082b555f8d77f1e8db Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 8 Jun 2025 22:12:35 +0800 Subject: [PATCH 129/242] init: fix build warnings about export.h After commit a934a57a42f64a4 ("scripts/misc-check: check missing #include when W=1") and 7d95680d64ac8e836c ("scripts/misc-check: check unnecessary #include when W=1"), we get some build warnings with W=1: init/main.c: warning: EXPORT_SYMBOL() is used, but #include is missing init/initramfs.c: warning: EXPORT_SYMBOL() is used, but #include is missing So fix these build warnings for the init code. Link: https://lkml.kernel.org/r/20250608141235.155206-1-chenhuacai@loongson.cn Fixes: a934a57a42f6 ("scripts/misc-check: check missing #include when W=1") Signed-off-by: Huacai Chen Reviewed-by: Masahiro Yamada Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Andrew Morton --- init/initramfs.c | 1 + init/main.c | 1 + 2 files changed, 2 insertions(+) diff --git a/init/initramfs.c b/init/initramfs.c index 72bad44a1d41..097673b97784 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include diff --git a/init/main.c b/init/main.c index ed576c7f475d..225a58279acd 100644 --- a/init/main.c +++ b/init/main.c @@ -13,6 +13,7 @@ #define DEBUG /* Enable initcall_debug */ #include +#include #include #include #include From 2d5a84c3ecc075d87bcb16c1cc80277b5837c153 Mon Sep 17 00:00:00 2001 From: Xi Pardee Date: Tue, 10 Jun 2025 16:04:06 -0700 Subject: [PATCH 130/242] platform/x86/intel/pmc: Add Lunar Lake support to Intel PMC SSRAM Telemetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Lunar Lake support to Intel PMC SSRAM Telemetry driver. Signed-off-by: Xi Pardee Link: https://lore.kernel.org/r/20250610230416.622970-1-xi.pardee@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/pmc/core.h | 3 +++ drivers/platform/x86/intel/pmc/ssram_telemetry.c | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/platform/x86/intel/pmc/core.h b/drivers/platform/x86/intel/pmc/core.h index e136d18b1d38..c1db41cb8334 100644 --- a/drivers/platform/x86/intel/pmc/core.h +++ b/drivers/platform/x86/intel/pmc/core.h @@ -299,6 +299,9 @@ enum ppfear_regs { #define PTL_PCD_PMC_MMIO_REG_LEN 0x31A8 /* SSRAM PMC Device ID */ +/* LNL */ +#define PMC_DEVID_LNL_SOCM 0xa87f + /* ARL */ #define PMC_DEVID_ARL_SOCM 0x777f #define PMC_DEVID_ARL_SOCS 0xae7f diff --git a/drivers/platform/x86/intel/pmc/ssram_telemetry.c b/drivers/platform/x86/intel/pmc/ssram_telemetry.c index b207247eb5dd..24d5d01805c8 100644 --- a/drivers/platform/x86/intel/pmc/ssram_telemetry.c +++ b/drivers/platform/x86/intel/pmc/ssram_telemetry.c @@ -187,6 +187,7 @@ static const struct pci_device_id intel_pmc_ssram_telemetry_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PMC_DEVID_MTL_SOCM) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PMC_DEVID_ARL_SOCS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PMC_DEVID_ARL_SOCM) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PMC_DEVID_LNL_SOCM) }, { } }; MODULE_DEVICE_TABLE(pci, intel_pmc_ssram_telemetry_pci_ids); From be574d5eee9808a422cff0293da9b08c0e434ed0 Mon Sep 17 00:00:00 2001 From: Xi Pardee Date: Tue, 10 Jun 2025 16:04:07 -0700 Subject: [PATCH 131/242] platform/x86/intel/pmc: Add Panther Lake support to Intel PMC SSRAM Telemetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Panther Lake support to Intel PMC SSRAM Telemetry driver. Signed-off-by: Xi Pardee Link: https://lore.kernel.org/r/20250610230416.622970-2-xi.pardee@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/pmc/core.h | 4 ++++ drivers/platform/x86/intel/pmc/ssram_telemetry.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/drivers/platform/x86/intel/pmc/core.h b/drivers/platform/x86/intel/pmc/core.h index c1db41cb8334..4a94a4ee031e 100644 --- a/drivers/platform/x86/intel/pmc/core.h +++ b/drivers/platform/x86/intel/pmc/core.h @@ -302,6 +302,10 @@ enum ppfear_regs { /* LNL */ #define PMC_DEVID_LNL_SOCM 0xa87f +/* PTL */ +#define PMC_DEVID_PTL_PCDH 0xe37f +#define PMC_DEVID_PTL_PCDP 0xe47f + /* ARL */ #define PMC_DEVID_ARL_SOCM 0x777f #define PMC_DEVID_ARL_SOCS 0xae7f diff --git a/drivers/platform/x86/intel/pmc/ssram_telemetry.c b/drivers/platform/x86/intel/pmc/ssram_telemetry.c index 24d5d01805c8..93579152188e 100644 --- a/drivers/platform/x86/intel/pmc/ssram_telemetry.c +++ b/drivers/platform/x86/intel/pmc/ssram_telemetry.c @@ -188,6 +188,8 @@ static const struct pci_device_id intel_pmc_ssram_telemetry_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PMC_DEVID_ARL_SOCS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PMC_DEVID_ARL_SOCM) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PMC_DEVID_LNL_SOCM) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PMC_DEVID_PTL_PCDH) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PMC_DEVID_PTL_PCDP) }, { } }; MODULE_DEVICE_TABLE(pci, intel_pmc_ssram_telemetry_pci_ids); From 0c44b46f51a17baa7ab67de1464427116e9c4eaa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Jun 2025 11:34:55 +0200 Subject: [PATCH 132/242] platform/x86/intel-uncore-freq: avoid non-literal format string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using a string variable in place of a format string causes a W=1 build warning: drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.c:61:40: error: format string is not a string literal (potentially insecure) [-Werror,-Wformat-security] 61 | length += sysfs_emit_at(buf, length, agent_name[agent]); | ^~~~~~~~~~~~~~~~~ Use the safer "%s" format string to print it instead. Fixes: b98fa870fce2 ("platform/x86/intel-uncore-freq: Add attributes to show agent types") Signed-off-by: Arnd Bergmann Tested-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20250610093459.2646337-1-arnd@kernel.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../x86/intel/uncore-frequency/uncore-frequency-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.c index 0f8aea18275b..65897fae17df 100644 --- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.c +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.c @@ -58,7 +58,7 @@ static ssize_t show_agent_types(struct kobject *kobj, struct kobj_attribute *att if (length) length += sysfs_emit_at(buf, length, " "); - length += sysfs_emit_at(buf, length, agent_name[agent]); + length += sysfs_emit_at(buf, length, "%s", agent_name[agent]); } length += sysfs_emit_at(buf, length, "\n"); From 527c88d8390d6c0358dea4d71696795c05328925 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 12 Jun 2025 09:22:45 +0200 Subject: [PATCH 133/242] ovl: fix debug print in case of mkdir error We want to print the name in case of mkdir failure and now we will get a cryptic (efault) as name. Fixes: c54b386969a5 ("VFS: Change vfs_mkdir() to return the dentry.") Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/20250612072245.2825938-1-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/overlayfs/overlayfs.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 8baaba0a3fe5..497323128e5f 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -246,9 +246,11 @@ static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs, struct dentry *dentry, umode_t mode) { - dentry = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); - pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(dentry)); - return dentry; + struct dentry *ret; + + ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(ret)); + return ret; } static inline int ovl_do_mknod(struct ovl_fs *ofs, From 0b9d62a47149083d581d8b2abb04124b6175cb29 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 11 Jun 2025 09:40:44 -0700 Subject: [PATCH 134/242] fs: unlock the superblock during iterate_supers_type This function takes super_lock in shared mode, so it should release the same lock. Cc: stable@vger.kernel.org # v6.16-rc1 Fixes: af7551cf13cf7f ("super: remove pointless s_root checks") Signed-off-by: "Darrick J. Wong" Link: https://lore.kernel.org/20250611164044.GF6138@frogsfrogsfrogs Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/super.c b/fs/super.c index 21799e213fd7..80418ca8e215 100644 --- a/fs/super.c +++ b/fs/super.c @@ -964,8 +964,10 @@ void iterate_supers_type(struct file_system_type *type, spin_unlock(&sb_lock); locked = super_lock_shared(sb); - if (locked) + if (locked) { f(sb, arg); + super_unlock_shared(sb); + } spin_lock(&sb_lock); if (p) From c538f400fae22725580842deb2bef546701b64bd Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 11 Jun 2025 13:53:43 -0700 Subject: [PATCH 135/242] io_uring: consistently use rcu semantics with sqpoll thread The sqpoll thread is dereferenced with rcu read protection in one place, so it needs to be annotated as an __rcu type, and should consistently use rcu helpers for access and assignment to make sparse happy. Since most of the accesses occur under the sqd->lock, we can use rcu_dereference_protected() without declaring an rcu read section. Provide a simple helper to get the thread from a locked context. Fixes: ac0b8b327a5677d ("io_uring: fix use-after-free of sq->thread in __io_uring_show_fdinfo()") Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250611205343.1821117-1-kbusch@meta.com [axboe: fold in fix for register.c] Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/register.c | 7 +++++-- io_uring/sqpoll.c | 34 ++++++++++++++++++++++++---------- io_uring/sqpoll.h | 8 +++++++- 4 files changed, 38 insertions(+), 15 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cf759c172083..4e32f808d07d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2906,7 +2906,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) struct task_struct *tsk; io_sq_thread_park(sqd); - tsk = sqd->thread; + tsk = sqpoll_task_locked(sqd); if (tsk && tsk->io_uring && tsk->io_uring->io_wq) io_wq_cancel_cb(tsk->io_uring->io_wq, io_cancel_ctx_cb, ctx, true); @@ -3142,7 +3142,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) s64 inflight; DEFINE_WAIT(wait); - WARN_ON_ONCE(sqd && sqd->thread != current); + WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current); if (!current->io_uring) return; diff --git a/io_uring/register.c b/io_uring/register.c index cc23a4c205cd..a59589249fce 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -273,6 +273,8 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, if (ctx->flags & IORING_SETUP_SQPOLL) { sqd = ctx->sq_data; if (sqd) { + struct task_struct *tsk; + /* * Observe the correct sqd->lock -> ctx->uring_lock * ordering. Fine to drop uring_lock here, we hold @@ -282,8 +284,9 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, mutex_unlock(&ctx->uring_lock); mutex_lock(&sqd->lock); mutex_lock(&ctx->uring_lock); - if (sqd->thread) - tctx = sqd->thread->io_uring; + tsk = sqpoll_task_locked(sqd); + if (tsk) + tctx = tsk->io_uring; } } else { tctx = current->io_uring; diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 0625a421626f..268d2fbe6160 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -30,7 +30,7 @@ enum { void io_sq_thread_unpark(struct io_sq_data *sqd) __releases(&sqd->lock) { - WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(sqpoll_task_locked(sqd) == current); /* * Do the dance but not conditional clear_bit() because it'd race with @@ -46,24 +46,32 @@ void io_sq_thread_unpark(struct io_sq_data *sqd) void io_sq_thread_park(struct io_sq_data *sqd) __acquires(&sqd->lock) { - WARN_ON_ONCE(data_race(sqd->thread) == current); + struct task_struct *tsk; atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); + + tsk = sqpoll_task_locked(sqd); + if (tsk) { + WARN_ON_ONCE(tsk == current); + wake_up_process(tsk); + } } void io_sq_thread_stop(struct io_sq_data *sqd) { - WARN_ON_ONCE(sqd->thread == current); + struct task_struct *tsk; + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); + tsk = sqpoll_task_locked(sqd); + if (tsk) { + WARN_ON_ONCE(tsk == current); + wake_up_process(tsk); + } mutex_unlock(&sqd->lock); wait_for_completion(&sqd->exited); } @@ -486,7 +494,10 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, goto err_sqpoll; } - sqd->thread = tsk; + mutex_lock(&sqd->lock); + rcu_assign_pointer(sqd->thread, tsk); + mutex_unlock(&sqd->lock); + task_to_put = get_task_struct(tsk); ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); @@ -514,10 +525,13 @@ __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, int ret = -EINVAL; if (sqd) { + struct task_struct *tsk; + io_sq_thread_park(sqd); /* Don't set affinity for a dying thread */ - if (sqd->thread) - ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); + tsk = sqpoll_task_locked(sqd); + if (tsk) + ret = io_wq_cpu_affinity(tsk->io_uring, mask); io_sq_thread_unpark(sqd); } diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h index 4171666b1cf4..b83dcdec9765 100644 --- a/io_uring/sqpoll.h +++ b/io_uring/sqpoll.h @@ -8,7 +8,7 @@ struct io_sq_data { /* ctx's that are using this sqd */ struct list_head ctx_list; - struct task_struct *thread; + struct task_struct __rcu *thread; struct wait_queue_head wait; unsigned sq_thread_idle; @@ -29,3 +29,9 @@ void io_sq_thread_unpark(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd); void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask); + +static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd) +{ + return rcu_dereference_protected(sqd->thread, + lockdep_is_held(&sqd->lock)); +} From 9c7632faad434c98f1f2cc06f3647a5a5d05ddbf Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 4 Jun 2025 08:03:05 -0700 Subject: [PATCH 136/242] drm/xe/lrc: Use a temporary buffer for WA BB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case the BO is in iomem, we can't simply take the vaddr and write to it. Instead, prepare a separate buffer that is later copied into io memory. Right now it's just a few words that could be using xe_map_write32(), but the intention is to grow the WA BB for other uses. Fixes: 617d824c5323 ("drm/xe: Add WA BB to capture active context utilization") Cc: Umesh Nerlige Ramappa Cc: Tvrtko Ursulin Reviewed-by: Matthew Brost Reviewed-by: Umesh Nerlige Ramappa Link: https://lore.kernel.org/r/20250604-wa-bb-fix-v1-1-0dfc5dafcef0@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit ef48715b2d3df17c060e23b9aa636af3d95652f8) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_lrc.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index 63d74e27f54c..bf7c3981897d 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -941,11 +941,18 @@ static void xe_lrc_finish(struct xe_lrc *lrc) * store it in the PPHSWP. */ #define CONTEXT_ACTIVE 1ULL -static void xe_lrc_setup_utilization(struct xe_lrc *lrc) +static int xe_lrc_setup_utilization(struct xe_lrc *lrc) { - u32 *cmd; + u32 *cmd, *buf = NULL; - cmd = lrc->bb_per_ctx_bo->vmap.vaddr; + if (lrc->bb_per_ctx_bo->vmap.is_iomem) { + buf = kmalloc(lrc->bb_per_ctx_bo->size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + cmd = buf; + } else { + cmd = lrc->bb_per_ctx_bo->vmap.vaddr; + } *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; *cmd++ = ENGINE_ID(0).addr; @@ -966,9 +973,16 @@ static void xe_lrc_setup_utilization(struct xe_lrc *lrc) *cmd++ = MI_BATCH_BUFFER_END; + if (buf) { + xe_map_memcpy_to(gt_to_xe(lrc->gt), &lrc->bb_per_ctx_bo->vmap, 0, + buf, (cmd - buf) * sizeof(*cmd)); + kfree(buf); + } + xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1); + return 0; } #define PVC_CTX_ASID (0x2e + 1) @@ -1125,7 +1139,9 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, map = __xe_lrc_start_seqno_map(lrc); xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); - xe_lrc_setup_utilization(lrc); + err = xe_lrc_setup_utilization(lrc); + if (err) + goto err_lrc_finish; return 0; From b64af6bcd3b0f3fc633d6a70adb0991737abfef4 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Thu, 12 Jun 2025 12:45:04 -0300 Subject: [PATCH 137/242] smb: client: fix perf regression with deferred closes Customer reported that one of their applications started failing to open files with STATUS_INSUFFICIENT_RESOURCES due to NetApp server hitting the maximum number of opens to same file that it would allow for a single client connection. It turned out the client was failing to reuse open handles with deferred closes because matching ->f_flags directly without masking off O_CREAT|O_EXCL|O_TRUNC bits first broke the comparision and then client ended up with thousands of deferred closes to same file. Those bits are already satisfied on the original open, so no need to check them against existing open handles. Reproducer: #include #include #include #include #include #include #define NR_THREADS 4 #define NR_ITERATIONS 2500 #define TEST_FILE "/mnt/1/test/dir/foo" static char buf[64]; static void *worker(void *arg) { int i, j; int fd; for (i = 0; i < NR_ITERATIONS; i++) { fd = open(TEST_FILE, O_WRONLY|O_CREAT|O_APPEND, 0666); for (j = 0; j < 16; j++) write(fd, buf, sizeof(buf)); close(fd); } } int main(int argc, char *argv[]) { pthread_t t[NR_THREADS]; int fd; int i; fd = open(TEST_FILE, O_WRONLY|O_CREAT|O_TRUNC, 0666); close(fd); memset(buf, 'a', sizeof(buf)); for (i = 0; i < NR_THREADS; i++) pthread_create(&t[i], NULL, worker, NULL); for (i = 0; i < NR_THREADS; i++) pthread_join(t[i], NULL); return 0; } Before patch: $ mount.cifs //srv/share /mnt/1 -o ... $ mkdir -p /mnt/1/test/dir $ gcc repro.c && ./a.out ... number of opens: 1391 After patch: $ mount.cifs //srv/share /mnt/1 -o ... $ mkdir -p /mnt/1/test/dir $ gcc repro.c && ./a.out ... number of opens: 1 Cc: linux-cifs@vger.kernel.org Cc: David Howells Cc: Jay Shin Cc: Pierguido Lambri Fixes: b8ea3b1ff544 ("smb: enable reuse of deferred file handles for write operations") Acked-by: Shyam Prasad N Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/file.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index d2df10b8e6fd..9835672267d2 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -999,15 +999,18 @@ int cifs_open(struct inode *inode, struct file *file) rc = cifs_get_readable_path(tcon, full_path, &cfile); } if (rc == 0) { - if (file->f_flags == cfile->f_flags) { + unsigned int oflags = file->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC); + unsigned int cflags = cfile->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC); + + if (cifs_convert_flags(oflags, 0) == cifs_convert_flags(cflags, 0) && + (oflags & (O_SYNC|O_DIRECT)) == (cflags & (O_SYNC|O_DIRECT))) { file->private_data = cfile; spin_lock(&CIFS_I(inode)->deferred_lock); cifs_del_deferred_close(cfile); spin_unlock(&CIFS_I(inode)->deferred_lock); goto use_cache; - } else { - _cifsFileInfo_put(cfile, true, false); } + _cifsFileInfo_put(cfile, true, false); } else { /* hard link on the defeered close file */ rc = cifs_get_hardlink_path(tcon, inode, file); From 72dd7961a4bb4fa1fc456169a61dd12e68e50645 Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Wed, 11 Jun 2025 16:59:02 +0530 Subject: [PATCH 138/242] smb: improve directory cache reuse for readdir operations Currently, cached directory contents were not reused across subsequent 'ls' operations because the cache validity check relied on comparing the ctx pointer, which changes with each readdir invocation. As a result, the cached dir entries was not marked as valid and the cache was not utilized for subsequent 'ls' operations. This change uses the file pointer, which remains consistent across all readdir calls for a given directory instance, to associate and validate the cache. As a result, cached directory contents can now be correctly reused, improving performance for repeated directory listings. Performance gains with local windows SMB server: Without the patch and default actimeo=1: 1000 directory enumeration operations on dir with 10k files took 135.0s With this patch and actimeo=0: 1000 directory enumeration operations on dir with 10k files took just 5.1s Signed-off-by: Bharath SM Reviewed-by: Shyam Prasad N Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cached_dir.h | 8 ++++---- fs/smb/client/readdir.c | 28 +++++++++++++++------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index 1dfe79d947a6..bc8a812ff95f 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -21,10 +21,10 @@ struct cached_dirent { struct cached_dirents { bool is_valid:1; bool is_failed:1; - struct dir_context *ctx; /* - * Only used to make sure we only take entries - * from a single context. Never dereferenced. - */ + struct file *file; /* + * Used to associate the cache with a single + * open file instance. + */ struct mutex de_mutex; int pos; /* Expected ctx->pos */ struct list_head entries; diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index f9f11cbf89be..ba0193cf9033 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -851,9 +851,9 @@ static bool emit_cached_dirents(struct cached_dirents *cde, } static void update_cached_dirents_count(struct cached_dirents *cde, - struct dir_context *ctx) + struct file *file) { - if (cde->ctx != ctx) + if (cde->file != file) return; if (cde->is_valid || cde->is_failed) return; @@ -862,9 +862,9 @@ static void update_cached_dirents_count(struct cached_dirents *cde, } static void finished_cached_dirents_count(struct cached_dirents *cde, - struct dir_context *ctx) + struct dir_context *ctx, struct file *file) { - if (cde->ctx != ctx) + if (cde->file != file) return; if (cde->is_valid || cde->is_failed) return; @@ -877,11 +877,12 @@ static void finished_cached_dirents_count(struct cached_dirents *cde, static void add_cached_dirent(struct cached_dirents *cde, struct dir_context *ctx, const char *name, int namelen, - struct cifs_fattr *fattr) + struct cifs_fattr *fattr, + struct file *file) { struct cached_dirent *de; - if (cde->ctx != ctx) + if (cde->file != file) return; if (cde->is_valid || cde->is_failed) return; @@ -911,7 +912,8 @@ static void add_cached_dirent(struct cached_dirents *cde, static bool cifs_dir_emit(struct dir_context *ctx, const char *name, int namelen, struct cifs_fattr *fattr, - struct cached_fid *cfid) + struct cached_fid *cfid, + struct file *file) { bool rc; ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); @@ -923,7 +925,7 @@ static bool cifs_dir_emit(struct dir_context *ctx, if (cfid) { mutex_lock(&cfid->dirents.de_mutex); add_cached_dirent(&cfid->dirents, ctx, name, namelen, - fattr); + fattr, file); mutex_unlock(&cfid->dirents.de_mutex); } @@ -1023,7 +1025,7 @@ static int cifs_filldir(char *find_entry, struct file *file, cifs_prime_dcache(file_dentry(file), &name, &fattr); return !cifs_dir_emit(ctx, name.name, name.len, - &fattr, cfid); + &fattr, cfid, file); } @@ -1074,8 +1076,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) * we need to initialize scanning and storing the * directory content. */ - if (ctx->pos == 0 && cfid->dirents.ctx == NULL) { - cfid->dirents.ctx = ctx; + if (ctx->pos == 0 && cfid->dirents.file == NULL) { + cfid->dirents.file = file; cfid->dirents.pos = 2; } /* @@ -1143,7 +1145,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) } else { if (cfid) { mutex_lock(&cfid->dirents.de_mutex); - finished_cached_dirents_count(&cfid->dirents, ctx); + finished_cached_dirents_count(&cfid->dirents, ctx, file); mutex_unlock(&cfid->dirents.de_mutex); } cifs_dbg(FYI, "Could not find entry\n"); @@ -1184,7 +1186,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) ctx->pos++; if (cfid) { mutex_lock(&cfid->dirents.de_mutex); - update_cached_dirents_count(&cfid->dirents, ctx); + update_cached_dirents_count(&cfid->dirents, file); mutex_unlock(&cfid->dirents.de_mutex); } From 5466491c9e3309ed5c7adbb8fad6e93fcc9a8fe9 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Mon, 9 Jun 2025 14:28:27 -0700 Subject: [PATCH 139/242] ionic: Prevent driver/fw getting out of sync on devcmd(s) Some stress/negative firmware testing around devcmd(s) returning EAGAIN found that the done bit could get out of sync in the firmware when it wasn't cleared in a retry case. While here, change the type of the local done variable to a bool to match the return type from ionic_dev_cmd_done(). Fixes: ec8ee714736e ("ionic: stretch heartbeat detection") Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250609212827.53842-1-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index daf1e82cb76b..0e60a6bef99a 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -516,9 +516,9 @@ static int __ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds, unsigned long start_time; unsigned long max_wait; unsigned long duration; - int done = 0; bool fw_up; int opcode; + bool done; int err; /* Wait for dev cmd to complete, retrying if we get EAGAIN, @@ -526,6 +526,7 @@ static int __ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds, */ max_wait = jiffies + (max_seconds * HZ); try_again: + done = false; opcode = idev->opcode; start_time = jiffies; for (fw_up = ionic_is_fw_running(idev); From bb666b7c27073b986b75699e51a7102910f58060 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 9 Jun 2025 17:57:49 +0100 Subject: [PATCH 140/242] mm: add mmap_prepare() compatibility layer for nested file systems Nested file systems, that is those which invoke call_mmap() within their own f_op->mmap() handlers, may encounter underlying file systems which provide the f_op->mmap_prepare() hook introduced by commit c84bf6dd2b83 ("mm: introduce new .mmap_prepare() file callback"). We have a chicken-and-egg scenario here - until all file systems are converted to using .mmap_prepare(), we cannot convert these nested handlers, as we can't call f_op->mmap from an .mmap_prepare() hook. So we have to do it the other way round - invoke the .mmap_prepare() hook from an .mmap() one. in order to do so, we need to convert VMA state into a struct vm_area_desc descriptor, invoking the underlying file system's f_op->mmap_prepare() callback passing a pointer to this, and then setting VMA state accordingly and safely. This patch achieves this via the compat_vma_mmap_prepare() function, which we invoke from call_mmap() if f_op->mmap_prepare() is specified in the passed in file pointer. We place the fundamental logic into mm/vma.h where VMA manipulation belongs. We also update the VMA userland tests to accommodate the changes. The compat_vma_mmap_prepare() function and its associated machinery is temporary, and will be removed once the conversion of file systems is complete. We carefully place this code so it can be used with CONFIG_MMU and also with cutting edge nommu silicon. [akpm@linux-foundation.org: export compat_vma_mmap_prepare tp fix build] [lorenzo.stoakes@oracle.com: remove unused declarations] Link: https://lkml.kernel.org/r/ac3ae324-4c65-432a-8c6d-2af988b18ac8@lucifer.local Link: https://lkml.kernel.org/r/20250609165749.344976-1-lorenzo.stoakes@oracle.com Fixes: c84bf6dd2b83 ("mm: introduce new .mmap_prepare() file callback"). Signed-off-by: Lorenzo Stoakes Reported-by: Jann Horn Closes: https://lore.kernel.org/linux-mm/CAG48ez04yOEVx1ekzOChARDDBZzAKwet8PEoPM4Ln3_rk91AzQ@mail.gmail.com/ Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jann Horn Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/fs.h | 6 ++-- mm/util.c | 40 +++++++++++++++++++++++++++ mm/vma.c | 1 - mm/vma.h | 47 ++++++++++++++++++++++++++++++++ tools/testing/vma/vma_internal.h | 16 +++++++++++ 5 files changed, 107 insertions(+), 3 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 96c7925a6551..4ec77da65f14 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2274,10 +2274,12 @@ static inline bool file_has_valid_mmap_hooks(struct file *file) return true; } +int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); + static inline int call_mmap(struct file *file, struct vm_area_struct *vma) { - if (WARN_ON_ONCE(file->f_op->mmap_prepare)) - return -EINVAL; + if (file->f_op->mmap_prepare) + return compat_vma_mmap_prepare(file, vma); return file->f_op->mmap(file, vma); } diff --git a/mm/util.c b/mm/util.c index 448117da071f..0b270c43d7d1 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1131,3 +1131,43 @@ void flush_dcache_folio(struct folio *folio) } EXPORT_SYMBOL(flush_dcache_folio); #endif + +/** + * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an + * existing VMA + * @file: The file which possesss an f_op->mmap_prepare() hook + * @vma: The VMA to apply the .mmap_prepare() hook to. + * + * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain + * 'wrapper' file systems invoke a nested mmap hook of an underlying file. + * + * Until all filesystems are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these 'wrapper' filesystems using the + * deprecated .mmap() hook. + * + * However we have a problem if the underlying file system possesses an + * .mmap_prepare() hook, as we are in a different context when we invoke the + * .mmap() hook, already having a VMA to deal with. + * + * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, + * establishes a struct vm_area_desc descriptor, passes to the underlying + * .mmap_prepare() hook and applies any changes performed by it. + * + * Once the conversion of filesystems is complete this function will no longer + * be required and will be removed. + * + * Returns: 0 on success or error. + */ +int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc; + int err; + + err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); + if (err) + return err; + set_vma_from_desc(vma, &desc); + + return 0; +} +EXPORT_SYMBOL(compat_vma_mmap_prepare); diff --git a/mm/vma.c b/mm/vma.c index 0fb9b2c7b734..fef67a66a095 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -3113,7 +3113,6 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock) return ret; } - /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_rwsem is taken here. diff --git a/mm/vma.h b/mm/vma.h index 0db066e7a45d..f47112a352db 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -222,6 +222,53 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } + +/* + * Temporary helper functions for file systems which wrap an invocation of + * f_op->mmap() but which might have an underlying file system which implements + * f_op->mmap_prepare(). + */ + +static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc) +{ + desc->mm = vma->vm_mm; + desc->start = vma->vm_start; + desc->end = vma->vm_end; + + desc->pgoff = vma->vm_pgoff; + desc->file = vma->vm_file; + desc->vm_flags = vma->vm_flags; + desc->page_prot = vma->vm_page_prot; + + desc->vm_ops = NULL; + desc->private_data = NULL; + + return desc; +} + +static inline void set_vma_from_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc) +{ + /* + * Since we're invoking .mmap_prepare() despite having a partially + * established VMA, we must take care to handle setting fields + * correctly. + */ + + /* Mutable fields. Populated with initial state. */ + vma->vm_pgoff = desc->pgoff; + if (vma->vm_file != desc->file) + vma_set_file(vma, desc->file); + if (vma->vm_flags != desc->vm_flags) + vm_flags_set(vma, desc->vm_flags); + vma->vm_page_prot = desc->page_prot; + + /* User-defined fields. */ + vma->vm_ops = desc->vm_ops; + vma->vm_private_data = desc->private_data; +} + int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 4505b1c31be1..14718ca23a05 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -159,6 +159,14 @@ typedef __bitwise unsigned int vm_fault_t; #define ASSERT_EXCLUSIVE_WRITER(x) +/** + * swap - swap values of @a and @b + * @a: first value + * @b: second value + */ +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + struct kref { refcount_t refcount; }; @@ -1468,4 +1476,12 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) (void)vma; } +static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) +{ + /* Changing an anonymous vma with this is illegal */ + get_file(file); + swap(vma->vm_file, file); + fput(file); +} + #endif /* __MM_VMA_INTERNAL_H */ From b93755f408325170edb2156c6a894ed1cae5f4f6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 12 May 2025 20:14:55 +0200 Subject: [PATCH 141/242] powerpc/vdso: Fix build of VDSO32 with pcrel Building vdso32 on power10 with pcrel leads to following errors: VDSO32A arch/powerpc/kernel/vdso/gettimeofday-32.o arch/powerpc/kernel/vdso/gettimeofday.S: Assembler messages: arch/powerpc/kernel/vdso/gettimeofday.S:40: Error: syntax error; found `@', expected `,' arch/powerpc/kernel/vdso/gettimeofday.S:71: Info: macro invoked from here arch/powerpc/kernel/vdso/gettimeofday.S:40: Error: junk at end of line: `@notoc' arch/powerpc/kernel/vdso/gettimeofday.S:71: Info: macro invoked from here ... make[2]: *** [arch/powerpc/kernel/vdso/Makefile:85: arch/powerpc/kernel/vdso/gettimeofday-32.o] Error 1 make[1]: *** [arch/powerpc/Makefile:388: vdso_prepare] Error 2 Once the above is fixed, the following happens: VDSO32C arch/powerpc/kernel/vdso/vgettimeofday-32.o cc1: error: '-mpcrel' requires '-mcmodel=medium' make[2]: *** [arch/powerpc/kernel/vdso/Makefile:89: arch/powerpc/kernel/vdso/vgettimeofday-32.o] Error 1 make[1]: *** [arch/powerpc/Makefile:388: vdso_prepare] Error 2 make: *** [Makefile:251: __sub-make] Error 2 Make sure pcrel version of CFUNC() macro is used only for powerpc64 builds and remove -mpcrel for powerpc32 builds. Fixes: 7e3a68be42e1 ("powerpc/64: vmlinux support building with PCREL addresing") Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/1fa3453f07d42a50a70114da9905bf7b73304fca.1747073669.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc_asm.h | 2 +- arch/powerpc/kernel/vdso/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 02897f4b0dbf..b891910fce8a 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -183,7 +183,7 @@ /* * Used to name C functions called from asm */ -#ifdef CONFIG_PPC_KERNEL_PCREL +#if defined(__powerpc64__) && defined(CONFIG_PPC_KERNEL_PCREL) #define CFUNC(name) name@notoc #else #define CFUNC(name) name diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index e8824f933326..8834dfe9d727 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -53,7 +53,7 @@ ldflags-$(CONFIG_LD_ORPHAN_WARN) += -Wl,--orphan-handling=$(CONFIG_LD_ORPHAN_WAR ldflags-y += $(filter-out $(CC_AUTO_VAR_INIT_ZERO_ENABLER) $(CC_FLAGS_FTRACE) -Wa$(comma)%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)) CC32FLAGS := -m32 -CC32FLAGSREMOVE := -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc +CC32FLAGSREMOVE := -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc -mpcrel ifdef CONFIG_CC_IS_CLANG # This flag is supported by clang for 64-bit but not 32-bit so it will cause # an unused command line flag warning for this file. From 33bc69cf6655cf60829a803a45275f11a74899e5 Mon Sep 17 00:00:00 2001 From: Narayana Murty N Date: Thu, 8 May 2025 02:29:28 -0400 Subject: [PATCH 142/242] powerpc/eeh: Fix missing PE bridge reconfiguration during VFIO EEH recovery VFIO EEH recovery for PCI passthrough devices fails on PowerNV and pseries platforms due to missing host-side PE bridge reconfiguration. In the current implementation, eeh_pe_configure() only performs RTAS or OPAL-based bridge reconfiguration for native host devices, but skips it entirely for PEs managed through VFIO in guest passthrough scenarios. This leads to incomplete EEH recovery when a PCI error affects a passthrough device assigned to a QEMU/KVM guest. Although VFIO triggers the EEH recovery flow through VFIO_EEH_PE_ENABLE ioctl, the platform-specific bridge reconfiguration step is silently bypassed. As a result, the PE's config space is not fully restored, causing subsequent config space access failures or EEH freeze-on-access errors inside the guest. This patch fixes the issue by ensuring that eeh_pe_configure() always invokes the platform's configure_bridge() callback (e.g., pseries_eeh_phb_configure_bridge) even for VFIO-managed PEs. This ensures that RTAS or OPAL calls to reconfigure the PE bridge are correctly issued on the host side, restoring the PE's configuration space after an EEH event. This fix is essential for reliable EEH recovery in QEMU/KVM guests using VFIO PCI passthrough on PowerNV and pseries systems. Tested with: - QEMU/KVM guest using VFIO passthrough (IBM Power9,(lpar)Power11 host) - Injected EEH errors with pseries EEH errinjct tool on host, recovery verified on qemu guest. - Verified successful config space access and CAP_EXP DevCtl restoration after recovery Fixes: 212d16cdca2d ("powerpc/eeh: EEH support for VFIO PCI device") Signed-off-by: Narayana Murty N Reviewed-by: Vaibhav Jain Reviewed-by: Ganesh Goudar Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250508062928.146043-1-nnmlinux@linux.ibm.com --- arch/powerpc/kernel/eeh.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 83fe99861eb1..ca7f7bb2b478 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1509,6 +1509,8 @@ int eeh_pe_configure(struct eeh_pe *pe) /* Invalid PE ? */ if (!pe) return -ENODEV; + else + ret = eeh_ops->configure_bridge(pe); return ret; } From 4e6d080acfda5344ccbc63afe778830e22be4be9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Neusch=C3=A4fer?= Date: Wed, 11 Jun 2025 23:07:49 +0200 Subject: [PATCH 143/242] powerpc/microwatt: Fix model property in device tree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The standard property for the model name is called "model". Signed-off-by: J. Neuschäfer Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250611-microwatt-v2-1-80847bbc5f9c@posteo.net --- arch/powerpc/boot/dts/microwatt.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/dts/microwatt.dts b/arch/powerpc/boot/dts/microwatt.dts index c4e4d2a9b460..b7eac4e56019 100644 --- a/arch/powerpc/boot/dts/microwatt.dts +++ b/arch/powerpc/boot/dts/microwatt.dts @@ -4,7 +4,7 @@ / { #size-cells = <0x02>; #address-cells = <0x02>; - model-name = "microwatt"; + model = "microwatt"; compatible = "microwatt-soc"; aliases { From e75cb6010838f61b9d63e921d1763a8ab177e38e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Neusch=C3=A4fer?= Date: Wed, 11 Jun 2025 21:01:01 +0200 Subject: [PATCH 144/242] powerpc: dts: mpc8315erdb: Add GPIO controller node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MPC8315E SoC and variants have a GPIO controller at IMMR + 0xc00. This node was previously missing from the device tree. Signed-off-by: J. Neuschäfer Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250611-mpc-gpio-v1-1-02d1f75336e2@posteo.net --- arch/powerpc/boot/dts/mpc8315erdb.dts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/powerpc/boot/dts/mpc8315erdb.dts b/arch/powerpc/boot/dts/mpc8315erdb.dts index e09b37d7489d..a89cb3139ca8 100644 --- a/arch/powerpc/boot/dts/mpc8315erdb.dts +++ b/arch/powerpc/boot/dts/mpc8315erdb.dts @@ -6,6 +6,7 @@ */ /dts-v1/; +#include / { compatible = "fsl,mpc8315erdb"; @@ -358,6 +359,15 @@ pmc: power@b00 { interrupt-parent = <&ipic>; fsl,mpc8313-wakeup-timer = <>m1>; }; + + gpio: gpio-controller@c00 { + compatible = "fsl,mpc8314-gpio"; + reg = <0xc00 0x100>; + interrupts = <74 IRQ_TYPE_LEVEL_LOW>; + interrupt-parent = <&ipic>; + gpio-controller; + #gpio-cells = <2>; + }; }; pci0: pci@e0008500 { From 9f0ad43b158d07bc7144d219ceabdea36e28e392 Mon Sep 17 00:00:00 2001 From: Thangaraj Samynathan Date: Thu, 12 Jun 2025 08:00:59 +0530 Subject: [PATCH 145/242] spi: spi-pci1xxxx: Drop MSI-X usage as unsupported by DMA engine Removes MSI-X from the interrupt request path, as the DMA engine used by the SPI controller does not support MSI-X interrupts. Signed-off-by: Thangaraj Samynathan Link: https://patch.msgid.link/20250612023059.71726-1-thangaraj.s@microchip.com Signed-off-by: Mark Brown --- drivers/spi/spi-pci1xxxx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-pci1xxxx.c b/drivers/spi/spi-pci1xxxx.c index 9112d8a1a0c8..e27642c4dea4 100644 --- a/drivers/spi/spi-pci1xxxx.c +++ b/drivers/spi/spi-pci1xxxx.c @@ -762,7 +762,7 @@ static int pci1xxxx_spi_probe(struct pci_dev *pdev, const struct pci_device_id * return -EINVAL; num_vector = pci_alloc_irq_vectors(pdev, 1, hw_inst_cnt, - PCI_IRQ_ALL_TYPES); + PCI_IRQ_INTX | PCI_IRQ_MSI); if (num_vector < 0) { dev_err(&pdev->dev, "Error allocating MSI vectors\n"); return num_vector; From 9ba75ccad85708c5a484637dccc1fc59295b0a83 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 11 Jun 2025 15:33:40 -0500 Subject: [PATCH 146/242] platform/x86/amd/pmc: Add PCSpecialist Lafite Pro V 14M to 8042 quirks list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every other s2idle cycle fails to reach hardware sleep when keyboard wakeup is enabled. This appears to be an EC bug, but the vendor refuses to fix it. It was confirmed that turning off i8042 wakeup avoids ths issue (albeit keyboard wakeup is disabled). Take the lesser of two evils and add it to the i8042 quirk list. Reported-by: Raoul Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220116 Tested-by: Raoul Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20250611203341.3733478-1-superm1@kernel.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc-quirks.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index 5c7c01f66cde..f292111bd065 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -225,6 +225,15 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_BOARD_NAME, "WUJIE14-GX4HRXL"), } }, + /* https://bugzilla.kernel.org/show_bug.cgi?id=220116 */ + { + .ident = "PCSpecialist Lafite Pro V 14M", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "PCSpecialist"), + DMI_MATCH(DMI_PRODUCT_NAME, "Lafite Pro V 14M"), + } + }, {} }; From e2468dc700743683e1d1793bbd855e2536fd3de2 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 11 Jun 2025 18:30:40 -0300 Subject: [PATCH 147/242] Revert "platform/x86: alienware-wmi-wmax: Add G-Mode support to Alienware m16 R1" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 5ff79cabb23a2f14d2ed29e9596aec908905a0e6. Although the Alienware m16 R1 AMD model supports G-Mode, it actually has a lower power ceiling than plain "performance" profile, which results in lower performance. Reported-by: Cihan Ozakca Cc: stable@vger.kernel.org # 6.15.x Signed-off-by: Kurt Borja Link: https://lore.kernel.org/r/20250611-m16-rev-v1-1-72d13bad03c9@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi-wmax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/dell/alienware-wmi-wmax.c b/drivers/platform/x86/dell/alienware-wmi-wmax.c index c42f9228b0b2..20ec122a9fe0 100644 --- a/drivers/platform/x86/dell/alienware-wmi-wmax.c +++ b/drivers/platform/x86/dell/alienware-wmi-wmax.c @@ -119,7 +119,7 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m16 R1 AMD"), }, - .driver_data = &g_series_quirks, + .driver_data = &generic_quirks, }, { .ident = "Alienware m16 R2", From f826ec7966a63d48e16e0868af4e038bf9a1a3ae Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 Jun 2025 15:41:25 +0100 Subject: [PATCH 148/242] bio: Fix bio_first_folio() for SPARSEMEM without VMEMMAP It is possible for physically contiguous folios to have discontiguous struct pages if SPARSEMEM is enabled and SPARSEMEM_VMEMMAP is not. This is correctly handled by folio_page_idx(), so remove this open-coded implementation. Fixes: 640d1930bef4 (block: Add bio_for_each_folio_all()) Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20250612144126.2849931-1-willy@infradead.org Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 9c37c66ef9ca..46ffac5caab7 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -291,7 +291,7 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio, fi->folio = page_folio(bvec->bv_page); fi->offset = bvec->bv_offset + - PAGE_SIZE * (bvec->bv_page - &fi->folio->page); + PAGE_SIZE * folio_page_idx(fi->folio, bvec->bv_page); fi->_seg_count = bvec->bv_len; fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count); fi->_next = folio_next(fi->folio); From 5e223e06ee7c6d8f630041a0645ac90e39a42cc6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 Jun 2025 15:42:53 +0100 Subject: [PATCH 149/242] block: Fix bvec_set_folio() for very large folios Similarly to 26064d3e2b4d ("block: fix adding folio to bio"), if we attempt to add a folio that is larger than 4GB, we'll silently truncate the offset and len. Widen the parameters to size_t, assert that the length is less than 4GB and set the first page that contains the interesting data rather than the first page of the folio. Fixes: 26db5ee15851 (block: add a bvec_set_folio helper) Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20250612144255.2850278-1-willy@infradead.org Signed-off-by: Jens Axboe --- include/linux/bvec.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 204b22a99c4b..0a80e1f9aa20 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -57,9 +57,12 @@ static inline void bvec_set_page(struct bio_vec *bv, struct page *page, * @offset: offset into the folio */ static inline void bvec_set_folio(struct bio_vec *bv, struct folio *folio, - unsigned int len, unsigned int offset) + size_t len, size_t offset) { - bvec_set_page(bv, &folio->page, len, offset); + unsigned long nr = offset / PAGE_SIZE; + + WARN_ON_ONCE(len > UINT_MAX); + bvec_set_page(bv, folio_page(folio, nr), len, offset % PAGE_SIZE); } /** From b5acc3628898baa63658bc4125f9525f9b3dd4f3 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 12 Jun 2025 16:17:51 +0200 Subject: [PATCH 150/242] ata: ahci: Disallow LPM for ASUSPRO-D840SA motherboard A user has bisected a regression which causes graphical corruptions on his screen to commit 7627a0edef54 ("ata: ahci: Drop low power policy board type"). Simply reverting commit 7627a0edef54 ("ata: ahci: Drop low power policy board type") makes the graphical corruptions on his screen to go away. (Note: there are no visible messages in dmesg that indicates a problem with AHCI.) The user also reports that the problem occurs regardless if there is an HDD or an SSD connected via AHCI, so the problem is not device related. The devices also work fine on other motherboards, so it seems specific to the ASUSPRO-D840SA motherboard. While enabling low power modes for AHCI is not supposed to affect completely unrelated hardware, like a graphics card, it does however allow the system to enter deeper PC-states, which could expose ACPI issues that were previously not visible (because the system never entered these lower power states before). There are previous examples where enabling LPM exposed serious BIOS/ACPI bugs, see e.g. commit 240630e61870 ("ahci: Disable LPM on Lenovo 50 series laptops with a too old BIOS"). Since there hasn't been any BIOS update in years for the ASUSPRO-D840SA motherboard, disable LPM for this board, in order to avoid entering lower PC-states, which triggers graphical corruptions. Cc: stable@vger.kernel.org Reported-by: Andy Yang Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220111 Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Reviewed-by: Damien Le Moal Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20250612141750.2108342-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/ahci.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index e7c8357cbc54..c8ad8ace7496 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1410,8 +1410,15 @@ static bool ahci_broken_suspend(struct pci_dev *pdev) static bool ahci_broken_lpm(struct pci_dev *pdev) { + /* + * Platforms with LPM problems. + * If driver_data is NULL, there is no existing BIOS version with + * functioning LPM. + * If driver_data is non-NULL, then driver_data contains the DMI BIOS + * build date of the first BIOS version with functioning LPM (i.e. older + * BIOS versions have broken LPM). + */ static const struct dmi_system_id sysids[] = { - /* Various Lenovo 50 series have LPM issues with older BIOSen */ { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), @@ -1440,6 +1447,13 @@ static bool ahci_broken_lpm(struct pci_dev *pdev) }, .driver_data = "20180409", /* 2.35 */ }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_VERSION, "ASUSPRO D840MB_M840SA"), + }, + /* 320 is broken, there is no known good version yet. */ + }, { } /* terminate list */ }; const struct dmi_system_id *dmi = dmi_first_match(sysids); @@ -1449,6 +1463,9 @@ static bool ahci_broken_lpm(struct pci_dev *pdev) if (!dmi) return false; + if (!dmi->driver_data) + return true; + dmi_get_date(DMI_BIOS_DATE, &year, &month, &date); snprintf(buf, sizeof(buf), "%04d%02d%02d", year, month, date); From f9705d66fa7107fcd619083f7aae2afb0554a593 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Jun 2025 16:14:45 -0300 Subject: [PATCH 151/242] iommu/tegra: Fix incorrect size calculation This driver uses a mixture of ways to get the size of a PTE, tegra_smmu_set_pde() did it as sizeof(*pd) which became wrong when pd switched to a struct tegra_pd. Switch pd back to a u32* in tegra_smmu_set_pde() so the sizeof(*pd) returns 4. Fixes: 50568f87d1e2 ("iommu/terga: Do not use struct page as the handle for as->pd memory") Reported-by: Diogo Ivo Closes: https://lore.kernel.org/all/62e7f7fe-6200-4e4f-ad42-d58ad272baa6@tecnico.ulisboa.pt/ Signed-off-by: Jason Gunthorpe Acked-by: Thierry Reding Reviewed-by: Jerry Snitselaar Tested-by: Diogo Ivo Link: https://lore.kernel.org/r/0-v1-da7b8b3d57eb+ce-iommu_terga_sizeof_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/tegra-smmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 61897d50162d..e58fe9d8b9e7 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -559,11 +559,11 @@ static void tegra_smmu_set_pde(struct tegra_smmu_as *as, unsigned long iova, { unsigned int pd_index = iova_pd_index(iova); struct tegra_smmu *smmu = as->smmu; - struct tegra_pd *pd = as->pd; + u32 *pd = &as->pd->val[pd_index]; unsigned long offset = pd_index * sizeof(*pd); /* Set the page directory entry first */ - pd->val[pd_index] = value; + *pd = value; /* The flush the page directory entry from caches */ dma_sync_single_range_for_device(smmu->dev, as->pd_dma, offset, From db3dfae1a2f662e69d535827703bcdbb04b8d72b Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 13 Jun 2025 09:38:57 +0700 Subject: [PATCH 152/242] Documentation: ublk: Separate UBLK_F_AUTO_BUF_REG fallback behavior sublists Stephen Rothwell reports htmldocs warning on ublk docs: Documentation/block/ublk.rst:414: ERROR: Unexpected indentation. [docutils] Fix the warning by separating sublists of auto buffer registration fallback behavior from their appropriate parent list item. Fixes: ff20c516485e ("ublk: document auto buffer registration(UBLK_F_AUTO_BUF_REG)") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250612132638.193de386@canb.auug.org.au/ Signed-off-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20250613023857.15971-1-bagasdotme@gmail.com Signed-off-by: Jens Axboe --- Documentation/block/ublk.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index abec524a04ed..8c4030bcabb6 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -408,6 +408,7 @@ Fallback Behavior If auto buffer registration fails: 1. When ``UBLK_AUTO_BUF_REG_FALLBACK`` is enabled: + - The uring_cmd is completed - ``UBLK_IO_F_NEED_REG_BUF`` is set in ``ublksrv_io_desc.op_flags`` - The ublk server must manually deal with the failure, such as, register @@ -415,6 +416,7 @@ If auto buffer registration fails: for handling ublk IO 2. If fallback is not enabled: + - The ublk I/O request fails silently - The uring_cmd won't be completed From ab107276607af90b13a5994997e19b7b9731e251 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Sat, 17 May 2025 19:52:37 +0530 Subject: [PATCH 153/242] powerpc: Fix struct termio related ioctl macros Since termio interface is now obsolete, include/uapi/asm/ioctls.h has some constant macros referring to "struct termio", this caused build failure at userspace. In file included from /usr/include/asm/ioctl.h:12, from /usr/include/asm/ioctls.h:5, from tst-ioctls.c:3: tst-ioctls.c: In function 'get_TCGETA': tst-ioctls.c:12:10: error: invalid application of 'sizeof' to incomplete type 'struct termio' 12 | return TCGETA; | ^~~~~~ Even though termios.h provides "struct termio", trying to juggle definitions around to make it compile could introduce regressions. So better to open code it. Reported-by: Tulio Magno Suggested-by: Nicholas Piggin Tested-by: Justin M. Forbes Reviewed-by: Michael Ellerman Closes: https://lore.kernel.org/linuxppc-dev/8734dji5wl.fsf@ascii.art.br/ Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250517142237.156665-1-maddy@linux.ibm.com --- arch/powerpc/include/uapi/asm/ioctls.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/uapi/asm/ioctls.h b/arch/powerpc/include/uapi/asm/ioctls.h index 2c145da3b774..b5211e413829 100644 --- a/arch/powerpc/include/uapi/asm/ioctls.h +++ b/arch/powerpc/include/uapi/asm/ioctls.h @@ -23,10 +23,10 @@ #define TCSETSW _IOW('t', 21, struct termios) #define TCSETSF _IOW('t', 22, struct termios) -#define TCGETA _IOR('t', 23, struct termio) -#define TCSETA _IOW('t', 24, struct termio) -#define TCSETAW _IOW('t', 25, struct termio) -#define TCSETAF _IOW('t', 28, struct termio) +#define TCGETA 0x40147417 /* _IOR('t', 23, struct termio) */ +#define TCSETA 0x80147418 /* _IOW('t', 24, struct termio) */ +#define TCSETAW 0x80147419 /* _IOW('t', 25, struct termio) */ +#define TCSETAF 0x8014741c /* _IOW('t', 28, struct termio) */ #define TCSBRK _IO('t', 29) #define TCXONC _IO('t', 30) From 26ec15e4b0c1d7b25214d9c0be1d50492e2f006c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jun 2025 11:01:49 -0600 Subject: [PATCH 154/242] io_uring/kbuf: don't truncate end buffer for multiple buffer peeks If peeking a bunch of buffers, normally io_ring_buffers_peek() will truncate the end buffer. This isn't optimal as presumably more data will be arriving later, and hence it's better to stop with the last full buffer rather than truncate the end buffer. Cc: stable@vger.kernel.org Fixes: 35c8711c8fc4 ("io_uring/kbuf: add helpers for getting/peeking multiple buffers") Reported-by: Christian Mazakas Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 2ea65f3cef72..ce95e3af44a9 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -270,8 +270,11 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, /* truncate end piece, if needed, for non partial buffers */ if (len > arg->max_len) { len = arg->max_len; - if (!(bl->flags & IOBL_INC)) + if (!(bl->flags & IOBL_INC)) { + if (iov != arg->iovs) + break; buf->len = len; + } } iov->iov_base = u64_to_user_ptr(buf->addr); From f90fff1e152dedf52b932240ebbd670d83330eca Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 13 Jun 2025 19:26:50 +0200 Subject: [PATCH 155/242] posix-cpu-timers: fix race between handle_posix_cpu_timers() and posix_cpu_timer_del() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If an exiting non-autoreaping task has already passed exit_notify() and calls handle_posix_cpu_timers() from IRQ, it can be reaped by its parent or debugger right after unlock_task_sighand(). If a concurrent posix_cpu_timer_del() runs at that moment, it won't be able to detect timer->it.cpu.firing != 0: cpu_timer_task_rcu() and/or lock_task_sighand() will fail. Add the tsk->exit_state check into run_posix_cpu_timers() to fix this. This fix is not needed if CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y, because exit_task_work() is called before exit_notify(). But the check still makes sense, task_work_add(&tsk->posix_cputimers_work.work) will fail anyway in this case. Cc: stable@vger.kernel.org Reported-by: Benoît Sevens Fixes: 0bdd2ed4138e ("sched: run_posix_cpu_timers: Don't check ->exit_state, use lock_task_sighand()") Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/time/posix-cpu-timers.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 50e8d04ab661..2e5b89d7d866 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1405,6 +1405,15 @@ void run_posix_cpu_timers(void) lockdep_assert_irqs_disabled(); + /* + * Ensure that release_task(tsk) can't happen while + * handle_posix_cpu_timers() is running. Otherwise, a concurrent + * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and + * miss timer->it.cpu.firing != 0. + */ + if (tsk->exit_state) + return; + /* * If the actual expiry is deferred to task work context and the * work is already scheduled there is no point to do anything here. From 9ce6c9875f3e995be5fd720b65835291f8a609b1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jun 2025 13:37:41 -0600 Subject: [PATCH 156/242] nvme: always punt polled uring_cmd end_io work to task_work Currently NVMe uring_cmd completions will complete locally, if they are polled. This is done because those completions are always invoked from task context. And while that is true, there's no guarantee that it's invoked under the right ring context, or even task. If someone does NVMe passthrough via multiple threads and with a limited number of poll queues, then ringA may find completions from ringB. For that case, completing the request may not be sound. Always just punt the passthrough completions via task_work, which will redirect the completion, if needed. Cc: stable@vger.kernel.org Fixes: 585079b6e425 ("nvme: wire up async polling for io passthrough commands") Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 0b50da2f1175..6b3ac8ae3f34 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -429,21 +429,14 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /* - * For iopoll, complete it directly. Note that using the uring_cmd - * helper for this is safe only because we check blk_rq_is_poll(). - * As that returns false if we're NOT on a polled queue, then it's - * safe to use the polled completion helper. - * - * Otherwise, move the completion to task work. + * IOPOLL could potentially complete this request directly, but + * if multiple rings are polling on the same queue, then it's possible + * for one ring to find completions for another ring. Punting the + * completion via task_work will always direct it to the right + * location, rather than potentially complete requests for ringA + * under iopoll invocations from ringB. */ - if (blk_rq_is_poll(req)) { - if (pdu->bio) - blk_rq_unmap_user(pdu->bio); - io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status); - } else { - io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); - } - + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); return RQ_END_IO_FREE; } From b62e0efd8a8571460d05922862a451855ebdf3c6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jun 2025 15:24:53 -0600 Subject: [PATCH 157/242] io_uring: run local task_work from ring exit IOPOLL reaping In preparation for needing to shift NVMe passthrough to always use task_work for polled IO completions, ensure that those are suitably run at exit time. See commit: 9ce6c9875f3e ("nvme: always punt polled uring_cmd end_io work to task_work") for details on why that is necessary. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4e32f808d07d..5111ec040c53 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1523,6 +1523,9 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) } } mutex_unlock(&ctx->uring_lock); + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_move_task_work_from_local(ctx); } static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) From 1b56e765bf8990f1f60e124926c11fc4ac63d752 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Thu, 12 Jun 2025 14:17:13 +0200 Subject: [PATCH 158/242] rust: completion: implement initial abstraction Implement a minimal abstraction for the completion synchronization primitive. This initial abstraction only adds complete_all() and wait_for_completion(), since that is what is required for the subsequent Devres patch. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Valentin Schneider Reviewed-by: Alice Ryhl Reviewed-by: Boqun Feng Reviewed-by: Benno Lossin Acked-by: Miguel Ojeda Link: https://lore.kernel.org/r/20250612121817.1621-2-dakr@kernel.org Signed-off-by: Danilo Krummrich --- rust/bindings/bindings_helper.h | 1 + rust/helpers/completion.c | 8 +++ rust/helpers/helpers.c | 1 + rust/kernel/sync.rs | 2 + rust/kernel/sync/completion.rs | 112 ++++++++++++++++++++++++++++++++ 5 files changed, 124 insertions(+) create mode 100644 rust/helpers/completion.c create mode 100644 rust/kernel/sync/completion.rs diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index bc494745f67b..8cbb660e2ec2 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include diff --git a/rust/helpers/completion.c b/rust/helpers/completion.c new file mode 100644 index 000000000000..b2443262a2ae --- /dev/null +++ b/rust/helpers/completion.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +void rust_helper_init_completion(struct completion *x) +{ + init_completion(x); +} diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 0f1b5d115985..6451cfd94a8d 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -13,6 +13,7 @@ #include "build_assert.c" #include "build_bug.c" #include "clk.c" +#include "completion.c" #include "cpufreq.c" #include "cpumask.c" #include "cred.c" diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index 36a719015583..c23a12639924 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -10,6 +10,7 @@ use pin_init; mod arc; +pub mod completion; mod condvar; pub mod lock; mod locked_by; @@ -17,6 +18,7 @@ pub mod rcu; pub use arc::{Arc, ArcBorrow, UniqueArc}; +pub use completion::Completion; pub use condvar::{new_condvar, CondVar, CondVarTimeoutResult}; pub use lock::global::{global_lock, GlobalGuard, GlobalLock, GlobalLockBackend, GlobalLockedBy}; pub use lock::mutex::{new_mutex, Mutex, MutexGuard}; diff --git a/rust/kernel/sync/completion.rs b/rust/kernel/sync/completion.rs new file mode 100644 index 000000000000..c50012a940a3 --- /dev/null +++ b/rust/kernel/sync/completion.rs @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Completion support. +//! +//! Reference: +//! +//! C header: [`include/linux/completion.h`](srctree/include/linux/completion.h) + +use crate::{bindings, prelude::*, types::Opaque}; + +/// Synchronization primitive to signal when a certain task has been completed. +/// +/// The [`Completion`] synchronization primitive signals when a certain task has been completed by +/// waking up other tasks that have been queued up to wait for the [`Completion`] to be completed. +/// +/// # Examples +/// +/// ``` +/// use kernel::sync::{Arc, Completion}; +/// use kernel::workqueue::{self, impl_has_work, new_work, Work, WorkItem}; +/// +/// #[pin_data] +/// struct MyTask { +/// #[pin] +/// work: Work, +/// #[pin] +/// done: Completion, +/// } +/// +/// impl_has_work! { +/// impl HasWork for MyTask { self.work } +/// } +/// +/// impl MyTask { +/// fn new() -> Result> { +/// let this = Arc::pin_init(pin_init!(MyTask { +/// work <- new_work!("MyTask::work"), +/// done <- Completion::new(), +/// }), GFP_KERNEL)?; +/// +/// let _ = workqueue::system().enqueue(this.clone()); +/// +/// Ok(this) +/// } +/// +/// fn wait_for_completion(&self) { +/// self.done.wait_for_completion(); +/// +/// pr_info!("Completion: task complete\n"); +/// } +/// } +/// +/// impl WorkItem for MyTask { +/// type Pointer = Arc; +/// +/// fn run(this: Arc) { +/// // process this task +/// this.done.complete_all(); +/// } +/// } +/// +/// let task = MyTask::new()?; +/// task.wait_for_completion(); +/// # Ok::<(), Error>(()) +/// ``` +#[pin_data] +pub struct Completion { + #[pin] + inner: Opaque, +} + +// SAFETY: `Completion` is safe to be send to any task. +unsafe impl Send for Completion {} + +// SAFETY: `Completion` is safe to be accessed concurrently. +unsafe impl Sync for Completion {} + +impl Completion { + /// Create an initializer for a new [`Completion`]. + pub fn new() -> impl PinInit { + pin_init!(Self { + inner <- Opaque::ffi_init(|slot: *mut bindings::completion| { + // SAFETY: `slot` is a valid pointer to an uninitialized `struct completion`. + unsafe { bindings::init_completion(slot) }; + }), + }) + } + + fn as_raw(&self) -> *mut bindings::completion { + self.inner.get() + } + + /// Signal all tasks waiting on this completion. + /// + /// This method wakes up all tasks waiting on this completion; after this operation the + /// completion is permanently done, i.e. signals all current and future waiters. + pub fn complete_all(&self) { + // SAFETY: `self.as_raw()` is a pointer to a valid `struct completion`. + unsafe { bindings::complete_all(self.as_raw()) }; + } + + /// Wait for completion of a task. + /// + /// This method waits for the completion of a task; it is not interruptible and there is no + /// timeout. + /// + /// See also [`Completion::complete_all`]. + pub fn wait_for_completion(&self) { + // SAFETY: `self.as_raw()` is a pointer to a valid `struct completion`. + unsafe { bindings::wait_for_completion(self.as_raw()) }; + } +} From 4b76fafb20dd4a2becb94949d78e86bc88006509 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Thu, 12 Jun 2025 14:17:14 +0200 Subject: [PATCH 159/242] rust: revocable: indicate whether `data` has been revoked already Return a boolean from Revocable::revoke() and Revocable::revoke_nosync() to indicate whether the data has been revoked already. Return true if the data hasn't been revoked yet (i.e. this call revoked the data), false otherwise. This is required by Devres in order to synchronize the completion of the revoke process. Reviewed-by: Benno Lossin Acked-by: Miguel Ojeda Link: https://lore.kernel.org/r/20250612121817.1621-3-dakr@kernel.org Signed-off-by: Danilo Krummrich --- rust/kernel/revocable.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/rust/kernel/revocable.rs b/rust/kernel/revocable.rs index db4aa46bb121..06a3cdfce344 100644 --- a/rust/kernel/revocable.rs +++ b/rust/kernel/revocable.rs @@ -154,8 +154,10 @@ pub unsafe fn access(&self) -> &T { /// # Safety /// /// Callers must ensure that there are no more concurrent users of the revocable object. - unsafe fn revoke_internal(&self) { - if self.is_available.swap(false, Ordering::Relaxed) { + unsafe fn revoke_internal(&self) -> bool { + let revoke = self.is_available.swap(false, Ordering::Relaxed); + + if revoke { if SYNC { // SAFETY: Just an FFI call, there are no further requirements. unsafe { bindings::synchronize_rcu() }; @@ -165,6 +167,8 @@ unsafe fn revoke_internal(&self) { // `compare_exchange` above that takes `is_available` from `true` to `false`. unsafe { drop_in_place(self.data.get()) }; } + + revoke } /// Revokes access to and drops the wrapped object. @@ -172,10 +176,13 @@ unsafe fn revoke_internal(&self) { /// Access to the object is revoked immediately to new callers of [`Revocable::try_access`], /// expecting that there are no concurrent users of the object. /// + /// Returns `true` if `&self` has been revoked with this call, `false` if it was revoked + /// already. + /// /// # Safety /// /// Callers must ensure that there are no more concurrent users of the revocable object. - pub unsafe fn revoke_nosync(&self) { + pub unsafe fn revoke_nosync(&self) -> bool { // SAFETY: By the safety requirement of this function, the caller ensures that nobody is // accessing the data anymore and hence we don't have to wait for the grace period to // finish. @@ -189,7 +196,10 @@ pub unsafe fn revoke_nosync(&self) { /// If there are concurrent users of the object (i.e., ones that called /// [`Revocable::try_access`] beforehand and still haven't dropped the returned guard), this /// function waits for the concurrent access to complete before dropping the wrapped object. - pub fn revoke(&self) { + /// + /// Returns `true` if `&self` has been revoked with this call, `false` if it was revoked + /// already. + pub fn revoke(&self) -> bool { // SAFETY: By passing `true` we ask `revoke_internal` to wait for the grace period to // finish. unsafe { self.revoke_internal::() } From f744201c6159fc7323c40936fd079525f7063598 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Thu, 12 Jun 2025 14:17:15 +0200 Subject: [PATCH 160/242] rust: devres: fix race in Devres::drop() In Devres::drop() we first remove the devres action and then drop the wrapped device resource. The design goal is to give the owner of a Devres object control over when the device resource is dropped, but limit the overall scope to the corresponding device being bound to a driver. However, there's a race that was introduced with commit 8ff656643d30 ("rust: devres: remove action in `Devres::drop`"), but also has been (partially) present from the initial version on. In Devres::drop(), the devres action is removed successfully and subsequently the destructor of the wrapped device resource runs. However, there is no guarantee that the destructor of the wrapped device resource completes before the driver core is done unbinding the corresponding device. If in Devres::drop(), the devres action can't be removed, it means that the devres callback has been executed already, or is still running concurrently. In case of the latter, either Devres::drop() wins revoking the Revocable or the devres callback wins revoking the Revocable. If Devres::drop() wins, we (again) have no guarantee that the destructor of the wrapped device resource completes before the driver core is done unbinding the corresponding device. CPU0 CPU1 ------------------------------------------------------------------------ Devres::drop() { Devres::devres_callback() { self.data.revoke() { this.data.revoke() { is_available.swap() == true is_available.swap == false } } // [...] // device fully unbound drop_in_place() { // release device resource } } } Depending on the specific device resource, this can potentially lead to user-after-free bugs. In order to fix this, implement the following logic. In the devres callback, we're always good when we get to revoke the device resource ourselves, i.e. Revocable::revoke() returns true. If Revocable::revoke() returns false, it means that Devres::drop(), concurrently, already drops the device resource and we have to wait for Devres::drop() to signal that it finished dropping the device resource. Note that if we hit the case where we need to wait for the completion of Devres::drop() in the devres callback, it means that we're actually racing with a concurrent Devres::drop() call, which already started revoking the device resource for us. This is rather unlikely and means that the concurrent Devres::drop() already started doing our work and we just need to wait for it to complete it for us. Hence, there should not be any additional overhead from that. (Actually, for now it's even better if Devres::drop() does the work for us, since it can bypass the synchronize_rcu() call implied by Revocable::revoke(), but this goes away anyways once I get to implement the split devres callback approach, which allows us to first flip the atomics of all registered Devres objects of a certain device, execute a single synchronize_rcu() and then drop all revocable objects.) In Devres::drop() we try to revoke the device resource. If that is *not* successful, it means that the devres callback already did and we're good. Otherwise, we try to remove the devres action, which, if successful, means that we're good, since the device resource has just been revoked by us *before* we removed the devres action successfully. If the devres action could not be removed, it means that the devres callback must be running concurrently, hence we signal that the device resource has been revoked by us, using the completion. This makes it safe to drop a Devres object from any task and at any point of time, which is one of the design goals. Fixes: 76c01ded724b ("rust: add devres abstraction") Reported-by: Alice Ryhl Closes: https://lore.kernel.org/lkml/aD64YNuqbPPZHAa5@google.com/ Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20250612121817.1621-4-dakr@kernel.org Signed-off-by: Danilo Krummrich --- rust/kernel/devres.rs | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/rust/kernel/devres.rs b/rust/kernel/devres.rs index 0f79a2ec9474..2f74bce5ed9d 100644 --- a/rust/kernel/devres.rs +++ b/rust/kernel/devres.rs @@ -13,7 +13,7 @@ ffi::c_void, prelude::*, revocable::Revocable, - sync::Arc, + sync::{Arc, Completion}, types::ARef, }; @@ -25,13 +25,17 @@ struct DevresInner { callback: unsafe extern "C" fn(*mut c_void), #[pin] data: Revocable, + #[pin] + revoke: Completion, } /// This abstraction is meant to be used by subsystems to containerize [`Device`] bound resources to /// manage their lifetime. /// /// [`Device`] bound resources should be freed when either the resource goes out of scope or the -/// [`Device`] is unbound respectively, depending on what happens first. +/// [`Device`] is unbound respectively, depending on what happens first. In any case, it is always +/// guaranteed that revoking the device resource is completed before the corresponding [`Device`] +/// is unbound. /// /// To achieve that [`Devres`] registers a devres callback on creation, which is called once the /// [`Device`] is unbound, revoking access to the encapsulated resource (see also [`Revocable`]). @@ -102,6 +106,7 @@ fn new(dev: &Device, data: T, flags: Flags) -> Result> dev: dev.into(), callback: Self::devres_callback, data <- Revocable::new(data), + revoke <- Completion::new(), }), flags, )?; @@ -130,26 +135,28 @@ fn as_ptr(&self) -> *const Self { self as _ } - fn remove_action(this: &Arc) { + fn remove_action(this: &Arc) -> bool { // SAFETY: // - `self.inner.dev` is a valid `Device`, // - the `action` and `data` pointers are the exact same ones as given to devm_add_action() // previously, // - `self` is always valid, even if the action has been released already. - let ret = unsafe { + let success = unsafe { bindings::devm_remove_action_nowarn( this.dev.as_raw(), Some(this.callback), this.as_ptr() as _, ) - }; + } == 0; - if ret == 0 { + if success { // SAFETY: We leaked an `Arc` reference to devm_add_action() in `DevresInner::new`; if // devm_remove_action_nowarn() was successful we can (and have to) claim back ownership // of this reference. let _ = unsafe { Arc::from_raw(this.as_ptr()) }; } + + success } #[allow(clippy::missing_safety_doc)] @@ -161,7 +168,12 @@ fn remove_action(this: &Arc) { // `DevresInner::new`. let inner = unsafe { Arc::from_raw(ptr) }; - inner.data.revoke(); + if !inner.data.revoke() { + // If `revoke()` returns false, it means that `Devres::drop` already started revoking + // `inner.data` for us. Hence we have to wait until `Devres::drop()` signals that it + // completed revoking `inner.data`. + inner.revoke.wait_for_completion(); + } } } @@ -232,6 +244,15 @@ fn deref(&self) -> &Self::Target { impl Drop for Devres { fn drop(&mut self) { - DevresInner::remove_action(&self.0); + // SAFETY: When `drop` runs, it is guaranteed that nobody is accessing the revocable data + // anymore, hence it is safe not to wait for the grace period to finish. + if unsafe { self.revoke_nosync() } { + // We revoked `self.0.data` before the devres action did, hence try to remove it. + if !DevresInner::remove_action(&self.0) { + // We could not remove the devres action, which means that it now runs concurrently, + // hence signal that `self.0.data` has been revoked successfully. + self.0.revoke.complete_all(); + } + } } } From 20c96ed278e362ae4e324ed7d8c69fb48c508d3c Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Wed, 11 Jun 2025 19:48:25 +0200 Subject: [PATCH 161/242] rust: devres: do not dereference to the internal Revocable We can't expose direct access to the internal Revocable, since this allows users to directly revoke the internal Revocable without Devres having the chance to synchronize with the devres callback -- we have to guarantee that the internal Revocable has been fully revoked before the device is fully unbound. Hence, remove the corresponding Deref implementation and, instead, provide indirect accessors for the internal Revocable. Note that we can still support Devres::revoke() by implementing the required synchronization (which would be almost identical to the synchronization in Devres::drop()). Fixes: 76c01ded724b ("rust: add devres abstraction") Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20250611174827.380555-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- rust/kernel/devres.rs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/rust/kernel/devres.rs b/rust/kernel/devres.rs index 2f74bce5ed9d..57502534d985 100644 --- a/rust/kernel/devres.rs +++ b/rust/kernel/devres.rs @@ -12,13 +12,11 @@ error::{Error, Result}, ffi::c_void, prelude::*, - revocable::Revocable, - sync::{Arc, Completion}, + revocable::{Revocable, RevocableGuard}, + sync::{rcu, Arc, Completion}, types::ARef, }; -use core::ops::Deref; - #[pin_data] struct DevresInner { dev: ARef, @@ -230,15 +228,22 @@ pub fn access<'a>(&'a self, dev: &'a Device) -> Result<&'a T> { // SAFETY: `dev` being the same device as the device this `Devres` has been created for // proves that `self.0.data` hasn't been revoked and is guaranteed to not be revoked as // long as `dev` lives; `dev` lives at least as long as `self`. - Ok(unsafe { self.deref().access() }) + Ok(unsafe { self.0.data.access() }) } -} -impl Deref for Devres { - type Target = Revocable; + /// [`Devres`] accessor for [`Revocable::try_access`]. + pub fn try_access(&self) -> Option> { + self.0.data.try_access() + } - fn deref(&self) -> &Self::Target { - &self.0.data + /// [`Devres`] accessor for [`Revocable::try_access_with`]. + pub fn try_access_with R>(&self, f: F) -> Option { + self.0.data.try_access_with(f) + } + + /// [`Devres`] accessor for [`Revocable::try_access_with_guard`]. + pub fn try_access_with_guard<'a>(&'a self, guard: &'a rcu::Guard) -> Option<&'a T> { + self.0.data.try_access_with_guard(guard) } } @@ -246,7 +251,7 @@ impl Drop for Devres { fn drop(&mut self) { // SAFETY: When `drop` runs, it is guaranteed that nobody is accessing the revocable data // anymore, hence it is safe not to wait for the grace period to finish. - if unsafe { self.revoke_nosync() } { + if unsafe { self.0.data.revoke_nosync() } { // We revoked `self.0.data` before the devres action did, hence try to remove it. if !DevresInner::remove_action(&self.0) { // We could not remove the devres action, which means that it now runs concurrently, From a6a7946bd691940cfe7289ae6dfb1f077516df72 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 13 Jun 2025 01:08:48 +0900 Subject: [PATCH 162/242] kbuild: move warnings about linux/export.h from W=1 to W=2 This hides excessive warnings, as nobody builds with W=2. Fixes: a934a57a42f6 ("scripts/misc-check: check missing #include when W=1") Fixes: 7d95680d64ac ("scripts/misc-check: check unnecessary #include when W=1") Signed-off-by: Masahiro Yamada Acked-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Acked-by: Heiko Carstens --- Makefile | 3 --- scripts/misc-check | 15 ++++++++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 35e6e5240c61..69c534982415 100644 --- a/Makefile +++ b/Makefile @@ -1832,12 +1832,9 @@ rustfmtcheck: rustfmt # Misc # --------------------------------------------------------------------------- -# Run misc checks when ${KBUILD_EXTRA_WARN} contains 1 PHONY += misc-check -ifneq ($(findstring 1,$(KBUILD_EXTRA_WARN)),) misc-check: $(Q)$(srctree)/scripts/misc-check -endif all: misc-check diff --git a/scripts/misc-check b/scripts/misc-check index a74450e799d1..84f08da17b2c 100755 --- a/scripts/misc-check +++ b/scripts/misc-check @@ -62,6 +62,15 @@ check_unnecessary_include_linux_export_h () { xargs -r printf "%s: warning: EXPORT_SYMBOL() is not used, but #include is present\n" >&2 } -check_tracked_ignored_files -check_missing_include_linux_export_h -check_unnecessary_include_linux_export_h +case "${KBUILD_EXTRA_WARN}" in +*1*) + check_tracked_ignored_files + ;; +esac + +case "${KBUILD_EXTRA_WARN}" in +*2*) + check_missing_include_linux_export_h + check_unnecessary_include_linux_export_h + ;; +esac From 2f6b47b295518c3ba16fabb1dddbe6a319899acb Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Sat, 14 Jun 2025 00:55:33 +0000 Subject: [PATCH 163/242] gendwarfksyms: Fix structure type overrides As we always iterate through the entire die_map when expanding type strings, recursively processing referenced types in type_expand_child() is not actually necessary. Furthermore, the type_string kABI rule added in commit c9083467f7b9 ("gendwarfksyms: Add a kABI rule to override type strings") can fail to override type strings for structures due to a missing kabi_get_type_string() check in this function. Fix the issue by dropping the unnecessary recursion and moving the override check to type_expand(). Note that symbol versions are otherwise unchanged with this patch. Fixes: c9083467f7b9 ("gendwarfksyms: Add a kABI rule to override type strings") Reported-by: Giuliano Procida Signed-off-by: Sami Tolvanen Reviewed-by: Petr Pavlu Signed-off-by: Masahiro Yamada --- scripts/gendwarfksyms/gendwarfksyms.h | 14 +----- scripts/gendwarfksyms/types.c | 65 ++++++++------------------- 2 files changed, 21 insertions(+), 58 deletions(-) diff --git a/scripts/gendwarfksyms/gendwarfksyms.h b/scripts/gendwarfksyms/gendwarfksyms.h index 7dd03ffe0c5c..d9c06d2cb1df 100644 --- a/scripts/gendwarfksyms/gendwarfksyms.h +++ b/scripts/gendwarfksyms/gendwarfksyms.h @@ -216,24 +216,14 @@ int cache_get(struct cache *cache, unsigned long key); void cache_init(struct cache *cache); void cache_free(struct cache *cache); -static inline void __cache_mark_expanded(struct cache *cache, uintptr_t addr) -{ - cache_set(cache, addr, 1); -} - -static inline bool __cache_was_expanded(struct cache *cache, uintptr_t addr) -{ - return cache_get(cache, addr) == 1; -} - static inline void cache_mark_expanded(struct cache *cache, void *addr) { - __cache_mark_expanded(cache, (uintptr_t)addr); + cache_set(cache, (unsigned long)addr, 1); } static inline bool cache_was_expanded(struct cache *cache, void *addr) { - return __cache_was_expanded(cache, (uintptr_t)addr); + return cache_get(cache, (unsigned long)addr) == 1; } /* diff --git a/scripts/gendwarfksyms/types.c b/scripts/gendwarfksyms/types.c index 39ce1770e463..7bd459ea6c59 100644 --- a/scripts/gendwarfksyms/types.c +++ b/scripts/gendwarfksyms/types.c @@ -333,37 +333,11 @@ static void calculate_version(struct version *version, cache_free(&expansion_cache); } -static void __type_expand(struct die *cache, struct type_expansion *type, - bool recursive); - -static void type_expand_child(struct die *cache, struct type_expansion *type, - bool recursive) -{ - struct type_expansion child; - char *name; - - name = get_type_name(cache); - if (!name) { - __type_expand(cache, type, recursive); - return; - } - - if (recursive && !__cache_was_expanded(&expansion_cache, cache->addr)) { - __cache_mark_expanded(&expansion_cache, cache->addr); - type_expansion_init(&child); - __type_expand(cache, &child, true); - type_map_add(name, &child); - type_expansion_free(&child); - } - - type_expansion_append(type, name, name); -} - -static void __type_expand(struct die *cache, struct type_expansion *type, - bool recursive) +static void __type_expand(struct die *cache, struct type_expansion *type) { struct die_fragment *df; struct die *child; + char *name; list_for_each_entry(df, &cache->fragments, list) { switch (df->type) { @@ -379,7 +353,12 @@ static void __type_expand(struct die *cache, struct type_expansion *type, error("unknown child: %" PRIxPTR, df->data.addr); - type_expand_child(child, type, recursive); + name = get_type_name(child); + if (name) + type_expansion_append(type, name, name); + else + __type_expand(child, type); + break; case FRAGMENT_LINEBREAK: /* @@ -397,12 +376,17 @@ static void __type_expand(struct die *cache, struct type_expansion *type, } } -static void type_expand(struct die *cache, struct type_expansion *type, - bool recursive) +static void type_expand(const char *name, struct die *cache, + struct type_expansion *type) { + const char *override; + type_expansion_init(type); - __type_expand(cache, type, recursive); - cache_free(&expansion_cache); + + if (stable && kabi_get_type_string(name, &override)) + type_parse(name, override, type); + else + __type_expand(cache, type); } static void type_parse(const char *name, const char *str, @@ -416,8 +400,6 @@ static void type_parse(const char *name, const char *str, if (!*str) error("empty type string override for '%s'", name); - type_expansion_init(type); - for (pos = 0; str[pos]; ++pos) { bool empty; char marker = ' '; @@ -478,7 +460,6 @@ static void type_parse(const char *name, const char *str, static void expand_type(struct die *cache, void *arg) { struct type_expansion type; - const char *override; char *name; if (cache->mapped) @@ -504,11 +485,7 @@ static void expand_type(struct die *cache, void *arg) debug("%s", name); - if (stable && kabi_get_type_string(name, &override)) - type_parse(name, override, &type); - else - type_expand(cache, &type, true); - + type_expand(name, cache, &type); type_map_add(name, &type); type_expansion_free(&type); free(name); @@ -518,7 +495,6 @@ static void expand_symbol(struct symbol *sym, void *arg) { struct type_expansion type; struct version version; - const char *override; struct die *cache; /* @@ -532,10 +508,7 @@ static void expand_symbol(struct symbol *sym, void *arg) if (__die_map_get(sym->die_addr, DIE_SYMBOL, &cache)) return; /* We'll warn about missing CRCs later. */ - if (stable && kabi_get_type_string(sym->name, &override)) - type_parse(sym->name, override, &type); - else - type_expand(cache, &type, false); + type_expand(sym->name, cache, &type); /* If the symbol already has a version, don't calculate it again. */ if (sym->state != SYMBOL_PROCESSED) { From e04c78d86a9699d136910cfc0bdcf01087e3267e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 15 Jun 2025 13:49:41 -0700 Subject: [PATCH 164/242] Linux 6.16-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 69c534982415..ba0827a1fccd 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 16 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Baby Opossum Posse # *DOCUMENTATION* From e202196b8aa249d78ab87eae56bbe0e71e3dc39c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 29 May 2025 10:31:17 -0700 Subject: [PATCH 165/242] lib/crypto: Annotate crypto strings with nonstring Annotate various keys, ivs, and other byte arrays with __nonstring so that static initializers will not complain about truncating the trailing NUL byte under GCC 15 with -Wunterminated-string-initialization enabled. Silences many warnings like: ../lib/crypto/aesgcm.c:642:27: warning: initializer-string for array of 'unsigned char' truncates NUL terminator but destination lacks 'nonstring' attribute (13 chars into 12 available) [-Wunterminated-string-initialization] 642 | .iv = "\xca\xfe\xba\xbe\xfa\xce\xdb\xad" | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20250529173113.work.760-kees@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/aescfb.c | 8 ++++---- lib/crypto/aesgcm.c | 46 ++++++++++++++++++++++----------------------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/lib/crypto/aescfb.c b/lib/crypto/aescfb.c index 437613265e14..2f09ae92ffa0 100644 --- a/lib/crypto/aescfb.c +++ b/lib/crypto/aescfb.c @@ -106,11 +106,11 @@ MODULE_LICENSE("GPL"); */ static struct { - u8 ptext[64]; - u8 ctext[64]; + u8 ptext[64] __nonstring; + u8 ctext[64] __nonstring; - u8 key[AES_MAX_KEY_SIZE]; - u8 iv[AES_BLOCK_SIZE]; + u8 key[AES_MAX_KEY_SIZE] __nonstring; + u8 iv[AES_BLOCK_SIZE] __nonstring; int klen; int len; diff --git a/lib/crypto/aesgcm.c b/lib/crypto/aesgcm.c index 277824d6b4af..faa4dee9bb1b 100644 --- a/lib/crypto/aesgcm.c +++ b/lib/crypto/aesgcm.c @@ -205,19 +205,19 @@ MODULE_LICENSE("GPL"); * Test code below. Vectors taken from crypto/testmgr.h */ -static const u8 __initconst ctext0[16] = +static const u8 __initconst ctext0[16] __nonstring = "\x58\xe2\xfc\xce\xfa\x7e\x30\x61" "\x36\x7f\x1d\x57\xa4\xe7\x45\x5a"; static const u8 __initconst ptext1[16]; -static const u8 __initconst ctext1[32] = +static const u8 __initconst ctext1[32] __nonstring = "\x03\x88\xda\xce\x60\xb6\xa3\x92" "\xf3\x28\xc2\xb9\x71\xb2\xfe\x78" "\xab\x6e\x47\xd4\x2c\xec\x13\xbd" "\xf5\x3a\x67\xb2\x12\x57\xbd\xdf"; -static const u8 __initconst ptext2[64] = +static const u8 __initconst ptext2[64] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -227,7 +227,7 @@ static const u8 __initconst ptext2[64] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39\x1a\xaf\xd2\x55"; -static const u8 __initconst ctext2[80] = +static const u8 __initconst ctext2[80] __nonstring = "\x42\x83\x1e\xc2\x21\x77\x74\x24" "\x4b\x72\x21\xb7\x84\xd0\xd4\x9c" "\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0" @@ -239,7 +239,7 @@ static const u8 __initconst ctext2[80] = "\x4d\x5c\x2a\xf3\x27\xcd\x64\xa6" "\x2c\xf3\x5a\xbd\x2b\xa6\xfa\xb4"; -static const u8 __initconst ptext3[60] = +static const u8 __initconst ptext3[60] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -249,7 +249,7 @@ static const u8 __initconst ptext3[60] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39"; -static const u8 __initconst ctext3[76] = +static const u8 __initconst ctext3[76] __nonstring = "\x42\x83\x1e\xc2\x21\x77\x74\x24" "\x4b\x72\x21\xb7\x84\xd0\xd4\x9c" "\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0" @@ -261,17 +261,17 @@ static const u8 __initconst ctext3[76] = "\x5b\xc9\x4f\xbc\x32\x21\xa5\xdb" "\x94\xfa\xe9\x5a\xe7\x12\x1a\x47"; -static const u8 __initconst ctext4[16] = +static const u8 __initconst ctext4[16] __nonstring = "\xcd\x33\xb2\x8a\xc7\x73\xf7\x4b" "\xa0\x0e\xd1\xf3\x12\x57\x24\x35"; -static const u8 __initconst ctext5[32] = +static const u8 __initconst ctext5[32] __nonstring = "\x98\xe7\x24\x7c\x07\xf0\xfe\x41" "\x1c\x26\x7e\x43\x84\xb0\xf6\x00" "\x2f\xf5\x8d\x80\x03\x39\x27\xab" "\x8e\xf4\xd4\x58\x75\x14\xf0\xfb"; -static const u8 __initconst ptext6[64] = +static const u8 __initconst ptext6[64] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -281,7 +281,7 @@ static const u8 __initconst ptext6[64] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39\x1a\xaf\xd2\x55"; -static const u8 __initconst ctext6[80] = +static const u8 __initconst ctext6[80] __nonstring = "\x39\x80\xca\x0b\x3c\x00\xe8\x41" "\xeb\x06\xfa\xc4\x87\x2a\x27\x57" "\x85\x9e\x1c\xea\xa6\xef\xd9\x84" @@ -293,17 +293,17 @@ static const u8 __initconst ctext6[80] = "\x99\x24\xa7\xc8\x58\x73\x36\xbf" "\xb1\x18\x02\x4d\xb8\x67\x4a\x14"; -static const u8 __initconst ctext7[16] = +static const u8 __initconst ctext7[16] __nonstring = "\x53\x0f\x8a\xfb\xc7\x45\x36\xb9" "\xa9\x63\xb4\xf1\xc4\xcb\x73\x8b"; -static const u8 __initconst ctext8[32] = +static const u8 __initconst ctext8[32] __nonstring = "\xce\xa7\x40\x3d\x4d\x60\x6b\x6e" "\x07\x4e\xc5\xd3\xba\xf3\x9d\x18" "\xd0\xd1\xc8\xa7\x99\x99\x6b\xf0" "\x26\x5b\x98\xb5\xd4\x8a\xb9\x19"; -static const u8 __initconst ptext9[64] = +static const u8 __initconst ptext9[64] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -313,7 +313,7 @@ static const u8 __initconst ptext9[64] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39\x1a\xaf\xd2\x55"; -static const u8 __initconst ctext9[80] = +static const u8 __initconst ctext9[80] __nonstring = "\x52\x2d\xc1\xf0\x99\x56\x7d\x07" "\xf4\x7f\x37\xa3\x2a\x84\x42\x7d" "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9" @@ -325,7 +325,7 @@ static const u8 __initconst ctext9[80] = "\xb0\x94\xda\xc5\xd9\x34\x71\xbd" "\xec\x1a\x50\x22\x70\xe3\xcc\x6c"; -static const u8 __initconst ptext10[60] = +static const u8 __initconst ptext10[60] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -335,7 +335,7 @@ static const u8 __initconst ptext10[60] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39"; -static const u8 __initconst ctext10[76] = +static const u8 __initconst ctext10[76] __nonstring = "\x52\x2d\xc1\xf0\x99\x56\x7d\x07" "\xf4\x7f\x37\xa3\x2a\x84\x42\x7d" "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9" @@ -347,7 +347,7 @@ static const u8 __initconst ctext10[76] = "\x76\xfc\x6e\xce\x0f\x4e\x17\x68" "\xcd\xdf\x88\x53\xbb\x2d\x55\x1b"; -static const u8 __initconst ptext11[60] = +static const u8 __initconst ptext11[60] __nonstring = "\xd9\x31\x32\x25\xf8\x84\x06\xe5" "\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" "\x86\xa7\xa9\x53\x15\x34\xf7\xda" @@ -357,7 +357,7 @@ static const u8 __initconst ptext11[60] = "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57" "\xba\x63\x7b\x39"; -static const u8 __initconst ctext11[76] = +static const u8 __initconst ctext11[76] __nonstring = "\x39\x80\xca\x0b\x3c\x00\xe8\x41" "\xeb\x06\xfa\xc4\x87\x2a\x27\x57" "\x85\x9e\x1c\xea\xa6\xef\xd9\x84" @@ -369,7 +369,7 @@ static const u8 __initconst ctext11[76] = "\x25\x19\x49\x8e\x80\xf1\x47\x8f" "\x37\xba\x55\xbd\x6d\x27\x61\x8c"; -static const u8 __initconst ptext12[719] = +static const u8 __initconst ptext12[719] __nonstring = "\x42\xc1\xcc\x08\x48\x6f\x41\x3f" "\x2f\x11\x66\x8b\x2a\x16\xf0\xe0" "\x58\x83\xf0\xc3\x70\x14\xc0\x5b" @@ -461,7 +461,7 @@ static const u8 __initconst ptext12[719] = "\x59\xfa\xfa\xaa\x44\x04\x01\xa7" "\xa4\x78\xdb\x74\x3d\x8b\xb5"; -static const u8 __initconst ctext12[735] = +static const u8 __initconst ctext12[735] __nonstring = "\x84\x0b\xdb\xd5\xb7\xa8\xfe\x20" "\xbb\xb1\x12\x7f\x41\xea\xb3\xc0" "\xa2\xb4\x37\x19\x11\x58\xb6\x0b" @@ -559,9 +559,9 @@ static struct { const u8 *ptext; const u8 *ctext; - u8 key[AES_MAX_KEY_SIZE]; - u8 iv[GCM_AES_IV_SIZE]; - u8 assoc[20]; + u8 key[AES_MAX_KEY_SIZE] __nonstring; + u8 iv[GCM_AES_IV_SIZE] __nonstring; + u8 assoc[20] __nonstring; int klen; int clen; From 2f13daee2a72bb962f5fd356c3a263a6f16da965 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 9 Jun 2025 15:45:20 -0700 Subject: [PATCH 166/242] lib/crypto/curve25519-hacl64: Disable KASAN with clang-17 and older After commit 6f110a5e4f99 ("Disable SLUB_TINY for build testing"), which causes CONFIG_KASAN to be enabled in allmodconfig again, arm64 allmodconfig builds with clang-17 and older show an instance of -Wframe-larger-than (which breaks the build with CONFIG_WERROR=y): lib/crypto/curve25519-hacl64.c:757:6: error: stack frame size (2336) exceeds limit (2048) in 'curve25519_generic' [-Werror,-Wframe-larger-than] 757 | void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE], | ^ When KASAN is disabled, the stack usage is roughly quartered: lib/crypto/curve25519-hacl64.c:757:6: error: stack frame size (608) exceeds limit (128) in 'curve25519_generic' [-Werror,-Wframe-larger-than] 757 | void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE], | ^ Using '-Rpass-analysis=stack-frame-layout' shows the following variables and many, many 8-byte spills when KASAN is enabled: Offset: [SP-144], Type: Variable, Align: 8, Size: 40 Offset: [SP-464], Type: Variable, Align: 8, Size: 320 Offset: [SP-784], Type: Variable, Align: 8, Size: 320 Offset: [SP-864], Type: Variable, Align: 32, Size: 80 Offset: [SP-896], Type: Variable, Align: 32, Size: 32 Offset: [SP-1016], Type: Variable, Align: 8, Size: 120 When KASAN is disabled, there are still spills but not at many and the variables list is smaller: Offset: [SP-192], Type: Variable, Align: 32, Size: 80 Offset: [SP-224], Type: Variable, Align: 32, Size: 32 Offset: [SP-344], Type: Variable, Align: 8, Size: 120 Disable KASAN for this file when using clang-17 or older to avoid blowing out the stack, clearing up the warning. Signed-off-by: Nathan Chancellor Acked-by: "Jason A. Donenfeld" Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250609-curve25519-hacl64-disable-kasan-clang-v1-1-08ea0ac5ccff@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 3e79283b617d..18664127ecd6 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -35,6 +35,10 @@ obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += libcurve25519-generic.o libcurve25519-generic-y := curve25519-fiat32.o libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := curve25519-hacl64.o libcurve25519-generic-y += curve25519-generic.o +# clang versions prior to 18 may blow out the stack with KASAN +ifeq ($(call clang-min-version, 180000),) +KASAN_SANITIZE_curve25519-hacl64.o := n +endif obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o libcurve25519-y += curve25519.o From dd2d6b7f6f519d078a866a36a625b0297d81c5bc Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 13 Jun 2025 11:11:11 +0100 Subject: [PATCH 167/242] fs: drop assert in file_seek_cur_needs_f_lock The assert in function file_seek_cur_needs_f_lock() can be triggered very easily because there are many users of vfs_llseek() (such as overlayfs) that do their custom locking around llseek instead of relying on fdget_pos(). Just drop the overzealous assertion. Fixes: da06e3c51794 ("fs: don't needlessly acquire f_lock") Suggested-by: Jan Kara Suggested-by: Mateusz Guzik Signed-off-by: Luis Henriques Link: https://lore.kernel.org/20250613101111.17716-1-luis@igalia.com Signed-off-by: Christian Brauner --- fs/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/file.c b/fs/file.c index 3a3146664cf3..b6db031545e6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1198,8 +1198,12 @@ bool file_seek_cur_needs_f_lock(struct file *file) if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) return false; - VFS_WARN_ON_ONCE((file_count(file) > 1) && - !mutex_is_locked(&file->f_pos_lock)); + /* + * Note that we are not guaranteed to be called after fdget_pos() on + * this file obj, in which case the caller is expected to provide the + * appropriate locking. + */ + return true; } From b8b8663ac82a2595a0922d30014f60c5547084de Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Sat, 14 Jun 2025 11:20:23 +0200 Subject: [PATCH 168/242] mailmap: add entry for Danilo Krummrich Add an entry to remap my Red Hat address. Link: https://lore.kernel.org/r/20250614092054.161658-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index c8a21d72241f..8b0aac19efcd 100644 --- a/.mailmap +++ b/.mailmap @@ -197,6 +197,7 @@ Daniel Borkmann Daniel Borkmann Daniel Borkmann Daniel Borkmann +Danilo Krummrich David Brownell David Collins David Heidelberg From 14c9ede9ca4cd078ad76a6ab9617b81074eb58bf Mon Sep 17 00:00:00 2001 From: Gui-Dong Han Date: Fri, 6 Jun 2025 07:16:40 +0000 Subject: [PATCH 169/242] hwmon: (ftsteutates) Fix TOCTOU race in fts_read() In the fts_read() function, when handling hwmon_pwm_auto_channels_temp, the code accesses the shared variable data->fan_source[channel] twice without holding any locks. It is first checked against FTS_FAN_SOURCE_INVALID, and if the check passes, it is read again when used as an argument to the BIT() macro. This creates a Time-of-Check to Time-of-Use (TOCTOU) race condition. Another thread executing fts_update_device() can modify the value of data->fan_source[channel] between the check and its use. If the value is changed to FTS_FAN_SOURCE_INVALID (0xff) during this window, the BIT() macro will be called with a large shift value (BIT(255)). A bit shift by a value greater than or equal to the type width is undefined behavior and can lead to a crash or incorrect values being returned to userspace. Fix this by reading data->fan_source[channel] into a local variable once, eliminating the race condition. Additionally, add a bounds check to ensure the value is less than BITS_PER_LONG before passing it to the BIT() macro, making the code more robust against undefined behavior. This possible bug was found by an experimental static analysis tool developed by our team. Fixes: 1c5759d8ce05 ("hwmon: (ftsteutates) Replace fanX_source with pwmX_auto_channels_temp") Cc: stable@vger.kernel.org Signed-off-by: Gui-Dong Han Link: https://lore.kernel.org/r/20250606071640.501262-1-hanguidong02@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/ftsteutates.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/hwmon/ftsteutates.c b/drivers/hwmon/ftsteutates.c index a3a07662e491..8aeec16a7a90 100644 --- a/drivers/hwmon/ftsteutates.c +++ b/drivers/hwmon/ftsteutates.c @@ -423,13 +423,16 @@ static int fts_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, break; case hwmon_pwm: switch (attr) { - case hwmon_pwm_auto_channels_temp: - if (data->fan_source[channel] == FTS_FAN_SOURCE_INVALID) + case hwmon_pwm_auto_channels_temp: { + u8 fan_source = data->fan_source[channel]; + + if (fan_source == FTS_FAN_SOURCE_INVALID || fan_source >= BITS_PER_LONG) *val = 0; else - *val = BIT(data->fan_source[channel]); + *val = BIT(fan_source); return 0; + } default: break; } From 744c2fe950e936c4d62430de899d6253424200ed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Jun 2025 11:23:06 +0200 Subject: [PATCH 170/242] hwmon: (occ) Rework attribute registration for stack usage clang produces an output with excessive stack usage when building the occ_setup_sensor_attrs() function, apparently the result of having a lot of struct literals and building with the -fno-strict-overflow option that leads clang to skip some optimization in case the 'attr' pointer overruns: drivers/hwmon/occ/common.c:775:12: error: stack frame size (1392) exceeds limit (1280) in 'occ_setup_sensor_attrs' [-Werror,-Wframe-larger-than] Replace the custom macros for initializing the attributes with a simpler function call that does not run into this corner case. Link: https://godbolt.org/z/Wf1Yx76a5 Fixes: 54076cb3b5ff ("hwmon (occ): Add sensor attributes and register hwmon device") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250610092315.2640039-1-arnd@kernel.org Signed-off-by: Guenter Roeck --- drivers/hwmon/occ/common.c | 210 +++++++++++++++---------------------- 1 file changed, 84 insertions(+), 126 deletions(-) diff --git a/drivers/hwmon/occ/common.c b/drivers/hwmon/occ/common.c index 9486db249c64..9029ad53790b 100644 --- a/drivers/hwmon/occ/common.c +++ b/drivers/hwmon/occ/common.c @@ -747,28 +747,29 @@ static ssize_t occ_show_extended(struct device *dev, } /* - * Some helper macros to make it easier to define an occ_attribute. Since these - * are dynamically allocated, we shouldn't use the existing kernel macros which + * A helper to make it easier to define an occ_attribute. Since these + * are dynamically allocated, we cannot use the existing kernel macros which * stringify the name argument. */ -#define ATTR_OCC(_name, _mode, _show, _store) { \ - .attr = { \ - .name = _name, \ - .mode = VERIFY_OCTAL_PERMISSIONS(_mode), \ - }, \ - .show = _show, \ - .store = _store, \ -} +static void occ_init_attribute(struct occ_attribute *attr, int mode, + ssize_t (*show)(struct device *dev, struct device_attribute *attr, char *buf), + ssize_t (*store)(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count), + int nr, int index, const char *fmt, ...) +{ + va_list args; -#define SENSOR_ATTR_OCC(_name, _mode, _show, _store, _nr, _index) { \ - .dev_attr = ATTR_OCC(_name, _mode, _show, _store), \ - .index = _index, \ - .nr = _nr, \ -} + va_start(args, fmt); + vsnprintf(attr->name, sizeof(attr->name), fmt, args); + va_end(args); -#define OCC_INIT_ATTR(_name, _mode, _show, _store, _nr, _index) \ - ((struct sensor_device_attribute_2) \ - SENSOR_ATTR_OCC(_name, _mode, _show, _store, _nr, _index)) + attr->sensor.dev_attr.attr.name = attr->name; + attr->sensor.dev_attr.attr.mode = mode; + attr->sensor.dev_attr.show = show; + attr->sensor.dev_attr.store = store; + attr->sensor.index = index; + attr->sensor.nr = nr; +} /* * Allocate and instatiate sensor_device_attribute_2s. It's most efficient to @@ -855,14 +856,15 @@ static int occ_setup_sensor_attrs(struct occ *occ) sensors->extended.num_sensors = 0; } - occ->attrs = devm_kzalloc(dev, sizeof(*occ->attrs) * num_attrs, + occ->attrs = devm_kcalloc(dev, num_attrs, sizeof(*occ->attrs), GFP_KERNEL); if (!occ->attrs) return -ENOMEM; /* null-terminated list */ - occ->group.attrs = devm_kzalloc(dev, sizeof(*occ->group.attrs) * - num_attrs + 1, GFP_KERNEL); + occ->group.attrs = devm_kcalloc(dev, num_attrs + 1, + sizeof(*occ->group.attrs), + GFP_KERNEL); if (!occ->group.attrs) return -ENOMEM; @@ -872,43 +874,33 @@ static int occ_setup_sensor_attrs(struct occ *occ) s = i + 1; temp = ((struct temp_sensor_2 *)sensors->temp.data) + i; - snprintf(attr->name, sizeof(attr->name), "temp%d_label", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_temp, NULL, - 0, i); + occ_init_attribute(attr, 0444, show_temp, NULL, + 0, i, "temp%d_label", s); attr++; if (sensors->temp.version == 2 && temp->fru_type == OCC_FRU_TYPE_VRM) { - snprintf(attr->name, sizeof(attr->name), - "temp%d_alarm", s); + occ_init_attribute(attr, 0444, show_temp, NULL, + 1, i, "temp%d_alarm", s); } else { - snprintf(attr->name, sizeof(attr->name), - "temp%d_input", s); + occ_init_attribute(attr, 0444, show_temp, NULL, + 1, i, "temp%d_input", s); } - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_temp, NULL, - 1, i); attr++; if (sensors->temp.version > 1) { - snprintf(attr->name, sizeof(attr->name), - "temp%d_fru_type", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_temp, NULL, 2, i); + occ_init_attribute(attr, 0444, show_temp, NULL, + 2, i, "temp%d_fru_type", s); attr++; - snprintf(attr->name, sizeof(attr->name), - "temp%d_fault", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_temp, NULL, 3, i); + occ_init_attribute(attr, 0444, show_temp, NULL, + 3, i, "temp%d_fault", s); attr++; if (sensors->temp.version == 0x10) { - snprintf(attr->name, sizeof(attr->name), - "temp%d_max", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_temp, NULL, - 4, i); + occ_init_attribute(attr, 0444, show_temp, NULL, + 4, i, "temp%d_max", s); attr++; } } @@ -917,14 +909,12 @@ static int occ_setup_sensor_attrs(struct occ *occ) for (i = 0; i < sensors->freq.num_sensors; ++i) { s = i + 1; - snprintf(attr->name, sizeof(attr->name), "freq%d_label", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_freq, NULL, - 0, i); + occ_init_attribute(attr, 0444, show_freq, NULL, + 0, i, "freq%d_label", s); attr++; - snprintf(attr->name, sizeof(attr->name), "freq%d_input", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_freq, NULL, - 1, i); + occ_init_attribute(attr, 0444, show_freq, NULL, + 1, i, "freq%d_input", s); attr++; } @@ -940,32 +930,24 @@ static int occ_setup_sensor_attrs(struct occ *occ) s = (i * 4) + 1; for (j = 0; j < 4; ++j) { - snprintf(attr->name, sizeof(attr->name), - "power%d_label", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_power, NULL, - nr++, i); + occ_init_attribute(attr, 0444, show_power, + NULL, nr++, i, + "power%d_label", s); attr++; - snprintf(attr->name, sizeof(attr->name), - "power%d_average", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_power, NULL, - nr++, i); + occ_init_attribute(attr, 0444, show_power, + NULL, nr++, i, + "power%d_average", s); attr++; - snprintf(attr->name, sizeof(attr->name), - "power%d_average_interval", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_power, NULL, - nr++, i); + occ_init_attribute(attr, 0444, show_power, + NULL, nr++, i, + "power%d_average_interval", s); attr++; - snprintf(attr->name, sizeof(attr->name), - "power%d_input", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_power, NULL, - nr++, i); + occ_init_attribute(attr, 0444, show_power, + NULL, nr++, i, + "power%d_input", s); attr++; s++; @@ -977,28 +959,20 @@ static int occ_setup_sensor_attrs(struct occ *occ) for (i = 0; i < sensors->power.num_sensors; ++i) { s = i + 1; - snprintf(attr->name, sizeof(attr->name), - "power%d_label", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_power, NULL, 0, i); + occ_init_attribute(attr, 0444, show_power, NULL, + 0, i, "power%d_label", s); attr++; - snprintf(attr->name, sizeof(attr->name), - "power%d_average", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_power, NULL, 1, i); + occ_init_attribute(attr, 0444, show_power, NULL, + 1, i, "power%d_average", s); attr++; - snprintf(attr->name, sizeof(attr->name), - "power%d_average_interval", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_power, NULL, 2, i); + occ_init_attribute(attr, 0444, show_power, NULL, + 2, i, "power%d_average_interval", s); attr++; - snprintf(attr->name, sizeof(attr->name), - "power%d_input", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_power, NULL, 3, i); + occ_init_attribute(attr, 0444, show_power, NULL, + 3, i, "power%d_input", s); attr++; } @@ -1006,56 +980,43 @@ static int occ_setup_sensor_attrs(struct occ *occ) } if (sensors->caps.num_sensors >= 1) { - snprintf(attr->name, sizeof(attr->name), "power%d_label", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_caps, NULL, - 0, 0); + occ_init_attribute(attr, 0444, show_caps, NULL, + 0, 0, "power%d_label", s); attr++; - snprintf(attr->name, sizeof(attr->name), "power%d_cap", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_caps, NULL, - 1, 0); + occ_init_attribute(attr, 0444, show_caps, NULL, + 1, 0, "power%d_cap", s); attr++; - snprintf(attr->name, sizeof(attr->name), "power%d_input", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_caps, NULL, - 2, 0); + occ_init_attribute(attr, 0444, show_caps, NULL, + 2, 0, "power%d_input", s); attr++; - snprintf(attr->name, sizeof(attr->name), - "power%d_cap_not_redundant", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_caps, NULL, - 3, 0); + occ_init_attribute(attr, 0444, show_caps, NULL, + 3, 0, "power%d_cap_not_redundant", s); attr++; - snprintf(attr->name, sizeof(attr->name), "power%d_cap_max", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_caps, NULL, - 4, 0); + occ_init_attribute(attr, 0444, show_caps, NULL, + 4, 0, "power%d_cap_max", s); attr++; - snprintf(attr->name, sizeof(attr->name), "power%d_cap_min", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, show_caps, NULL, - 5, 0); + occ_init_attribute(attr, 0444, show_caps, NULL, + 5, 0, "power%d_cap_min", s); attr++; - snprintf(attr->name, sizeof(attr->name), "power%d_cap_user", - s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0644, show_caps, - occ_store_caps_user, 6, 0); + occ_init_attribute(attr, 0644, show_caps, occ_store_caps_user, + 6, 0, "power%d_cap_user", s); attr++; if (sensors->caps.version > 1) { - snprintf(attr->name, sizeof(attr->name), - "power%d_cap_user_source", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_caps, NULL, 7, 0); + occ_init_attribute(attr, 0444, show_caps, NULL, + 7, 0, "power%d_cap_user_source", s); attr++; if (sensors->caps.version > 2) { - snprintf(attr->name, sizeof(attr->name), - "power%d_cap_min_soft", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - show_caps, NULL, - 8, 0); + occ_init_attribute(attr, 0444, show_caps, NULL, + 8, 0, + "power%d_cap_min_soft", s); attr++; } } @@ -1064,19 +1025,16 @@ static int occ_setup_sensor_attrs(struct occ *occ) for (i = 0; i < sensors->extended.num_sensors; ++i) { s = i + 1; - snprintf(attr->name, sizeof(attr->name), "extn%d_label", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - occ_show_extended, NULL, 0, i); + occ_init_attribute(attr, 0444, occ_show_extended, NULL, + 0, i, "extn%d_label", s); attr++; - snprintf(attr->name, sizeof(attr->name), "extn%d_flags", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - occ_show_extended, NULL, 1, i); + occ_init_attribute(attr, 0444, occ_show_extended, NULL, + 1, i, "extn%d_flags", s); attr++; - snprintf(attr->name, sizeof(attr->name), "extn%d_input", s); - attr->sensor = OCC_INIT_ATTR(attr->name, 0444, - occ_show_extended, NULL, 2, i); + occ_init_attribute(attr, 0444, occ_show_extended, NULL, + 2, i, "extn%d_input", s); attr++; } From 2c021b45c154958566aad0cae9f74ab26a2d5732 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Jun 2025 11:25:49 +0200 Subject: [PATCH 171/242] hwmon: (occ) fix unaligned accesses Passing a pointer to an unaligned integer as a function argument is undefined behavior: drivers/hwmon/occ/common.c:492:27: warning: taking address of packed member 'accumulator' of class or structure 'power_sensor_2' may result in an unaligned pointer value [-Waddress-of-packed-member] 492 | val = occ_get_powr_avg(&power->accumulator, | ^~~~~~~~~~~~~~~~~~ drivers/hwmon/occ/common.c:493:13: warning: taking address of packed member 'update_tag' of class or structure 'power_sensor_2' may result in an unaligned pointer value [-Waddress-of-packed-member] 493 | &power->update_tag); | ^~~~~~~~~~~~~~~~~ Move the get_unaligned() calls out of the function and pass these through argument registers instead. Fixes: c10e753d43eb ("hwmon (occ): Add sensor types and versions") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250610092553.2641094-1-arnd@kernel.org Signed-off-by: Guenter Roeck --- drivers/hwmon/occ/common.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/hwmon/occ/common.c b/drivers/hwmon/occ/common.c index 9029ad53790b..b3694a4209b9 100644 --- a/drivers/hwmon/occ/common.c +++ b/drivers/hwmon/occ/common.c @@ -459,12 +459,10 @@ static ssize_t occ_show_power_1(struct device *dev, return sysfs_emit(buf, "%llu\n", val); } -static u64 occ_get_powr_avg(u64 *accum, u32 *samples) +static u64 occ_get_powr_avg(u64 accum, u32 samples) { - u64 divisor = get_unaligned_be32(samples); - - return (divisor == 0) ? 0 : - div64_u64(get_unaligned_be64(accum) * 1000000ULL, divisor); + return (samples == 0) ? 0 : + mul_u64_u32_div(accum, 1000000UL, samples); } static ssize_t occ_show_power_2(struct device *dev, @@ -489,8 +487,8 @@ static ssize_t occ_show_power_2(struct device *dev, get_unaligned_be32(&power->sensor_id), power->function_id, power->apss_channel); case 1: - val = occ_get_powr_avg(&power->accumulator, - &power->update_tag); + val = occ_get_powr_avg(get_unaligned_be64(&power->accumulator), + get_unaligned_be32(&power->update_tag)); break; case 2: val = (u64)get_unaligned_be32(&power->update_tag) * @@ -527,8 +525,8 @@ static ssize_t occ_show_power_a0(struct device *dev, return sysfs_emit(buf, "%u_system\n", get_unaligned_be32(&power->sensor_id)); case 1: - val = occ_get_powr_avg(&power->system.accumulator, - &power->system.update_tag); + val = occ_get_powr_avg(get_unaligned_be64(&power->system.accumulator), + get_unaligned_be32(&power->system.update_tag)); break; case 2: val = (u64)get_unaligned_be32(&power->system.update_tag) * @@ -541,8 +539,8 @@ static ssize_t occ_show_power_a0(struct device *dev, return sysfs_emit(buf, "%u_proc\n", get_unaligned_be32(&power->sensor_id)); case 5: - val = occ_get_powr_avg(&power->proc.accumulator, - &power->proc.update_tag); + val = occ_get_powr_avg(get_unaligned_be64(&power->proc.accumulator), + get_unaligned_be32(&power->proc.update_tag)); break; case 6: val = (u64)get_unaligned_be32(&power->proc.update_tag) * @@ -555,8 +553,8 @@ static ssize_t occ_show_power_a0(struct device *dev, return sysfs_emit(buf, "%u_vdd\n", get_unaligned_be32(&power->sensor_id)); case 9: - val = occ_get_powr_avg(&power->vdd.accumulator, - &power->vdd.update_tag); + val = occ_get_powr_avg(get_unaligned_be64(&power->vdd.accumulator), + get_unaligned_be32(&power->vdd.update_tag)); break; case 10: val = (u64)get_unaligned_be32(&power->vdd.update_tag) * @@ -569,8 +567,8 @@ static ssize_t occ_show_power_a0(struct device *dev, return sysfs_emit(buf, "%u_vdn\n", get_unaligned_be32(&power->sensor_id)); case 13: - val = occ_get_powr_avg(&power->vdn.accumulator, - &power->vdn.update_tag); + val = occ_get_powr_avg(get_unaligned_be64(&power->vdn.accumulator), + get_unaligned_be32(&power->vdn.update_tag)); break; case 14: val = (u64)get_unaligned_be32(&power->vdn.update_tag) * From c25892b7a1744355e16281cd24a9b59ec15ec974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Wed, 11 Jun 2025 17:26:12 +0100 Subject: [PATCH 172/242] hwmon: (ltc4282) avoid repeated register write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fault enabled bits were being mistankenly enabled twice in case the FW property is present. Remove one of the writes. Fixes: cbc29538dbf7 ("hwmon: Add driver for LTC4282") Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20250611-fix-ltc4282-repetead-write-v1-1-fe46edd08cf1@analog.com Signed-off-by: Guenter Roeck --- drivers/hwmon/ltc4282.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/hwmon/ltc4282.c b/drivers/hwmon/ltc4282.c index 7f38d2696239..f607fe8f7937 100644 --- a/drivers/hwmon/ltc4282.c +++ b/drivers/hwmon/ltc4282.c @@ -1511,13 +1511,6 @@ static int ltc4282_setup(struct ltc4282_state *st, struct device *dev) return ret; } - if (device_property_read_bool(dev, "adi,fault-log-enable")) { - ret = regmap_set_bits(st->map, LTC4282_ADC_CTRL, - LTC4282_FAULT_LOG_EN_MASK); - if (ret) - return ret; - } - if (device_property_read_bool(dev, "adi,fault-log-enable")) { ret = regmap_set_bits(st->map, LTC4282_ADC_CTRL, LTC4282_FAULT_LOG_EN_MASK); if (ret) From 9d4204a8106fe7dc80e3f2e440c8f2ba1ba47319 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 15 Jun 2025 18:06:54 -0700 Subject: [PATCH 173/242] lib/crypto/poly1305: Fix arm64's poly1305_blocks_arch() For some reason arm64's Poly1305 code got changed to ignore the padbit argument. As a result, the output is incorrect when the message length is not a multiple of 16 (which is not reached with the standard ChaCha20Poly1305, but bcachefs could reach this). Fix this. Fixes: a59e5468a921 ("crypto: arm64/poly1305 - Add block-only interface") Reported-by: Kent Overstreet Tested-by: Kent Overstreet Link: https://lore.kernel.org/r/20250616010654.367302-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/lib/crypto/poly1305-glue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/lib/crypto/poly1305-glue.c b/arch/arm64/lib/crypto/poly1305-glue.c index 6a661cf04821..c9a74766785b 100644 --- a/arch/arm64/lib/crypto/poly1305-glue.c +++ b/arch/arm64/lib/crypto/poly1305-glue.c @@ -38,14 +38,14 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, unsigned int todo = min_t(unsigned int, len, SZ_4K); kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, 1); + poly1305_blocks_neon(state, src, todo, padbit); kernel_neon_end(); len -= todo; src += todo; } while (len); } else - poly1305_blocks(state, src, len, 1); + poly1305_blocks(state, src, len, padbit); } EXPORT_SYMBOL_GPL(poly1305_blocks_arch); From 1224b218a4b9203656ecc932152f4c81a97b4fcc Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 13 Jun 2025 17:46:20 +0100 Subject: [PATCH 174/242] pldmfw: Select CRC32 when PLDMFW is selected pldmfw calls crc32 code and depends on it being enabled, else there is a link error as follows. So PLDMFW should select CRC32. lib/pldmfw/pldmfw.o: In function `pldmfw_flash_image': pldmfw.c:(.text+0x70f): undefined reference to `crc32_le_base' This problem was introduced by commit b8265621f488 ("Add pldmfw library for PLDM firmware update"). It manifests as of commit d69ea414c9b4 ("ice: implement device flash update via devlink"). And is more likely to occur as of commit 9ad19171b6d6 ("lib/crc: remove unnecessary prompt for CONFIG_CRC32 and drop 'default y'"). Found by chance while exercising builds based on tinyconfig. Fixes: b8265621f488 ("Add pldmfw library for PLDM firmware update") Signed-off-by: Simon Horman Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250613-pldmfw-crc32-v1-1-f3fad109eee6@kernel.org Signed-off-by: Jakub Kicinski --- lib/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Kconfig b/lib/Kconfig index 6c1b8f184267..37db228f70a9 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -716,6 +716,7 @@ config GENERIC_LIB_DEVMEM_IS_ALLOWED config PLDMFW bool + select CRC32 default n config ASN1_ENCODER From 86c8db86af43f52f682e53a0f2f0828683be1e52 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Fri, 13 Jun 2025 15:37:05 -0400 Subject: [PATCH 175/242] selinux: fix selinux_xfrm_alloc_user() to set correct ctx_len MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should count the terminating NUL byte as part of the ctx_len. Otherwise, UBSAN logs a warning: UBSAN: array-index-out-of-bounds in security/selinux/xfrm.c:99:14 index 60 is out of range for type 'char [*]' The allocation itself is correct so there is no actual out of bounds indexing, just a warning. Cc: stable@vger.kernel.org Suggested-by: Christian Göttsche Link: https://lore.kernel.org/selinux/CAEjxPJ6tA5+LxsGfOJokzdPeRomBHjKLBVR6zbrg+_w3ZZbM3A@mail.gmail.com/ Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/xfrm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index 90ec4ef1b082..61d56b0c2be1 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -94,7 +94,7 @@ static int selinux_xfrm_alloc_user(struct xfrm_sec_ctx **ctxp, ctx->ctx_doi = XFRM_SC_DOI_LSM; ctx->ctx_alg = XFRM_SC_ALG_SELINUX; - ctx->ctx_len = str_len; + ctx->ctx_len = str_len + 1; memcpy(ctx->ctx_str, &uctx[1], str_len); ctx->ctx_str[str_len] = '\0'; rc = security_context_to_sid(ctx->ctx_str, str_len, From a7b3b77fd111d49f8e25624e4ea1046322a57baf Mon Sep 17 00:00:00 2001 From: Mikko Korhonen Date: Tue, 17 Jun 2025 09:18:41 +0300 Subject: [PATCH 176/242] ata: ahci: Disallow LPM for Asus B550-F motherboard Asus ROG STRIX B550-F GAMING (WI-FI) motherboard has problems on some SATA ports with at least one hard drive model (WDC WD20EFAX-68FB5N0) when LPM is enabled. Disabling LPM solves the issue. Cc: stable@vger.kernel.org Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Signed-off-by: Mikko Korhonen Link: https://lore.kernel.org/r/20250617062055.784827-1-mjkorhon@gmail.com [cassel: more detailed comment, make single line comments consistent] Signed-off-by: Niklas Cassel --- drivers/ata/ahci.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index c8ad8ace7496..e5e5c2e81d09 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1452,7 +1452,23 @@ static bool ahci_broken_lpm(struct pci_dev *pdev) DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), DMI_MATCH(DMI_PRODUCT_VERSION, "ASUSPRO D840MB_M840SA"), }, - /* 320 is broken, there is no known good version yet. */ + /* 320 is broken, there is no known good version. */ + }, + { + /* + * AMD 500 Series Chipset SATA Controller [1022:43eb] + * on this motherboard timeouts on ports 5 and 6 when + * LPM is enabled, at least with WDC WD20EFAX-68FB5N0 + * hard drives. LPM with the same drive works fine on + * all other ports on the same controller. + */ + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, + "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_BOARD_NAME, + "ROG STRIX B550-F GAMING (WI-FI)"), + }, + /* 3621 is broken, there is no known good version. */ }, { } /* terminate list */ }; From a85b8544d46390469b6ca72d6bfd3ecb7be985ff Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 14 Jun 2025 00:30:37 +0200 Subject: [PATCH 177/242] wifi: remove zero-length arrays All of these are really meant to be variable-length, and in the case of s1g_beacon it's actually accessed. Make that one in particular, and a couple of others (that aren't used as arrays now), actually variable. Reported-by: syzbot+fd222bb38e916df26fa4@syzkaller.appspotmail.com Fixes: 1e1f706fc2ce ("wifi: cfg80211/mac80211: correctly parse S1G beacon optional elements") Link: https://patch.msgid.link/20250614003037.a3e82e882251.I2e8b58e56ff2a9f8b06c66f036578b7c1d4e4685@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index ce377f7fb912..22f39e5e2ff1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1278,7 +1278,7 @@ struct ieee80211_ext { u8 sa[ETH_ALEN]; __le32 timestamp; u8 change_seq; - u8 variable[0]; + u8 variable[]; } __packed s1g_beacon; } u; } __packed __aligned(2); @@ -1536,7 +1536,7 @@ struct ieee80211_mgmt { u8 action_code; u8 dialog_token; __le16 capability; - u8 variable[0]; + u8 variable[]; } __packed tdls_discover_resp; struct { u8 action_code; @@ -1721,35 +1721,35 @@ struct ieee80211_tdls_data { struct { u8 dialog_token; __le16 capability; - u8 variable[0]; + u8 variable[]; } __packed setup_req; struct { __le16 status_code; u8 dialog_token; __le16 capability; - u8 variable[0]; + u8 variable[]; } __packed setup_resp; struct { __le16 status_code; u8 dialog_token; - u8 variable[0]; + u8 variable[]; } __packed setup_cfm; struct { __le16 reason_code; - u8 variable[0]; + u8 variable[]; } __packed teardown; struct { u8 dialog_token; - u8 variable[0]; + u8 variable[]; } __packed discover_req; struct { u8 target_channel; u8 oper_class; - u8 variable[0]; + u8 variable[]; } __packed chan_switch_req; struct { __le16 status_code; - u8 variable[0]; + u8 variable[]; } __packed chan_switch_resp; } u; } __packed; From d1b1a5eb27c4948e8811cf4dbb05aaf3eb10700c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 16 Jun 2025 17:18:38 +0200 Subject: [PATCH 178/242] wifi: mac80211: drop invalid source address OCB frames In OCB, don't accept frames from invalid source addresses (and in particular don't try to create stations for them), drop the frames instead. Reported-by: syzbot+8b512026a7ec10dcbdd9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/6788d2d9.050a0220.20d369.0028.GAE@google.com/ Signed-off-by: Johannes Berg Tested-by: syzbot+8b512026a7ec10dcbdd9@syzkaller.appspotmail.com Link: https://patch.msgid.link/20250616171838.7433379cab5d.I47444d63c72a0bd58d2e2b67bb99e1fea37eec6f@changeid Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 09beb65d6108..e73431549ce7 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4432,6 +4432,10 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) if (!multicast && !ether_addr_equal(sdata->dev->dev_addr, hdr->addr1)) return false; + /* reject invalid/our STA address */ + if (!is_valid_ether_addr(hdr->addr2) || + ether_addr_equal(sdata->dev->dev_addr, hdr->addr2)) + return false; if (!rx->sta) { int rate_idx; if (status->encoding != RX_ENC_LEGACY) From d19bac3d4edc4ab92e90d1c6bf68620192254b4b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 17 Jun 2025 10:49:01 +0200 Subject: [PATCH 179/242] wifi: mac80211: don't WARN for late channel/color switch There's really no value in the WARN stack trace etc., the reason for this happening isn't directly related to the calling function anyway. Also, syzbot has been observing it constantly, and there's no way we can resolve it there - those systems are just slow. Instead print an error message (once) and add a comment about what really causes this message. Reported-by: syzbot+468656785707b0e995df@syzkaller.appspotmail.com Reported-by: syzbot+18c783c5cf6a781e3e2c@syzkaller.appspotmail.com Reported-by: syzbot+d5924d5cffddfccab68e@syzkaller.appspotmail.com Reported-by: syzbot+7d73d99525d1ff7752ef@syzkaller.appspotmail.com Reported-by: syzbot+8e6e002c74d1927edaf5@syzkaller.appspotmail.com Reported-by: syzbot+97254a3b10c541879a65@syzkaller.appspotmail.com Reported-by: syzbot+dfd1fd46a1960ad9c6ec@syzkaller.appspotmail.com Reported-by: syzbot+85e0b8d12d9ca877d806@syzkaller.appspotmail.com Link: https://patch.msgid.link/20250617104902.146e10919be1.I85f352ca4a2dce6f556e5ff45ceaa5f3769cb5ce@changeid Signed-off-by: Johannes Berg --- net/mac80211/debug.h | 5 ++++- net/mac80211/tx.c | 29 +++++++++++++++++++++-------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/net/mac80211/debug.h b/net/mac80211/debug.h index 5b81998cb0c9..ef7c1a68d88d 100644 --- a/net/mac80211/debug.h +++ b/net/mac80211/debug.h @@ -1,10 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions - * Copyright (C) 2022 - 2024 Intel Corporation + * Copyright (C) 2022 - 2025 Intel Corporation */ #ifndef __MAC80211_DEBUG_H #define __MAC80211_DEBUG_H +#include #include #ifdef CONFIG_MAC80211_OCB_DEBUG @@ -152,6 +153,8 @@ do { \ else \ _sdata_err((link)->sdata, fmt, ##__VA_ARGS__); \ } while (0) +#define link_err_once(link, fmt, ...) \ + DO_ONCE_LITE(link_err, link, fmt, ##__VA_ARGS__) #define link_id_info(sdata, link_id, fmt, ...) \ do { \ if (ieee80211_vif_is_mld(&sdata->vif)) \ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index d8d4f3d7d7f2..d58b80813bdd 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * Transmit and frame generation functions. */ @@ -5016,12 +5016,25 @@ static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata, } } -static u8 __ieee80211_beacon_update_cntdwn(struct beacon_data *beacon) +static u8 __ieee80211_beacon_update_cntdwn(struct ieee80211_link_data *link, + struct beacon_data *beacon) { - beacon->cntdwn_current_counter--; + if (beacon->cntdwn_current_counter == 1) { + /* + * Channel switch handling is done by a worker thread while + * beacons get pulled from hardware timers. It's therefore + * possible that software threads are slow enough to not be + * able to complete CSA handling in a single beacon interval, + * in which case we get here. There isn't much to do about + * it, other than letting the user know that the AP isn't + * behaving correctly. + */ + link_err_once(link, + "beacon TX faster than countdown (channel/color switch) completion\n"); + return 0; + } - /* the counter should never reach 0 */ - WARN_ON_ONCE(!beacon->cntdwn_current_counter); + beacon->cntdwn_current_counter--; return beacon->cntdwn_current_counter; } @@ -5052,7 +5065,7 @@ u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, unsigned int link_i if (!beacon) goto unlock; - count = __ieee80211_beacon_update_cntdwn(beacon); + count = __ieee80211_beacon_update_cntdwn(link, beacon); unlock: rcu_read_unlock(); @@ -5450,7 +5463,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) - __ieee80211_beacon_update_cntdwn(beacon); + __ieee80211_beacon_update_cntdwn(link, beacon); ieee80211_set_beacon_cntdwn(sdata, beacon, link); } @@ -5482,7 +5495,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, * for now we leave it consistent with overall * mac80211's behavior. */ - __ieee80211_beacon_update_cntdwn(beacon); + __ieee80211_beacon_update_cntdwn(link, beacon); ieee80211_set_beacon_cntdwn(sdata, beacon, link); } From 7b4ac12cc929e281cf7edc22203e0533790ebc2b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 13 Jun 2025 14:36:29 +0200 Subject: [PATCH 180/242] openvswitch: Allocate struct ovs_pcpu_storage dynamically PERCPU_MODULE_RESERVE defines the maximum size that can by used for the per-CPU data size used by modules. This is 8KiB. Commit 035fcdc4d240c ("openvswitch: Merge three per-CPU structures into one") restructured the per-CPU memory allocation for the module and moved the separate alloc_percpu() invocations at module init time to a static per-CPU variable which is allocated by the module loader. The size of the per-CPU data section for openvswitch is 6488 bytes which is ~80% of the available per-CPU memory. Together with a few other modules it is easy to exhaust the available 8KiB of memory. Allocate ovs_pcpu_storage dynamically at module init time. Reported-by: Gal Pressman Closes: https://lore.kernel.org/all/c401e017-f8db-4f57-a1cd-89beb979a277@nvidia.com Fixes: 035fcdc4d240c ("openvswitch: Merge three per-CPU structures into one") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20250613123629.-XSoQTCu@linutronix.de Signed-off-by: Paolo Abeni --- net/openvswitch/actions.c | 23 +++++++++------------ net/openvswitch/datapath.c | 42 +++++++++++++++++++++++++++++++------- net/openvswitch/datapath.h | 3 ++- 3 files changed, 47 insertions(+), 21 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index e7269a3eec79..3add108340bf 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -39,16 +39,14 @@ #include "flow_netlink.h" #include "openvswitch_trace.h" -DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage) = { - .bh_lock = INIT_LOCAL_LOCK(bh_lock), -}; +struct ovs_pcpu_storage __percpu *ovs_pcpu_storage; /* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys' * space. Return NULL if out of key spaces. */ static struct sw_flow_key *clone_key(const struct sw_flow_key *key_) { - struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage); + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(ovs_pcpu_storage); struct action_flow_keys *keys = &ovs_pcpu->flow_keys; int level = ovs_pcpu->exec_level; struct sw_flow_key *key = NULL; @@ -94,7 +92,7 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb, const struct nlattr *actions, const int actions_len) { - struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos); + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage->action_fifos); struct deferred_action *da; da = action_fifo_put(fifo); @@ -755,7 +753,7 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct ovs_frag_data *data = this_cpu_ptr(&ovs_pcpu_storage.frag_data); + struct ovs_frag_data *data = this_cpu_ptr(&ovs_pcpu_storage->frag_data); struct vport *vport = data->vport; if (skb_cow_head(skb, data->l2_len) < 0) { @@ -807,7 +805,7 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb, unsigned int hlen = skb_network_offset(skb); struct ovs_frag_data *data; - data = this_cpu_ptr(&ovs_pcpu_storage.frag_data); + data = this_cpu_ptr(&ovs_pcpu_storage->frag_data); data->dst = skb->_skb_refdst; data->vport = vport; data->cb = *OVS_CB(skb); @@ -1566,16 +1564,15 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, clone = clone_flow_key ? clone_key(key) : key; if (clone) { int err = 0; - if (actions) { /* Sample action */ if (clone_flow_key) - __this_cpu_inc(ovs_pcpu_storage.exec_level); + __this_cpu_inc(ovs_pcpu_storage->exec_level); err = do_execute_actions(dp, skb, clone, actions, len); if (clone_flow_key) - __this_cpu_dec(ovs_pcpu_storage.exec_level); + __this_cpu_dec(ovs_pcpu_storage->exec_level); } else { /* Recirc action */ clone->recirc_id = recirc_id; ovs_dp_process_packet(skb, clone); @@ -1611,7 +1608,7 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, static void process_deferred_actions(struct datapath *dp) { - struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos); + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage->action_fifos); /* Do not touch the FIFO in case there is no deferred actions. */ if (action_fifo_is_empty(fifo)) @@ -1642,7 +1639,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, { int err, level; - level = __this_cpu_inc_return(ovs_pcpu_storage.exec_level); + level = __this_cpu_inc_return(ovs_pcpu_storage->exec_level); if (unlikely(level > OVS_RECURSION_LIMIT)) { net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", ovs_dp_name(dp)); @@ -1659,6 +1656,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, process_deferred_actions(dp); out: - __this_cpu_dec(ovs_pcpu_storage.exec_level); + __this_cpu_dec(ovs_pcpu_storage->exec_level); return err; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6a304ae2d959..b990dc83504f 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -244,7 +244,7 @@ void ovs_dp_detach_port(struct vport *p) /* Must be called with rcu_read_lock. */ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { - struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage); + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(ovs_pcpu_storage); const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; @@ -299,7 +299,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) * avoided. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && ovs_pcpu->owner != current) { - local_lock_nested_bh(&ovs_pcpu_storage.bh_lock); + local_lock_nested_bh(&ovs_pcpu_storage->bh_lock); ovs_pcpu->owner = current; ovs_pcpu_locked = true; } @@ -310,7 +310,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) ovs_dp_name(dp), error); if (ovs_pcpu_locked) { ovs_pcpu->owner = NULL; - local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock); + local_unlock_nested_bh(&ovs_pcpu_storage->bh_lock); } stats_counter = &stats->n_hit; @@ -689,13 +689,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); - local_lock_nested_bh(&ovs_pcpu_storage.bh_lock); + local_lock_nested_bh(&ovs_pcpu_storage->bh_lock); if (IS_ENABLED(CONFIG_PREEMPT_RT)) - this_cpu_write(ovs_pcpu_storage.owner, current); + this_cpu_write(ovs_pcpu_storage->owner, current); err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); if (IS_ENABLED(CONFIG_PREEMPT_RT)) - this_cpu_write(ovs_pcpu_storage.owner, NULL); - local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock); + this_cpu_write(ovs_pcpu_storage->owner, NULL); + local_unlock_nested_bh(&ovs_pcpu_storage->bh_lock); local_bh_enable(); rcu_read_unlock(); @@ -2744,6 +2744,28 @@ static struct drop_reason_list drop_reason_list_ovs = { .n_reasons = ARRAY_SIZE(ovs_drop_reasons), }; +static int __init ovs_alloc_percpu_storage(void) +{ + unsigned int cpu; + + ovs_pcpu_storage = alloc_percpu(*ovs_pcpu_storage); + if (!ovs_pcpu_storage) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct ovs_pcpu_storage *ovs_pcpu; + + ovs_pcpu = per_cpu_ptr(ovs_pcpu_storage, cpu); + local_lock_init(&ovs_pcpu->bh_lock); + } + return 0; +} + +static void ovs_free_percpu_storage(void) +{ + free_percpu(ovs_pcpu_storage); +} + static int __init dp_init(void) { int err; @@ -2753,6 +2775,10 @@ static int __init dp_init(void) pr_info("Open vSwitch switching datapath\n"); + err = ovs_alloc_percpu_storage(); + if (err) + goto error; + err = ovs_internal_dev_rtnl_link_register(); if (err) goto error; @@ -2799,6 +2825,7 @@ static int __init dp_init(void) error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); error: + ovs_free_percpu_storage(); return err; } @@ -2813,6 +2840,7 @@ static void dp_cleanup(void) ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); + ovs_free_percpu_storage(); } module_init(dp_init); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 1b5348b0f559..cfeb817a1889 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -220,7 +220,8 @@ struct ovs_pcpu_storage { struct task_struct *owner; local_lock_t bh_lock; }; -DECLARE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage); + +extern struct ovs_pcpu_storage __percpu *ovs_pcpu_storage; /** * enum ovs_pkt_hash_types - hash info to include with a packet From db22720545207f734aaa9d9f71637bfc8b0155e0 Mon Sep 17 00:00:00 2001 From: Brett Werling Date: Thu, 12 Jun 2025 14:18:25 -0500 Subject: [PATCH 181/242] can: tcan4x5x: fix power regulator retrieval during probe Fixes the power regulator retrieval in tcan4x5x_can_probe() by ensuring the regulator pointer is not set to NULL in the successful return from devm_regulator_get_optional(). Fixes: 3814ca3a10be ("can: tcan4x5x: tcan4x5x_can_probe(): turn on the power before parsing the config") Signed-off-by: Brett Werling Link: https://patch.msgid.link/20250612191825.3646364-1-brett.werling@garmin.com Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/tcan4x5x-core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/can/m_can/tcan4x5x-core.c b/drivers/net/can/m_can/tcan4x5x-core.c index e5c162f8c589..8edaa339d590 100644 --- a/drivers/net/can/m_can/tcan4x5x-core.c +++ b/drivers/net/can/m_can/tcan4x5x-core.c @@ -411,10 +411,11 @@ static int tcan4x5x_can_probe(struct spi_device *spi) priv = cdev_to_priv(mcan_class); priv->power = devm_regulator_get_optional(&spi->dev, "vsup"); - if (PTR_ERR(priv->power) == -EPROBE_DEFER) { - ret = -EPROBE_DEFER; - goto out_m_can_class_free_dev; - } else { + if (IS_ERR(priv->power)) { + if (PTR_ERR(priv->power) == -EPROBE_DEFER) { + ret = -EPROBE_DEFER; + goto out_m_can_class_free_dev; + } priv->power = NULL; } From 5d3bc9e5e725aa36cca9b794e340057feb6880b4 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Tue, 20 May 2025 22:36:56 +0530 Subject: [PATCH 182/242] net: ice: Perform accurate aRFS flow match This patch fixes an issue seen in a large-scale deployment under heavy incoming pkts where the aRFS flow wrongly matches a flow and reprograms the NIC with wrong settings. That mis-steering causes RX-path latency spikes and noisy neighbor effects when many connections collide on the same hash (some of our production servers have 20-30K connections). set_rps_cpu() calls ndo_rx_flow_steer() with flow_id that is calculated by hashing the skb sized by the per rx-queue table size. This results in multiple connections (even across different rx-queues) getting the same hash value. The driver steer function modifies the wrong flow to use this rx-queue, e.g.: Flow#1 is first added: Flow#1: , Hash 'h', q#10 Later when a new flow needs to be added: Flow#2: , Hash 'h', q#20 The driver finds the hash 'h' from Flow#1 and updates it to use q#20. This results in both flows getting un-optimized - packets for Flow#1 goes to q#20, and then reprogrammed back to q#10 later and so on; and Flow #2 programming is never done as Flow#1 is matched first for all misses. Many flows may wrongly share the same hash and reprogram rules of the original flow each with their own q#. Tested on two 144-core servers with 16K netperf sessions for 180s. Netperf clients are pinned to cores 0-71 sequentially (so that wrong packets on q#s 72-143 can be measured). IRQs are set 1:1 for queues -> CPUs, enable XPS, enable aRFS (global value is 144 * rps_flow_cnt). Test notes about results from ice_rx_flow_steer(): --------------------------------------------------- 1. "Skip:" counter increments here: if (fltr_info->q_index == rxq_idx || arfs_entry->fltr_state != ICE_ARFS_ACTIVE) goto out; 2. "Add:" counter increments here: ret = arfs_entry->fltr_info.fltr_id; INIT_HLIST_NODE(&arfs_entry->list_entry); 3. "Update:" counter increments here: /* update the queue to forward to on an already existing flow */ Runtime comparison: original code vs with the patch for different rps_flow_cnt values. +-------------------------------+--------------+--------------+ | rps_flow_cnt | 512 | 2048 | +-------------------------------+--------------+--------------+ | Ratio of Pkts on Good:Bad q's | 214 vs 822K | 1.1M vs 980K | | Avoid wrong aRFS programming | 0 vs 310K | 0 vs 30K | | CPU User | 216 vs 183 | 216 vs 206 | | CPU System | 1441 vs 1171 | 1447 vs 1320 | | CPU Softirq | 1245 vs 920 | 1238 vs 961 | | CPU Total | 29 vs 22.7 | 29 vs 24.9 | | aRFS Update | 533K vs 59 | 521K vs 32 | | aRFS Skip | 82M vs 77M | 7.2M vs 4.5M | +-------------------------------+--------------+--------------+ A separate TCP_STREAM and TCP_RR with 1,4,8,16,64,128,256,512 connections showed no performance degradation. Some points on the patch/aRFS behavior: 1. Enabling full tuple matching ensures flows are always correctly matched, even with smaller hash sizes. 2. 5-6% drop in CPU utilization as the packets arrive at the correct CPUs and fewer calls to driver for programming on misses. 3. Larger hash tables reduces mis-steering due to more unique flow hashes, but still has clashes. However, with larger per-device rps_flow_cnt, old flows take more time to expire and new aRFS flows cannot be added if h/w limits are reached (rps_may_expire_flow() succeeds when 10*rps_flow_cnt pkts have been processed by this cpu that are not part of the flow). Fixes: 28bf26724fdb0 ("ice: Implement aRFS") Signed-off-by: Krishna Kumar Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_arfs.c | 48 +++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_arfs.c b/drivers/net/ethernet/intel/ice/ice_arfs.c index 2bc5c7f59844..1f7834c03550 100644 --- a/drivers/net/ethernet/intel/ice/ice_arfs.c +++ b/drivers/net/ethernet/intel/ice/ice_arfs.c @@ -377,6 +377,50 @@ ice_arfs_is_perfect_flow_set(struct ice_hw *hw, __be16 l3_proto, u8 l4_proto) return false; } +/** + * ice_arfs_cmp - Check if aRFS filter matches this flow. + * @fltr_info: filter info of the saved ARFS entry. + * @fk: flow dissector keys. + * @n_proto: One of htons(ETH_P_IP) or htons(ETH_P_IPV6). + * @ip_proto: One of IPPROTO_TCP or IPPROTO_UDP. + * + * Since this function assumes limited values for n_proto and ip_proto, it + * is meant to be called only from ice_rx_flow_steer(). + * + * Return: + * * true - fltr_info refers to the same flow as fk. + * * false - fltr_info and fk refer to different flows. + */ +static bool +ice_arfs_cmp(const struct ice_fdir_fltr *fltr_info, const struct flow_keys *fk, + __be16 n_proto, u8 ip_proto) +{ + /* Determine if the filter is for IPv4 or IPv6 based on flow_type, + * which is one of ICE_FLTR_PTYPE_NONF_IPV{4,6}_{TCP,UDP}. + */ + bool is_v4 = fltr_info->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_TCP || + fltr_info->flow_type == ICE_FLTR_PTYPE_NONF_IPV4_UDP; + + /* Following checks are arranged in the quickest and most discriminative + * fields first for early failure. + */ + if (is_v4) + return n_proto == htons(ETH_P_IP) && + fltr_info->ip.v4.src_port == fk->ports.src && + fltr_info->ip.v4.dst_port == fk->ports.dst && + fltr_info->ip.v4.src_ip == fk->addrs.v4addrs.src && + fltr_info->ip.v4.dst_ip == fk->addrs.v4addrs.dst && + fltr_info->ip.v4.proto == ip_proto; + + return fltr_info->ip.v6.src_port == fk->ports.src && + fltr_info->ip.v6.dst_port == fk->ports.dst && + fltr_info->ip.v6.proto == ip_proto && + !memcmp(&fltr_info->ip.v6.src_ip, &fk->addrs.v6addrs.src, + sizeof(struct in6_addr)) && + !memcmp(&fltr_info->ip.v6.dst_ip, &fk->addrs.v6addrs.dst, + sizeof(struct in6_addr)); +} + /** * ice_rx_flow_steer - steer the Rx flow to where application is being run * @netdev: ptr to the netdev being adjusted @@ -448,6 +492,10 @@ ice_rx_flow_steer(struct net_device *netdev, const struct sk_buff *skb, continue; fltr_info = &arfs_entry->fltr_info; + + if (!ice_arfs_cmp(fltr_info, &fk, n_proto, ip_proto)) + continue; + ret = fltr_info->fltr_id; if (fltr_info->q_index == rxq_idx || From 48c8b214974dc55283bd5f12e3a483b27c403bbc Mon Sep 17 00:00:00 2001 From: Grzegorz Nitka Date: Fri, 16 May 2025 15:09:07 +0200 Subject: [PATCH 183/242] ice: fix eswitch code memory leak in reset scenario Add simple eswitch mode checker in attaching VF procedure and allocate required port representor memory structures only in switchdev mode. The reset flows triggers VF (if present) detach/attach procedure. It might involve VF port representor(s) re-creation if the device is configured is switchdev mode (not legacy one). The memory was blindly allocated in current implementation, regardless of the mode and not freed if in legacy mode. Kmemeleak trace: unreferenced object (percpu) 0x7e3bce5b888458 (size 40): comm "bash", pid 1784, jiffies 4295743894 hex dump (first 32 bytes on cpu 45): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 0): pcpu_alloc_noprof+0x4c4/0x7c0 ice_repr_create+0x66/0x130 [ice] ice_repr_create_vf+0x22/0x70 [ice] ice_eswitch_attach_vf+0x1b/0xa0 [ice] ice_reset_all_vfs+0x1dd/0x2f0 [ice] ice_pci_err_resume+0x3b/0xb0 [ice] pci_reset_function+0x8f/0x120 reset_store+0x56/0xa0 kernfs_fop_write_iter+0x120/0x1b0 vfs_write+0x31c/0x430 ksys_write+0x61/0xd0 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e Testing hints (ethX is PF netdev): - create at least one VF echo 1 > /sys/class/net/ethX/device/sriov_numvfs - trigger the reset echo 1 > /sys/class/net/ethX/device/reset Fixes: 415db8399d06 ("ice: make representor code generic") Signed-off-by: Grzegorz Nitka Reviewed-by: Przemek Kitszel Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_eswitch.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index 6aae03771746..2e4f0969035f 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -508,10 +508,14 @@ ice_eswitch_attach(struct ice_pf *pf, struct ice_repr *repr, unsigned long *id) */ int ice_eswitch_attach_vf(struct ice_pf *pf, struct ice_vf *vf) { - struct ice_repr *repr = ice_repr_create_vf(vf); struct devlink *devlink = priv_to_devlink(pf); + struct ice_repr *repr; int err; + if (!ice_is_eswitch_mode_switchdev(pf)) + return 0; + + repr = ice_repr_create_vf(vf); if (IS_ERR(repr)) return PTR_ERR(repr); From 688a0d61b2d7427189c4eb036ce485d8fc957cbb Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Sun, 25 May 2025 11:38:43 +0300 Subject: [PATCH 184/242] e1000e: set fixed clock frequency indication for Nahum 11 and Nahum 13 On some systems with Nahum 11 and Nahum 13 the value of the XTAL clock in the software STRAP is incorrect. This causes the PTP timer to run at the wrong rate and can lead to synchronization issues. The STRAP value is configured by the system firmware, and a firmware update is not always possible. Since the XTAL clock on these systems always runs at 38.4MHz, the driver may ignore the STRAP and just set the correct value. Fixes: cc23f4f0b6b9 ("e1000e: Add support for Meteor Lake") Signed-off-by: Vitaly Lifshits Tested-by: Mor Bar-Gabay Reviewed-by: Gil Fine Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/netdev.c | 14 +++++++++++--- drivers/net/ethernet/intel/e1000e/ptp.c | 8 +++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index a96f4cfa6e17..7719e15813ee 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -3534,9 +3534,6 @@ s32 e1000e_get_base_timinca(struct e1000_adapter *adapter, u32 *timinca) case e1000_pch_cnp: case e1000_pch_tgp: case e1000_pch_adp: - case e1000_pch_mtp: - case e1000_pch_lnp: - case e1000_pch_ptp: case e1000_pch_nvp: if (er32(TSYNCRXCTL) & E1000_TSYNCRXCTL_SYSCFI) { /* Stable 24MHz frequency */ @@ -3552,6 +3549,17 @@ s32 e1000e_get_base_timinca(struct e1000_adapter *adapter, u32 *timinca) adapter->cc.shift = shift; } break; + case e1000_pch_mtp: + case e1000_pch_lnp: + case e1000_pch_ptp: + /* System firmware can misreport this value, so set it to a + * stable 38400KHz frequency. + */ + incperiod = INCPERIOD_38400KHZ; + incvalue = INCVALUE_38400KHZ; + shift = INCVALUE_SHIFT_38400KHZ; + adapter->cc.shift = shift; + break; case e1000_82574: case e1000_82583: /* Stable 25MHz frequency */ diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c index 89d57dd911dc..ea3c3eb2ef20 100644 --- a/drivers/net/ethernet/intel/e1000e/ptp.c +++ b/drivers/net/ethernet/intel/e1000e/ptp.c @@ -295,15 +295,17 @@ void e1000e_ptp_init(struct e1000_adapter *adapter) case e1000_pch_cnp: case e1000_pch_tgp: case e1000_pch_adp: - case e1000_pch_mtp: - case e1000_pch_lnp: - case e1000_pch_ptp: case e1000_pch_nvp: if (er32(TSYNCRXCTL) & E1000_TSYNCRXCTL_SYSCFI) adapter->ptp_clock_info.max_adj = MAX_PPB_24MHZ; else adapter->ptp_clock_info.max_adj = MAX_PPB_38400KHZ; break; + case e1000_pch_mtp: + case e1000_pch_lnp: + case e1000_pch_ptp: + adapter->ptp_clock_info.max_adj = MAX_PPB_38400KHZ; + break; case e1000_82574: case e1000_82583: adapter->ptp_clock_info.max_adj = MAX_PPB_25MHZ; From c50784e99f0e7199cdb12dbddf02229b102744ef Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 13:23:07 -1000 Subject: [PATCH 185/242] sched_ext: Make scx_group_set_weight() always update tg->scx.weight Otherwise, tg->scx.weight can go out of sync while scx_cgroup is not enabled and ops.cgroup_init() may be called with a stale weight value. Signed-off-by: Tejun Heo Fixes: 819513666966 ("sched_ext: Add cgroup support") Cc: stable@vger.kernel.org # v6.12+ --- kernel/sched/ext.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2c41c78be61e..33a0d8c6ff95 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4241,12 +4241,12 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled && tg->scx_weight != weight) { - if (SCX_HAS_OP(sch, cgroup_set_weight)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, - tg_cgrp(tg), weight); - tg->scx_weight = weight; - } + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && + tg->scx_weight != weight) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, + tg_cgrp(tg), weight); + + tg->scx_weight = weight; percpu_up_read(&scx_cgroup_rwsem); } From 33796b91871ad4010c8188372dd1faf97cf0f1c0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 16 Jun 2025 10:13:25 -1000 Subject: [PATCH 186/242] sched_ext, sched/core: Don't call scx_group_set_weight() prematurely from sched_create_group() During task_group creation, sched_create_group() calls scx_group_set_weight() with CGROUP_WEIGHT_DFL to initialize the sched_ext portion. This is premature and ends up calling ops.cgroup_set_weight() with an incorrect @cgrp before ops.cgroup_init() is called. sched_create_group() should just initialize SCX related fields in the new task_group. Fix it by factoring out scx_tg_init() from sched_init() and making sched_create_group() call that function instead of scx_group_set_weight(). v2: Retain CONFIG_EXT_GROUP_SCHED ifdef in sched_init() as removing it leads to build failures on !CONFIG_GROUP_SCHED configs. Signed-off-by: Tejun Heo Fixes: 819513666966 ("sched_ext: Add cgroup support") Cc: stable@vger.kernel.org # v6.12+ --- kernel/sched/core.c | 4 ++-- kernel/sched/ext.c | 5 +++++ kernel/sched/ext.h | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dce50fa57471..8988d38d46a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8545,7 +8545,7 @@ void __init sched_init(void) init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_EXT_GROUP_SCHED - root_task_group.scx_weight = CGROUP_WEIGHT_DFL; + scx_tg_init(&root_task_group); #endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -8985,7 +8985,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; - scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); + scx_tg_init(tg); alloc_uclamp_sched_group(tg, parent); return tg; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 33a0d8c6ff95..b498d867ba21 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4092,6 +4092,11 @@ bool scx_can_stop_tick(struct rq *rq) DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); static bool scx_cgroup_enabled; +void scx_tg_init(struct task_group *tg) +{ + tg->scx_weight = CGROUP_WEIGHT_DFL; +} + int scx_tg_online(struct task_group *tg) { struct scx_sched *sch = scx_root; diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 6e5072f57771..a75835c23f15 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -79,6 +79,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} #ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_EXT_GROUP_SCHED +void scx_tg_init(struct task_group *tg); int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); @@ -88,6 +89,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); #else /* CONFIG_EXT_GROUP_SCHED */ +static inline void scx_tg_init(struct task_group *tg) {} static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } From 261dce3d64021e7ec828a17b4975ce9182e54ceb Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Tue, 17 Jun 2025 12:42:16 +0800 Subject: [PATCH 187/242] workqueue: Initialize wq_isolated_cpumask in workqueue_init_early() Now when isolcpus is enabled via the cmdline, wq_isolated_cpumask does not include these isolated CPUs, even wq_unbound_cpumask has already excluded them. It is only when we successfully configure an isolate cpuset partition that wq_isolated_cpumask gets overwritten by workqueue_unbound_exclude_cpumask(), including both the cmdline-specified isolated CPUs and the isolated CPUs within the cpuset partitions. Fix this issue by initializing wq_isolated_cpumask properly in workqueue_init_early(). Fixes: fe28f631fa94 ("workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask") Signed-off-by: Chuyi Zhou Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 97f37b5bae66..9f9148075828 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7767,7 +7767,8 @@ void __init workqueue_init_early(void) restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); - + cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, + housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs(); From a89f5fae998bdc4d0505306f93844c9ae059d50c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 10 Jun 2025 18:52:56 +0900 Subject: [PATCH 188/242] ksmbd: add free_transport ops in ksmbd connection free_transport function for tcp connection can be called from smbdirect. It will cause kernel oops. This patch add free_transport ops in ksmbd connection, and add each free_transports for tcp and smbdirect. Fixes: 21a4e47578d4 ("ksmbd: fix use-after-free in __smb2_lease_break_noti()") Reviewed-by: Stefan Metzmacher Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 2 +- fs/smb/server/connection.h | 1 + fs/smb/server/transport_rdma.c | 10 ++++++++-- fs/smb/server/transport_tcp.c | 3 ++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 83764c230e9d..3f04a2977ba8 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -40,7 +40,7 @@ void ksmbd_conn_free(struct ksmbd_conn *conn) kvfree(conn->request_buf); kfree(conn->preauth_info); if (atomic_dec_and_test(&conn->refcnt)) { - ksmbd_free_transport(conn->transport); + conn->transport->ops->free_transport(conn->transport); kfree(conn); } } diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 6efed923bd68..dd3e0e3f7bf0 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -133,6 +133,7 @@ struct ksmbd_transport_ops { void *buf, unsigned int len, struct smb2_buffer_desc_v1 *desc, unsigned int desc_len); + void (*free_transport)(struct ksmbd_transport *kt); }; struct ksmbd_transport { diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 4998df04ab95..64a428a06ace 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -159,7 +159,8 @@ struct smb_direct_transport { }; #define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport)) - +#define SMBD_TRANS(t) ((struct smb_direct_transport *)container_of(t, \ + struct smb_direct_transport, transport)) enum { SMB_DIRECT_MSG_NEGOTIATE_REQ = 0, SMB_DIRECT_MSG_DATA_TRANSFER @@ -410,6 +411,11 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) return NULL; } +static void smb_direct_free_transport(struct ksmbd_transport *kt) +{ + kfree(SMBD_TRANS(kt)); +} + static void free_transport(struct smb_direct_transport *t) { struct smb_direct_recvmsg *recvmsg; @@ -455,7 +461,6 @@ static void free_transport(struct smb_direct_transport *t) smb_direct_destroy_pools(t); ksmbd_conn_free(KSMBD_TRANS(t)->conn); - kfree(t); } static struct smb_direct_sendmsg @@ -2281,4 +2286,5 @@ static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { .read = smb_direct_read, .rdma_read = smb_direct_rdma_read, .rdma_write = smb_direct_rdma_write, + .free_transport = smb_direct_free_transport, }; diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index abedf510899a..4e9f98db9ff4 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -93,7 +93,7 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk) return t; } -void ksmbd_free_transport(struct ksmbd_transport *kt) +static void ksmbd_tcp_free_transport(struct ksmbd_transport *kt) { struct tcp_transport *t = TCP_TRANS(kt); @@ -656,4 +656,5 @@ static const struct ksmbd_transport_ops ksmbd_tcp_transport_ops = { .read = ksmbd_tcp_read, .writev = ksmbd_tcp_writev, .disconnect = ksmbd_tcp_disconnect, + .free_transport = ksmbd_tcp_free_transport, }; From 7ac5b66acafcc9292fb935d7e03790f2b8b2dc0e Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 13 Jun 2025 10:12:43 +0900 Subject: [PATCH 189/242] ksmbd: fix null pointer dereference in destroy_previous_session If client set ->PreviousSessionId on kerberos session setup stage, NULL pointer dereference error will happen. Since sess->user is not set yet, It can pass the user argument as NULL to destroy_previous_session. sess->user will be set in ksmbd_krb5_authenticate(). So this patch move calling destroy_previous_session() after ksmbd_krb5_authenticate(). Cc: stable@vger.kernel.org Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-27391 Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 1a308171b599..6645d8fd772e 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1607,17 +1607,18 @@ static int krb5_authenticate(struct ksmbd_work *work, out_len = work->response_sz - (le16_to_cpu(rsp->SecurityBufferOffset) + 4); - /* Check previous session */ - prev_sess_id = le64_to_cpu(req->PreviousSessionId); - if (prev_sess_id && prev_sess_id != sess->id) - destroy_previous_session(conn, sess->user, prev_sess_id); - retval = ksmbd_krb5_authenticate(sess, in_blob, in_len, out_blob, &out_len); if (retval) { ksmbd_debug(SMB, "krb5 authentication failed\n"); return -EINVAL; } + + /* Check previous session */ + prev_sess_id = le64_to_cpu(req->PreviousSessionId); + if (prev_sess_id && prev_sess_id != sess->id) + destroy_previous_session(conn, sess->user, prev_sess_id); + rsp->SecurityBufferLength = cpu_to_le16(out_len); if ((conn->sign || server_conf.enforced_signing) || From 4ea0bb8aaedfad8e695429cda6bd1c8b0dad0844 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 13 Jun 2025 11:51:29 +0900 Subject: [PATCH 190/242] ksmbd: handle set/get info file for streamed file The bug only appears when: - windows 11 copies a file that has an alternate data stream - streams_xattr is enabled on the share configuration. Microsoft Edge adds a ZoneIdentifier data stream containing the URL for files it downloads. Another way to create a test file: - open cmd.exe - echo "hello from default data stream" > hello.txt - echo "hello again from ads" > hello.txt:ads.txt If you open the file using notepad, we'll see the first message. If you run "notepad hello.txt:ads.txt" in cmd.exe, we should see the second message. dir /s /r should least all streams for the file. The truncation happens because the windows 11 client sends a SetInfo/EndOfFile message on the ADS, but it is instead applied on the main file, because we don't check fp->stream. When receiving set/get info file for a stream file, Change to process requests using stream position and size. Truncate is unnecessary for stream files, so we skip set_file_allocation_info and set_end_of_file_info operations. Reported-by: Marios Makassikis Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 63 +++++++++++++++++++++++++++++++-------- fs/smb/server/vfs.c | 5 ++-- fs/smb/server/vfs_cache.h | 1 + 3 files changed, 54 insertions(+), 15 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 6645d8fd772e..fafa86273f12 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -4872,8 +4872,13 @@ static int get_file_standard_info(struct smb2_query_info_rsp *rsp, sinfo = (struct smb2_file_standard_info *)rsp->Buffer; delete_pending = ksmbd_inode_pending_delete(fp); - sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9); - sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + if (ksmbd_stream_fd(fp) == false) { + sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9); + sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + } else { + sinfo->AllocationSize = cpu_to_le64(fp->stream.size); + sinfo->EndOfFile = cpu_to_le64(fp->stream.size); + } sinfo->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending); sinfo->DeletePending = delete_pending; sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0; @@ -4936,9 +4941,14 @@ static int get_file_all_info(struct ksmbd_work *work, file_info->ChangeTime = cpu_to_le64(time); file_info->Attributes = fp->f_ci->m_fattr; file_info->Pad1 = 0; - file_info->AllocationSize = - cpu_to_le64(stat.blocks << 9); - file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + if (ksmbd_stream_fd(fp) == false) { + file_info->AllocationSize = + cpu_to_le64(stat.blocks << 9); + file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + } else { + file_info->AllocationSize = cpu_to_le64(fp->stream.size); + file_info->EndOfFile = cpu_to_le64(fp->stream.size); + } file_info->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending); file_info->DeletePending = delete_pending; @@ -4947,7 +4957,10 @@ static int get_file_all_info(struct ksmbd_work *work, file_info->IndexNumber = cpu_to_le64(stat.ino); file_info->EASize = 0; file_info->AccessFlags = fp->daccess; - file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + if (ksmbd_stream_fd(fp) == false) + file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + else + file_info->CurrentByteOffset = cpu_to_le64(fp->stream.pos); file_info->Mode = fp->coption; file_info->AlignmentRequirement = 0; conv_len = smbConvertToUTF16((__le16 *)file_info->FileName, filename, @@ -5135,8 +5148,13 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, time = ksmbd_UnixTimeToNT(stat.ctime); file_info->ChangeTime = cpu_to_le64(time); file_info->Attributes = fp->f_ci->m_fattr; - file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); - file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + if (ksmbd_stream_fd(fp) == false) { + file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); + file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + } else { + file_info->AllocationSize = cpu_to_le64(fp->stream.size); + file_info->EndOfFile = cpu_to_le64(fp->stream.size); + } file_info->Reserved = cpu_to_le32(0); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_ntwrk_info)); @@ -5159,7 +5177,11 @@ static void get_file_position_info(struct smb2_query_info_rsp *rsp, struct smb2_file_pos_info *file_info; file_info = (struct smb2_file_pos_info *)rsp->Buffer; - file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + if (ksmbd_stream_fd(fp) == false) + file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + else + file_info->CurrentByteOffset = cpu_to_le64(fp->stream.pos); + rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_pos_info)); } @@ -5248,8 +5270,13 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info->ChangeTime = cpu_to_le64(time); file_info->DosAttributes = fp->f_ci->m_fattr; file_info->Inode = cpu_to_le64(stat.ino); - file_info->EndOfFile = cpu_to_le64(stat.size); - file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); + if (ksmbd_stream_fd(fp) == false) { + file_info->EndOfFile = cpu_to_le64(stat.size); + file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); + } else { + file_info->EndOfFile = cpu_to_le64(fp->stream.size); + file_info->AllocationSize = cpu_to_le64(fp->stream.size); + } file_info->HardLinks = cpu_to_le32(stat.nlink); file_info->Mode = cpu_to_le32(stat.mode & 0777); switch (stat.mode & S_IFMT) { @@ -6191,6 +6218,9 @@ static int set_file_allocation_info(struct ksmbd_work *work, if (!(fp->daccess & FILE_WRITE_DATA_LE)) return -EACCES; + if (ksmbd_stream_fd(fp) == true) + return 0; + rc = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); if (rc) @@ -6249,7 +6279,8 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, * truncate of some filesystem like FAT32 fill zero data in * truncated range. */ - if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) { + if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC && + ksmbd_stream_fd(fp) == false) { ksmbd_debug(SMB, "truncated to newsize %lld\n", newsize); rc = ksmbd_vfs_truncate(work, fp, newsize); if (rc) { @@ -6322,7 +6353,13 @@ static int set_file_position_info(struct ksmbd_file *fp, return -EINVAL; } - fp->filp->f_pos = current_byte_offset; + if (ksmbd_stream_fd(fp) == false) + fp->filp->f_pos = current_byte_offset; + else { + if (current_byte_offset > XATTR_SIZE_MAX) + current_byte_offset = XATTR_SIZE_MAX; + fp->stream.pos = current_byte_offset; + } return 0; } diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index ba45e809555a..0f3aad12e495 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -293,6 +293,7 @@ static int ksmbd_vfs_stream_read(struct ksmbd_file *fp, char *buf, loff_t *pos, if (v_len - *pos < count) count = v_len - *pos; + fp->stream.pos = v_len; memcpy(buf, &stream_buf[*pos], count); @@ -456,8 +457,8 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, true); if (err < 0) goto out; - - fp->filp->f_pos = *pos; + else + fp->stream.pos = size; err = 0; out: kvfree(stream_buf); diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index 5bbb179736c2..0708155b5caf 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -44,6 +44,7 @@ struct ksmbd_lock { struct stream { char *name; ssize_t size; + loff_t pos; }; struct ksmbd_inode { From 60524f1d2bdf222db6dc3f680e0272441f697fe4 Mon Sep 17 00:00:00 2001 From: Meghana Malladi Date: Mon, 16 Jun 2025 12:03:19 +0530 Subject: [PATCH 191/242] net: ti: icssg-prueth: Fix packet handling for XDP_TX While transmitting XDP frames for XDP_TX, page_pool is used to get the DMA buffers (already mapped to the pages) and need to be freed/reycled once the transmission is complete. This need not be explicitly done by the driver as this is handled more gracefully by the xdp driver while returning the xdp frame. __xdp_return() frees the XDP memory based on its memory type, under which page_pool memory is also handled. This change fixes the transmit queue timeout while running XDP_TX. logs: [ 309.069682] icssg-prueth icssg1-eth eth2: NETDEV WATCHDOG: CPU: 0: transmit queue 0 timed out 45860 ms [ 313.933780] icssg-prueth icssg1-eth eth2: NETDEV WATCHDOG: CPU: 0: transmit queue 0 timed out 50724 ms [ 319.053656] icssg-prueth icssg1-eth eth2: NETDEV WATCHDOG: CPU: 0: transmit queue 0 timed out 55844 ms ... Fixes: 62aa3246f462 ("net: ti: icssg-prueth: Add XDP support") Signed-off-by: Meghana Malladi Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250616063319.3347541-1-m-malladi@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/icssg/icssg_common.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_common.c b/drivers/net/ethernet/ti/icssg/icssg_common.c index 5b8fdb882172..12f25cec6255 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_common.c +++ b/drivers/net/ethernet/ti/icssg/icssg_common.c @@ -98,20 +98,11 @@ void prueth_xmit_free(struct prueth_tx_chn *tx_chn, { struct cppi5_host_desc_t *first_desc, *next_desc; dma_addr_t buf_dma, next_desc_dma; - struct prueth_swdata *swdata; - struct page *page; u32 buf_dma_len; first_desc = desc; next_desc = first_desc; - swdata = cppi5_hdesc_get_swdata(desc); - if (swdata->type == PRUETH_SWDATA_PAGE) { - page = swdata->data.page; - page_pool_recycle_direct(page->pp, swdata->data.page); - goto free_desc; - } - cppi5_hdesc_get_obuf(first_desc, &buf_dma, &buf_dma_len); k3_udma_glue_tx_cppi5_to_dma_addr(tx_chn->tx_chn, &buf_dma); @@ -135,7 +126,6 @@ void prueth_xmit_free(struct prueth_tx_chn *tx_chn, k3_cppi_desc_pool_free(tx_chn->desc_pool, next_desc); } -free_desc: k3_cppi_desc_pool_free(tx_chn->desc_pool, first_desc); } EXPORT_SYMBOL_GPL(prueth_xmit_free); @@ -612,13 +602,8 @@ u32 emac_xmit_xdp_frame(struct prueth_emac *emac, k3_udma_glue_tx_dma_to_cppi5_addr(tx_chn->tx_chn, &buf_dma); cppi5_hdesc_attach_buf(first_desc, buf_dma, xdpf->len, buf_dma, xdpf->len); swdata = cppi5_hdesc_get_swdata(first_desc); - if (page) { - swdata->type = PRUETH_SWDATA_PAGE; - swdata->data.page = page; - } else { - swdata->type = PRUETH_SWDATA_XDPF; - swdata->data.xdpf = xdpf; - } + swdata->type = PRUETH_SWDATA_XDPF; + swdata->data.xdpf = xdpf; /* Report BQL before sending the packet */ netif_txq = netdev_get_tx_queue(ndev, tx_chn->id); From 6f793a1d053775f8324b8dba1e7ed224f8b0166f Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Sun, 15 Jun 2025 20:07:33 +0000 Subject: [PATCH 192/242] net: netmem: fix skb_ensure_writable with unreadable skbs skb_ensure_writable should succeed when it's trying to write to the header of the unreadable skbs, so it doesn't need an unconditional skb_frags_readable check. The preceding pskb_may_pull() call will succeed if write_len is within the head and fail if we're trying to write to the unreadable payload, so we don't need an additional check. Removing this check restores DSCP functionality with unreadable skbs as it's called from dscp_tg. Cc: willemb@google.com Cc: asml.silence@gmail.com Fixes: 65249feb6b3d ("net: add support for skbs with unreadable frags") Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250615200733.520113-1-almasrymina@google.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 85fc82f72d26..d6420b74ea9c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6261,9 +6261,6 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) if (!pskb_may_pull(skb, write_len)) return -ENOMEM; - if (!skb_frags_readable(skb)) - return -EFAULT; - if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) return 0; From 1e9ac33fa271be0d2480fd732f9642d81542500b Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Fri, 13 Jun 2025 16:18:39 -0700 Subject: [PATCH 193/242] bnxt_en: Fix double invocation of bnxt_ulp_stop()/bnxt_ulp_start() Before the commit under the Fixes tag below, bnxt_ulp_stop() and bnxt_ulp_start() were always invoked in pairs. After that commit, the new bnxt_ulp_restart() can be invoked after bnxt_ulp_stop() has been called. This may result in the RoCE driver's aux driver .suspend() method being invoked twice. The 2nd bnxt_re_suspend() call will crash when it dereferences a NULL pointer: (NULL ib_device): Handle device suspend call BUG: kernel NULL pointer dereference, address: 0000000000000b78 PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP PTI CPU: 20 UID: 0 PID: 181 Comm: kworker/u96:5 Tainted: G S 6.15.0-rc1 #4 PREEMPT(voluntary) Tainted: [S]=CPU_OUT_OF_SPEC Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.4.3 01/17/2017 Workqueue: bnxt_pf_wq bnxt_sp_task [bnxt_en] RIP: 0010:bnxt_re_suspend+0x45/0x1f0 [bnxt_re] Code: 8b 05 a7 3c 5b f5 48 89 44 24 18 31 c0 49 8b 5c 24 08 4d 8b 2c 24 e8 ea 06 0a f4 48 c7 c6 04 60 52 c0 48 89 df e8 1b ce f9 ff <48> 8b 83 78 0b 00 00 48 8b 80 38 03 00 00 a8 40 0f 85 b5 00 00 00 RSP: 0018:ffffa2e84084fd88 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001 RDX: 0000000000000000 RSI: ffffffffb4b6b934 RDI: 00000000ffffffff RBP: ffffa1760954c9c0 R08: 0000000000000000 R09: c0000000ffffdfff R10: 0000000000000001 R11: ffffa2e84084fb50 R12: ffffa176031ef070 R13: ffffa17609775000 R14: ffffa17603adc180 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffffa17daa397000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000b78 CR3: 00000004aaa30003 CR4: 00000000003706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: bnxt_ulp_stop+0x69/0x90 [bnxt_en] bnxt_sp_task+0x678/0x920 [bnxt_en] ? __schedule+0x514/0xf50 process_scheduled_works+0x9d/0x400 worker_thread+0x11c/0x260 ? __pfx_worker_thread+0x10/0x10 kthread+0xfe/0x1e0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2b/0x40 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Check the BNXT_EN_FLAG_ULP_STOPPED flag and do not proceed if the flag is already set. This will preserve the original symmetrical bnxt_ulp_stop() and bnxt_ulp_start(). Also, inside bnxt_ulp_start(), clear the BNXT_EN_FLAG_ULP_STOPPED flag after taking the mutex to avoid any race condition. And for symmetry, only proceed in bnxt_ulp_start() if the BNXT_EN_FLAG_ULP_STOPPED is set. Fixes: 3c163f35bd50 ("bnxt_en: Optimize recovery path ULP locking in the driver") Signed-off-by: Kalesh AP Co-developed-by: Michael Chan Signed-off-by: Michael Chan Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250613231841.377988-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index 84c4812414fd..2450a369b792 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -231,10 +231,9 @@ void bnxt_ulp_stop(struct bnxt *bp) return; mutex_lock(&edev->en_dev_lock); - if (!bnxt_ulp_registered(edev)) { - mutex_unlock(&edev->en_dev_lock); - return; - } + if (!bnxt_ulp_registered(edev) || + (edev->flags & BNXT_EN_FLAG_ULP_STOPPED)) + goto ulp_stop_exit; edev->flags |= BNXT_EN_FLAG_ULP_STOPPED; if (aux_priv) { @@ -250,6 +249,7 @@ void bnxt_ulp_stop(struct bnxt *bp) adrv->suspend(adev, pm); } } +ulp_stop_exit: mutex_unlock(&edev->en_dev_lock); } @@ -258,19 +258,13 @@ void bnxt_ulp_start(struct bnxt *bp, int err) struct bnxt_aux_priv *aux_priv = bp->aux_priv; struct bnxt_en_dev *edev = bp->edev; - if (!edev) - return; - - edev->flags &= ~BNXT_EN_FLAG_ULP_STOPPED; - - if (err) + if (!edev || err) return; mutex_lock(&edev->en_dev_lock); - if (!bnxt_ulp_registered(edev)) { - mutex_unlock(&edev->en_dev_lock); - return; - } + if (!bnxt_ulp_registered(edev) || + !(edev->flags & BNXT_EN_FLAG_ULP_STOPPED)) + goto ulp_start_exit; if (edev->ulp_tbl->msix_requested) bnxt_fill_msix_vecs(bp, edev->msix_entries); @@ -287,6 +281,8 @@ void bnxt_ulp_start(struct bnxt *bp, int err) adrv->resume(adev); } } +ulp_start_exit: + edev->flags &= ~BNXT_EN_FLAG_ULP_STOPPED; mutex_unlock(&edev->en_dev_lock); } From e11baaea94e2923739a98abeee85eb0667c04fd3 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Fri, 13 Jun 2025 16:18:40 -0700 Subject: [PATCH 194/242] bnxt_en: Add a helper function to configure MRU and RSS Add a new helper function that will configure MRU and RSS table of a VNIC. This will be useful when we configure both on a VNIC when resetting an RX ring. This function will be used again in the next bug fix patch where we have to reconfigure VNICs for RSS contexts. Suggested-by: Michael Chan Reviewed-by: Simon Horman Reviewed-by: David Wei Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250613231841.377988-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 37 ++++++++++++++++------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 869580b6f70d..dfd2366d4c8c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -10780,6 +10780,26 @@ void bnxt_del_one_rss_ctx(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx, bp->num_rss_ctx--; } +static int bnxt_set_vnic_mru_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic, + u16 mru) +{ + int rc; + + if (mru) { + rc = bnxt_hwrm_vnic_set_rss_p5(bp, vnic, true); + if (rc) { + netdev_err(bp->dev, "hwrm vnic %d set rss failure rc: %d\n", + vnic->vnic_id, rc); + return rc; + } + } + vnic->mru = mru; + bnxt_hwrm_vnic_update(bp, vnic, + VNIC_UPDATE_REQ_ENABLES_MRU_VALID); + + return 0; +} + static void bnxt_hwrm_realloc_rss_ctx_vnic(struct bnxt *bp) { bool set_tpa = !!(bp->flags & BNXT_FLAG_TPA); @@ -15927,6 +15947,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) struct bnxt_vnic_info *vnic; struct bnxt_napi *bnapi; int i, rc; + u16 mru; rxr = &bp->rx_ring[idx]; clone = qmem; @@ -15977,18 +15998,13 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) napi_enable_locked(&bnapi->napi); bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons); + mru = bp->dev->mtu + ETH_HLEN + VLAN_HLEN; for (i = 0; i < bp->nr_vnics; i++) { vnic = &bp->vnic_info[i]; - rc = bnxt_hwrm_vnic_set_rss_p5(bp, vnic, true); - if (rc) { - netdev_err(bp->dev, "hwrm vnic %d set rss failure rc: %d\n", - vnic->vnic_id, rc); + rc = bnxt_set_vnic_mru_p5(bp, vnic, mru); + if (rc) return rc; - } - vnic->mru = bp->dev->mtu + ETH_HLEN + VLAN_HLEN; - bnxt_hwrm_vnic_update(bp, vnic, - VNIC_UPDATE_REQ_ENABLES_MRU_VALID); } return 0; @@ -16013,9 +16029,8 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx) for (i = 0; i < bp->nr_vnics; i++) { vnic = &bp->vnic_info[i]; - vnic->mru = 0; - bnxt_hwrm_vnic_update(bp, vnic, - VNIC_UPDATE_REQ_ENABLES_MRU_VALID); + + bnxt_set_vnic_mru_p5(bp, vnic, 0); } /* Make sure NAPI sees that the VNIC is disabled */ synchronize_net(); From 5dacc94c6fe61cde6f700e95cf35af9944b022c4 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Fri, 13 Jun 2025 16:18:41 -0700 Subject: [PATCH 195/242] bnxt_en: Update MRU and RSS table of RSS contexts on queue reset The commit under the Fixes tag below which updates the VNICs' RSS and MRU during .ndo_queue_start(), needs to be extended to cover any non-default RSS contexts which have their own VNICs. Without this step, packets that are destined to a non-default RSS context may be dropped after .ndo_queue_start(). We further optimize this scheme by updating the VNIC only if the RX ring being restarted is in the RSS table of the VNIC. Updating the VNIC (in particular setting the MRU to 0) will momentarily stop all traffic to all rings in the RSS table. Any VNIC that has the RX ring excluded from the RSS table can skip this step and avoid the traffic disruption. Note that this scheme is just an improvement. A VNIC with multiple rings in the RSS table will still see traffic disruptions to all rings in the RSS table when one of the rings is being restarted. We are working on a FW scheme that will improve upon this further. Fixes: 5ac066b7b062 ("bnxt_en: Fix queue start to update vnic RSS table") Reported-by: David Wei Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250613231841.377988-4-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 56 +++++++++++++++++++++-- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index dfd2366d4c8c..2cb3185c442c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -10780,11 +10780,39 @@ void bnxt_del_one_rss_ctx(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx, bp->num_rss_ctx--; } +static bool bnxt_vnic_has_rx_ring(struct bnxt *bp, struct bnxt_vnic_info *vnic, + int rxr_id) +{ + u16 tbl_size = bnxt_get_rxfh_indir_size(bp->dev); + int i, vnic_rx; + + /* Ntuple VNIC always has all the rx rings. Any change of ring id + * must be updated because a future filter may use it. + */ + if (vnic->flags & BNXT_VNIC_NTUPLE_FLAG) + return true; + + for (i = 0; i < tbl_size; i++) { + if (vnic->flags & BNXT_VNIC_RSSCTX_FLAG) + vnic_rx = ethtool_rxfh_context_indir(vnic->rss_ctx)[i]; + else + vnic_rx = bp->rss_indir_tbl[i]; + + if (rxr_id == vnic_rx) + return true; + } + + return false; +} + static int bnxt_set_vnic_mru_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic, - u16 mru) + u16 mru, int rxr_id) { int rc; + if (!bnxt_vnic_has_rx_ring(bp, vnic, rxr_id)) + return 0; + if (mru) { rc = bnxt_hwrm_vnic_set_rss_p5(bp, vnic, true); if (rc) { @@ -10800,6 +10828,24 @@ static int bnxt_set_vnic_mru_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic, return 0; } +static int bnxt_set_rss_ctx_vnic_mru(struct bnxt *bp, u16 mru, int rxr_id) +{ + struct ethtool_rxfh_context *ctx; + unsigned long context; + int rc; + + xa_for_each(&bp->dev->ethtool->rss_ctx, context, ctx) { + struct bnxt_rss_ctx *rss_ctx = ethtool_rxfh_context_priv(ctx); + struct bnxt_vnic_info *vnic = &rss_ctx->vnic; + + rc = bnxt_set_vnic_mru_p5(bp, vnic, mru, rxr_id); + if (rc) + return rc; + } + + return 0; +} + static void bnxt_hwrm_realloc_rss_ctx_vnic(struct bnxt *bp) { bool set_tpa = !!(bp->flags & BNXT_FLAG_TPA); @@ -16002,12 +16048,11 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) for (i = 0; i < bp->nr_vnics; i++) { vnic = &bp->vnic_info[i]; - rc = bnxt_set_vnic_mru_p5(bp, vnic, mru); + rc = bnxt_set_vnic_mru_p5(bp, vnic, mru, idx); if (rc) return rc; } - - return 0; + return bnxt_set_rss_ctx_vnic_mru(bp, mru, idx); err_reset: netdev_err(bp->dev, "Unexpected HWRM error during queue start rc: %d\n", @@ -16030,8 +16075,9 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx) for (i = 0; i < bp->nr_vnics; i++) { vnic = &bp->vnic_info[i]; - bnxt_set_vnic_mru_p5(bp, vnic, 0); + bnxt_set_vnic_mru_p5(bp, vnic, 0, idx); } + bnxt_set_rss_ctx_vnic_mru(bp, 0, idx); /* Make sure NAPI sees that the VNIC is disabled */ synchronize_net(); rxr = &bp->rx_ring[idx]; From 5ab73b010cad294851e558f1d4714a85c6f206c7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 13 Jun 2025 20:47:48 +0300 Subject: [PATCH 196/242] ptp: fix breakage after ptp_vclock_in_use() rework What is broken -------------- ptp4l, and any other application which calls clock_adjtime() on a physical clock, is greeted with error -EBUSY after commit 87f7ce260a3c ("ptp: remove ptp->n_vclocks check logic in ptp_vclock_in_use()"). Explanation for the breakage ---------------------------- The blamed commit was based on the false assumption that ptp_vclock_in_use() callers already test for n_vclocks prior to calling this function. This is notably incorrect for the code path below, in which there is, in fact, no n_vclocks test: ptp_clock_adjtime() -> ptp_clock_freerun() -> ptp_vclock_in_use() The result is that any clock adjustment on any physical clock is now impossible. This is _despite_ there not being any vclock over this physical clock. $ ptp4l -i eno0 -2 -P -m ptp4l[58.425]: selected /dev/ptp0 as PTP clock [ 58.429749] ptp: physical clock is free running ptp4l[58.431]: Failed to open /dev/ptp0: Device or resource busy failed to create a clock $ cat /sys/class/ptp/ptp0/n_vclocks 0 The patch makes the ptp_vclock_in_use() function say "if it's not a virtual clock, then this physical clock does have virtual clocks on top". Then ptp_clock_freerun() uses this information to say "this physical clock has virtual clocks on top, so it must stay free-running". Then ptp_clock_adjtime() uses this information to say "well, if this physical clock has to be free-running, I can't do it, return -EBUSY". Simply put, ptp_vclock_in_use() cannot be simplified so as to remove the test whether vclocks are in use. What did the blamed commit intend to fix ---------------------------------------- The blamed commit presents a lockdep warning stating "possible recursive locking detected", with the n_vclocks_store() and ptp_clock_unregister() functions involved. The recursive locking seems this: n_vclocks_store() -> mutex_lock_interruptible(&ptp->n_vclocks_mux) // 1 -> device_for_each_child_reverse(..., unregister_vclock) -> unregister_vclock() -> ptp_vclock_unregister() -> ptp_clock_unregister() -> ptp_vclock_in_use() -> mutex_lock_interruptible(&ptp->n_vclocks_mux) // 2 The issue can be triggered by creating and then deleting vclocks: $ echo 2 > /sys/class/ptp/ptp0/n_vclocks $ echo 0 > /sys/class/ptp/ptp0/n_vclocks But note that in the original stack trace, the address of the first lock is different from the address of the second lock. This is because at step 1 marked above, &ptp->n_vclocks_mux is the lock of the parent (physical) PTP clock, and at step 2, the lock is of the child (virtual) PTP clock. They are different locks of different devices. In this situation there is no real deadlock, the lockdep warning is caused by the fact that the mutexes have the same lock class on both the parent and the child. Functionally it is fine. Proposed alternative solution ----------------------------- We must reintroduce the body of ptp_vclock_in_use() mostly as it was structured prior to the blamed commit, but avoid the lockdep warning. Based on the fact that vclocks cannot be nested on top of one another (ptp_is_attribute_visible() hides n_vclocks for virtual clocks), we already know that ptp->n_vclocks is zero for a virtual clock. And ptp->is_virtual_clock is a runtime invariant, established at ptp_clock_register() time and never changed. There is no need to serialize on any mutex in order to read ptp->is_virtual_clock, and we take advantage of that by moving it outside the lock. Thus, virtual clocks do not need to acquire &ptp->n_vclocks_mux at all, and step 2 in the code walkthrough above can simply go away. We can simply return false to the question "ptp_vclock_in_use(a virtual clock)". Other notes ----------- Releasing &ptp->n_vclocks_mux before ptp_vclock_in_use() returns execution seems racy, because the returned value can become stale as soon as the function returns and before the return value is used (i.e. n_vclocks_store() can run any time). The locking requirement should somehow be transferred to the caller, to ensure a longer life time for the returned value, but this seems out of scope for this severe bug fix. Because we are also fixing up the logic from the original commit, there is another Fixes: tag for that. Fixes: 87f7ce260a3c ("ptp: remove ptp->n_vclocks check logic in ptp_vclock_in_use()") Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250613174749.406826-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_private.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index 528d86a33f37..a6aad743c282 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -98,7 +98,27 @@ static inline int queue_cnt(const struct timestamp_event_queue *q) /* Check if ptp virtual clock is in use */ static inline bool ptp_vclock_in_use(struct ptp_clock *ptp) { - return !ptp->is_virtual_clock; + bool in_use = false; + + /* Virtual clocks can't be stacked on top of virtual clocks. + * Avoid acquiring the n_vclocks_mux on virtual clocks, to allow this + * function to be called from code paths where the n_vclocks_mux of the + * parent physical clock is already held. Functionally that's not an + * issue, but lockdep would complain, because they have the same lock + * class. + */ + if (ptp->is_virtual_clock) + return false; + + if (mutex_lock_interruptible(&ptp->n_vclocks_mux)) + return true; + + if (ptp->n_vclocks) + in_use = true; + + mutex_unlock(&ptp->n_vclocks_mux); + + return in_use; } /* Check if ptp clock shall be free running */ From aa112cbc5f0ac6f3b44d829005bf34005d9fe9bb Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 13 Jun 2025 20:47:49 +0300 Subject: [PATCH 197/242] ptp: allow reading of currently dialed frequency to succeed on free-running clocks There is a bug in ptp_clock_adjtime() which makes it refuse the operation even if we just want to read the current clock dialed frequency, not modify anything (tx->modes == 0). That should be possible even if the clock is free-running. For context, the kernel UAPI is the same for getting and setting the frequency of a POSIX clock. For example, ptp4l errors out at clock_create() -> clockadj_get_freq() -> clock_adjtime() time, when it should logically only have failed on actual adjustments to the clock, aka if the clock was configured as slave. But in master mode it should work. This was discovered when examining the issue described in the previous commit, where ptp_clock_freerun() returned true despite n_vclocks being zero. Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250613174749.406826-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_clock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 35a5994bf64f..36f57d7b4a66 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -121,7 +121,8 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct __kernel_timex *tx) struct ptp_clock_info *ops; int err = -EOPNOTSUPP; - if (ptp_clock_freerun(ptp)) { + if (tx->modes & (ADJ_SETOFFSET | ADJ_FREQUENCY | ADJ_OFFSET) && + ptp_clock_freerun(ptp)) { pr_err("ptp: physical clock is free running\n"); return -EBUSY; } From b160766e26d4e2e2d6fe2294e0b02f92baefcec5 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Fri, 13 Jun 2025 20:54:57 -0400 Subject: [PATCH 198/242] net/sched: fix use-after-free in taprio_dev_notifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since taprio’s taprio_dev_notifier() isn’t protected by an RCU read-side critical section, a race with advance_sched() can lead to a use-after-free. Adding rcu_read_lock() inside taprio_dev_notifier() prevents this. Fixes: fed87cc6718a ("net/sched: taprio: automatically calculate queueMaxSDU based on TC gate durations") Cc: stable@vger.kernel.org Signed-off-by: Hyunwoo Kim Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/aEzIYYxt0is9upYG@v4bel-B760M-AORUS-ELITE-AX Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 14021b812329..2b14c81a87e5 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1328,13 +1328,15 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, stab = rtnl_dereference(q->root->stab); - oper = rtnl_dereference(q->oper_sched); + rcu_read_lock(); + oper = rcu_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); - admin = rtnl_dereference(q->admin_sched); + admin = rcu_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); + rcu_read_unlock(); break; } From 18ae7d0cdd76420e80f6ab15ada063708f14ba40 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 9 Jun 2025 22:06:22 -0500 Subject: [PATCH 199/242] wifi: ath12k: Avoid CPU busy-wait by handling VDEV_STAT and BCN_STAT When the ath12k driver is built without CONFIG_ATH12K_DEBUG, the recently refactored stats code can cause any user space application (such at NetworkManager) to consume 100% CPU for 3 seconds, every time stats are read. Commit 'b8a0d83fe4c7 ("wifi: ath12k: move firmware stats out of debugfs")' moved ath12k_debugfs_fw_stats_request() out of debugfs, by merging the additional logic into ath12k_mac_get_fw_stats(). Among the added responsibility of ath12k_mac_get_fw_stats() was the busy-wait for `fw_stats_done`. Signalling of `fw_stats_done` happens when one of the WMI_REQUEST_PDEV_STAT, WMI_REQUEST_VDEV_STAT, and WMI_REQUEST_BCN_STAT messages are received, but the handling of the latter two commands remained in the debugfs code. As `fw_stats_done` isn't signalled, the calling processes will spin until the timeout (3 seconds) is reached. Moving the handling of these two additional responses out of debugfs resolves the issue. Fixes: b8a0d83fe4c7 ("wifi: ath12k: move firmware stats out of debugfs") Signed-off-by: Bjorn Andersson Tested-by: Abel Vesa Link: https://patch.msgid.link/20250609-ath12k-fw-stats-done-v1-1-2b3624656697@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/debugfs.c | 58 -------------------- drivers/net/wireless/ath/ath12k/debugfs.h | 7 --- drivers/net/wireless/ath/ath12k/wmi.c | 67 ++++++++++++++++++++--- 3 files changed, 60 insertions(+), 72 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/debugfs.c b/drivers/net/wireless/ath/ath12k/debugfs.c index dd624d73b8b2..23da93afaa5c 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs.c +++ b/drivers/net/wireless/ath/ath12k/debugfs.c @@ -1251,64 +1251,6 @@ void ath12k_debugfs_soc_destroy(struct ath12k_base *ab) */ } -void -ath12k_debugfs_fw_stats_process(struct ath12k *ar, - struct ath12k_fw_stats *stats) -{ - struct ath12k_base *ab = ar->ab; - struct ath12k_pdev *pdev; - bool is_end; - static unsigned int num_vdev, num_bcn; - size_t total_vdevs_started = 0; - int i; - - if (stats->stats_id == WMI_REQUEST_VDEV_STAT) { - if (list_empty(&stats->vdevs)) { - ath12k_warn(ab, "empty vdev stats"); - return; - } - /* FW sends all the active VDEV stats irrespective of PDEV, - * hence limit until the count of all VDEVs started - */ - rcu_read_lock(); - for (i = 0; i < ab->num_radios; i++) { - pdev = rcu_dereference(ab->pdevs_active[i]); - if (pdev && pdev->ar) - total_vdevs_started += pdev->ar->num_started_vdevs; - } - rcu_read_unlock(); - - is_end = ((++num_vdev) == total_vdevs_started); - - list_splice_tail_init(&stats->vdevs, - &ar->fw_stats.vdevs); - - if (is_end) { - ar->fw_stats.fw_stats_done = true; - num_vdev = 0; - } - return; - } - if (stats->stats_id == WMI_REQUEST_BCN_STAT) { - if (list_empty(&stats->bcn)) { - ath12k_warn(ab, "empty beacon stats"); - return; - } - /* Mark end until we reached the count of all started VDEVs - * within the PDEV - */ - is_end = ((++num_bcn) == ar->num_started_vdevs); - - list_splice_tail_init(&stats->bcn, - &ar->fw_stats.bcn); - - if (is_end) { - ar->fw_stats.fw_stats_done = true; - num_bcn = 0; - } - } -} - static int ath12k_open_vdev_stats(struct inode *inode, struct file *file) { struct ath12k *ar = inode->i_private; diff --git a/drivers/net/wireless/ath/ath12k/debugfs.h b/drivers/net/wireless/ath/ath12k/debugfs.h index ebef7dace344..21641a8a0346 100644 --- a/drivers/net/wireless/ath/ath12k/debugfs.h +++ b/drivers/net/wireless/ath/ath12k/debugfs.h @@ -12,8 +12,6 @@ void ath12k_debugfs_soc_create(struct ath12k_base *ab); void ath12k_debugfs_soc_destroy(struct ath12k_base *ab); void ath12k_debugfs_register(struct ath12k *ar); void ath12k_debugfs_unregister(struct ath12k *ar); -void ath12k_debugfs_fw_stats_process(struct ath12k *ar, - struct ath12k_fw_stats *stats); void ath12k_debugfs_op_vif_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void ath12k_debugfs_pdev_create(struct ath12k_base *ab); @@ -126,11 +124,6 @@ static inline void ath12k_debugfs_unregister(struct ath12k *ar) { } -static inline void ath12k_debugfs_fw_stats_process(struct ath12k *ar, - struct ath12k_fw_stats *stats) -{ -} - static inline bool ath12k_debugfs_is_extd_rx_stats_enabled(struct ath12k *ar) { return false; diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 60e2444fe08c..2d2444417e2b 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -7626,6 +7626,63 @@ static int ath12k_wmi_pull_fw_stats(struct ath12k_base *ab, struct sk_buff *skb, &parse); } +static void ath12k_wmi_fw_stats_process(struct ath12k *ar, + struct ath12k_fw_stats *stats) +{ + struct ath12k_base *ab = ar->ab; + struct ath12k_pdev *pdev; + bool is_end; + static unsigned int num_vdev, num_bcn; + size_t total_vdevs_started = 0; + int i; + + if (stats->stats_id == WMI_REQUEST_VDEV_STAT) { + if (list_empty(&stats->vdevs)) { + ath12k_warn(ab, "empty vdev stats"); + return; + } + /* FW sends all the active VDEV stats irrespective of PDEV, + * hence limit until the count of all VDEVs started + */ + rcu_read_lock(); + for (i = 0; i < ab->num_radios; i++) { + pdev = rcu_dereference(ab->pdevs_active[i]); + if (pdev && pdev->ar) + total_vdevs_started += pdev->ar->num_started_vdevs; + } + rcu_read_unlock(); + + is_end = ((++num_vdev) == total_vdevs_started); + + list_splice_tail_init(&stats->vdevs, + &ar->fw_stats.vdevs); + + if (is_end) { + ar->fw_stats.fw_stats_done = true; + num_vdev = 0; + } + return; + } + if (stats->stats_id == WMI_REQUEST_BCN_STAT) { + if (list_empty(&stats->bcn)) { + ath12k_warn(ab, "empty beacon stats"); + return; + } + /* Mark end until we reached the count of all started VDEVs + * within the PDEV + */ + is_end = ((++num_bcn) == ar->num_started_vdevs); + + list_splice_tail_init(&stats->bcn, + &ar->fw_stats.bcn); + + if (is_end) { + ar->fw_stats.fw_stats_done = true; + num_bcn = 0; + } + } +} + static void ath12k_update_stats_event(struct ath12k_base *ab, struct sk_buff *skb) { struct ath12k_fw_stats stats = {}; @@ -7655,19 +7712,15 @@ static void ath12k_update_stats_event(struct ath12k_base *ab, struct sk_buff *sk spin_lock_bh(&ar->data_lock); - /* WMI_REQUEST_PDEV_STAT can be requested via .get_txpower mac ops or via - * debugfs fw stats. Therefore, processing it separately. - */ + /* Handle WMI_REQUEST_PDEV_STAT status update */ if (stats.stats_id == WMI_REQUEST_PDEV_STAT) { list_splice_tail_init(&stats.pdevs, &ar->fw_stats.pdevs); ar->fw_stats.fw_stats_done = true; goto complete; } - /* WMI_REQUEST_VDEV_STAT and WMI_REQUEST_BCN_STAT are currently requested only - * via debugfs fw stats. Hence, processing these in debugfs context. - */ - ath12k_debugfs_fw_stats_process(ar, &stats); + /* Handle WMI_REQUEST_VDEV_STAT and WMI_REQUEST_BCN_STAT updates. */ + ath12k_wmi_fw_stats_process(ar, &stats); complete: complete(&ar->fw_stats_complete); From 062ade23991e556a14bbb27c1cf873c34dbd9d28 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 22 May 2025 16:54:10 +0800 Subject: [PATCH 200/242] wifi: ath12k: parse and save hardware mode info from WMI_SERVICE_READY_EXT_EVENTID event for later use WLAN hardware might support various hardware modes such as DBS (dual band simultaneously), SBS (single band simultaneously) and DBS_OR_SBS etc, see enum wmi_host_hw_mode_config_type. Firmware advertises actual supported modes in WMI_SERVICE_READY_EXT_EVENTID event. For each mode, firmware advertises frequency range each hardware MAC can operate on. In MLO case such information is necessary during vdev activation and link selection (which is done in following patches), so add a new structure ath12k_svc_ext_info to ath12k_wmi_base, then parse and save those information to it for later use. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1 Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1 Signed-off-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20250522-ath12k-sbs-dbs-v1-1-54a29e7a3a88@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 69 ++++++++++++++++++++++++++- drivers/net/wireless/ath/ath12k/wmi.h | 31 ++++++++++++ 2 files changed, 98 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 2d2444417e2b..edb38fbb4ed4 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -4395,6 +4395,7 @@ static int ath12k_wmi_hw_mode_caps_parse(struct ath12k_base *soc, static int ath12k_wmi_hw_mode_caps(struct ath12k_base *soc, u16 len, const void *ptr, void *data) { + struct ath12k_svc_ext_info *svc_ext_info = &soc->wmi_ab.svc_ext_info; struct ath12k_wmi_svc_rdy_ext_parse *svc_rdy_ext = data; const struct ath12k_wmi_hw_mode_cap_params *hw_mode_caps; enum wmi_host_hw_mode_config_type mode, pref; @@ -4427,8 +4428,11 @@ static int ath12k_wmi_hw_mode_caps(struct ath12k_base *soc, } } - ath12k_dbg(soc, ATH12K_DBG_WMI, "preferred_hw_mode:%d\n", - soc->wmi_ab.preferred_hw_mode); + svc_ext_info->num_hw_modes = svc_rdy_ext->n_hw_mode_caps; + + ath12k_dbg(soc, ATH12K_DBG_WMI, "num hw modes %u preferred_hw_mode %d\n", + svc_ext_info->num_hw_modes, soc->wmi_ab.preferred_hw_mode); + if (soc->wmi_ab.preferred_hw_mode == WMI_HOST_HW_MODE_MAX) return -EINVAL; @@ -4658,6 +4662,65 @@ static int ath12k_wmi_dma_ring_caps(struct ath12k_base *ab, return ret; } +static void +ath12k_wmi_save_mac_phy_info(struct ath12k_base *ab, + const struct ath12k_wmi_mac_phy_caps_params *mac_phy_cap, + struct ath12k_svc_ext_mac_phy_info *mac_phy_info) +{ + mac_phy_info->phy_id = __le32_to_cpu(mac_phy_cap->phy_id); + mac_phy_info->supported_bands = __le32_to_cpu(mac_phy_cap->supported_bands); + mac_phy_info->hw_freq_range.low_2ghz_freq = + __le32_to_cpu(mac_phy_cap->low_2ghz_chan_freq); + mac_phy_info->hw_freq_range.high_2ghz_freq = + __le32_to_cpu(mac_phy_cap->high_2ghz_chan_freq); + mac_phy_info->hw_freq_range.low_5ghz_freq = + __le32_to_cpu(mac_phy_cap->low_5ghz_chan_freq); + mac_phy_info->hw_freq_range.high_5ghz_freq = + __le32_to_cpu(mac_phy_cap->high_5ghz_chan_freq); +} + +static void +ath12k_wmi_save_all_mac_phy_info(struct ath12k_base *ab, + struct ath12k_wmi_svc_rdy_ext_parse *svc_rdy_ext) +{ + struct ath12k_svc_ext_info *svc_ext_info = &ab->wmi_ab.svc_ext_info; + const struct ath12k_wmi_mac_phy_caps_params *mac_phy_cap; + const struct ath12k_wmi_hw_mode_cap_params *hw_mode_cap; + struct ath12k_svc_ext_mac_phy_info *mac_phy_info; + u32 hw_mode_id, phy_bit_map; + u8 hw_idx; + + mac_phy_info = &svc_ext_info->mac_phy_info[0]; + mac_phy_cap = svc_rdy_ext->mac_phy_caps; + + for (hw_idx = 0; hw_idx < svc_ext_info->num_hw_modes; hw_idx++) { + hw_mode_cap = &svc_rdy_ext->hw_mode_caps[hw_idx]; + hw_mode_id = __le32_to_cpu(hw_mode_cap->hw_mode_id); + phy_bit_map = __le32_to_cpu(hw_mode_cap->phy_id_map); + + while (phy_bit_map) { + ath12k_wmi_save_mac_phy_info(ab, mac_phy_cap, mac_phy_info); + mac_phy_info->hw_mode_config_type = + le32_get_bits(hw_mode_cap->hw_mode_config_type, + WMI_HW_MODE_CAP_CFG_TYPE); + ath12k_dbg(ab, ATH12K_DBG_WMI, + "hw_idx %u hw_mode_id %u hw_mode_config_type %u supported_bands %u phy_id %u 2 GHz [%u - %u] 5 GHz [%u - %u]\n", + hw_idx, hw_mode_id, + mac_phy_info->hw_mode_config_type, + mac_phy_info->supported_bands, mac_phy_info->phy_id, + mac_phy_info->hw_freq_range.low_2ghz_freq, + mac_phy_info->hw_freq_range.high_2ghz_freq, + mac_phy_info->hw_freq_range.low_5ghz_freq, + mac_phy_info->hw_freq_range.high_5ghz_freq); + + mac_phy_cap++; + mac_phy_info++; + + phy_bit_map >>= 1; + } + } +} + static int ath12k_wmi_svc_rdy_ext_parse(struct ath12k_base *ab, u16 tag, u16 len, const void *ptr, void *data) @@ -4706,6 +4769,8 @@ static int ath12k_wmi_svc_rdy_ext_parse(struct ath12k_base *ab, return ret; } + ath12k_wmi_save_all_mac_phy_info(ab, svc_rdy_ext); + svc_rdy_ext->mac_phy_done = true; } else if (!svc_rdy_ext->ext_hal_reg_done) { ret = ath12k_wmi_ext_hal_reg_caps(ab, len, ptr, svc_rdy_ext); diff --git a/drivers/net/wireless/ath/ath12k/wmi.h b/drivers/net/wireless/ath/ath12k/wmi.h index ac18f75e0449..96c31b7820ec 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.h +++ b/drivers/net/wireless/ath/ath12k/wmi.h @@ -2617,6 +2617,8 @@ struct ath12k_wmi_soc_mac_phy_hw_mode_caps_params { __le32 num_chainmask_tables; } __packed; +#define WMI_HW_MODE_CAP_CFG_TYPE GENMASK(27, 0) + struct ath12k_wmi_hw_mode_cap_params { __le32 tlv_header; __le32 hw_mode_id; @@ -2666,6 +2668,12 @@ struct ath12k_wmi_mac_phy_caps_params { __le32 he_cap_info_2g_ext; __le32 he_cap_info_5g_ext; __le32 he_cap_info_internal; + __le32 wireless_modes; + __le32 low_2ghz_chan_freq; + __le32 high_2ghz_chan_freq; + __le32 low_5ghz_chan_freq; + __le32 high_5ghz_chan_freq; + __le32 nss_ratio; } __packed; struct ath12k_wmi_hal_reg_caps_ext_params { @@ -5049,6 +5057,27 @@ struct ath12k_wmi_pdev { u32 rx_decap_mode; }; +struct ath12k_hw_mode_freq_range_arg { + u32 low_2ghz_freq; + u32 high_2ghz_freq; + u32 low_5ghz_freq; + u32 high_5ghz_freq; +}; + +struct ath12k_svc_ext_mac_phy_info { + enum wmi_host_hw_mode_config_type hw_mode_config_type; + u32 phy_id; + u32 supported_bands; + struct ath12k_hw_mode_freq_range_arg hw_freq_range; +}; + +#define ATH12K_MAX_MAC_PHY_CAP 8 + +struct ath12k_svc_ext_info { + u32 num_hw_modes; + struct ath12k_svc_ext_mac_phy_info mac_phy_info[ATH12K_MAX_MAC_PHY_CAP]; +}; + struct ath12k_wmi_base { struct ath12k_base *ab; struct ath12k_wmi_pdev wmi[MAX_RADIOS]; @@ -5066,6 +5095,8 @@ struct ath12k_wmi_base { enum wmi_host_hw_mode_config_type preferred_hw_mode; struct ath12k_wmi_target_cap_arg *targ_cap; + + struct ath12k_svc_ext_info svc_ext_info; }; struct wmi_pdev_set_bios_interface_cmd { From 241d130f1419fd99923d99aff6e40490a4c34d93 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 22 May 2025 16:54:11 +0800 Subject: [PATCH 201/242] wifi: ath12k: parse and save sbs_lower_band_end_freq from WMI_SERVICE_READY_EXT2_EVENTID event Firmware sends the boundary between lower and higher bands in ath12k_wmi_dbs_or_sbs_cap_params structure embedded in WMI_SERVICE_READY_EXT2_EVENTID event. The boundary is needed when updating frequency range in the following patch. So parse and save it for later use. Note ath12k_wmi_dbs_or_sbs_cap_params is placed after some other structures, so placeholders for them are added as well. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1 Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1 Signed-off-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20250522-ath12k-sbs-dbs-v1-2-54a29e7a3a88@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 23 +++++++++++++++++++++++ drivers/net/wireless/ath/ath12k/wmi.h | 6 ++++++ 2 files changed, 29 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index edb38fbb4ed4..824c910bf1d6 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -91,6 +91,11 @@ struct ath12k_wmi_svc_rdy_ext2_parse { bool dma_ring_cap_done; bool spectral_bin_scaling_done; bool mac_phy_caps_ext_done; + bool hal_reg_caps_ext2_done; + bool scan_radio_caps_ext2_done; + bool twt_caps_done; + bool htt_msdu_idx_to_qtype_map_done; + bool dbs_or_sbs_cap_ext_done; }; struct ath12k_wmi_rdy_parse { @@ -4991,6 +4996,7 @@ static int ath12k_wmi_svc_rdy_ext2_parse(struct ath12k_base *ab, u16 tag, u16 len, const void *ptr, void *data) { + const struct ath12k_wmi_dbs_or_sbs_cap_params *dbs_or_sbs_caps; struct ath12k_wmi_pdev *wmi_handle = &ab->wmi_ab.wmi[0]; struct ath12k_wmi_svc_rdy_ext2_parse *parse = data; int ret; @@ -5032,6 +5038,23 @@ static int ath12k_wmi_svc_rdy_ext2_parse(struct ath12k_base *ab, } parse->mac_phy_caps_ext_done = true; + } else if (!parse->hal_reg_caps_ext2_done) { + parse->hal_reg_caps_ext2_done = true; + } else if (!parse->scan_radio_caps_ext2_done) { + parse->scan_radio_caps_ext2_done = true; + } else if (!parse->twt_caps_done) { + parse->twt_caps_done = true; + } else if (!parse->htt_msdu_idx_to_qtype_map_done) { + parse->htt_msdu_idx_to_qtype_map_done = true; + } else if (!parse->dbs_or_sbs_cap_ext_done) { + dbs_or_sbs_caps = ptr; + ab->wmi_ab.sbs_lower_band_end_freq = + __le32_to_cpu(dbs_or_sbs_caps->sbs_lower_band_end_freq); + + ath12k_dbg(ab, ATH12K_DBG_WMI, "sbs_lower_band_end_freq %u\n", + ab->wmi_ab.sbs_lower_band_end_freq); + + parse->dbs_or_sbs_cap_ext_done = true; } break; default: diff --git a/drivers/net/wireless/ath/ath12k/wmi.h b/drivers/net/wireless/ath/ath12k/wmi.h index 96c31b7820ec..e69d53054f6d 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.h +++ b/drivers/net/wireless/ath/ath12k/wmi.h @@ -2747,6 +2747,11 @@ struct wmi_service_ready_ext2_event { __le32 default_num_msduq_supported_per_tid; } __packed; +struct ath12k_wmi_dbs_or_sbs_cap_params { + __le32 hw_mode_id; + __le32 sbs_lower_band_end_freq; +} __packed; + struct ath12k_wmi_caps_ext_params { __le32 hw_mode_id; __le32 pdev_and_hw_link_ids; @@ -5097,6 +5102,7 @@ struct ath12k_wmi_base { struct ath12k_wmi_target_cap_arg *targ_cap; struct ath12k_svc_ext_info svc_ext_info; + u32 sbs_lower_band_end_freq; }; struct wmi_pdev_set_bios_interface_cmd { From e47b11e3bd34177af6669ef0945eb23dd4da3bcb Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 22 May 2025 16:54:12 +0800 Subject: [PATCH 202/242] wifi: ath12k: update freq range for each hardware mode Previous patches parse and save hardware MAC frequency range information in ath12k_svc_ext_info structure. Such range represents hardware capability hence needs to be updated based on host information, e.g. guard the range based on host's low/high boundary. So update frequency range. The updated range is saved in ath12k_hw_mode_info structure and would be used when doing vdev activation and link selection in following patches. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1 Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1 Signed-off-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20250522-ath12k-sbs-dbs-v1-3-54a29e7a3a88@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 446 ++++++++++++++++++++++++++ drivers/net/wireless/ath/ath12k/wmi.h | 27 ++ 2 files changed, 473 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 824c910bf1d6..034c18f6a2d2 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -4992,6 +4992,444 @@ static int ath12k_wmi_tlv_mac_phy_caps_ext(struct ath12k_base *ab, u16 tag, return 0; } +static void +ath12k_wmi_update_freq_info(struct ath12k_base *ab, + struct ath12k_svc_ext_mac_phy_info *mac_cap, + enum ath12k_hw_mode mode, + u32 phy_id) +{ + struct ath12k_hw_mode_info *hw_mode_info = &ab->wmi_ab.hw_mode_info; + struct ath12k_hw_mode_freq_range_arg *mac_range; + + mac_range = &hw_mode_info->freq_range_caps[mode][phy_id]; + + if (mac_cap->supported_bands & WMI_HOST_WLAN_2GHZ_CAP) { + mac_range->low_2ghz_freq = max_t(u32, + mac_cap->hw_freq_range.low_2ghz_freq, + ATH12K_MIN_2GHZ_FREQ); + mac_range->high_2ghz_freq = mac_cap->hw_freq_range.high_2ghz_freq ? + min_t(u32, + mac_cap->hw_freq_range.high_2ghz_freq, + ATH12K_MAX_2GHZ_FREQ) : + ATH12K_MAX_2GHZ_FREQ; + } + + if (mac_cap->supported_bands & WMI_HOST_WLAN_5GHZ_CAP) { + mac_range->low_5ghz_freq = max_t(u32, + mac_cap->hw_freq_range.low_5ghz_freq, + ATH12K_MIN_5GHZ_FREQ); + mac_range->high_5ghz_freq = mac_cap->hw_freq_range.high_5ghz_freq ? + min_t(u32, + mac_cap->hw_freq_range.high_5ghz_freq, + ATH12K_MAX_6GHZ_FREQ) : + ATH12K_MAX_6GHZ_FREQ; + } +} + +static bool +ath12k_wmi_all_phy_range_updated(struct ath12k_base *ab, + enum ath12k_hw_mode hwmode) +{ + struct ath12k_hw_mode_info *hw_mode_info = &ab->wmi_ab.hw_mode_info; + struct ath12k_hw_mode_freq_range_arg *mac_range; + u8 phy_id; + + for (phy_id = 0; phy_id < MAX_RADIOS; phy_id++) { + mac_range = &hw_mode_info->freq_range_caps[hwmode][phy_id]; + /* modify SBS/DBS range only when both phy for DBS are filled */ + if (!mac_range->low_2ghz_freq && !mac_range->low_5ghz_freq) + return false; + } + + return true; +} + +static void ath12k_wmi_update_dbs_freq_info(struct ath12k_base *ab) +{ + struct ath12k_hw_mode_info *hw_mode_info = &ab->wmi_ab.hw_mode_info; + struct ath12k_hw_mode_freq_range_arg *mac_range; + u8 phy_id; + + mac_range = hw_mode_info->freq_range_caps[ATH12K_HW_MODE_DBS]; + /* Reset 5 GHz range for shared mac for DBS */ + for (phy_id = 0; phy_id < MAX_RADIOS; phy_id++) { + if (mac_range[phy_id].low_2ghz_freq && + mac_range[phy_id].low_5ghz_freq) { + mac_range[phy_id].low_5ghz_freq = 0; + mac_range[phy_id].high_5ghz_freq = 0; + } + } +} + +static u32 +ath12k_wmi_get_highest_5ghz_freq_from_range(struct ath12k_hw_mode_freq_range_arg *range) +{ + u32 highest_freq = 0; + u8 phy_id; + + for (phy_id = 0; phy_id < MAX_RADIOS; phy_id++) { + if (range[phy_id].high_5ghz_freq > highest_freq) + highest_freq = range[phy_id].high_5ghz_freq; + } + + return highest_freq ? highest_freq : ATH12K_MAX_6GHZ_FREQ; +} + +static u32 +ath12k_wmi_get_lowest_5ghz_freq_from_range(struct ath12k_hw_mode_freq_range_arg *range) +{ + u32 lowest_freq = 0; + u8 phy_id; + + for (phy_id = 0; phy_id < MAX_RADIOS; phy_id++) { + if ((!lowest_freq && range[phy_id].low_5ghz_freq) || + range[phy_id].low_5ghz_freq < lowest_freq) + lowest_freq = range[phy_id].low_5ghz_freq; + } + + return lowest_freq ? lowest_freq : ATH12K_MIN_5GHZ_FREQ; +} + +static void +ath12k_wmi_fill_upper_share_sbs_freq(struct ath12k_base *ab, + u16 sbs_range_sep, + struct ath12k_hw_mode_freq_range_arg *ref_freq) +{ + struct ath12k_hw_mode_info *hw_mode_info = &ab->wmi_ab.hw_mode_info; + struct ath12k_hw_mode_freq_range_arg *upper_sbs_freq_range; + u8 phy_id; + + upper_sbs_freq_range = + hw_mode_info->freq_range_caps[ATH12K_HW_MODE_SBS_UPPER_SHARE]; + + for (phy_id = 0; phy_id < MAX_RADIOS; phy_id++) { + upper_sbs_freq_range[phy_id].low_2ghz_freq = + ref_freq[phy_id].low_2ghz_freq; + upper_sbs_freq_range[phy_id].high_2ghz_freq = + ref_freq[phy_id].high_2ghz_freq; + + /* update for shared mac */ + if (upper_sbs_freq_range[phy_id].low_2ghz_freq) { + upper_sbs_freq_range[phy_id].low_5ghz_freq = sbs_range_sep + 10; + upper_sbs_freq_range[phy_id].high_5ghz_freq = + ath12k_wmi_get_highest_5ghz_freq_from_range(ref_freq); + } else { + upper_sbs_freq_range[phy_id].low_5ghz_freq = + ath12k_wmi_get_lowest_5ghz_freq_from_range(ref_freq); + upper_sbs_freq_range[phy_id].high_5ghz_freq = sbs_range_sep; + } + } +} + +static void +ath12k_wmi_fill_lower_share_sbs_freq(struct ath12k_base *ab, + u16 sbs_range_sep, + struct ath12k_hw_mode_freq_range_arg *ref_freq) +{ + struct ath12k_hw_mode_info *hw_mode_info = &ab->wmi_ab.hw_mode_info; + struct ath12k_hw_mode_freq_range_arg *lower_sbs_freq_range; + u8 phy_id; + + lower_sbs_freq_range = + hw_mode_info->freq_range_caps[ATH12K_HW_MODE_SBS_LOWER_SHARE]; + + for (phy_id = 0; phy_id < MAX_RADIOS; phy_id++) { + lower_sbs_freq_range[phy_id].low_2ghz_freq = + ref_freq[phy_id].low_2ghz_freq; + lower_sbs_freq_range[phy_id].high_2ghz_freq = + ref_freq[phy_id].high_2ghz_freq; + + /* update for shared mac */ + if (lower_sbs_freq_range[phy_id].low_2ghz_freq) { + lower_sbs_freq_range[phy_id].low_5ghz_freq = + ath12k_wmi_get_lowest_5ghz_freq_from_range(ref_freq); + lower_sbs_freq_range[phy_id].high_5ghz_freq = sbs_range_sep; + } else { + lower_sbs_freq_range[phy_id].low_5ghz_freq = sbs_range_sep + 10; + lower_sbs_freq_range[phy_id].high_5ghz_freq = + ath12k_wmi_get_highest_5ghz_freq_from_range(ref_freq); + } + } +} + +static const char *ath12k_wmi_hw_mode_to_str(enum ath12k_hw_mode hw_mode) +{ + static const char * const mode_str[] = { + [ATH12K_HW_MODE_SMM] = "SMM", + [ATH12K_HW_MODE_DBS] = "DBS", + [ATH12K_HW_MODE_SBS] = "SBS", + [ATH12K_HW_MODE_SBS_UPPER_SHARE] = "SBS_UPPER_SHARE", + [ATH12K_HW_MODE_SBS_LOWER_SHARE] = "SBS_LOWER_SHARE", + }; + + if (hw_mode >= ARRAY_SIZE(mode_str)) + return "Unknown"; + + return mode_str[hw_mode]; +} + +static void +ath12k_wmi_dump_freq_range_per_mac(struct ath12k_base *ab, + struct ath12k_hw_mode_freq_range_arg *freq_range, + enum ath12k_hw_mode hw_mode) +{ + u8 i; + + for (i = 0; i < MAX_RADIOS; i++) + if (freq_range[i].low_2ghz_freq || freq_range[i].low_5ghz_freq) + ath12k_dbg(ab, ATH12K_DBG_WMI, + "frequency range: %s(%d) mac %d 2 GHz [%d - %d] 5 GHz [%d - %d]", + ath12k_wmi_hw_mode_to_str(hw_mode), + hw_mode, i, + freq_range[i].low_2ghz_freq, + freq_range[i].high_2ghz_freq, + freq_range[i].low_5ghz_freq, + freq_range[i].high_5ghz_freq); +} + +static void ath12k_wmi_dump_freq_range(struct ath12k_base *ab) +{ + struct ath12k_hw_mode_freq_range_arg *freq_range; + u8 i; + + for (i = ATH12K_HW_MODE_SMM; i < ATH12K_HW_MODE_MAX; i++) { + freq_range = ab->wmi_ab.hw_mode_info.freq_range_caps[i]; + ath12k_wmi_dump_freq_range_per_mac(ab, freq_range, i); + } +} + +static int ath12k_wmi_modify_sbs_freq(struct ath12k_base *ab, u8 phy_id) +{ + struct ath12k_hw_mode_info *hw_mode_info = &ab->wmi_ab.hw_mode_info; + struct ath12k_hw_mode_freq_range_arg *sbs_mac_range, *shared_mac_range; + struct ath12k_hw_mode_freq_range_arg *non_shared_range; + u8 shared_phy_id; + + sbs_mac_range = &hw_mode_info->freq_range_caps[ATH12K_HW_MODE_SBS][phy_id]; + + /* if SBS mac range has both 2.4 and 5 GHz ranges, i.e. shared phy_id + * keep the range as it is in SBS + */ + if (sbs_mac_range->low_2ghz_freq && sbs_mac_range->low_5ghz_freq) + return 0; + + if (sbs_mac_range->low_2ghz_freq && !sbs_mac_range->low_5ghz_freq) { + ath12k_err(ab, "Invalid DBS/SBS mode with only 2.4Ghz"); + ath12k_wmi_dump_freq_range_per_mac(ab, sbs_mac_range, ATH12K_HW_MODE_SBS); + return -EINVAL; + } + + non_shared_range = sbs_mac_range; + /* if SBS mac range has only 5 GHz then it's the non-shared phy, so + * modify the range as per the shared mac. + */ + shared_phy_id = phy_id ? 0 : 1; + shared_mac_range = + &hw_mode_info->freq_range_caps[ATH12K_HW_MODE_SBS][shared_phy_id]; + + if (shared_mac_range->low_5ghz_freq > non_shared_range->low_5ghz_freq) { + ath12k_dbg(ab, ATH12K_DBG_WMI, "high 5 GHz shared"); + /* If the shared mac lower 5 GHz frequency is greater than + * non-shared mac lower 5 GHz frequency then the shared mac has + * high 5 GHz shared with 2.4 GHz. So non-shared mac's 5 GHz high + * freq should be less than the shared mac's low 5 GHz freq. + */ + if (non_shared_range->high_5ghz_freq >= + shared_mac_range->low_5ghz_freq) + non_shared_range->high_5ghz_freq = + max_t(u32, shared_mac_range->low_5ghz_freq - 10, + non_shared_range->low_5ghz_freq); + } else if (shared_mac_range->high_5ghz_freq < + non_shared_range->high_5ghz_freq) { + ath12k_dbg(ab, ATH12K_DBG_WMI, "low 5 GHz shared"); + /* If the shared mac high 5 GHz frequency is less than + * non-shared mac high 5 GHz frequency then the shared mac has + * low 5 GHz shared with 2.4 GHz. So non-shared mac's 5 GHz low + * freq should be greater than the shared mac's high 5 GHz freq. + */ + if (shared_mac_range->high_5ghz_freq >= + non_shared_range->low_5ghz_freq) + non_shared_range->low_5ghz_freq = + min_t(u32, shared_mac_range->high_5ghz_freq + 10, + non_shared_range->high_5ghz_freq); + } else { + ath12k_warn(ab, "invalid SBS range with all 5 GHz shared"); + return -EINVAL; + } + + return 0; +} + +static void ath12k_wmi_update_sbs_freq_info(struct ath12k_base *ab) +{ + struct ath12k_hw_mode_info *hw_mode_info = &ab->wmi_ab.hw_mode_info; + struct ath12k_hw_mode_freq_range_arg *mac_range; + u16 sbs_range_sep; + u8 phy_id; + int ret; + + mac_range = hw_mode_info->freq_range_caps[ATH12K_HW_MODE_SBS]; + + /* If sbs_lower_band_end_freq has a value, then the frequency range + * will be split using that value. + */ + sbs_range_sep = ab->wmi_ab.sbs_lower_band_end_freq; + if (sbs_range_sep) { + ath12k_wmi_fill_upper_share_sbs_freq(ab, sbs_range_sep, + mac_range); + ath12k_wmi_fill_lower_share_sbs_freq(ab, sbs_range_sep, + mac_range); + /* Hardware specifies the range boundary with sbs_range_sep, + * (i.e. the boundary between 5 GHz high and 5 GHz low), + * reset the original one to make sure it will not get used. + */ + memset(mac_range, 0, sizeof(*mac_range) * MAX_RADIOS); + return; + } + + /* If sbs_lower_band_end_freq is not set that means firmware will send one + * shared mac range and one non-shared mac range. so update that freq. + */ + for (phy_id = 0; phy_id < MAX_RADIOS; phy_id++) { + ret = ath12k_wmi_modify_sbs_freq(ab, phy_id); + if (ret) { + memset(mac_range, 0, sizeof(*mac_range) * MAX_RADIOS); + break; + } + } +} + +static void +ath12k_wmi_update_mac_freq_info(struct ath12k_base *ab, + enum wmi_host_hw_mode_config_type hw_config_type, + u32 phy_id, + struct ath12k_svc_ext_mac_phy_info *mac_cap) +{ + if (phy_id >= MAX_RADIOS) { + ath12k_err(ab, "mac more than two not supported: %d", phy_id); + return; + } + + ath12k_dbg(ab, ATH12K_DBG_WMI, + "hw_mode_cfg %d mac %d band 0x%x SBS cutoff freq %d 2 GHz [%d - %d] 5 GHz [%d - %d]", + hw_config_type, phy_id, mac_cap->supported_bands, + ab->wmi_ab.sbs_lower_band_end_freq, + mac_cap->hw_freq_range.low_2ghz_freq, + mac_cap->hw_freq_range.high_2ghz_freq, + mac_cap->hw_freq_range.low_5ghz_freq, + mac_cap->hw_freq_range.high_5ghz_freq); + + switch (hw_config_type) { + case WMI_HOST_HW_MODE_SINGLE: + if (phy_id) { + ath12k_dbg(ab, ATH12K_DBG_WMI, "mac phy 1 is not supported"); + break; + } + ath12k_wmi_update_freq_info(ab, mac_cap, ATH12K_HW_MODE_SMM, phy_id); + break; + + case WMI_HOST_HW_MODE_DBS: + if (!ath12k_wmi_all_phy_range_updated(ab, ATH12K_HW_MODE_DBS)) + ath12k_wmi_update_freq_info(ab, mac_cap, + ATH12K_HW_MODE_DBS, phy_id); + break; + case WMI_HOST_HW_MODE_DBS_SBS: + case WMI_HOST_HW_MODE_DBS_OR_SBS: + ath12k_wmi_update_freq_info(ab, mac_cap, ATH12K_HW_MODE_DBS, phy_id); + if (ab->wmi_ab.sbs_lower_band_end_freq || + mac_cap->hw_freq_range.low_5ghz_freq || + mac_cap->hw_freq_range.low_2ghz_freq) + ath12k_wmi_update_freq_info(ab, mac_cap, ATH12K_HW_MODE_SBS, + phy_id); + + if (ath12k_wmi_all_phy_range_updated(ab, ATH12K_HW_MODE_DBS)) + ath12k_wmi_update_dbs_freq_info(ab); + if (ath12k_wmi_all_phy_range_updated(ab, ATH12K_HW_MODE_SBS)) + ath12k_wmi_update_sbs_freq_info(ab); + break; + case WMI_HOST_HW_MODE_SBS: + case WMI_HOST_HW_MODE_SBS_PASSIVE: + ath12k_wmi_update_freq_info(ab, mac_cap, ATH12K_HW_MODE_SBS, phy_id); + if (ath12k_wmi_all_phy_range_updated(ab, ATH12K_HW_MODE_SBS)) + ath12k_wmi_update_sbs_freq_info(ab); + + break; + default: + break; + } +} + +static bool ath12k_wmi_sbs_range_present(struct ath12k_base *ab) +{ + if (ath12k_wmi_all_phy_range_updated(ab, ATH12K_HW_MODE_SBS) || + (ab->wmi_ab.sbs_lower_band_end_freq && + ath12k_wmi_all_phy_range_updated(ab, ATH12K_HW_MODE_SBS_LOWER_SHARE) && + ath12k_wmi_all_phy_range_updated(ab, ATH12K_HW_MODE_SBS_UPPER_SHARE))) + return true; + + return false; +} + +static int ath12k_wmi_update_hw_mode_list(struct ath12k_base *ab) +{ + struct ath12k_svc_ext_info *svc_ext_info = &ab->wmi_ab.svc_ext_info; + struct ath12k_hw_mode_info *info = &ab->wmi_ab.hw_mode_info; + enum wmi_host_hw_mode_config_type hw_config_type; + struct ath12k_svc_ext_mac_phy_info *tmp; + bool dbs_mode = false, sbs_mode = false; + u32 i, j = 0; + + if (!svc_ext_info->num_hw_modes) { + ath12k_err(ab, "invalid number of hw modes"); + return -EINVAL; + } + + ath12k_dbg(ab, ATH12K_DBG_WMI, "updated HW mode list: num modes %d", + svc_ext_info->num_hw_modes); + + memset(info->freq_range_caps, 0, sizeof(info->freq_range_caps)); + + for (i = 0; i < svc_ext_info->num_hw_modes; i++) { + if (j >= ATH12K_MAX_MAC_PHY_CAP) + return -EINVAL; + + /* Update for MAC0 */ + tmp = &svc_ext_info->mac_phy_info[j++]; + hw_config_type = tmp->hw_mode_config_type; + ath12k_wmi_update_mac_freq_info(ab, hw_config_type, tmp->phy_id, tmp); + + /* SBS and DBS have dual MAC. Up to 2 MACs are considered. */ + if (hw_config_type == WMI_HOST_HW_MODE_DBS || + hw_config_type == WMI_HOST_HW_MODE_SBS_PASSIVE || + hw_config_type == WMI_HOST_HW_MODE_SBS || + hw_config_type == WMI_HOST_HW_MODE_DBS_OR_SBS) { + if (j >= ATH12K_MAX_MAC_PHY_CAP) + return -EINVAL; + /* Update for MAC1 */ + tmp = &svc_ext_info->mac_phy_info[j++]; + ath12k_wmi_update_mac_freq_info(ab, hw_config_type, + tmp->phy_id, tmp); + + if (hw_config_type == WMI_HOST_HW_MODE_DBS || + hw_config_type == WMI_HOST_HW_MODE_DBS_OR_SBS) + dbs_mode = true; + + if (ath12k_wmi_sbs_range_present(ab) && + (hw_config_type == WMI_HOST_HW_MODE_SBS_PASSIVE || + hw_config_type == WMI_HOST_HW_MODE_SBS || + hw_config_type == WMI_HOST_HW_MODE_DBS_OR_SBS)) + sbs_mode = true; + } + } + + info->support_dbs = dbs_mode; + info->support_sbs = sbs_mode; + + ath12k_wmi_dump_freq_range(ab); + + return 0; +} + static int ath12k_wmi_svc_rdy_ext2_parse(struct ath12k_base *ab, u16 tag, u16 len, const void *ptr, void *data) @@ -5054,8 +5492,16 @@ static int ath12k_wmi_svc_rdy_ext2_parse(struct ath12k_base *ab, ath12k_dbg(ab, ATH12K_DBG_WMI, "sbs_lower_band_end_freq %u\n", ab->wmi_ab.sbs_lower_band_end_freq); + ret = ath12k_wmi_update_hw_mode_list(ab); + if (ret) { + ath12k_warn(ab, "failed to update hw mode list: %d\n", + ret); + return ret; + } + parse->dbs_or_sbs_cap_ext_done = true; } + break; default: break; diff --git a/drivers/net/wireless/ath/ath12k/wmi.h b/drivers/net/wireless/ath/ath12k/wmi.h index e69d53054f6d..f2a04a7bd91a 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.h +++ b/drivers/net/wireless/ath/ath12k/wmi.h @@ -5083,6 +5083,32 @@ struct ath12k_svc_ext_info { struct ath12k_svc_ext_mac_phy_info mac_phy_info[ATH12K_MAX_MAC_PHY_CAP]; }; +/** + * enum ath12k_hw_mode - enum for host mode + * @ATH12K_HW_MODE_SMM: Single mac mode + * @ATH12K_HW_MODE_DBS: DBS mode + * @ATH12K_HW_MODE_SBS: SBS mode with either high share or low share + * @ATH12K_HW_MODE_SBS_UPPER_SHARE: Higher 5 GHz shared with 2.4 GHz + * @ATH12K_HW_MODE_SBS_LOWER_SHARE: Lower 5 GHz shared with 2.4 GHz + * @ATH12K_HW_MODE_MAX: Max, used to indicate invalid mode + */ +enum ath12k_hw_mode { + ATH12K_HW_MODE_SMM, + ATH12K_HW_MODE_DBS, + ATH12K_HW_MODE_SBS, + ATH12K_HW_MODE_SBS_UPPER_SHARE, + ATH12K_HW_MODE_SBS_LOWER_SHARE, + ATH12K_HW_MODE_MAX, +}; + +struct ath12k_hw_mode_info { + bool support_dbs:1; + bool support_sbs:1; + + struct ath12k_hw_mode_freq_range_arg freq_range_caps[ATH12K_HW_MODE_MAX] + [MAX_RADIOS]; +}; + struct ath12k_wmi_base { struct ath12k_base *ab; struct ath12k_wmi_pdev wmi[MAX_RADIOS]; @@ -5103,6 +5129,7 @@ struct ath12k_wmi_base { struct ath12k_svc_ext_info svc_ext_info; u32 sbs_lower_band_end_freq; + struct ath12k_hw_mode_info hw_mode_info; }; struct wmi_pdev_set_bios_interface_cmd { From c36f2cd628f9df6ffb19c23eedaa666440b8f5c5 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 22 May 2025 16:54:13 +0800 Subject: [PATCH 203/242] wifi: ath12k: support WMI_MLO_LINK_SET_ACTIVE_CMDID command Add WMI_MLO_LINK_SET_ACTIVE_CMDID command. This command allows host to send required link information to firmware such that firmware can make decision on activating/deactivating links in various scenarios. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1 Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1 Signed-off-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20250522-ath12k-sbs-dbs-v1-4-54a29e7a3a88@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 221 ++++++++++++++++++++++++++ drivers/net/wireless/ath/ath12k/wmi.h | 116 +++++++++++++- 2 files changed, 336 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 034c18f6a2d2..6dbd272db1d5 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -10498,3 +10498,224 @@ int ath12k_wmi_send_vdev_set_tpc_power(struct ath12k *ar, return 0; } + +static int +ath12k_wmi_fill_disallowed_bmap(struct ath12k_base *ab, + struct wmi_disallowed_mlo_mode_bitmap_params *dislw_bmap, + struct wmi_mlo_link_set_active_arg *arg) +{ + struct wmi_ml_disallow_mode_bmap_arg *dislw_bmap_arg; + u8 i; + + if (arg->num_disallow_mode_comb > + ARRAY_SIZE(arg->disallow_bmap)) { + ath12k_warn(ab, "invalid num_disallow_mode_comb: %d", + arg->num_disallow_mode_comb); + return -EINVAL; + } + + dislw_bmap_arg = &arg->disallow_bmap[0]; + for (i = 0; i < arg->num_disallow_mode_comb; i++) { + dislw_bmap->tlv_header = + ath12k_wmi_tlv_cmd_hdr(0, sizeof(*dislw_bmap)); + dislw_bmap->disallowed_mode_bitmap = + cpu_to_le32(dislw_bmap_arg->disallowed_mode); + dislw_bmap->ieee_link_id_comb = + le32_encode_bits(dislw_bmap_arg->ieee_link_id[0], + WMI_DISALW_MLO_MODE_BMAP_IEEE_LINK_ID_COMB_1) | + le32_encode_bits(dislw_bmap_arg->ieee_link_id[1], + WMI_DISALW_MLO_MODE_BMAP_IEEE_LINK_ID_COMB_2) | + le32_encode_bits(dislw_bmap_arg->ieee_link_id[2], + WMI_DISALW_MLO_MODE_BMAP_IEEE_LINK_ID_COMB_3) | + le32_encode_bits(dislw_bmap_arg->ieee_link_id[3], + WMI_DISALW_MLO_MODE_BMAP_IEEE_LINK_ID_COMB_4); + + ath12k_dbg(ab, ATH12K_DBG_WMI, + "entry %d disallowed_mode %d ieee_link_id_comb 0x%x", + i, dislw_bmap_arg->disallowed_mode, + dislw_bmap_arg->ieee_link_id_comb); + dislw_bmap++; + dislw_bmap_arg++; + } + + return 0; +} + +int ath12k_wmi_send_mlo_link_set_active_cmd(struct ath12k_base *ab, + struct wmi_mlo_link_set_active_arg *arg) +{ + struct wmi_disallowed_mlo_mode_bitmap_params *disallowed_mode_bmap; + struct wmi_mlo_set_active_link_number_params *link_num_param; + u32 num_link_num_param = 0, num_vdev_bitmap = 0; + struct ath12k_wmi_base *wmi_ab = &ab->wmi_ab; + struct wmi_mlo_link_set_active_cmd *cmd; + u32 num_inactive_vdev_bitmap = 0; + u32 num_disallow_mode_comb = 0; + struct wmi_tlv *tlv; + struct sk_buff *skb; + __le32 *vdev_bitmap; + void *buf_ptr; + int i, ret; + u32 len; + + if (!arg->num_vdev_bitmap && !arg->num_link_entry) { + ath12k_warn(ab, "Invalid num_vdev_bitmap and num_link_entry"); + return -EINVAL; + } + + switch (arg->force_mode) { + case WMI_MLO_LINK_FORCE_MODE_ACTIVE_LINK_NUM: + case WMI_MLO_LINK_FORCE_MODE_INACTIVE_LINK_NUM: + num_link_num_param = arg->num_link_entry; + fallthrough; + case WMI_MLO_LINK_FORCE_MODE_ACTIVE: + case WMI_MLO_LINK_FORCE_MODE_INACTIVE: + case WMI_MLO_LINK_FORCE_MODE_NO_FORCE: + num_vdev_bitmap = arg->num_vdev_bitmap; + break; + case WMI_MLO_LINK_FORCE_MODE_ACTIVE_INACTIVE: + num_vdev_bitmap = arg->num_vdev_bitmap; + num_inactive_vdev_bitmap = arg->num_inactive_vdev_bitmap; + break; + default: + ath12k_warn(ab, "Invalid force mode: %u", arg->force_mode); + return -EINVAL; + } + + num_disallow_mode_comb = arg->num_disallow_mode_comb; + len = sizeof(*cmd) + + TLV_HDR_SIZE + sizeof(*link_num_param) * num_link_num_param + + TLV_HDR_SIZE + sizeof(*vdev_bitmap) * num_vdev_bitmap + + TLV_HDR_SIZE + TLV_HDR_SIZE + TLV_HDR_SIZE + + TLV_HDR_SIZE + sizeof(*disallowed_mode_bmap) * num_disallow_mode_comb; + if (arg->force_mode == WMI_MLO_LINK_FORCE_MODE_ACTIVE_INACTIVE) + len += sizeof(*vdev_bitmap) * num_inactive_vdev_bitmap; + + skb = ath12k_wmi_alloc_skb(wmi_ab, len); + if (!skb) + return -ENOMEM; + + cmd = (struct wmi_mlo_link_set_active_cmd *)skb->data; + cmd->tlv_header = ath12k_wmi_tlv_cmd_hdr(WMI_TAG_MLO_LINK_SET_ACTIVE_CMD, + sizeof(*cmd)); + cmd->force_mode = cpu_to_le32(arg->force_mode); + cmd->reason = cpu_to_le32(arg->reason); + ath12k_dbg(ab, ATH12K_DBG_WMI, + "mode %d reason %d num_link_num_param %d num_vdev_bitmap %d inactive %d num_disallow_mode_comb %d", + arg->force_mode, arg->reason, num_link_num_param, + num_vdev_bitmap, num_inactive_vdev_bitmap, + num_disallow_mode_comb); + + buf_ptr = skb->data + sizeof(*cmd); + tlv = buf_ptr; + tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_STRUCT, + sizeof(*link_num_param) * num_link_num_param); + buf_ptr += TLV_HDR_SIZE; + + if (num_link_num_param) { + cmd->ctrl_flags = + le32_encode_bits(arg->ctrl_flags.dync_force_link_num ? 1 : 0, + CRTL_F_DYNC_FORCE_LINK_NUM); + + link_num_param = buf_ptr; + for (i = 0; i < num_link_num_param; i++) { + link_num_param->tlv_header = + ath12k_wmi_tlv_cmd_hdr(0, sizeof(*link_num_param)); + link_num_param->num_of_link = + cpu_to_le32(arg->link_num[i].num_of_link); + link_num_param->vdev_type = + cpu_to_le32(arg->link_num[i].vdev_type); + link_num_param->vdev_subtype = + cpu_to_le32(arg->link_num[i].vdev_subtype); + link_num_param->home_freq = + cpu_to_le32(arg->link_num[i].home_freq); + ath12k_dbg(ab, ATH12K_DBG_WMI, + "entry %d num_of_link %d vdev type %d subtype %d freq %d control_flags %d", + i, arg->link_num[i].num_of_link, + arg->link_num[i].vdev_type, + arg->link_num[i].vdev_subtype, + arg->link_num[i].home_freq, + __le32_to_cpu(cmd->ctrl_flags)); + link_num_param++; + } + + buf_ptr += sizeof(*link_num_param) * num_link_num_param; + } + + tlv = buf_ptr; + tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_UINT32, + sizeof(*vdev_bitmap) * num_vdev_bitmap); + buf_ptr += TLV_HDR_SIZE; + + if (num_vdev_bitmap) { + vdev_bitmap = buf_ptr; + for (i = 0; i < num_vdev_bitmap; i++) { + vdev_bitmap[i] = cpu_to_le32(arg->vdev_bitmap[i]); + ath12k_dbg(ab, ATH12K_DBG_WMI, "entry %d vdev_id_bitmap 0x%x", + i, arg->vdev_bitmap[i]); + } + + buf_ptr += sizeof(*vdev_bitmap) * num_vdev_bitmap; + } + + if (arg->force_mode == WMI_MLO_LINK_FORCE_MODE_ACTIVE_INACTIVE) { + tlv = buf_ptr; + tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_UINT32, + sizeof(*vdev_bitmap) * + num_inactive_vdev_bitmap); + buf_ptr += TLV_HDR_SIZE; + + if (num_inactive_vdev_bitmap) { + vdev_bitmap = buf_ptr; + for (i = 0; i < num_inactive_vdev_bitmap; i++) { + vdev_bitmap[i] = + cpu_to_le32(arg->inactive_vdev_bitmap[i]); + ath12k_dbg(ab, ATH12K_DBG_WMI, + "entry %d inactive_vdev_id_bitmap 0x%x", + i, arg->inactive_vdev_bitmap[i]); + } + + buf_ptr += sizeof(*vdev_bitmap) * num_inactive_vdev_bitmap; + } + } else { + /* add empty vdev bitmap2 tlv */ + tlv = buf_ptr; + tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_UINT32, 0); + buf_ptr += TLV_HDR_SIZE; + } + + /* add empty ieee_link_id_bitmap tlv */ + tlv = buf_ptr; + tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_UINT32, 0); + buf_ptr += TLV_HDR_SIZE; + + /* add empty ieee_link_id_bitmap2 tlv */ + tlv = buf_ptr; + tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_UINT32, 0); + buf_ptr += TLV_HDR_SIZE; + + tlv = buf_ptr; + tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_STRUCT, + sizeof(*disallowed_mode_bmap) * + arg->num_disallow_mode_comb); + buf_ptr += TLV_HDR_SIZE; + + ret = ath12k_wmi_fill_disallowed_bmap(ab, buf_ptr, arg); + if (ret) + goto free_skb; + + ret = ath12k_wmi_cmd_send(&wmi_ab->wmi[0], skb, WMI_MLO_LINK_SET_ACTIVE_CMDID); + if (ret) { + ath12k_warn(ab, + "failed to send WMI_MLO_LINK_SET_ACTIVE_CMDID: %d\n", ret); + goto free_skb; + } + + ath12k_dbg(ab, ATH12K_DBG_WMI, "WMI mlo link set active cmd"); + + return ret; + +free_skb: + dev_kfree_skb(skb); + return ret; +} diff --git a/drivers/net/wireless/ath/ath12k/wmi.h b/drivers/net/wireless/ath/ath12k/wmi.h index f2a04a7bd91a..c640ffa180c8 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.h +++ b/drivers/net/wireless/ath/ath12k/wmi.h @@ -1974,6 +1974,7 @@ enum wmi_tlv_tag { WMI_TAG_TPC_STATS_CTL_PWR_TABLE_EVENT, WMI_TAG_VDEV_SET_TPC_POWER_CMD = 0x3B5, WMI_TAG_VDEV_CH_POWER_INFO, + WMI_TAG_MLO_LINK_SET_ACTIVE_CMD = 0x3BE, WMI_TAG_EHT_RATE_SET = 0x3C4, WMI_TAG_DCS_AWGN_INT_TYPE = 0x3C5, WMI_TAG_MLO_TX_SEND_PARAMS, @@ -6061,6 +6062,118 @@ struct wmi_vdev_set_tpc_power_cmd { */ } __packed; +#define CRTL_F_DYNC_FORCE_LINK_NUM GENMASK(3, 2) + +struct wmi_mlo_link_set_active_cmd { + __le32 tlv_header; + __le32 force_mode; + __le32 reason; + __le32 use_ieee_link_id_bitmap; + struct ath12k_wmi_mac_addr_params ap_mld_mac_addr; + __le32 ctrl_flags; +} __packed; + +struct wmi_mlo_set_active_link_number_params { + __le32 tlv_header; + __le32 num_of_link; + __le32 vdev_type; + __le32 vdev_subtype; + __le32 home_freq; +} __packed; + +#define WMI_DISALW_MLO_MODE_BMAP_IEEE_LINK_ID_COMB_1 GENMASK(7, 0) +#define WMI_DISALW_MLO_MODE_BMAP_IEEE_LINK_ID_COMB_2 GENMASK(15, 8) +#define WMI_DISALW_MLO_MODE_BMAP_IEEE_LINK_ID_COMB_3 GENMASK(23, 16) +#define WMI_DISALW_MLO_MODE_BMAP_IEEE_LINK_ID_COMB_4 GENMASK(31, 24) + +struct wmi_disallowed_mlo_mode_bitmap_params { + __le32 tlv_header; + __le32 disallowed_mode_bitmap; + __le32 ieee_link_id_comb; +} __packed; + +enum wmi_mlo_link_force_mode { + WMI_MLO_LINK_FORCE_MODE_ACTIVE = 1, + WMI_MLO_LINK_FORCE_MODE_INACTIVE = 2, + WMI_MLO_LINK_FORCE_MODE_ACTIVE_LINK_NUM = 3, + WMI_MLO_LINK_FORCE_MODE_INACTIVE_LINK_NUM = 4, + WMI_MLO_LINK_FORCE_MODE_NO_FORCE = 5, + WMI_MLO_LINK_FORCE_MODE_ACTIVE_INACTIVE = 6, + WMI_MLO_LINK_FORCE_MODE_NON_FORCE_UPDATE = 7, +}; + +enum wmi_mlo_link_force_reason { + WMI_MLO_LINK_FORCE_REASON_NEW_CONNECT = 1, + WMI_MLO_LINK_FORCE_REASON_NEW_DISCONNECT = 2, + WMI_MLO_LINK_FORCE_REASON_LINK_REMOVAL = 3, + WMI_MLO_LINK_FORCE_REASON_TDLS = 4, + WMI_MLO_LINK_FORCE_REASON_REVERT_FAILURE = 5, + WMI_MLO_LINK_FORCE_REASON_LINK_DELETE = 6, + WMI_MLO_LINK_FORCE_REASON_SINGLE_LINK_EMLSR_OP = 7, +}; + +struct wmi_mlo_link_num_arg { + u32 num_of_link; + u32 vdev_type; + u32 vdev_subtype; + u32 home_freq; +}; + +struct wmi_mlo_control_flags_arg { + bool overwrite_force_active_bitmap; + bool overwrite_force_inactive_bitmap; + bool dync_force_link_num; + bool post_re_evaluate; + u8 post_re_evaluate_loops; + bool dont_reschedule_workqueue; +}; + +struct wmi_ml_link_force_cmd_arg { + u8 ap_mld_mac_addr[ETH_ALEN]; + u16 ieee_link_id_bitmap; + u16 ieee_link_id_bitmap2; + u8 link_num; +}; + +struct wmi_ml_disallow_mode_bmap_arg { + u32 disallowed_mode; + union { + u32 ieee_link_id_comb; + u8 ieee_link_id[4]; + }; +}; + +/* maximum size of link number param array + * for MLO link set active command + */ +#define WMI_MLO_LINK_NUM_SZ 2 + +/* maximum size of vdev bitmap array for + * MLO link set active command + */ +#define WMI_MLO_VDEV_BITMAP_SZ 2 + +/* Max number of disallowed bitmap combination + * sent to firmware + */ +#define WMI_ML_MAX_DISALLOW_BMAP_COMB 4 + +struct wmi_mlo_link_set_active_arg { + enum wmi_mlo_link_force_mode force_mode; + enum wmi_mlo_link_force_reason reason; + u32 num_link_entry; + u32 num_vdev_bitmap; + u32 num_inactive_vdev_bitmap; + struct wmi_mlo_link_num_arg link_num[WMI_MLO_LINK_NUM_SZ]; + u32 vdev_bitmap[WMI_MLO_VDEV_BITMAP_SZ]; + u32 inactive_vdev_bitmap[WMI_MLO_VDEV_BITMAP_SZ]; + struct wmi_mlo_control_flags_arg ctrl_flags; + bool use_ieee_link_id; + struct wmi_ml_link_force_cmd_arg force_cmd; + u32 num_disallow_mode_comb; + struct wmi_ml_disallow_mode_bmap_arg disallow_bmap[WMI_ML_MAX_DISALLOW_BMAP_COMB]; +}; + void ath12k_wmi_init_qcn9274(struct ath12k_base *ab, struct ath12k_wmi_resource_config_arg *config); void ath12k_wmi_init_wcn7850(struct ath12k_base *ab, @@ -6259,5 +6372,6 @@ bool ath12k_wmi_supports_6ghz_cc_ext(struct ath12k *ar); int ath12k_wmi_send_vdev_set_tpc_power(struct ath12k *ar, u32 vdev_id, struct ath12k_reg_tpc_power_info *param); - +int ath12k_wmi_send_mlo_link_set_active_cmd(struct ath12k_base *ab, + struct wmi_mlo_link_set_active_arg *param); #endif From 2296038fd35a6e57af77496affe9044293ed2947 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 22 May 2025 16:54:14 +0800 Subject: [PATCH 204/242] wifi: ath12k: update link active in case two links fall on the same MAC In case of two links established on the same device in an ML connection, depending on device's hardware mode capability, it is possible that both links fall on the same MAC. Currently, no specific action is taken to address this but just keep both links active. However this would result in lower throughput compared to even one link, because switching between these two links on the resulted MAC significantly impacts throughput. Check if both links fall in the frequency range of a single MAC. If so, send WMI_MLO_LINK_SET_ACTIVE_CMDID command to firmware such that firmware can deactivate one of them. Note the decision of which link getting deactivated is made by firmware, host only sends the vdev lists. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1 Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1 Signed-off-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20250522-ath12k-sbs-dbs-v1-5-54a29e7a3a88@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/mac.c | 227 +++++++++++++++++++++++++- drivers/net/wireless/ath/ath12k/mac.h | 2 + 2 files changed, 228 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 88b59f3ff87a..2fae4b01ec90 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -5890,6 +5890,211 @@ static int ath12k_mac_handle_link_sta_state(struct ieee80211_hw *hw, return ret; } +static bool ath12k_mac_is_freq_on_mac(struct ath12k_hw_mode_freq_range_arg *freq_range, + u32 freq, u8 mac_id) +{ + return (freq >= freq_range[mac_id].low_2ghz_freq && + freq <= freq_range[mac_id].high_2ghz_freq) || + (freq >= freq_range[mac_id].low_5ghz_freq && + freq <= freq_range[mac_id].high_5ghz_freq); +} + +static bool +ath12k_mac_2_freq_same_mac_in_freq_range(struct ath12k_base *ab, + struct ath12k_hw_mode_freq_range_arg *freq_range, + u32 freq_link1, u32 freq_link2) +{ + u8 i; + + for (i = 0; i < MAX_RADIOS; i++) { + if (ath12k_mac_is_freq_on_mac(freq_range, freq_link1, i) && + ath12k_mac_is_freq_on_mac(freq_range, freq_link2, i)) + return true; + } + + return false; +} + +static bool ath12k_mac_is_hw_dbs_capable(struct ath12k_base *ab) +{ + return test_bit(WMI_TLV_SERVICE_DUAL_BAND_SIMULTANEOUS_SUPPORT, + ab->wmi_ab.svc_map) && + ab->wmi_ab.hw_mode_info.support_dbs; +} + +static bool ath12k_mac_2_freq_same_mac_in_dbs(struct ath12k_base *ab, + u32 freq_link1, u32 freq_link2) +{ + struct ath12k_hw_mode_freq_range_arg *freq_range; + + if (!ath12k_mac_is_hw_dbs_capable(ab)) + return true; + + freq_range = ab->wmi_ab.hw_mode_info.freq_range_caps[ATH12K_HW_MODE_DBS]; + return ath12k_mac_2_freq_same_mac_in_freq_range(ab, freq_range, + freq_link1, freq_link2); +} + +static bool ath12k_mac_is_hw_sbs_capable(struct ath12k_base *ab) +{ + return test_bit(WMI_TLV_SERVICE_DUAL_BAND_SIMULTANEOUS_SUPPORT, + ab->wmi_ab.svc_map) && + ab->wmi_ab.hw_mode_info.support_sbs; +} + +static bool ath12k_mac_2_freq_same_mac_in_sbs(struct ath12k_base *ab, + u32 freq_link1, u32 freq_link2) +{ + struct ath12k_hw_mode_info *info = &ab->wmi_ab.hw_mode_info; + struct ath12k_hw_mode_freq_range_arg *sbs_uppr_share; + struct ath12k_hw_mode_freq_range_arg *sbs_low_share; + struct ath12k_hw_mode_freq_range_arg *sbs_range; + + if (!ath12k_mac_is_hw_sbs_capable(ab)) + return true; + + if (ab->wmi_ab.sbs_lower_band_end_freq) { + sbs_uppr_share = info->freq_range_caps[ATH12K_HW_MODE_SBS_UPPER_SHARE]; + sbs_low_share = info->freq_range_caps[ATH12K_HW_MODE_SBS_LOWER_SHARE]; + + return ath12k_mac_2_freq_same_mac_in_freq_range(ab, sbs_low_share, + freq_link1, freq_link2) || + ath12k_mac_2_freq_same_mac_in_freq_range(ab, sbs_uppr_share, + freq_link1, freq_link2); + } + + sbs_range = info->freq_range_caps[ATH12K_HW_MODE_SBS]; + return ath12k_mac_2_freq_same_mac_in_freq_range(ab, sbs_range, + freq_link1, freq_link2); +} + +static bool ath12k_mac_freqs_on_same_mac(struct ath12k_base *ab, + u32 freq_link1, u32 freq_link2) +{ + return ath12k_mac_2_freq_same_mac_in_dbs(ab, freq_link1, freq_link2) && + ath12k_mac_2_freq_same_mac_in_sbs(ab, freq_link1, freq_link2); +} + +static int ath12k_mac_mlo_sta_set_link_active(struct ath12k_base *ab, + enum wmi_mlo_link_force_reason reason, + enum wmi_mlo_link_force_mode mode, + u8 *mlo_vdev_id_lst, + u8 num_mlo_vdev, + u8 *mlo_inactive_vdev_lst, + u8 num_mlo_inactive_vdev) +{ + struct wmi_mlo_link_set_active_arg param = {0}; + u32 entry_idx, entry_offset, vdev_idx; + u8 vdev_id; + + param.reason = reason; + param.force_mode = mode; + + for (vdev_idx = 0; vdev_idx < num_mlo_vdev; vdev_idx++) { + vdev_id = mlo_vdev_id_lst[vdev_idx]; + entry_idx = vdev_id / 32; + entry_offset = vdev_id % 32; + if (entry_idx >= WMI_MLO_LINK_NUM_SZ) { + ath12k_warn(ab, "Invalid entry_idx %d num_mlo_vdev %d vdev %d", + entry_idx, num_mlo_vdev, vdev_id); + return -EINVAL; + } + param.vdev_bitmap[entry_idx] |= 1 << entry_offset; + /* update entry number if entry index changed */ + if (param.num_vdev_bitmap < entry_idx + 1) + param.num_vdev_bitmap = entry_idx + 1; + } + + ath12k_dbg(ab, ATH12K_DBG_MAC, + "num_vdev_bitmap %d vdev_bitmap[0] = 0x%x, vdev_bitmap[1] = 0x%x", + param.num_vdev_bitmap, param.vdev_bitmap[0], param.vdev_bitmap[1]); + + if (mode == WMI_MLO_LINK_FORCE_MODE_ACTIVE_INACTIVE) { + for (vdev_idx = 0; vdev_idx < num_mlo_inactive_vdev; vdev_idx++) { + vdev_id = mlo_inactive_vdev_lst[vdev_idx]; + entry_idx = vdev_id / 32; + entry_offset = vdev_id % 32; + if (entry_idx >= WMI_MLO_LINK_NUM_SZ) { + ath12k_warn(ab, "Invalid entry_idx %d num_mlo_vdev %d vdev %d", + entry_idx, num_mlo_inactive_vdev, vdev_id); + return -EINVAL; + } + param.inactive_vdev_bitmap[entry_idx] |= 1 << entry_offset; + /* update entry number if entry index changed */ + if (param.num_inactive_vdev_bitmap < entry_idx + 1) + param.num_inactive_vdev_bitmap = entry_idx + 1; + } + + ath12k_dbg(ab, ATH12K_DBG_MAC, + "num_vdev_bitmap %d inactive_vdev_bitmap[0] = 0x%x, inactive_vdev_bitmap[1] = 0x%x", + param.num_inactive_vdev_bitmap, + param.inactive_vdev_bitmap[0], + param.inactive_vdev_bitmap[1]); + } + + if (mode == WMI_MLO_LINK_FORCE_MODE_ACTIVE_LINK_NUM || + mode == WMI_MLO_LINK_FORCE_MODE_INACTIVE_LINK_NUM) { + param.num_link_entry = 1; + param.link_num[0].num_of_link = num_mlo_vdev - 1; + } + + return ath12k_wmi_send_mlo_link_set_active_cmd(ab, ¶m); +} + +static int ath12k_mac_mlo_sta_update_link_active(struct ath12k_base *ab, + struct ieee80211_hw *hw, + struct ath12k_vif *ahvif) +{ + u8 mlo_vdev_id_lst[IEEE80211_MLD_MAX_NUM_LINKS] = {0}; + u32 mlo_freq_list[IEEE80211_MLD_MAX_NUM_LINKS] = {0}; + unsigned long links = ahvif->links_map; + enum wmi_mlo_link_force_reason reason; + struct ieee80211_chanctx_conf *conf; + enum wmi_mlo_link_force_mode mode; + struct ieee80211_bss_conf *info; + struct ath12k_link_vif *arvif; + u8 num_mlo_vdev = 0; + u8 link_id; + + for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + /* make sure vdev is created on this device */ + if (!arvif || !arvif->is_created || arvif->ar->ab != ab) + continue; + + info = ath12k_mac_get_link_bss_conf(arvif); + conf = wiphy_dereference(hw->wiphy, info->chanctx_conf); + mlo_freq_list[num_mlo_vdev] = conf->def.chan->center_freq; + + mlo_vdev_id_lst[num_mlo_vdev] = arvif->vdev_id; + num_mlo_vdev++; + } + + /* It is not allowed to activate more links than a single device + * supported. Something goes wrong if we reach here. + */ + if (num_mlo_vdev > ATH12K_NUM_MAX_ACTIVE_LINKS_PER_DEVICE) { + WARN_ON_ONCE(1); + return -EINVAL; + } + + /* if 2 links are established and both link channels fall on the + * same hardware MAC, send command to firmware to deactivate one + * of them. + */ + if (num_mlo_vdev == 2 && + ath12k_mac_freqs_on_same_mac(ab, mlo_freq_list[0], + mlo_freq_list[1])) { + mode = WMI_MLO_LINK_FORCE_MODE_INACTIVE_LINK_NUM; + reason = WMI_MLO_LINK_FORCE_REASON_NEW_CONNECT; + return ath12k_mac_mlo_sta_set_link_active(ab, reason, mode, + mlo_vdev_id_lst, num_mlo_vdev, + NULL, 0); + } + + return 0; +} + static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, @@ -5899,10 +6104,12 @@ static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); struct ath12k_sta *ahsta = ath12k_sta_to_ahsta(sta); struct ath12k_hw *ah = ath12k_hw_to_ah(hw); + struct ath12k_base *prev_ab = NULL, *ab; struct ath12k_link_vif *arvif; struct ath12k_link_sta *arsta; unsigned long valid_links; - u8 link_id = 0; + u8 link_id = 0, i; + struct ath12k *ar; int ret; lockdep_assert_wiphy(hw->wiphy); @@ -5997,6 +6204,24 @@ static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, } } + if (ieee80211_vif_is_mld(vif) && vif->type == NL80211_IFTYPE_STATION && + old_state == IEEE80211_STA_ASSOC && new_state == IEEE80211_STA_AUTHORIZED) { + for_each_ar(ah, ar, i) { + ab = ar->ab; + if (prev_ab == ab) + continue; + + ret = ath12k_mac_mlo_sta_update_link_active(ab, hw, ahvif); + if (ret) { + ath12k_warn(ab, + "failed to update link active state on connect %d\n", + ret); + goto exit; + } + + prev_ab = ab; + } + } /* IEEE80211_STA_NONE -> IEEE80211_STA_NOTEXIST: * Remove the station from driver (handle ML sta here since that * needs special handling. Normal sta will be handled in generic diff --git a/drivers/net/wireless/ath/ath12k/mac.h b/drivers/net/wireless/ath/ath12k/mac.h index e6e74b45bfa4..cc81b1f5680f 100644 --- a/drivers/net/wireless/ath/ath12k/mac.h +++ b/drivers/net/wireless/ath/ath12k/mac.h @@ -54,6 +54,8 @@ struct ath12k_generic_iter { #define ATH12K_DEFAULT_SCAN_LINK IEEE80211_MLD_MAX_NUM_LINKS #define ATH12K_NUM_MAX_LINKS (IEEE80211_MLD_MAX_NUM_LINKS + 1) +#define ATH12K_NUM_MAX_ACTIVE_LINKS_PER_DEVICE 2 + enum ath12k_supported_bw { ATH12K_BW_20 = 0, ATH12K_BW_40 = 1, From d9dbc6b8b94ab41fb8b9dbb5a7be024029d46309 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 22 May 2025 16:54:15 +0800 Subject: [PATCH 205/242] wifi: ath12k: don't activate more links than firmware supports In case of ML connection, currently all useful links are activated at ASSOC stage: ieee80211_set_active_links(vif, ieee80211_vif_usable_links(vif)) this results in firmware crash when the number of links activated on the same device is more than supported. Since firmware supports activating at most 2 links for a ML connection, to avoid firmware crash, host needs to select 2 links out of the useful links. As the assoc link has already been chosen, the question becomes how to determine partner links. A straightforward principle applied here is that the resulted combination should achieve the best throughput. For that purpose, ideally various factors like bandwidth, RSSI etc should be considered. But that would be too complicate. To make it easy, the choice is to only take hardware modes into consideration. The SBS (single band simultaneously) mode frequency range covers 5 GHz and 6 GHz bands. In this mode, the two individual MACs are both active, with one working on 5g-high band and the other on 5g-low band (from hardware perspective 5 GHz and 6 GHz bands are referred to as a 'large' single 5 GHz band). The DBS (dual band simultaneously) mode covers 2 GHz band and the 'large' 5 GHz band, with one MAC working on 2 GHz band and the other working on 5 GHz band or 6 GHz band. Since 5,6 GHz bands could provide higher bandwidth than 2 GHz band, the preference is given to SBS mode. Other hardware modes results in only one working MAC at any given time, so it is chosen only when both SBS are DBS are not possible. For each hardware mode, if there are more than one partner candidate, just choose the first one. For now only single device MLO case is handled as it is easy. Other cases could be addressed in the future. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1 Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1 Signed-off-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20250522-ath12k-sbs-dbs-v1-6-54a29e7a3a88@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.h | 6 ++ drivers/net/wireless/ath/ath12k/mac.c | 137 ++++++++++++++++++++++++- 2 files changed, 141 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index 941db6e49d6e..efbd6e8ce275 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -601,6 +601,12 @@ struct ath12k_sta { #define ATH12K_NUM_CHANS 101 #define ATH12K_MAX_5GHZ_CHAN 173 +static inline bool ath12k_is_2ghz_channel_freq(u32 freq) +{ + return freq >= ATH12K_MIN_2GHZ_FREQ && + freq <= ATH12K_MAX_2GHZ_FREQ; +} + enum ath12k_hw_state { ATH12K_HW_STATE_OFF, ATH12K_HW_STATE_ON, diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 2fae4b01ec90..b5b31cd8de99 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -6095,6 +6095,122 @@ static int ath12k_mac_mlo_sta_update_link_active(struct ath12k_base *ab, return 0; } +static bool ath12k_mac_are_sbs_chan(struct ath12k_base *ab, u32 freq_1, u32 freq_2) +{ + if (!ath12k_mac_is_hw_sbs_capable(ab)) + return false; + + if (ath12k_is_2ghz_channel_freq(freq_1) || + ath12k_is_2ghz_channel_freq(freq_2)) + return false; + + return !ath12k_mac_2_freq_same_mac_in_sbs(ab, freq_1, freq_2); +} + +static bool ath12k_mac_are_dbs_chan(struct ath12k_base *ab, u32 freq_1, u32 freq_2) +{ + if (!ath12k_mac_is_hw_dbs_capable(ab)) + return false; + + return !ath12k_mac_2_freq_same_mac_in_dbs(ab, freq_1, freq_2); +} + +static int ath12k_mac_select_links(struct ath12k_base *ab, + struct ieee80211_vif *vif, + struct ieee80211_hw *hw, + u16 *selected_links) +{ + unsigned long useful_links = ieee80211_vif_usable_links(vif); + struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); + u8 num_useful_links = hweight_long(useful_links); + struct ieee80211_chanctx_conf *chanctx; + struct ath12k_link_vif *assoc_arvif; + u32 assoc_link_freq, partner_freq; + u16 sbs_links = 0, dbs_links = 0; + struct ieee80211_bss_conf *info; + struct ieee80211_channel *chan; + struct ieee80211_sta *sta; + struct ath12k_sta *ahsta; + u8 link_id; + + /* activate all useful links if less than max supported */ + if (num_useful_links <= ATH12K_NUM_MAX_ACTIVE_LINKS_PER_DEVICE) { + *selected_links = useful_links; + return 0; + } + + /* only in station mode we can get here, so it's safe + * to use ap_addr + */ + rcu_read_lock(); + sta = ieee80211_find_sta(vif, vif->cfg.ap_addr); + if (!sta) { + rcu_read_unlock(); + ath12k_warn(ab, "failed to find sta with addr %pM\n", vif->cfg.ap_addr); + return -EINVAL; + } + + ahsta = ath12k_sta_to_ahsta(sta); + assoc_arvif = wiphy_dereference(hw->wiphy, ahvif->link[ahsta->assoc_link_id]); + info = ath12k_mac_get_link_bss_conf(assoc_arvif); + chanctx = rcu_dereference(info->chanctx_conf); + assoc_link_freq = chanctx->def.chan->center_freq; + rcu_read_unlock(); + ath12k_dbg(ab, ATH12K_DBG_MAC, "assoc link %u freq %u\n", + assoc_arvif->link_id, assoc_link_freq); + + /* assoc link is already activated and has to be kept active, + * only need to select a partner link from others. + */ + useful_links &= ~BIT(assoc_arvif->link_id); + for_each_set_bit(link_id, &useful_links, IEEE80211_MLD_MAX_NUM_LINKS) { + info = wiphy_dereference(hw->wiphy, vif->link_conf[link_id]); + if (!info) { + ath12k_warn(ab, "failed to get link info for link: %u\n", + link_id); + return -ENOLINK; + } + + chan = info->chanreq.oper.chan; + if (!chan) { + ath12k_warn(ab, "failed to get chan for link: %u\n", link_id); + return -EINVAL; + } + + partner_freq = chan->center_freq; + if (ath12k_mac_are_sbs_chan(ab, assoc_link_freq, partner_freq)) { + sbs_links |= BIT(link_id); + ath12k_dbg(ab, ATH12K_DBG_MAC, "new SBS link %u freq %u\n", + link_id, partner_freq); + continue; + } + + if (ath12k_mac_are_dbs_chan(ab, assoc_link_freq, partner_freq)) { + dbs_links |= BIT(link_id); + ath12k_dbg(ab, ATH12K_DBG_MAC, "new DBS link %u freq %u\n", + link_id, partner_freq); + continue; + } + + ath12k_dbg(ab, ATH12K_DBG_MAC, "non DBS/SBS link %u freq %u\n", + link_id, partner_freq); + } + + /* choose the first candidate no matter how many is in the list */ + if (sbs_links) + link_id = __ffs(sbs_links); + else if (dbs_links) + link_id = __ffs(dbs_links); + else + link_id = ffs(useful_links) - 1; + + ath12k_dbg(ab, ATH12K_DBG_MAC, "select partner link %u\n", link_id); + + *selected_links = BIT(assoc_arvif->link_id) | BIT(link_id); + + return 0; +} + static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, @@ -6108,6 +6224,7 @@ static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, struct ath12k_link_vif *arvif; struct ath12k_link_sta *arsta; unsigned long valid_links; + u16 selected_links = 0; u8 link_id = 0, i; struct ath12k *ar; int ret; @@ -6179,8 +6296,24 @@ static int ath12k_mac_op_sta_state(struct ieee80211_hw *hw, * about to move to the associated state. */ if (ieee80211_vif_is_mld(vif) && vif->type == NL80211_IFTYPE_STATION && - old_state == IEEE80211_STA_AUTH && new_state == IEEE80211_STA_ASSOC) - ieee80211_set_active_links(vif, ieee80211_vif_usable_links(vif)); + old_state == IEEE80211_STA_AUTH && new_state == IEEE80211_STA_ASSOC) { + /* TODO: for now only do link selection for single device + * MLO case. Other cases would be handled in the future. + */ + ab = ah->radio[0].ab; + if (ab->ag->num_devices == 1) { + ret = ath12k_mac_select_links(ab, vif, hw, &selected_links); + if (ret) { + ath12k_warn(ab, + "failed to get selected links: %d\n", ret); + goto exit; + } + } else { + selected_links = ieee80211_vif_usable_links(vif); + } + + ieee80211_set_active_links(vif, selected_links); + } /* Handle all the other state transitions in generic way */ valid_links = ahsta->links_map; From a48a931a32f88157db2baa31c10738a7fe660e93 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 12 Jun 2025 09:31:49 +0800 Subject: [PATCH 206/242] wifi: ath12k: fix documentation on firmware stats Regarding the firmware stats events handling, the comment in ath12k_mac_get_fw_stats() says host determines whether all events have been received based on 'end' tag in TLV. This is wrong as there is no such tag at all, actually host makes the decision totally by itself based on the stats type and active pdev/vdev counts etc. Fix it to correctly reflect the logic. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284.1-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Tested-on: QCN9274 hw2.0 WLAN.WBE.1.5-01651-QCAHKSWPL_SILICONZ-1 Signed-off-by: Baochen Qiang Link: https://patch.msgid.link/20250612-ath12k-fw-fixes-v1-1-12f594f3b857@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/mac.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index b5b31cd8de99..581b6c9c84ea 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4400,9 +4400,8 @@ int ath12k_mac_get_fw_stats(struct ath12k *ar, /* Firmware sends WMI_UPDATE_STATS_EVENTID back-to-back * when stats data buffer limit is reached. fw_stats_complete * is completed once host receives first event from firmware, but - * still end might not be marked in the TLV. - * Below loop is to confirm that firmware completed sending all the event - * and fw_stats_done is marked true when end is marked in the TLV. + * still there could be more events following. Below loop is to wait + * until firmware completes sending all the events. */ for (;;) { if (time_after(jiffies, timeout)) From 9a353a4a11a475578be3c02dd17e70afe3a4a7f6 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 12 Jun 2025 09:31:50 +0800 Subject: [PATCH 207/242] wifi: ath12k: avoid burning CPU while waiting for firmware stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ath12k_mac_get_fw_stats() is busy polling fw_stats_done flag while waiting firmware finishing sending all events. This is not good as CPU is monopolized and kept burning during the wait. Change to the completion mechanism to fix it. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284.1-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Tested-on: QCN9274 hw2.0 WLAN.WBE.1.5-01651-QCAHKSWPL_SILICONZ-1 Fixes: e367c924768b ("wifi: ath12k: Request vdev stats from firmware") Reported-by: Grégoire Stein Closes: https://lore.kernel.org/ath12k/AS8P190MB120575BBB25FCE697CD7D4988763A@AS8P190MB1205.EURP190.PROD.OUTLOOK.COM/ Signed-off-by: Baochen Qiang Tested-by: Grégoire Stein Link: https://patch.msgid.link/20250612-ath12k-fw-fixes-v1-2-12f594f3b857@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.c | 2 +- drivers/net/wireless/ath/ath12k/core.h | 2 +- drivers/net/wireless/ath/ath12k/mac.c | 27 ++++++++------------------ drivers/net/wireless/ath/ath12k/wmi.c | 7 ++++--- 4 files changed, 14 insertions(+), 24 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index ebc0560d40e3..fbc62209086f 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -1216,6 +1216,7 @@ void ath12k_fw_stats_init(struct ath12k *ar) INIT_LIST_HEAD(&ar->fw_stats.pdevs); INIT_LIST_HEAD(&ar->fw_stats.bcn); init_completion(&ar->fw_stats_complete); + init_completion(&ar->fw_stats_done); } void ath12k_fw_stats_free(struct ath12k_fw_stats *stats) @@ -1228,7 +1229,6 @@ void ath12k_fw_stats_free(struct ath12k_fw_stats *stats) void ath12k_fw_stats_reset(struct ath12k *ar) { spin_lock_bh(&ar->data_lock); - ar->fw_stats.fw_stats_done = false; ath12k_fw_stats_free(&ar->fw_stats); spin_unlock_bh(&ar->data_lock); } diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index efbd6e8ce275..2b2e0c16b64e 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -632,7 +632,6 @@ struct ath12k_fw_stats { struct list_head pdevs; struct list_head vdevs; struct list_head bcn; - bool fw_stats_done; }; struct ath12k_dbg_htt_stats { @@ -812,6 +811,7 @@ struct ath12k { bool regdom_set_by_user; struct completion fw_stats_complete; + struct completion fw_stats_done; struct completion mlo_setup_done; u32 mlo_setup_status; diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 581b6c9c84ea..59ec422992d3 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4360,7 +4360,7 @@ int ath12k_mac_get_fw_stats(struct ath12k *ar, { struct ath12k_base *ab = ar->ab; struct ath12k_hw *ah = ath12k_ar_to_ah(ar); - unsigned long timeout, time_left; + unsigned long time_left; int ret; guard(mutex)(&ah->hw_mutex); @@ -4368,19 +4368,13 @@ int ath12k_mac_get_fw_stats(struct ath12k *ar, if (ah->state != ATH12K_HW_STATE_ON) return -ENETDOWN; - /* FW stats can get split when exceeding the stats data buffer limit. - * In that case, since there is no end marking for the back-to-back - * received 'update stats' event, we keep a 3 seconds timeout in case, - * fw_stats_done is not marked yet - */ - timeout = jiffies + msecs_to_jiffies(3 * 1000); ath12k_fw_stats_reset(ar); reinit_completion(&ar->fw_stats_complete); + reinit_completion(&ar->fw_stats_done); ret = ath12k_wmi_send_stats_request_cmd(ar, param->stats_id, param->vdev_id, param->pdev_id); - if (ret) { ath12k_warn(ab, "failed to request fw stats: %d\n", ret); return ret; @@ -4391,7 +4385,6 @@ int ath12k_mac_get_fw_stats(struct ath12k *ar, param->pdev_id, param->vdev_id, param->stats_id); time_left = wait_for_completion_timeout(&ar->fw_stats_complete, 1 * HZ); - if (!time_left) { ath12k_warn(ab, "time out while waiting for get fw stats\n"); return -ETIMEDOUT; @@ -4400,19 +4393,15 @@ int ath12k_mac_get_fw_stats(struct ath12k *ar, /* Firmware sends WMI_UPDATE_STATS_EVENTID back-to-back * when stats data buffer limit is reached. fw_stats_complete * is completed once host receives first event from firmware, but - * still there could be more events following. Below loop is to wait + * still there could be more events following. Below is to wait * until firmware completes sending all the events. */ - for (;;) { - if (time_after(jiffies, timeout)) - break; - spin_lock_bh(&ar->data_lock); - if (ar->fw_stats.fw_stats_done) { - spin_unlock_bh(&ar->data_lock); - break; - } - spin_unlock_bh(&ar->data_lock); + time_left = wait_for_completion_timeout(&ar->fw_stats_done, 3 * HZ); + if (!time_left) { + ath12k_warn(ab, "time out while waiting for fw stats done\n"); + return -ETIMEDOUT; } + return 0; } diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 6dbd272db1d5..e8323581a5af 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -8192,11 +8192,12 @@ static void ath12k_wmi_fw_stats_process(struct ath12k *ar, &ar->fw_stats.vdevs); if (is_end) { - ar->fw_stats.fw_stats_done = true; + complete(&ar->fw_stats_done); num_vdev = 0; } return; } + if (stats->stats_id == WMI_REQUEST_BCN_STAT) { if (list_empty(&stats->bcn)) { ath12k_warn(ab, "empty beacon stats"); @@ -8211,7 +8212,7 @@ static void ath12k_wmi_fw_stats_process(struct ath12k *ar, &ar->fw_stats.bcn); if (is_end) { - ar->fw_stats.fw_stats_done = true; + complete(&ar->fw_stats_done); num_bcn = 0; } } @@ -8249,7 +8250,7 @@ static void ath12k_update_stats_event(struct ath12k_base *ab, struct sk_buff *sk /* Handle WMI_REQUEST_PDEV_STAT status update */ if (stats.stats_id == WMI_REQUEST_PDEV_STAT) { list_splice_tail_init(&stats.pdevs, &ar->fw_stats.pdevs); - ar->fw_stats.fw_stats_done = true; + complete(&ar->fw_stats_done); goto complete; } From ac7b8ff7839ded757806317ab8979be844766770 Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 12 Jun 2025 09:31:51 +0800 Subject: [PATCH 208/242] wifi: ath12k: don't use static variables in ath12k_wmi_fw_stats_process() Currently ath12k_wmi_fw_stats_process() is using static variables to count firmware stat events. Taking num_vdev as an example, if for whatever reason (say ar->num_started_vdevs is 0 or firmware bug etc.) the following condition (++num_vdev) == total_vdevs_started is not met, is_end is not set thus num_vdev won't be cleared. Next time when firmware stats is requested again, even if everything is working fine, failure is expected due to the condition above will never be satisfied. The same applies to num_bcn as well. Change to use non-static counters and reset them each time before firmware stats is requested. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284.1-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Tested-on: QCN9274 hw2.0 WLAN.WBE.1.5-01651-QCAHKSWPL_SILICONZ-1 Fixes: e367c924768b ("wifi: ath12k: Request vdev stats from firmware") Signed-off-by: Baochen Qiang Link: https://patch.msgid.link/20250612-ath12k-fw-fixes-v1-3-12f594f3b857@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.c | 2 ++ drivers/net/wireless/ath/ath12k/core.h | 2 ++ drivers/net/wireless/ath/ath12k/wmi.c | 14 +++++--------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index fbc62209086f..89ae80934b30 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -1230,6 +1230,8 @@ void ath12k_fw_stats_reset(struct ath12k *ar) { spin_lock_bh(&ar->data_lock); ath12k_fw_stats_free(&ar->fw_stats); + ar->fw_stats.num_vdev_recvd = 0; + ar->fw_stats.num_bcn_recvd = 0; spin_unlock_bh(&ar->data_lock); } diff --git a/drivers/net/wireless/ath/ath12k/core.h b/drivers/net/wireless/ath/ath12k/core.h index 2b2e0c16b64e..7bcd9c70309f 100644 --- a/drivers/net/wireless/ath/ath12k/core.h +++ b/drivers/net/wireless/ath/ath12k/core.h @@ -632,6 +632,8 @@ struct ath12k_fw_stats { struct list_head pdevs; struct list_head vdevs; struct list_head bcn; + u32 num_vdev_recvd; + u32 num_bcn_recvd; }; struct ath12k_dbg_htt_stats { diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index e8323581a5af..533e4ddb9a50 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -8166,7 +8166,6 @@ static void ath12k_wmi_fw_stats_process(struct ath12k *ar, struct ath12k_base *ab = ar->ab; struct ath12k_pdev *pdev; bool is_end; - static unsigned int num_vdev, num_bcn; size_t total_vdevs_started = 0; int i; @@ -8186,15 +8185,14 @@ static void ath12k_wmi_fw_stats_process(struct ath12k *ar, } rcu_read_unlock(); - is_end = ((++num_vdev) == total_vdevs_started); + is_end = ((++ar->fw_stats.num_vdev_recvd) == total_vdevs_started); list_splice_tail_init(&stats->vdevs, &ar->fw_stats.vdevs); - if (is_end) { + if (is_end) complete(&ar->fw_stats_done); - num_vdev = 0; - } + return; } @@ -8206,15 +8204,13 @@ static void ath12k_wmi_fw_stats_process(struct ath12k *ar, /* Mark end until we reached the count of all started VDEVs * within the PDEV */ - is_end = ((++num_bcn) == ar->num_started_vdevs); + is_end = ((++ar->fw_stats.num_bcn_recvd) == ar->num_started_vdevs); list_splice_tail_init(&stats->bcn, &ar->fw_stats.bcn); - if (is_end) { + if (is_end) complete(&ar->fw_stats_done); - num_bcn = 0; - } } } From ad5e9178cec589bf3af589dc43e638e5a5bf56fa Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 12 Jun 2025 09:31:52 +0800 Subject: [PATCH 209/242] wifi: ath12k: don't wait when there is no vdev started For WMI_REQUEST_VDEV_STAT request, firmware might split response into multiple events dut to buffer limit, hence currently in ath12k_wmi_fw_stats_process() host waits until all events received. In case there is no vdev started, this results in that below condition would never get satisfied ((++ar->fw_stats.num_vdev_recvd) == total_vdevs_started) consequently the requestor would be blocked until time out. The same applies to WMI_REQUEST_BCN_STAT request as well due to: ((++ar->fw_stats.num_bcn_recvd) == ar->num_started_vdevs) Change to check the number of started vdev first: if it is zero, finish directly; if not, follow the old way. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00284.1-QCAHMTSWPL_V1.0_V2.0_SILICONZ-3 Tested-on: QCN9274 hw2.0 WLAN.WBE.1.5-01651-QCAHKSWPL_SILICONZ-1 Fixes: e367c924768b ("wifi: ath12k: Request vdev stats from firmware") Signed-off-by: Baochen Qiang Link: https://patch.msgid.link/20250612-ath12k-fw-fixes-v1-4-12f594f3b857@quicinc.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 533e4ddb9a50..465f877fc0fb 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -8165,7 +8165,7 @@ static void ath12k_wmi_fw_stats_process(struct ath12k *ar, { struct ath12k_base *ab = ar->ab; struct ath12k_pdev *pdev; - bool is_end; + bool is_end = true; size_t total_vdevs_started = 0; int i; @@ -8185,7 +8185,9 @@ static void ath12k_wmi_fw_stats_process(struct ath12k *ar, } rcu_read_unlock(); - is_end = ((++ar->fw_stats.num_vdev_recvd) == total_vdevs_started); + if (total_vdevs_started) + is_end = ((++ar->fw_stats.num_vdev_recvd) == + total_vdevs_started); list_splice_tail_init(&stats->vdevs, &ar->fw_stats.vdevs); @@ -8204,7 +8206,9 @@ static void ath12k_wmi_fw_stats_process(struct ath12k *ar, /* Mark end until we reached the count of all started VDEVs * within the PDEV */ - is_end = ((++ar->fw_stats.num_bcn_recvd) == ar->num_started_vdevs); + if (ar->num_started_vdevs) + is_end = ((++ar->fw_stats.num_bcn_recvd) == + ar->num_started_vdevs); list_splice_tail_init(&stats->bcn, &ar->fw_stats.bcn); From 15d25307692312cec4b57052da73387f91a2e870 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 16 Jun 2025 21:12:05 +0300 Subject: [PATCH 210/242] wifi: carl9170: do not ping device which has failed to load firmware Syzkaller reports [1, 2] crashes caused by an attempts to ping the device which has failed to load firmware. Since such a device doesn't pass 'ieee80211_register_hw()', an internal workqueue managed by 'ieee80211_queue_work()' is not yet created and an attempt to queue work on it causes null-ptr-deref. [1] https://syzkaller.appspot.com/bug?extid=9a4aec827829942045ff [2] https://syzkaller.appspot.com/bug?extid=0d8afba53e8fb2633217 Fixes: e4a668c59080 ("carl9170: fix spurious restart due to high latency") Signed-off-by: Dmitry Antipov Acked-by: Christian Lamparter Link: https://patch.msgid.link/20250616181205.38883-1-dmantipov@yandex.ru Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/carl9170/usb.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/ath/carl9170/usb.c b/drivers/net/wireless/ath/carl9170/usb.c index a3e03580cd9f..564ca6a61985 100644 --- a/drivers/net/wireless/ath/carl9170/usb.c +++ b/drivers/net/wireless/ath/carl9170/usb.c @@ -438,14 +438,21 @@ static void carl9170_usb_rx_complete(struct urb *urb) if (atomic_read(&ar->rx_anch_urbs) == 0) { /* - * The system is too slow to cope with - * the enormous workload. We have simply - * run out of active rx urbs and this - * unfortunately leads to an unpredictable - * device. + * At this point, either the system is too slow to + * cope with the enormous workload (so we have simply + * run out of active rx urbs and this unfortunately + * leads to an unpredictable device), or the device + * is not fully functional after an unsuccessful + * firmware loading attempts (so it doesn't pass + * ieee80211_register_hw() and there is no internal + * workqueue at all). */ - ieee80211_queue_work(ar->hw, &ar->ping_work); + if (ar->registered) + ieee80211_queue_work(ar->hw, &ar->ping_work); + else + pr_warn_once("device %s is not registered\n", + dev_name(&ar->udev->dev)); } } else { /* From 6dbb0d97c5096072c78a6abffe393584e57ae945 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 16 Jun 2025 13:15:12 -0700 Subject: [PATCH 211/242] mpls: Use rcu_dereference_rtnl() in mpls_route_input_rcu(). As syzbot reported [0], mpls_route_input_rcu() can be called from mpls_getroute(), where is under RTNL. net->mpls.platform_label is only updated under RTNL. Let's use rcu_dereference_rtnl() in mpls_route_input_rcu() to silence the splat. [0]: WARNING: suspicious RCU usage 6.15.0-rc7-syzkaller-00082-g5cdb2c77c4c3 #0 Not tainted ---------------------------- net/mpls/af_mpls.c:84 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by syz.2.4451/17730: #0: ffffffff9012a3e8 (rtnl_mutex){+.+.}-{4:4}, at: rtnl_lock net/core/rtnetlink.c:80 [inline] #0: ffffffff9012a3e8 (rtnl_mutex){+.+.}-{4:4}, at: rtnetlink_rcv_msg+0x371/0xe90 net/core/rtnetlink.c:6961 stack backtrace: CPU: 1 UID: 0 PID: 17730 Comm: syz.2.4451 Not tainted 6.15.0-rc7-syzkaller-00082-g5cdb2c77c4c3 #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x16c/0x1f0 lib/dump_stack.c:120 lockdep_rcu_suspicious+0x166/0x260 kernel/locking/lockdep.c:6865 mpls_route_input_rcu+0x1d4/0x200 net/mpls/af_mpls.c:84 mpls_getroute+0x621/0x1ea0 net/mpls/af_mpls.c:2381 rtnetlink_rcv_msg+0x3c9/0xe90 net/core/rtnetlink.c:6964 netlink_rcv_skb+0x16d/0x440 net/netlink/af_netlink.c:2534 netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] netlink_unicast+0x53a/0x7f0 net/netlink/af_netlink.c:1339 netlink_sendmsg+0x8d1/0xdd0 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg net/socket.c:727 [inline] ____sys_sendmsg+0xa98/0xc70 net/socket.c:2566 ___sys_sendmsg+0x134/0x1d0 net/socket.c:2620 __sys_sendmmsg+0x200/0x420 net/socket.c:2709 __do_sys_sendmmsg net/socket.c:2736 [inline] __se_sys_sendmmsg net/socket.c:2733 [inline] __x64_sys_sendmmsg+0x9c/0x100 net/socket.c:2733 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcd/0x230 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f0a2818e969 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f0a28f52038 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 00007f0a283b5fa0 RCX: 00007f0a2818e969 RDX: 0000000000000003 RSI: 0000200000000080 RDI: 0000000000000003 RBP: 00007f0a28210ab1 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f0a283b5fa0 R15: 00007ffce5e9f268 Fixes: 0189197f4416 ("mpls: Basic routing support") Reported-by: syzbot+8a583bdd1a5cc0b0e068@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68507981.a70a0220.395abc.01ef.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250616201532.1036568-1-kuni1840@gmail.com Signed-off-by: Jakub Kicinski --- net/mpls/af_mpls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index d536c97144e9..47d7dfd9ad09 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -81,8 +81,8 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) if (index < net->mpls.platform_labels) { struct mpls_route __rcu **platform_label = - rcu_dereference(net->mpls.platform_label); - rt = rcu_dereference(platform_label[index]); + rcu_dereference_rtnl(net->mpls.platform_label); + rt = rcu_dereference_rtnl(platform_label[index]); } return rt; } From 2f370ae1fb6317985f3497b1bb80d457508ca2f7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 16 Jun 2025 11:21:14 -0700 Subject: [PATCH 212/242] atm: atmtcp: Free invalid length skb in atmtcp_c_send(). syzbot reported the splat below. [0] vcc_sendmsg() copies data passed from userspace to skb and passes it to vcc->dev->ops->send(). atmtcp_c_send() accesses skb->data as struct atmtcp_hdr after checking if skb->len is 0, but it's not enough. Also, when skb->len == 0, skb and sk (vcc) were leaked because dev_kfree_skb() is not called and sk_wmem_alloc adjustment is missing to revert atm_account_tx() in vcc_sendmsg(), which is expected to be done in atm_pop_raw(). Let's properly free skb with an invalid length in atmtcp_c_send(). [0]: BUG: KMSAN: uninit-value in atmtcp_c_send+0x255/0xed0 drivers/atm/atmtcp.c:294 atmtcp_c_send+0x255/0xed0 drivers/atm/atmtcp.c:294 vcc_sendmsg+0xd7c/0xff0 net/atm/common.c:644 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg+0x330/0x3d0 net/socket.c:727 ____sys_sendmsg+0x7e0/0xd80 net/socket.c:2566 ___sys_sendmsg+0x271/0x3b0 net/socket.c:2620 __sys_sendmsg net/socket.c:2652 [inline] __do_sys_sendmsg net/socket.c:2657 [inline] __se_sys_sendmsg net/socket.c:2655 [inline] __x64_sys_sendmsg+0x211/0x3e0 net/socket.c:2655 x64_sys_call+0x32fb/0x3db0 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xd9/0x210 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was created at: slab_post_alloc_hook mm/slub.c:4154 [inline] slab_alloc_node mm/slub.c:4197 [inline] kmem_cache_alloc_node_noprof+0x818/0xf00 mm/slub.c:4249 kmalloc_reserve+0x13c/0x4b0 net/core/skbuff.c:579 __alloc_skb+0x347/0x7d0 net/core/skbuff.c:670 alloc_skb include/linux/skbuff.h:1336 [inline] vcc_sendmsg+0xb40/0xff0 net/atm/common.c:628 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg+0x330/0x3d0 net/socket.c:727 ____sys_sendmsg+0x7e0/0xd80 net/socket.c:2566 ___sys_sendmsg+0x271/0x3b0 net/socket.c:2620 __sys_sendmsg net/socket.c:2652 [inline] __do_sys_sendmsg net/socket.c:2657 [inline] __se_sys_sendmsg net/socket.c:2655 [inline] __x64_sys_sendmsg+0x211/0x3e0 net/socket.c:2655 x64_sys_call+0x32fb/0x3db0 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xd9/0x210 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f CPU: 1 UID: 0 PID: 5798 Comm: syz-executor192 Not tainted 6.16.0-rc1-syzkaller-00010-g2c4a1f3fe03e #0 PREEMPT(undef) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+1d3c235276f62963e93a@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=1d3c235276f62963e93a Tested-by: syzbot+1d3c235276f62963e93a@syzkaller.appspotmail.com Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250616182147.963333-2-kuni1840@gmail.com Signed-off-by: Jakub Kicinski --- drivers/atm/atmtcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c index d4aa0f353b6c..eeae160c898d 100644 --- a/drivers/atm/atmtcp.c +++ b/drivers/atm/atmtcp.c @@ -288,7 +288,9 @@ static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb) struct sk_buff *new_skb; int result = 0; - if (!skb->len) return 0; + if (skb->len < sizeof(struct atmtcp_hdr)) + goto done; + dev = vcc->dev_data; hdr = (struct atmtcp_hdr *) skb->data; if (hdr->length == ATMTCP_HDR_MAGIC) { From 7851263998d4269125fd6cb3fdbfc7c6db853859 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 16 Jun 2025 11:21:15 -0700 Subject: [PATCH 213/242] atm: Revert atm_account_tx() if copy_from_iter_full() fails. In vcc_sendmsg(), we account skb->truesize to sk->sk_wmem_alloc by atm_account_tx(). It is expected to be reverted by atm_pop_raw() later called by vcc->dev->ops->send(vcc, skb). However, vcc_sendmsg() misses the same revert when copy_from_iter_full() fails, and then we will leak a socket. Let's factorise the revert part as atm_return_tx() and call it in the failure path. Note that the corresponding sk_wmem_alloc operation can be found in alloc_tx() as of the blamed commit. $ git blame -L:alloc_tx net/atm/common.c c55fa3cccbc2c~ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Simon Horman Closes: https://lore.kernel.org/netdev/20250614161959.GR414686@horms.kernel.org/ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250616182147.963333-3-kuni1840@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/atmdev.h | 6 ++++++ net/atm/common.c | 1 + net/atm/raw.c | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index 9b02961d65ee..45f2f278b50a 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -249,6 +249,12 @@ static inline void atm_account_tx(struct atm_vcc *vcc, struct sk_buff *skb) ATM_SKB(skb)->atm_options = vcc->atm_options; } +static inline void atm_return_tx(struct atm_vcc *vcc, struct sk_buff *skb) +{ + WARN_ON_ONCE(refcount_sub_and_test(ATM_SKB(skb)->acct_truesize, + &sk_atm(vcc)->sk_wmem_alloc)); +} + static inline void atm_force_charge(struct atm_vcc *vcc,int truesize) { atomic_add(truesize, &sk_atm(vcc)->sk_rmem_alloc); diff --git a/net/atm/common.c b/net/atm/common.c index 9b75699992ff..d7f7976ea13a 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -635,6 +635,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size) skb->dev = NULL; /* for paths shared with net_device interfaces */ if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) { + atm_return_tx(vcc, skb); kfree_skb(skb); error = -EFAULT; goto out; diff --git a/net/atm/raw.c b/net/atm/raw.c index 2b5f78a7ec3e..1e6511ec842c 100644 --- a/net/atm/raw.c +++ b/net/atm/raw.c @@ -36,7 +36,7 @@ static void atm_pop_raw(struct atm_vcc *vcc, struct sk_buff *skb) pr_debug("(%d) %d -= %d\n", vcc->vci, sk_wmem_alloc_get(sk), ATM_SKB(skb)->acct_truesize); - WARN_ON(refcount_sub_and_test(ATM_SKB(skb)->acct_truesize, &sk->sk_wmem_alloc)); + atm_return_tx(vcc, skb); dev_kfree_skb_any(skb); sk->sk_write_space(sk); } From e7417421d89358da071fd2930f91e67c7128fbff Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 17 Jun 2025 11:45:29 +0200 Subject: [PATCH 214/242] wifi: ath6kl: remove WARN on bad firmware input If the firmware gives bad input, that's nothing to do with the driver's stack at this point etc., so the WARN_ON() doesn't add any value. Additionally, this is one of the top syzbot reports now. Just print a message, and as an added bonus, print the sizes too. Reported-by: syzbot+92c6dd14aaa230be6855@syzkaller.appspotmail.com Tested-by: syzbot+92c6dd14aaa230be6855@syzkaller.appspotmail.com Acked-by: Jeff Johnson Link: https://patch.msgid.link/20250617114529.031a677a348e.I58bf1eb4ac16a82c546725ff010f3f0d2b0cca49@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/bmi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath6kl/bmi.c b/drivers/net/wireless/ath/ath6kl/bmi.c index af98e871199d..5a9e93fd1ef4 100644 --- a/drivers/net/wireless/ath/ath6kl/bmi.c +++ b/drivers/net/wireless/ath/ath6kl/bmi.c @@ -87,7 +87,9 @@ int ath6kl_bmi_get_target_info(struct ath6kl *ar, * We need to do some backwards compatibility to make this work. */ if (le32_to_cpu(targ_info->byte_count) != sizeof(*targ_info)) { - WARN_ON(1); + ath6kl_err("mismatched byte count %d vs. expected %zd\n", + le32_to_cpu(targ_info->byte_count), + sizeof(*targ_info)); return -EINVAL; } From db5957ab85204a02093ac5ab2d0bbfe253955b71 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Tue, 17 Jun 2025 15:00:06 +0300 Subject: [PATCH 215/242] wifi: iwlwifi: restore missing initialization of async_handlers_list (again) The initialization of async_handlers_list was accidentally removed in a previous change. Then it was restoted by commit 175e69e33c66 ("wifi: iwlwifi: restore missing initialization of async_handlers_list"). Somehow, the initialization disappeared again. Restote it. Signed-off-by: Miri Korenblit --- drivers/net/wireless/intel/iwlwifi/mld/mld.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mld/mld.c b/drivers/net/wireless/intel/iwlwifi/mld/mld.c index e8820e7cf8fa..1774bb84dd3f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/mld.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/mld.c @@ -77,6 +77,7 @@ void iwl_construct_mld(struct iwl_mld *mld, struct iwl_trans *trans, /* Setup async RX handling */ spin_lock_init(&mld->async_handlers_lock); + INIT_LIST_HEAD(&mld->async_handlers_list); wiphy_work_init(&mld->async_handlers_wk, iwl_mld_async_handlers_wk); From d5352b491a3a2628f1a798952a4ae76bde5d42e4 Mon Sep 17 00:00:00 2001 From: Pei Xiao Date: Tue, 27 May 2025 16:03:55 +0800 Subject: [PATCH 216/242] wifi: iwlwifi: cfg: Limit cb_size to valid range on arm64 defconfig build failed with gcc-8: drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c:208:3: include/linux/bitfield.h:195:3: error: call to '__field_overflow' declared with attribute error: value doesn't fit into mask __field_overflow(); \ ^~~~~~~~~~~~~~~~~~ include/linux/bitfield.h:215:2: note: in expansion of macro '____MAKE_OP' ____MAKE_OP(u##size,u##size,,) ^~~~~~~~~~~ include/linux/bitfield.h:218:1: note: in expansion of macro '__MAKE_OP' __MAKE_OP(32) Limit cb_size to valid range to fix it. Signed-off-by: Pei Xiao Link: https://patch.msgid.link/7b373a4426070d50b5afb3269fd116c18ce3aea8.1748332709.git.xiaopei01@kylinos.cn Signed-off-by: Miri Korenblit --- drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c index cb36baac14da..4f2be0c1bd97 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c @@ -166,7 +166,7 @@ int iwl_pcie_ctxt_info_init(struct iwl_trans *trans, struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct iwl_context_info *ctxt_info; struct iwl_context_info_rbd_cfg *rx_cfg; - u32 control_flags = 0, rb_size; + u32 control_flags = 0, rb_size, cb_size; dma_addr_t phys; int ret; @@ -202,11 +202,12 @@ int iwl_pcie_ctxt_info_init(struct iwl_trans *trans, rb_size = IWL_CTXT_INFO_RB_SIZE_4K; } - WARN_ON(RX_QUEUE_CB_SIZE(iwl_trans_get_num_rbds(trans)) > 12); + cb_size = RX_QUEUE_CB_SIZE(iwl_trans_get_num_rbds(trans)); + if (WARN_ON(cb_size > 12)) + cb_size = 12; + control_flags = IWL_CTXT_INFO_TFD_FORMAT_LONG; - control_flags |= - u32_encode_bits(RX_QUEUE_CB_SIZE(iwl_trans_get_num_rbds(trans)), - IWL_CTXT_INFO_RB_CB_SIZE); + control_flags |= u32_encode_bits(cb_size, IWL_CTXT_INFO_RB_CB_SIZE); control_flags |= u32_encode_bits(rb_size, IWL_CTXT_INFO_RB_SIZE); ctxt_info->control.control_flags = cpu_to_le32(control_flags); From 432a41232ca932ecb2330f46105e68e296ba8c7f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 16 Jun 2025 13:48:36 +0200 Subject: [PATCH 217/242] wifi: iwlwifi: dvm: restore n_no_reclaim_cmds setting Apparently I accidentally removed this setting in my transport configuration rework, leading to an endless stream of warnings from the PCIe code when relevant notifications are received by the driver from firmware. Restore it. Reported-by: Woody Suwalski Closes: https://lore.kernel.org/r/e8c45d32-6129-8a5e-af80-ccc42aaf200b@gmail.com/ Fixes: 08e77d5edf70 ("wifi: iwlwifi: rework transport configuration") Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20250616134902.222342908ca4.I47a551c86cbc0e9de4f980ca2fd0d67bf4052e50@changeid Signed-off-by: Miri Korenblit --- drivers/net/wireless/intel/iwlwifi/dvm/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/main.c b/drivers/net/wireless/intel/iwlwifi/dvm/main.c index dbfd45948e8b..66211426aa3a 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/main.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/main.c @@ -1316,6 +1316,7 @@ static struct iwl_op_mode *iwl_op_mode_dvm_start(struct iwl_trans *trans, sizeof(trans->conf.no_reclaim_cmds)); memcpy(trans->conf.no_reclaim_cmds, no_reclaim_cmds, sizeof(no_reclaim_cmds)); + trans->conf.n_no_reclaim_cmds = ARRAY_SIZE(no_reclaim_cmds); switch (iwlwifi_mod_params.amsdu_size) { case IWL_AMSDU_DEF: From 83f3ac2848b46e3e5af5d06b5f176c17e35733a3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 22 May 2025 13:17:03 +0100 Subject: [PATCH 218/242] wifi: iwlwifi: Fix incorrect logic on cmd_ver range checking The current cmd_ver range checking on cmd_ver < 1 && cmd_ver > 3 can never be true because the logical operator && is being used, cmd_ver can never be less than 1 and also greater than 3. Fix this by using the logical || operator. Fixes: df6146a0296e ("wifi: iwlwifi: Add a new version for mac config command") Signed-off-by: Colin Ian King Link: https://patch.msgid.link/20250522121703.2766764-1-colin.i.king@gmail.com Signed-off-by: Miri Korenblit --- drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c index 81ca9ff67be9..3c255ae916c8 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c @@ -34,7 +34,7 @@ static void iwl_mvm_mld_mac_ctxt_cmd_common(struct iwl_mvm *mvm, WIDE_ID(MAC_CONF_GROUP, MAC_CONFIG_CMD), 0); - if (WARN_ON(cmd_ver < 1 && cmd_ver > 3)) + if (WARN_ON(cmd_ver < 1 || cmd_ver > 3)) return; cmd->id_and_color = cpu_to_le32(mvmvif->id); From d0fa59897e049e84432600e86df82aab3dce7aa5 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 13 Jun 2025 15:30:56 -0400 Subject: [PATCH 219/242] tcp: fix tcp_packet_delayed() for tcp_is_non_sack_preventing_reopen() behavior After the following commit from 2024: commit e37ab7373696 ("tcp: fix to allow timestamp undo if no retransmits were sent") ...there was buggy behavior where TCP connections without SACK support could easily see erroneous undo events at the end of fast recovery or RTO recovery episodes. The erroneous undo events could cause those connections to suffer repeated loss recovery episodes and high retransmit rates. The problem was an interaction between the non-SACK behavior on these connections and the undo logic. The problem is that, for non-SACK connections at the end of a loss recovery episode, if snd_una == high_seq, then tcp_is_non_sack_preventing_reopen() holds steady in CA_Recovery or CA_Loss, but clears tp->retrans_stamp to 0. Then upon the next ACK the "tcp: fix to allow timestamp undo if no retransmits were sent" logic saw the tp->retrans_stamp at 0 and erroneously concluded that no data was retransmitted, and erroneously performed an undo of the cwnd reduction, restoring cwnd immediately to the value it had before loss recovery. This caused an immediate burst of traffic and build-up of queues and likely another immediate loss recovery episode. This commit fixes tcp_packet_delayed() to ignore zero retrans_stamp values for non-SACK connections when snd_una is at or above high_seq, because tcp_is_non_sack_preventing_reopen() clears retrans_stamp in this case, so it's not a valid signal that we can undo. Note that the commit named in the Fixes footer restored long-present behavior from roughly 2005-2019, so apparently this bug was present for a while during that era, and this was simply not caught. Fixes: e37ab7373696 ("tcp: fix to allow timestamp undo if no retransmits were sent") Reported-by: Eric Wheeler Closes: https://lore.kernel.org/netdev/64ea9333-e7f9-0df-b0f2-8d566143acab@ewheeler.net/ Signed-off-by: Neal Cardwell Co-developed-by: Yuchung Cheng Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8ec92dec321a..12c2e6fc85c6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2479,20 +2479,33 @@ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { const struct sock *sk = (const struct sock *)tp; - if (tp->retrans_stamp && - tcp_tsopt_ecr_before(tp, tp->retrans_stamp)) - return true; /* got echoed TS before first retransmission */ + /* Received an echoed timestamp before the first retransmission? */ + if (tp->retrans_stamp) + return tcp_tsopt_ecr_before(tp, tp->retrans_stamp); - /* Check if nothing was retransmitted (retrans_stamp==0), which may - * happen in fast recovery due to TSQ. But we ignore zero retrans_stamp - * in TCP_SYN_SENT, since when we set FLAG_SYN_ACKED we also clear - * retrans_stamp even if we had retransmitted the SYN. + /* We set tp->retrans_stamp upon the first retransmission of a loss + * recovery episode, so normally if tp->retrans_stamp is 0 then no + * retransmission has happened yet (likely due to TSQ, which can cause + * fast retransmits to be delayed). So if snd_una advanced while + * (tp->retrans_stamp is 0 then apparently a packet was merely delayed, + * not lost. But there are exceptions where we retransmit but then + * clear tp->retrans_stamp, so we check for those exceptions. */ - if (!tp->retrans_stamp && /* no record of a retransmit/SYN? */ - sk->sk_state != TCP_SYN_SENT) /* not the FLAG_SYN_ACKED case? */ - return true; /* nothing was retransmitted */ - return false; + /* (1) For non-SACK connections, tcp_is_non_sack_preventing_reopen() + * clears tp->retrans_stamp when snd_una == high_seq. + */ + if (!tcp_is_sack(tp) && !before(tp->snd_una, tp->high_seq)) + return false; + + /* (2) In TCP_SYN_SENT tcp_clean_rtx_queue() clears tp->retrans_stamp + * when setting FLAG_SYN_ACKED is set, even if the SYN was + * retransmitted. + */ + if (sk->sk_state == TCP_SYN_SENT) + return false; + + return true; /* tp->retrans_stamp is zero; no retransmit yet */ } /* Undo procedures. */ From 327e28664307d49ce3fa71ba30dcc0007c270974 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Jun 2025 07:38:01 -0400 Subject: [PATCH 220/242] fgraph: Do not enable function_graph tracer when setting funcgraph-args When setting the funcgraph-args option when function graph tracer is net enabled, it incorrectly enables it. Worse, it unregisters itself when it was never registered. Then when it gets enabled again, it will register itself a second time causing a WARNing. ~# echo 1 > /sys/kernel/tracing/options/funcgraph-args ~# head -20 /sys/kernel/tracing/trace # tracer: nop # # entries-in-buffer/entries-written: 813/26317372 #P:8 # # _-----=> irqs-off/BH-disabled # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> migrate-disable # |||| / delay # TASK-PID CPU# ||||| TIMESTAMP FUNCTION # | | | ||||| | | -0 [007] d..4. 358.966010: 7) 1.692 us | fetch_next_timer_interrupt(basej=4294981640, basem=357956000000, base_local=0xffff88823c3ae040, base_global=0xffff88823c3af300, tevt=0xffff888100e47cb8); -0 [007] d..4. 358.966012: 7) | tmigr_cpu_deactivate(nextexp=357988000000) { -0 [007] d..4. 358.966013: 7) | _raw_spin_lock(lock=0xffff88823c3b2320) { -0 [007] d..4. 358.966014: 7) 0.981 us | preempt_count_add(val=1); -0 [007] d..5. 358.966017: 7) 1.058 us | do_raw_spin_lock(lock=0xffff88823c3b2320); -0 [007] d..4. 358.966019: 7) 5.824 us | } -0 [007] d..5. 358.966021: 7) | tmigr_inactive_up(group=0xffff888100cb9000, child=0x0, data=0xffff888100e47bc0) { -0 [007] d..5. 358.966022: 7) | tmigr_update_events(group=0xffff888100cb9000, child=0x0, data=0xffff888100e47bc0) { Notice the "tracer: nop" at the top there. The current tracer is the "nop" tracer, but the content is obviously the function graph tracer. Enabling function graph tracing will cause it to register again and trigger a warning in the accounting: ~# echo function_graph > /sys/kernel/tracing/current_tracer -bash: echo: write error: Device or resource busy With the dmesg of: ------------[ cut here ]------------ WARNING: CPU: 7 PID: 1095 at kernel/trace/ftrace.c:3509 ftrace_startup_subops+0xc1e/0x1000 Modules linked in: kvm_intel kvm irqbypass CPU: 7 UID: 0 PID: 1095 Comm: bash Not tainted 6.16.0-rc2-test-00006-gea03de4105d3 #24 PREEMPT Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:ftrace_startup_subops+0xc1e/0x1000 Code: 48 b8 22 01 00 00 00 00 ad de 49 89 84 24 88 01 00 00 8b 44 24 08 89 04 24 e9 c3 f7 ff ff c7 04 24 ed ff ff ff e9 b7 f7 ff ff <0f> 0b c7 04 24 f0 ff ff ff e9 a9 f7 ff ff c7 04 24 f4 ff ff ff e9 RSP: 0018:ffff888133cff948 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 1ffff1102679ff31 RCX: 0000000000000000 RDX: 1ffffffff0b27a60 RSI: ffffffff8593d2f0 RDI: ffffffff85941140 RBP: 00000000000c2041 R08: ffffffffffffffff R09: ffffed1020240221 R10: ffff88810120110f R11: ffffed1020240214 R12: ffffffff8593d2f0 R13: ffffffff8593d300 R14: ffffffff85941140 R15: ffffffff85631100 FS: 00007f7ec6f28740(0000) GS:ffff8882b5251000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7ec6f181c0 CR3: 000000012f1d0005 CR4: 0000000000172ef0 Call Trace: ? __pfx_ftrace_startup_subops+0x10/0x10 ? find_held_lock+0x2b/0x80 ? ftrace_stub_direct_tramp+0x10/0x10 ? ftrace_stub_direct_tramp+0x10/0x10 ? trace_preempt_on+0xd0/0x110 ? __pfx_trace_graph_entry_args+0x10/0x10 register_ftrace_graph+0x4d2/0x1020 ? tracing_reset_online_cpus+0x14b/0x1e0 ? __pfx_register_ftrace_graph+0x10/0x10 ? ring_buffer_record_enable+0x16/0x20 ? tracing_reset_online_cpus+0x153/0x1e0 ? __pfx_tracing_reset_online_cpus+0x10/0x10 ? __pfx_trace_graph_return+0x10/0x10 graph_trace_init+0xfd/0x160 tracing_set_tracer+0x500/0xa80 ? __pfx_tracing_set_tracer+0x10/0x10 ? lock_release+0x181/0x2d0 ? _copy_from_user+0x26/0xa0 tracing_set_trace_write+0x132/0x1e0 ? __pfx_tracing_set_trace_write+0x10/0x10 ? ftrace_graph_func+0xcc/0x140 ? ftrace_stub_direct_tramp+0x10/0x10 ? ftrace_stub_direct_tramp+0x10/0x10 ? ftrace_stub_direct_tramp+0x10/0x10 vfs_write+0x1d0/0xe90 ? __pfx_vfs_write+0x10/0x10 Have the setting of the funcgraph-args check if function_graph tracer is the current tracer of the instance, and if not, do nothing, as there's nothing to do (the option is checked when function_graph tracing starts). Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: Mark Rutland Link: https://lore.kernel.org/20250618073801.057ea636@gandalf.local.home Fixes: c7a60a733c373 ("ftrace: Have funcgraph-args take affect during tracing") Closes: https://lore.kernel.org/all/4ab1a7bdd0174ab09c7b0d68cdbff9a4@huawei.com/ Reported-by: Changbin Du Tested-by: Changbin Du Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 9234e2c39abf..14d74a7491b8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -455,10 +455,16 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +static struct tracer graph_trace; + static int ftrace_graph_trace_args(struct trace_array *tr, int set) { trace_func_graph_ent_t entry; + /* Do nothing if the current tracer is not this tracer */ + if (tr->current_trace != &graph_trace) + return 0; + if (set) entry = trace_graph_entry_args; else From eab9dcb76b9fca47402c9e93afca243e745a0f02 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Sat, 14 Jun 2025 20:59:24 +0530 Subject: [PATCH 221/242] Documentation: embargoed-hardware-issues.rst: Add myself for Power Adding myself as the contact for Power Signed-off-by: Madhavan Srinivasan Link: https://lore.kernel.org/r/20250614152925.82831-1-maddy@linux.ibm.com Signed-off-by: Greg Kroah-Hartman --- Documentation/process/embargoed-hardware-issues.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index da6bf0f6d01e..34e00848e0da 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -290,6 +290,7 @@ an involved disclosed party. The current ambassadors list: AMD Tom Lendacky Ampere Darren Hart ARM Catalin Marinas + IBM Power Madhavan Srinivasan IBM Z Christian Borntraeger Intel Tony Luck Qualcomm Trilok Soni From 37fb58a7273726e59f9429c89ade5116083a213d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 18 Jun 2025 07:32:17 +0000 Subject: [PATCH 222/242] cgroup,freezer: fix incomplete freezing when attaching tasks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An issue was found: # cd /sys/fs/cgroup/freezer/ # mkdir test # echo FROZEN > test/freezer.state # cat test/freezer.state FROZEN # sleep 1000 & [1] 863 # echo 863 > test/cgroup.procs # cat test/freezer.state FREEZING When tasks are migrated to a frozen cgroup, the freezer fails to immediately freeze the tasks, causing the cgroup to remain in the "FREEZING". The freeze_task() function is called before clearing the CGROUP_FROZEN flag. This causes the freezing() check to incorrectly return false, preventing __freeze_task() from being invoked for the migrated task. To fix this issue, clear the CGROUP_FROZEN state before calling freeze_task(). Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Cc: stable@vger.kernel.org # v6.1+ Reported-by: Zhong Jiawei Signed-off-by: Chen Ridong Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/legacy_freezer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 039d1eb2f215..507b8f19a262 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -188,13 +188,12 @@ static void freezer_attach(struct cgroup_taskset *tset) if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; freezer = parent_freezer(freezer); } + freeze_task(task); } } From c6d732c38f93c4aebd204a5656583142289c3a2e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jun 2025 13:22:40 -0700 Subject: [PATCH 223/242] net: ethtool: remove duplicate defines for family info Commit under fixes switched to uAPI generation from the YAML spec. A number of custom defines were left behind, mostly for commands very hard to express in YAML spec. Among what was left behind was the name and version of the generic netlink family. Problem is that the codegen always outputs those values so we ended up with a duplicated, differently named set of defines. Provide naming info in YAML and remove the incorrect defines. Fixes: 8d0580c6ebdd ("ethtool: regenerate uapi header from the spec") Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250617202240.811179-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 3 +++ include/uapi/linux/ethtool_netlink.h | 4 ---- include/uapi/linux/ethtool_netlink_generated.h | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 9f98715a6512..72a076b0e1b5 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -7,6 +7,9 @@ protocol: genetlink-legacy doc: Partial family for Ethtool Netlink. uapi-header: linux/ethtool_netlink_generated.h +c-family-name: ethtool-genl-name +c-version-name: ethtool-genl-version + definitions: - name: udp-tunnel-type diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 9ff72cfb2e98..09a75bdb6560 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -208,10 +208,6 @@ enum { ETHTOOL_A_STATS_PHY_MAX = (__ETHTOOL_A_STATS_PHY_CNT - 1) }; -/* generic netlink info */ -#define ETHTOOL_GENL_NAME "ethtool" -#define ETHTOOL_GENL_VERSION 1 - #define ETHTOOL_MCGRP_MONITOR_NAME "monitor" #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 9a02f579de22..aa8ab5227c1e 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -6,8 +6,8 @@ #ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H #define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H -#define ETHTOOL_FAMILY_NAME "ethtool" -#define ETHTOOL_FAMILY_VERSION 1 +#define ETHTOOL_GENL_NAME "ethtool" +#define ETHTOOL_GENL_VERSION 1 enum { ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, From ae409629e022fbebbc6d31a1bfeccdbbeee20fd6 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 17 Jun 2025 20:20:17 +0200 Subject: [PATCH 224/242] net: ftgmac100: select FIXED_PHY Depending on e.g. DT configuration this driver uses a fixed link. So we shouldn't rely on the user to enable FIXED_PHY, select it in Kconfig instead. We may end up with a non-functional driver otherwise. Fixes: 38561ded50d0 ("net: ftgmac100: support fixed link") Cc: stable@vger.kernel.org Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/476bb33b-5584-40f0-826a-7294980f2895@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/faraday/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/faraday/Kconfig b/drivers/net/ethernet/faraday/Kconfig index c699bd6bcbb9..474073c7f94d 100644 --- a/drivers/net/ethernet/faraday/Kconfig +++ b/drivers/net/ethernet/faraday/Kconfig @@ -31,6 +31,7 @@ config FTGMAC100 depends on ARM || COMPILE_TEST depends on !64BIT || BROKEN select PHYLIB + select FIXED_PHY select MDIO_ASPEED if MACH_ASPEED_G6 select CRC32 help From 9ac8d0c640a161486eaf71d1999ee990fad62947 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Tue, 17 Jun 2025 12:04:02 +0530 Subject: [PATCH 225/242] Octeontx2-pf: Fix Backpresure configuration NIX block can receive packets from multiple links such as MAC (RPM), LBK and CPT. ----------------- RPM --| NIX | ----------------- | | LBK Each link supports multiple channels for example RPM link supports 16 channels. In case of link oversubsribe, NIX will assert backpressure on receive channels. The previous patch considered a single channel per link, resulting in backpressure not being enabled on the remaining channels Fixes: a7ef63dbd588 ("octeontx2-af: Disable backpressure between CPT and NIX") Signed-off-by: Hariprasad Kelam Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250617063403.3582210-1-hkelam@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 6f572589f1e5..6b5c9536d26d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -1822,7 +1822,7 @@ int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable) req->chan_cnt = IEEE_8021QAZ_MAX_TCS; req->bpid_per_chan = 1; } else { - req->chan_cnt = 1; + req->chan_cnt = pfvf->hw.rx_chan_cnt; req->bpid_per_chan = 0; } @@ -1847,7 +1847,7 @@ int otx2_nix_cpt_config_bp(struct otx2_nic *pfvf, bool enable) req->chan_cnt = IEEE_8021QAZ_MAX_TCS; req->bpid_per_chan = 1; } else { - req->chan_cnt = 1; + req->chan_cnt = pfvf->hw.rx_chan_cnt; req->bpid_per_chan = 0; } From f82727adcf2992822e12198792af450a76ebd5ef Mon Sep 17 00:00:00 2001 From: Haixia Qu Date: Tue, 17 Jun 2025 05:56:24 +0000 Subject: [PATCH 226/242] tipc: fix null-ptr-deref when acquiring remote ip of ethernet bearer The reproduction steps: 1. create a tun interface 2. enable l2 bearer 3. TIPC_NL_UDP_GET_REMOTEIP with media name set to tun tipc: Started in network mode tipc: Node identity 8af312d38a21, cluster identity 4711 tipc: Enabled bearer , priority 1 Oops: general protection fault KASAN: null-ptr-deref in range CPU: 1 UID: 1000 PID: 559 Comm: poc Not tainted 6.16.0-rc1+ #117 PREEMPT Hardware name: QEMU Ubuntu 24.04 PC RIP: 0010:tipc_udp_nl_dump_remoteip+0x4a4/0x8f0 the ub was in fact a struct dev. when bid != 0 && skip_cnt != 0, bearer_list[bid] may be NULL or other media when other thread changes it. fix this by checking media_id. Fixes: 832629ca5c313 ("tipc: add UDP remoteip dump to netlink API") Signed-off-by: Haixia Qu Reviewed-by: Tung Nguyen Link: https://patch.msgid.link/20250617055624.2680-1-hxqu@hillstonenet.com Signed-off-by: Jakub Kicinski --- net/tipc/udp_media.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 108a4cc2e001..258d6aa4f21a 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -489,7 +489,7 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); b = tipc_bearer_find(net, bname); - if (!b) { + if (!b || b->bcast_addr.media_id != TIPC_MEDIA_TYPE_UDP) { rtnl_unlock(); return -EINVAL; } @@ -500,7 +500,7 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); b = rtnl_dereference(tn->bearer_list[bid]); - if (!b) { + if (!b || b->bcast_addr.media_id != TIPC_MEDIA_TYPE_UDP) { rtnl_unlock(); return -EINVAL; } From 3168276591210d147ed9e6dc267f8a04007593c7 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 17 Jun 2025 14:20:59 -0700 Subject: [PATCH 227/242] selftests: netdevsim: improve lib.sh include in peer.sh Fix the peer.sh test to run from INSTALL_PATH. Signed-off-by: David Wei Link: https://patch.msgid.link/20250617212102.175711-2-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/netdevsim/peer.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/netdevsim/peer.sh b/tools/testing/selftests/drivers/net/netdevsim/peer.sh index 1bb46ec435d4..7f32b5600925 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/peer.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/peer.sh @@ -1,7 +1,8 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0-only -source ../../../net/lib.sh +lib_dir=$(dirname $0)/../../../net +source $lib_dir/lib.sh NSIM_DEV_1_ID=$((256 + RANDOM % 256)) NSIM_DEV_1_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_DEV_1_ID From c65b5bb2329e36340eba76f05752f8f1eb15bee0 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 17 Jun 2025 14:21:00 -0700 Subject: [PATCH 228/242] selftests: net: add passive TFO test binary Add a simple passive TFO server and client test binary. This will be used to test the SO_INCOMING_NAPI_ID of passive TFO accepted sockets. Signed-off-by: David Wei Link: https://patch.msgid.link/20250617212102.175711-3-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/tfo.c | 171 +++++++++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 tools/testing/selftests/net/tfo.c diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 532bb732bc6d..c6dd2a335cf4 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -50,6 +50,7 @@ tap tcp_fastopen_backup_key tcp_inq tcp_mmap +tfo timestamping tls toeplitz diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index ab996bd22a5f..ab8da438fd78 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -110,6 +110,7 @@ TEST_GEN_PROGS += proc_net_pktgen TEST_PROGS += lwt_dst_cache_ref_loop.sh TEST_PROGS += skf_net_off.sh TEST_GEN_FILES += skf_net_off +TEST_GEN_FILES += tfo # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := busy_poller netlink-dumps diff --git a/tools/testing/selftests/net/tfo.c b/tools/testing/selftests/net/tfo.c new file mode 100644 index 000000000000..eb3cac5e583c --- /dev/null +++ b/tools/testing/selftests/net/tfo.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int cfg_server; +static int cfg_client; +static int cfg_port = 8000; +static struct sockaddr_in6 cfg_addr; +static char *cfg_outfile; + +static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6) +{ + int ret; + + sin6->sin6_family = AF_INET6; + sin6->sin6_port = htons(port); + + ret = inet_pton(sin6->sin6_family, str, &sin6->sin6_addr); + if (ret != 1) { + /* fallback to plain IPv4 */ + ret = inet_pton(AF_INET, str, &sin6->sin6_addr.s6_addr32[3]); + if (ret != 1) + return -1; + + /* add ::ffff prefix */ + sin6->sin6_addr.s6_addr32[0] = 0; + sin6->sin6_addr.s6_addr32[1] = 0; + sin6->sin6_addr.s6_addr16[4] = 0; + sin6->sin6_addr.s6_addr16[5] = 0xffff; + } + + return 0; +} + +static void run_server(void) +{ + unsigned long qlen = 32; + int fd, opt, connfd; + socklen_t len; + char buf[64]; + FILE *outfile; + + outfile = fopen(cfg_outfile, "w"); + if (!outfile) + error(1, errno, "fopen() outfile"); + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (fd == -1) + error(1, errno, "socket()"); + + opt = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) + error(1, errno, "setsockopt(SO_REUSEADDR)"); + + if (setsockopt(fd, SOL_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)) < 0) + error(1, errno, "setsockopt(TCP_FASTOPEN)"); + + if (bind(fd, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)) < 0) + error(1, errno, "bind()"); + + if (listen(fd, 5) < 0) + error(1, errno, "listen()"); + + len = sizeof(cfg_addr); + connfd = accept(fd, (struct sockaddr *)&cfg_addr, &len); + if (connfd < 0) + error(1, errno, "accept()"); + + len = sizeof(opt); + if (getsockopt(connfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &opt, &len) < 0) + error(1, errno, "getsockopt(SO_INCOMING_NAPI_ID)"); + + read(connfd, buf, 64); + fprintf(outfile, "%d\n", opt); + + fclose(outfile); + close(connfd); + close(fd); +} + +static void run_client(void) +{ + int fd; + char *msg = "Hello, world!"; + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (fd == -1) + error(1, errno, "socket()"); + + sendto(fd, msg, strlen(msg), MSG_FASTOPEN, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)); + + close(fd); +} + +static void usage(const char *filepath) +{ + error(1, 0, "Usage: %s (-s|-c) -h -p -o ", filepath); +} + +static void parse_opts(int argc, char **argv) +{ + struct sockaddr_in6 *addr6 = (void *) &cfg_addr; + char *addr = NULL; + int ret; + int c; + + if (argc <= 1) + usage(argv[0]); + + while ((c = getopt(argc, argv, "sch:p:o:")) != -1) { + switch (c) { + case 's': + if (cfg_client) + error(1, 0, "Pass one of -s or -c"); + cfg_server = 1; + break; + case 'c': + if (cfg_server) + error(1, 0, "Pass one of -s or -c"); + cfg_client = 1; + break; + case 'h': + addr = optarg; + break; + case 'p': + cfg_port = strtoul(optarg, NULL, 0); + break; + case 'o': + cfg_outfile = strdup(optarg); + if (!cfg_outfile) + error(1, 0, "outfile invalid"); + break; + } + } + + if (cfg_server && addr) + error(1, 0, "Server cannot have -h specified"); + + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + addr6->sin6_addr = in6addr_any; + if (addr) { + ret = parse_address(addr, cfg_port, addr6); + if (ret) + error(1, 0, "Client address parse error: %s", addr); + } +} + +int main(int argc, char **argv) +{ + parse_opts(argc, argv); + + if (cfg_server) + run_server(); + else if (cfg_client) + run_client(); + + return 0; +} From 137e7b5cceda2fd634ff47fde6c4226092d09442 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 17 Jun 2025 14:21:01 -0700 Subject: [PATCH 229/242] selftests: net: add test for passive TFO socket NAPI ID Add a test that checks that the NAPI ID of a passive TFO socket is valid i.e. not zero. Signed-off-by: David Wei Link: https://patch.msgid.link/20250617212102.175711-4-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/tfo_passive.sh | 112 +++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100755 tools/testing/selftests/net/tfo_passive.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index ab8da438fd78..332f387615d7 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -111,6 +111,7 @@ TEST_PROGS += lwt_dst_cache_ref_loop.sh TEST_PROGS += skf_net_off.sh TEST_GEN_FILES += skf_net_off TEST_GEN_FILES += tfo +TEST_PROGS += tfo_passive.sh # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := busy_poller netlink-dumps diff --git a/tools/testing/selftests/net/tfo_passive.sh b/tools/testing/selftests/net/tfo_passive.sh new file mode 100755 index 000000000000..80bf11fdc046 --- /dev/null +++ b/tools/testing/selftests/net/tfo_passive.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +source lib.sh + +NSIM_SV_ID=$((256 + RANDOM % 256)) +NSIM_SV_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_SV_ID +NSIM_CL_ID=$((512 + RANDOM % 256)) +NSIM_CL_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_CL_ID + +NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device +NSIM_DEV_SYS_DEL=/sys/bus/netdevsim/del_device +NSIM_DEV_SYS_LINK=/sys/bus/netdevsim/link_device +NSIM_DEV_SYS_UNLINK=/sys/bus/netdevsim/unlink_device + +SERVER_IP=192.168.1.1 +CLIENT_IP=192.168.1.2 +SERVER_PORT=48675 + +setup_ns() +{ + set -e + ip netns add nssv + ip netns add nscl + + NSIM_SV_NAME=$(find $NSIM_SV_SYS/net -maxdepth 1 -type d ! \ + -path $NSIM_SV_SYS/net -exec basename {} \;) + NSIM_CL_NAME=$(find $NSIM_CL_SYS/net -maxdepth 1 -type d ! \ + -path $NSIM_CL_SYS/net -exec basename {} \;) + + ip link set $NSIM_SV_NAME netns nssv + ip link set $NSIM_CL_NAME netns nscl + + ip netns exec nssv ip addr add "${SERVER_IP}/24" dev $NSIM_SV_NAME + ip netns exec nscl ip addr add "${CLIENT_IP}/24" dev $NSIM_CL_NAME + + ip netns exec nssv ip link set dev $NSIM_SV_NAME up + ip netns exec nscl ip link set dev $NSIM_CL_NAME up + + # Enable passive TFO + ip netns exec nssv sysctl -w net.ipv4.tcp_fastopen=519 > /dev/null + + set +e +} + +cleanup_ns() +{ + ip netns del nscl + ip netns del nssv +} + +### +### Code start +### + +modprobe netdevsim + +# linking + +echo $NSIM_SV_ID > $NSIM_DEV_SYS_NEW +echo $NSIM_CL_ID > $NSIM_DEV_SYS_NEW +udevadm settle + +setup_ns + +NSIM_SV_FD=$((256 + RANDOM % 256)) +exec {NSIM_SV_FD} \ + $NSIM_DEV_SYS_LINK + +if [ $? -ne 0 ]; then + echo "linking netdevsim1 with netdevsim2 should succeed" + cleanup_ns + exit 1 +fi + +out_file=$(mktemp) + +timeout -k 1s 30s ip netns exec nssv ./tfo \ + -s \ + -p ${SERVER_PORT} \ + -o ${out_file}& + +wait_local_port_listen nssv ${SERVER_PORT} tcp + +ip netns exec nscl ./tfo -c -h ${SERVER_IP} -p ${SERVER_PORT} + +wait + +res=$(cat $out_file) +rm $out_file + +if [ $res -eq 0 ]; then + echo "got invalid NAPI ID from passive TFO socket" + cleanup_ns + exit 1 +fi + +echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK + +echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL + +cleanup_ns + +modprobe -r netdevsim + +exit 0 From dbe0ca8da1f62b6252e7be6337209f4d86d4a914 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 17 Jun 2025 14:21:02 -0700 Subject: [PATCH 230/242] tcp: fix passive TFO socket having invalid NAPI ID There is a bug with passive TFO sockets returning an invalid NAPI ID 0 from SO_INCOMING_NAPI_ID. Normally this is not an issue, but zero copy receive relies on a correct NAPI ID to process sockets on the right queue. Fix by adding a sk_mark_napi_id_set(). Fixes: e5907459ce7e ("tcp: Record Rx hash and NAPI ID in tcp_child_process") Signed-off-by: David Wei Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250617212102.175711-5-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_fastopen.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 9b83d639b5ac..5107121c5e37 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -3,6 +3,7 @@ #include #include #include +#include void tcp_fastopen_init_key_once(struct net *net) { @@ -279,6 +280,8 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); + sk_mark_napi_id_set(child, skb); + /* Now finish processing the fastopen child socket. */ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb); From 5bd1bafd4474ee26f504b41aba11f3e2a1175b88 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Jun 2025 12:55:10 -0700 Subject: [PATCH 231/242] eth: fbnic: avoid double free when failing to DMA-map FW msg The semantics are that caller of fbnic_mbx_map_msg() retains the ownership of the message on error. All existing callers dutifully free the page. Fixes: da3cde08209e ("eth: fbnic: Add FW communication mechanism") Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250616195510.225819-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_fw.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c index e2368075ab8c..4521d0483d18 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c @@ -127,11 +127,8 @@ static int fbnic_mbx_map_msg(struct fbnic_dev *fbd, int mbx_idx, return -EBUSY; addr = dma_map_single(fbd->dev, msg, PAGE_SIZE, direction); - if (dma_mapping_error(fbd->dev, addr)) { - free_page((unsigned long)msg); - + if (dma_mapping_error(fbd->dev, addr)) return -ENOSPC; - } mbx->buf_info[tail].msg = msg; mbx->buf_info[tail].addr = addr; From e353b0854d3a1a31cb061df8d022fbfea53a0f24 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Mon, 16 Jun 2025 11:37:43 +0000 Subject: [PATCH 232/242] net: lan743x: fix potential out-of-bounds write in lan743x_ptp_io_event_clock_get() Before calling lan743x_ptp_io_event_clock_get(), the 'channel' value is checked against the maximum value of PCI11X1X_PTP_IO_MAX_CHANNELS(8). This seems correct and aligns with the PTP interrupt status register (PTP_INT_STS) specifications. However, lan743x_ptp_io_event_clock_get() writes to ptp->extts[] with only LAN743X_PTP_N_EXTTS(4) elements, using channel as an index: lan743x_ptp_io_event_clock_get(..., u8 channel,...) { ... /* Update Local timestamp */ extts = &ptp->extts[channel]; extts->ts.tv_sec = sec; ... } To avoid an out-of-bounds write and utilize all the supported GPIO inputs, set LAN743X_PTP_N_EXTTS to 8. Detected using the static analysis tool - Svace. Fixes: 60942c397af6 ("net: lan743x: Add support for PTP-IO Event Input External Timestamp (extts)") Signed-off-by: Alexey Kodanev Reviewed-by: Jacob Keller Acked-by: Rengarajan S Link: https://patch.msgid.link/20250616113743.36284-1-aleksei.kodanev@bell-sw.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microchip/lan743x_ptp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.h b/drivers/net/ethernet/microchip/lan743x_ptp.h index e8d073bfa2ca..f33dc83c5700 100644 --- a/drivers/net/ethernet/microchip/lan743x_ptp.h +++ b/drivers/net/ethernet/microchip/lan743x_ptp.h @@ -18,9 +18,9 @@ */ #define LAN743X_PTP_N_EVENT_CHAN 2 #define LAN743X_PTP_N_PEROUT LAN743X_PTP_N_EVENT_CHAN -#define LAN743X_PTP_N_EXTTS 4 -#define LAN743X_PTP_N_PPS 0 #define PCI11X1X_PTP_IO_MAX_CHANNELS 8 +#define LAN743X_PTP_N_EXTTS PCI11X1X_PTP_IO_MAX_CHANNELS +#define LAN743X_PTP_N_PPS 0 #define PTP_CMD_CTL_TIMEOUT_CNT 50 struct lan743x_adapter; From a1113cefd7d62e20735edda79507dc9f6ce103ff Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 16 Jun 2025 15:44:37 -0700 Subject: [PATCH 233/242] MAINTAINERS: Remove Shannon Nelson from MAINTAINERS file Brett Creeley is taking ownership of AMD/Pensando drivers while I wander off into the sunset with my retirement this month. I'll still keep an eye out on a few topics for awhile, and maybe do some free-lance work in the future. Meanwhile, thank you all for the fun and support and the many learning opportunities :-). Special thanks go to DaveM for merging my first patch long ago, the big ionic patchset a few years ago, and my last patchset last week. Redirect things to a non-corporate account. Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Signed-off-by: Brett Creeley Link: https://patch.msgid.link/20250616224437.56581-1-shannon.nelson@amd.com [Jakub: squash in the .mailmap update] Signed-off-by: Shannon Nelson Link: https://patch.msgid.link/20250619010603.1173141-1-sln@onemain.com Signed-off-by: Jakub Kicinski --- .mailmap | 7 ++++--- MAINTAINERS | 5 +---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/.mailmap b/.mailmap index b77cd34cf852..7a3ffabb3434 100644 --- a/.mailmap +++ b/.mailmap @@ -691,9 +691,10 @@ Serge Hallyn Serge Hallyn Seth Forshee Shakeel Butt -Shannon Nelson -Shannon Nelson -Shannon Nelson +Shannon Nelson +Shannon Nelson +Shannon Nelson +Shannon Nelson Sharath Chandra Vurukala Shiraz Hashim Shuah Khan diff --git a/MAINTAINERS b/MAINTAINERS index 99fbde007792..437381aeaa71 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1157,7 +1157,6 @@ F: arch/x86/include/asm/amd/node.h F: arch/x86/kernel/amd_node.c AMD PDS CORE DRIVER -M: Shannon Nelson M: Brett Creeley L: netdev@vger.kernel.org S: Maintained @@ -9941,7 +9940,6 @@ F: drivers/fwctl/mlx5/ FWCTL PDS DRIVER M: Brett Creeley -R: Shannon Nelson L: linux-kernel@vger.kernel.org S: Maintained F: drivers/fwctl/pds/ @@ -19377,7 +19375,7 @@ F: crypto/pcrypt.c F: include/crypto/pcrypt.h PDS DSC VIRTIO DATA PATH ACCELERATOR -R: Shannon Nelson +R: Brett Creeley F: drivers/vdpa/pds/ PECI HARDWARE MONITORING DRIVERS @@ -19399,7 +19397,6 @@ F: include/linux/peci-cpu.h F: include/linux/peci.h PENSANDO ETHERNET DRIVERS -M: Shannon Nelson M: Brett Creeley L: netdev@vger.kernel.org S: Maintained From 10876da918fa1aec0227fb4c67647513447f53a9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 17 Jun 2025 15:40:42 -0700 Subject: [PATCH 234/242] calipso: Fix null-ptr-deref in calipso_req_{set,del}attr(). syzkaller reported a null-ptr-deref in sock_omalloc() while allocating a CALIPSO option. [0] The NULL is of struct sock, which was fetched by sk_to_full_sk() in calipso_req_setattr(). Since commit a1a5344ddbe8 ("tcp: avoid two atomic ops for syncookies"), reqsk->rsk_listener could be NULL when SYN Cookie is returned to its client, as hinted by the leading SYN Cookie log. Here are 3 options to fix the bug: 1) Return 0 in calipso_req_setattr() 2) Return an error in calipso_req_setattr() 3) Alaways set rsk_listener 1) is no go as it bypasses LSM, but 2) effectively disables SYN Cookie for CALIPSO. 3) is also no go as there have been many efforts to reduce atomic ops and make TCP robust against DDoS. See also commit 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood"). As of the blamed commit, SYN Cookie already did not need refcounting, and no one has stumbled on the bug for 9 years, so no CALIPSO user will care about SYN Cookie. Let's return an error in calipso_req_setattr() and calipso_req_delattr() in the SYN Cookie case. This can be reproduced by [1] on Fedora and now connect() of nc times out. [0]: TCP: request_sock_TCPv6: Possible SYN flooding on port [::]:20002. Sending cookies. Oops: general protection fault, probably for non-canonical address 0xdffffc0000000006: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037] CPU: 3 UID: 0 PID: 12262 Comm: syz.1.2611 Not tainted 6.14.0 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 RIP: 0010:read_pnet include/net/net_namespace.h:406 [inline] RIP: 0010:sock_net include/net/sock.h:655 [inline] RIP: 0010:sock_kmalloc+0x35/0x170 net/core/sock.c:2806 Code: 89 d5 41 54 55 89 f5 53 48 89 fb e8 25 e3 c6 fd e8 f0 91 e3 00 48 8d 7b 30 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 26 01 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b RSP: 0018:ffff88811af89038 EFLAGS: 00010216 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffff888105266400 RDX: 0000000000000006 RSI: ffff88800c890000 RDI: 0000000000000030 RBP: 0000000000000050 R08: 0000000000000000 R09: ffff88810526640e R10: ffffed1020a4cc81 R11: ffff88810526640f R12: 0000000000000000 R13: 0000000000000820 R14: ffff888105266400 R15: 0000000000000050 FS: 00007f0653a07640(0000) GS:ffff88811af80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f863ba096f4 CR3: 00000000163c0005 CR4: 0000000000770ef0 PKRU: 80000000 Call Trace: ipv6_renew_options+0x279/0x950 net/ipv6/exthdrs.c:1288 calipso_req_setattr+0x181/0x340 net/ipv6/calipso.c:1204 calipso_req_setattr+0x56/0x80 net/netlabel/netlabel_calipso.c:597 netlbl_req_setattr+0x18a/0x440 net/netlabel/netlabel_kapi.c:1249 selinux_netlbl_inet_conn_request+0x1fb/0x320 security/selinux/netlabel.c:342 selinux_inet_conn_request+0x1eb/0x2c0 security/selinux/hooks.c:5551 security_inet_conn_request+0x50/0xa0 security/security.c:4945 tcp_v6_route_req+0x22c/0x550 net/ipv6/tcp_ipv6.c:825 tcp_conn_request+0xec8/0x2b70 net/ipv4/tcp_input.c:7275 tcp_v6_conn_request+0x1e3/0x440 net/ipv6/tcp_ipv6.c:1328 tcp_rcv_state_process+0xafa/0x52b0 net/ipv4/tcp_input.c:6781 tcp_v6_do_rcv+0x8a6/0x1a40 net/ipv6/tcp_ipv6.c:1667 tcp_v6_rcv+0x505e/0x5b50 net/ipv6/tcp_ipv6.c:1904 ip6_protocol_deliver_rcu+0x17c/0x1da0 net/ipv6/ip6_input.c:436 ip6_input_finish+0x103/0x180 net/ipv6/ip6_input.c:480 NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ip6_input+0x13c/0x6b0 net/ipv6/ip6_input.c:491 dst_input include/net/dst.h:469 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:79 [inline] ip6_rcv_finish+0xb6/0x490 net/ipv6/ip6_input.c:69 NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ipv6_rcv+0xf9/0x490 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core+0x12e/0x1f0 net/core/dev.c:5896 __netif_receive_skb+0x1d/0x170 net/core/dev.c:6009 process_backlog+0x41e/0x13b0 net/core/dev.c:6357 __napi_poll+0xbd/0x710 net/core/dev.c:7191 napi_poll net/core/dev.c:7260 [inline] net_rx_action+0x9de/0xde0 net/core/dev.c:7382 handle_softirqs+0x19a/0x770 kernel/softirq.c:561 do_softirq.part.0+0x36/0x70 kernel/softirq.c:462 do_softirq arch/x86/include/asm/preempt.h:26 [inline] __local_bh_enable_ip+0xf1/0x110 kernel/softirq.c:389 local_bh_enable include/linux/bottom_half.h:33 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:919 [inline] __dev_queue_xmit+0xc2a/0x3c40 net/core/dev.c:4679 dev_queue_xmit include/linux/netdevice.h:3313 [inline] neigh_hh_output include/net/neighbour.h:523 [inline] neigh_output include/net/neighbour.h:537 [inline] ip6_finish_output2+0xd69/0x1f80 net/ipv6/ip6_output.c:141 __ip6_finish_output net/ipv6/ip6_output.c:215 [inline] ip6_finish_output+0x5dc/0xd60 net/ipv6/ip6_output.c:226 NF_HOOK_COND include/linux/netfilter.h:303 [inline] ip6_output+0x24b/0x8d0 net/ipv6/ip6_output.c:247 dst_output include/net/dst.h:459 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ip6_xmit+0xbbc/0x20d0 net/ipv6/ip6_output.c:366 inet6_csk_xmit+0x39a/0x720 net/ipv6/inet6_connection_sock.c:135 __tcp_transmit_skb+0x1a7b/0x3b40 net/ipv4/tcp_output.c:1471 tcp_transmit_skb net/ipv4/tcp_output.c:1489 [inline] tcp_send_syn_data net/ipv4/tcp_output.c:4059 [inline] tcp_connect+0x1c0c/0x4510 net/ipv4/tcp_output.c:4148 tcp_v6_connect+0x156c/0x2080 net/ipv6/tcp_ipv6.c:333 __inet_stream_connect+0x3a7/0xed0 net/ipv4/af_inet.c:677 tcp_sendmsg_fastopen+0x3e2/0x710 net/ipv4/tcp.c:1039 tcp_sendmsg_locked+0x1e82/0x3570 net/ipv4/tcp.c:1091 tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1358 inet6_sendmsg+0xb9/0x150 net/ipv6/af_inet6.c:659 sock_sendmsg_nosec net/socket.c:718 [inline] __sock_sendmsg+0xf4/0x2a0 net/socket.c:733 __sys_sendto+0x29a/0x390 net/socket.c:2187 __do_sys_sendto net/socket.c:2194 [inline] __se_sys_sendto net/socket.c:2190 [inline] __x64_sys_sendto+0xe1/0x1c0 net/socket.c:2190 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xc3/0x1d0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f06553c47ed Code: 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f0653a06fc8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00007f0655605fa0 RCX: 00007f06553c47ed RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000000000b RBP: 00007f065545db38 R08: 0000200000000140 R09: 000000000000001c R10: f7384d4ea84b01bd R11: 0000000000000246 R12: 0000000000000000 R13: 00007f0655605fac R14: 00007f0655606038 R15: 00007f06539e7000 Modules linked in: [1]: dnf install -y selinux-policy-targeted policycoreutils netlabel_tools procps-ng nmap-ncat mount -t selinuxfs none /sys/fs/selinux load_policy netlabelctl calipso add pass doi:1 netlabelctl map del default netlabelctl map add default address:::1 protocol:calipso,1 sysctl net.ipv4.tcp_syncookies=2 nc -l ::1 80 & nc ::1 80 Fixes: e1adea927080 ("calipso: Allow request sockets to be relabelled by the lsm.") Reported-by: syzkaller Reported-by: John Cheung Closes: https://lore.kernel.org/netdev/CAP=Rh=MvfhrGADy+-WJiftV2_WzMH4VEhEFmeT28qY+4yxNu4w@mail.gmail.com/ Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Moore Link: https://patch.msgid.link/20250617224125.17299-1-kuni1840@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/calipso.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 62618a058b8f..a247bb93908b 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -1207,6 +1207,10 @@ static int calipso_req_setattr(struct request_sock *req, struct ipv6_opt_hdr *old, *new; struct sock *sk = sk_to_full_sk(req_to_sk(req)); + /* sk is NULL for SYN+ACK w/ SYN Cookie */ + if (!sk) + return -ENOMEM; + if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt) old = req_inet->ipv6_opt->hopopt; else @@ -1247,6 +1251,10 @@ static void calipso_req_delattr(struct request_sock *req) struct ipv6_txoptions *txopts; struct sock *sk = sk_to_full_sk(req_to_sk(req)); + /* sk is NULL for SYN+ACK w/ SYN Cookie */ + if (!sk) + return; + if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt) return; From fc27ab48904ceb7e4792f0c400f1ef175edf16fe Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 18 Jun 2025 09:36:50 +0200 Subject: [PATCH 235/242] NFC: nci: uart: Set tty->disc_data only in success path Setting tty->disc_data before opening the NCI device means we need to clean it up on error paths. This also opens some short window if device starts sending data, even before NCIUARTSETDRIVER IOCTL succeeded (broken hardware?). Close the window by exposing tty->disc_data only on the success path, when opening of the NCI device and try_module_get() succeeds. The code differs in error path in one aspect: tty->disc_data won't be ever assigned thus NULL-ified. This however should not be relevant difference, because of "tty->disc_data=NULL" in nci_uart_tty_open(). Cc: Linus Torvalds Fixes: 9961127d4bce ("NFC: nci: add generic uart support") Cc: Signed-off-by: Krzysztof Kozlowski Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250618073649.25049-2-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- net/nfc/nci/uart.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index ed1508a9e093..aab107727f18 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -119,22 +119,22 @@ static int nci_uart_set_driver(struct tty_struct *tty, unsigned int driver) memcpy(nu, nci_uart_drivers[driver], sizeof(struct nci_uart)); nu->tty = tty; - tty->disc_data = nu; skb_queue_head_init(&nu->tx_q); INIT_WORK(&nu->write_work, nci_uart_write_work); spin_lock_init(&nu->rx_lock); ret = nu->ops.open(nu); if (ret) { - tty->disc_data = NULL; kfree(nu); + return ret; } else if (!try_module_get(nu->owner)) { nu->ops.close(nu); - tty->disc_data = NULL; kfree(nu); return -ENOENT; } - return ret; + tty->disc_data = nu; + + return 0; } /* ------ LDISC part ------ */ From 78bd03ee1f20a267d2c218884b66041b3508ac9c Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 18 Jun 2025 09:37:40 +0200 Subject: [PATCH 236/242] net: airoha: Always check return value from airoha_ppe_foe_get_entry() airoha_ppe_foe_get_entry routine can return NULL, so check the returned pointer is not NULL in airoha_ppe_foe_flow_l2_entry_update() Fixes: b81e0f2b58be3 ("net: airoha: Add FLOW_CLS_STATS callback support") Reviewed-by: Simon Horman Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250618-check-ret-from-airoha_ppe_foe_get_entry-v2-1-068dcea3cc66@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_ppe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 9067d2fc7706..0e217acfc5ef 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -809,8 +809,10 @@ airoha_ppe_foe_flow_l2_entry_update(struct airoha_ppe *ppe, int idle; hwe = airoha_ppe_foe_get_entry(ppe, iter->hash); - ib1 = READ_ONCE(hwe->ib1); + if (!hwe) + continue; + ib1 = READ_ONCE(hwe->ib1); state = FIELD_GET(AIROHA_FOE_IB1_BIND_STATE, ib1); if (state != AIROHA_FOE_STATE_BIND) { iter->hash = 0xffff; From e7ea5f5b1858ddb96b152584d5fe06e6fc623e89 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Wed, 18 Jun 2025 13:59:02 +0000 Subject: [PATCH 237/242] mlxbf_gige: return EPROBE_DEFER if PHY IRQ is not available The message "Error getting PHY irq. Use polling instead" is emitted when the mlxbf_gige driver is loaded by the kernel before the associated gpio-mlxbf driver, and thus the call to get the PHY IRQ fails since it is not yet available. The driver probe() must return -EPROBE_DEFER if acpi_dev_gpio_irq_get_by() returns the same. Fixes: 6c2a6ddca763 ("net: mellanox: mlxbf_gige: Replace non-standard interrupt handling") Signed-off-by: David Thompson Reviewed-by: Asmaa Mnebhi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250618135902.346-1-davthompson@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c index fb2e5b844c15..d76d7a945899 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c @@ -447,8 +447,10 @@ static int mlxbf_gige_probe(struct platform_device *pdev) priv->llu_plu_irq = platform_get_irq(pdev, MLXBF_GIGE_LLU_PLU_INTR_IDX); phy_irq = acpi_dev_gpio_irq_get_by(ACPI_COMPANION(&pdev->dev), "phy", 0); - if (phy_irq < 0) { - dev_err(&pdev->dev, "Error getting PHY irq. Use polling instead"); + if (phy_irq == -EPROBE_DEFER) { + err = -EPROBE_DEFER; + goto out; + } else if (phy_irq < 0) { phy_irq = PHY_POLL; } From d13a3824bfd2b4774b671a75cf766a16637a0e67 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Jun 2025 14:08:43 +0000 Subject: [PATCH 238/242] net: atm: add lec_mutex syzbot found its way in net/atm/lec.c, and found an error path in lecd_attach() could leave a dangling pointer in dev_lec[]. Add a mutex to protect dev_lecp[] uses from lecd_attach(), lec_vcc_attach() and lec_mcast_attach(). Following patch will use this mutex for /proc/net/atm/lec. BUG: KASAN: slab-use-after-free in lecd_attach net/atm/lec.c:751 [inline] BUG: KASAN: slab-use-after-free in lane_ioctl+0x2224/0x23e0 net/atm/lec.c:1008 Read of size 8 at addr ffff88807c7b8e68 by task syz.1.17/6142 CPU: 1 UID: 0 PID: 6142 Comm: syz.1.17 Not tainted 6.16.0-rc1-syzkaller-00239-g08215f5486ec #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xcd/0x680 mm/kasan/report.c:521 kasan_report+0xe0/0x110 mm/kasan/report.c:634 lecd_attach net/atm/lec.c:751 [inline] lane_ioctl+0x2224/0x23e0 net/atm/lec.c:1008 do_vcc_ioctl+0x12c/0x930 net/atm/ioctl.c:159 sock_do_ioctl+0x118/0x280 net/socket.c:1190 sock_ioctl+0x227/0x6b0 net/socket.c:1311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl fs/ioctl.c:893 [inline] __x64_sys_ioctl+0x18e/0x210 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcd/0x4c0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Allocated by task 6132: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0xaa/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4328 [inline] __kvmalloc_node_noprof+0x27b/0x620 mm/slub.c:5015 alloc_netdev_mqs+0xd2/0x1570 net/core/dev.c:11711 lecd_attach net/atm/lec.c:737 [inline] lane_ioctl+0x17db/0x23e0 net/atm/lec.c:1008 do_vcc_ioctl+0x12c/0x930 net/atm/ioctl.c:159 sock_do_ioctl+0x118/0x280 net/socket.c:1190 sock_ioctl+0x227/0x6b0 net/socket.c:1311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl fs/ioctl.c:893 [inline] __x64_sys_ioctl+0x18e/0x210 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcd/0x4c0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 6132: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 kasan_save_free_info+0x3b/0x60 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x51/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4643 [inline] kfree+0x2b4/0x4d0 mm/slub.c:4842 free_netdev+0x6c5/0x910 net/core/dev.c:11892 lecd_attach net/atm/lec.c:744 [inline] lane_ioctl+0x1ce8/0x23e0 net/atm/lec.c:1008 do_vcc_ioctl+0x12c/0x930 net/atm/ioctl.c:159 sock_do_ioctl+0x118/0x280 net/socket.c:1190 sock_ioctl+0x227/0x6b0 net/socket.c:1311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl fs/ioctl.c:893 [inline] __x64_sys_ioctl+0x18e/0x210 fs/ioctl.c:893 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+8b64dec3affaed7b3af5@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6852c6f6.050a0220.216029.0018.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250618140844.1686882-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/atm/lec.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/atm/lec.c b/net/atm/lec.c index acef984f3367..1e1f3eb0e2ba 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -124,6 +124,7 @@ static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; /* Device structures */ static struct net_device *dev_lec[MAX_LEC_ITF]; +static DEFINE_MUTEX(lec_mutex); #if IS_ENABLED(CONFIG_BRIDGE) static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) @@ -685,6 +686,7 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) int bytes_left; struct atmlec_ioc ioc_data; + lockdep_assert_held(&lec_mutex); /* Lecd must be up in this case */ bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc)); if (bytes_left != 0) @@ -710,6 +712,7 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) static int lec_mcast_attach(struct atm_vcc *vcc, int arg) { + lockdep_assert_held(&lec_mutex); if (arg < 0 || arg >= MAX_LEC_ITF) return -EINVAL; arg = array_index_nospec(arg, MAX_LEC_ITF); @@ -725,6 +728,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg) int i; struct lec_priv *priv; + lockdep_assert_held(&lec_mutex); if (arg < 0) arg = 0; if (arg >= MAX_LEC_ITF) @@ -742,6 +746,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg) snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i); if (register_netdev(dev_lec[i])) { free_netdev(dev_lec[i]); + dev_lec[i] = NULL; return -EINVAL; } @@ -1003,6 +1008,7 @@ static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return -ENOIOCTLCMD; } + mutex_lock(&lec_mutex); switch (cmd) { case ATMLEC_CTRL: err = lecd_attach(vcc, (int)arg); @@ -1017,6 +1023,7 @@ static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } + mutex_unlock(&lec_mutex); return err; } From d03b79f459c7935cff830d98373474f440bd03ae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Jun 2025 14:08:44 +0000 Subject: [PATCH 239/242] net: atm: fix /proc/net/atm/lec handling /proc/net/atm/lec must ensure safety against dev_lec[] changes. It appears it had dev_put() calls without prior dev_hold(), leading to imbalance and UAF. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Acked-by: Francois Romieu # Minor atm contributor Link: https://patch.msgid.link/20250618140844.1686882-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/atm/lec.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/atm/lec.c b/net/atm/lec.c index 1e1f3eb0e2ba..afb8d3eb2185 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -909,7 +909,6 @@ static void *lec_itf_walk(struct lec_state *state, loff_t *l) v = (dev && netdev_priv(dev)) ? lec_priv_walk(state, l, netdev_priv(dev)) : NULL; if (!v && dev) { - dev_put(dev); /* Partial state reset for the next time we get called */ dev = NULL; } @@ -933,6 +932,7 @@ static void *lec_seq_start(struct seq_file *seq, loff_t *pos) { struct lec_state *state = seq->private; + mutex_lock(&lec_mutex); state->itf = 0; state->dev = NULL; state->locked = NULL; @@ -950,8 +950,9 @@ static void lec_seq_stop(struct seq_file *seq, void *v) if (state->dev) { spin_unlock_irqrestore(&state->locked->lec_arp_lock, state->flags); - dev_put(state->dev); + state->dev = NULL; } + mutex_unlock(&lec_mutex); } static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos) From 9738280aae592b579a25b5b1b6584c894827d3c7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Jun 2025 10:17:46 -0700 Subject: [PATCH 240/242] tools: ynl: fix mixing ops and notifications on one socket The multi message support loosened the connection between the request and response handling, as we can now submit multiple requests before we start processing responses. Passing the attr set to NlMsgs decoding no longer makes sense (if it ever did), attr set may differ message by messsage. Isolate the part of decoding responsible for attr-set specific interpretation and call it once we identified the correct op. Without this fix performing SET operation on an ethtool socket, while being subscribed to notifications causes: # File "tools/net/ynl/pyynl/lib/ynl.py", line 1096, in _op # Exception| return self._ops(ops)[0] # Exception| ~~~~~~~~~^^^^^ # File "tools/net/ynl/pyynl/lib/ynl.py", line 1040, in _ops # Exception| nms = NlMsgs(reply, attr_space=op.attr_set) # Exception| ^^^^^^^^^^^ The value of op we use on line 1040 is stale, it comes form the previous loop. If a notification comes before a response we will update op to None and the next iteration thru the loop will break with the trace above. Fixes: 6fda63c45fe8 ("tools/net/ynl: fix cli.py --subscribe feature") Fixes: ba8be00f68f5 ("tools/net/ynl: Add multi message support to ynl") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250618171746.1201403-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/lib/ynl.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py index 55b59f6c79b8..61deb5923067 100644 --- a/tools/net/ynl/pyynl/lib/ynl.py +++ b/tools/net/ynl/pyynl/lib/ynl.py @@ -231,14 +231,7 @@ class NlMsg: self.extack['unknown'].append(extack) if attr_space: - # We don't have the ability to parse nests yet, so only do global - if 'miss-type' in self.extack and 'miss-nest' not in self.extack: - miss_type = self.extack['miss-type'] - if miss_type in attr_space.attrs_by_val: - spec = attr_space.attrs_by_val[miss_type] - self.extack['miss-type'] = spec['name'] - if 'doc' in spec: - self.extack['miss-type-doc'] = spec['doc'] + self.annotate_extack(attr_space) def _decode_policy(self, raw): policy = {} @@ -264,6 +257,18 @@ class NlMsg: policy['mask'] = attr.as_scalar('u64') return policy + def annotate_extack(self, attr_space): + """ Make extack more human friendly with attribute information """ + + # We don't have the ability to parse nests yet, so only do global + if 'miss-type' in self.extack and 'miss-nest' not in self.extack: + miss_type = self.extack['miss-type'] + if miss_type in attr_space.attrs_by_val: + spec = attr_space.attrs_by_val[miss_type] + self.extack['miss-type'] = spec['name'] + if 'doc' in spec: + self.extack['miss-type-doc'] = spec['doc'] + def cmd(self): return self.nl_type @@ -277,12 +282,12 @@ class NlMsg: class NlMsgs: - def __init__(self, data, attr_space=None): + def __init__(self, data): self.msgs = [] offset = 0 while offset < len(data): - msg = NlMsg(data, offset, attr_space=attr_space) + msg = NlMsg(data, offset) offset += msg.nl_len self.msgs.append(msg) @@ -1034,12 +1039,13 @@ class YnlFamily(SpecFamily): op_rsp = [] while not done: reply = self.sock.recv(self._recv_size) - nms = NlMsgs(reply, attr_space=op.attr_set) + nms = NlMsgs(reply) self._recv_dbg_print(reply, nms) for nl_msg in nms: if nl_msg.nl_seq in reqs_by_seq: (op, vals, req_msg, req_flags) = reqs_by_seq[nl_msg.nl_seq] if nl_msg.extack: + nl_msg.annotate_extack(op.attr_set) self._decode_extack(req_msg, op, nl_msg.extack, vals) else: op = None From edf8afeecfbb0b8c1a2edb8c8892d2f759d35321 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 19 Jun 2025 09:07:24 +0200 Subject: [PATCH 241/242] net: airoha: Compute number of descriptors according to reserved memory size In order to not exceed the reserved memory size for hwfd buffers, compute the number of hwfd buffers/descriptors according to the reserved memory size and the size of each hwfd buffer (2KB). Fixes: 3a1ce9e3d01b ("net: airoha: Add the capability to allocate hwfd buffers via reserved-memory") Reviewed-by: Simon Horman Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250619-airoha-hw-num-desc-v4-1-49600a9b319a@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index a7ec609d64de..1b7fd7ee0cbf 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -1065,19 +1065,13 @@ static void airoha_qdma_cleanup_tx_queue(struct airoha_queue *q) static int airoha_qdma_init_hfwd_queues(struct airoha_qdma *qdma) { + int size, index, num_desc = HW_DSCP_NUM; struct airoha_eth *eth = qdma->eth; int id = qdma - ð->qdma[0]; dma_addr_t dma_addr; const char *name; - int size, index; u32 status; - size = HW_DSCP_NUM * sizeof(struct airoha_qdma_fwd_desc); - if (!dmam_alloc_coherent(eth->dev, size, &dma_addr, GFP_KERNEL)) - return -ENOMEM; - - airoha_qdma_wr(qdma, REG_FWD_DSCP_BASE, dma_addr); - name = devm_kasprintf(eth->dev, GFP_KERNEL, "qdma%d-buf", id); if (!name) return -ENOMEM; @@ -1099,8 +1093,12 @@ static int airoha_qdma_init_hfwd_queues(struct airoha_qdma *qdma) rmem = of_reserved_mem_lookup(np); of_node_put(np); dma_addr = rmem->base; + /* Compute the number of hw descriptors according to the + * reserved memory size and the payload buffer size + */ + num_desc = rmem->size / AIROHA_MAX_PACKET_SIZE; } else { - size = AIROHA_MAX_PACKET_SIZE * HW_DSCP_NUM; + size = AIROHA_MAX_PACKET_SIZE * num_desc; if (!dmam_alloc_coherent(eth->dev, size, &dma_addr, GFP_KERNEL)) return -ENOMEM; @@ -1108,6 +1106,11 @@ static int airoha_qdma_init_hfwd_queues(struct airoha_qdma *qdma) airoha_qdma_wr(qdma, REG_FWD_BUF_BASE, dma_addr); + size = num_desc * sizeof(struct airoha_qdma_fwd_desc); + if (!dmam_alloc_coherent(eth->dev, size, &dma_addr, GFP_KERNEL)) + return -ENOMEM; + + airoha_qdma_wr(qdma, REG_FWD_DSCP_BASE, dma_addr); airoha_qdma_rmw(qdma, REG_HW_FWD_DSCP_CFG, HW_FWD_DSCP_PAYLOAD_SIZE_MASK, FIELD_PREP(HW_FWD_DSCP_PAYLOAD_SIZE_MASK, 0)); @@ -1116,7 +1119,7 @@ static int airoha_qdma_init_hfwd_queues(struct airoha_qdma *qdma) airoha_qdma_rmw(qdma, REG_LMGR_INIT_CFG, LMGR_INIT_START | LMGR_SRAM_MODE_MASK | HW_FWD_DESC_NUM_MASK, - FIELD_PREP(HW_FWD_DESC_NUM_MASK, HW_DSCP_NUM) | + FIELD_PREP(HW_FWD_DESC_NUM_MASK, num_desc) | LMGR_INIT_START | LMGR_SRAM_MODE_MASK); return read_poll_timeout(airoha_qdma_rr, status, From 7b46bdaec00a675f6fac9d0b01a2105b5746ebe9 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 19 Jun 2025 09:07:25 +0200 Subject: [PATCH 242/242] net: airoha: Differentiate hwfd buffer size for QDMA0 and QDMA1 EN7581 SoC allows configuring the size and the number of buffers in hwfd payload queue for both QDMA0 and QDMA1. In order to reduce the required DRAM used for hwfd buffers queues and decrease the memory footprint, differentiate hwfd buffer size for QDMA0 and QDMA1 and reduce hwfd buffer size to 1KB for QDMA1 (WAN) while maintaining 2KB for QDMA0 (LAN). Signed-off-by: Lorenzo Bianconi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250619-airoha-hw-num-desc-v4-2-49600a9b319a@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 1b7fd7ee0cbf..06dea3a13e77 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -1068,14 +1068,15 @@ static int airoha_qdma_init_hfwd_queues(struct airoha_qdma *qdma) int size, index, num_desc = HW_DSCP_NUM; struct airoha_eth *eth = qdma->eth; int id = qdma - ð->qdma[0]; + u32 status, buf_size; dma_addr_t dma_addr; const char *name; - u32 status; name = devm_kasprintf(eth->dev, GFP_KERNEL, "qdma%d-buf", id); if (!name) return -ENOMEM; + buf_size = id ? AIROHA_MAX_PACKET_SIZE / 2 : AIROHA_MAX_PACKET_SIZE; index = of_property_match_string(eth->dev->of_node, "memory-region-names", name); if (index >= 0) { @@ -1096,9 +1097,9 @@ static int airoha_qdma_init_hfwd_queues(struct airoha_qdma *qdma) /* Compute the number of hw descriptors according to the * reserved memory size and the payload buffer size */ - num_desc = rmem->size / AIROHA_MAX_PACKET_SIZE; + num_desc = div_u64(rmem->size, buf_size); } else { - size = AIROHA_MAX_PACKET_SIZE * num_desc; + size = buf_size * num_desc; if (!dmam_alloc_coherent(eth->dev, size, &dma_addr, GFP_KERNEL)) return -ENOMEM; @@ -1111,9 +1112,10 @@ static int airoha_qdma_init_hfwd_queues(struct airoha_qdma *qdma) return -ENOMEM; airoha_qdma_wr(qdma, REG_FWD_DSCP_BASE, dma_addr); + /* QDMA0: 2KB. QDMA1: 1KB */ airoha_qdma_rmw(qdma, REG_HW_FWD_DSCP_CFG, HW_FWD_DSCP_PAYLOAD_SIZE_MASK, - FIELD_PREP(HW_FWD_DSCP_PAYLOAD_SIZE_MASK, 0)); + FIELD_PREP(HW_FWD_DSCP_PAYLOAD_SIZE_MASK, !!id)); airoha_qdma_rmw(qdma, REG_FWD_DSCP_LOW_THR, FWD_DSCP_LOW_THR_MASK, FIELD_PREP(FWD_DSCP_LOW_THR_MASK, 128)); airoha_qdma_rmw(qdma, REG_LMGR_INIT_CFG,