From a424b598e6a6c1e69a2bb801d6fd16e805ab2c38 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:07 -0500 Subject: [PATCH 01/21] PCI/DPC: Initialize aer_err_info before using it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the struct aer_err_info "info" was allocated on the stack without being initialized, so it contained junk except for the fields we explicitly set later. Initialize "info" at declaration so it starts as all zeros. Fixes: 8aefa9b0d910 ("PCI/DPC: Print AER status in DPC event handling") Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilpo Järvinen Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250522232339.1525671-2-helgaas@kernel.org --- drivers/pci/pcie/dpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index df42f15c9829..3daaf61c79c9 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -258,7 +258,7 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev, void dpc_process_error(struct pci_dev *pdev) { u16 cap = pdev->dpc_cap, status, source, reason, ext_reason; - struct aer_err_info info; + struct aer_err_info info = {}; pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status); pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, &source); From a0b62cc310239c7f1323fb20bd3789f21bdd8615 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:08 -0500 Subject: [PATCH 02/21] PCI/DPC: Log Error Source ID only when valid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DPC Error Source ID is only valid when the DPC Trigger Reason indicates that DPC was triggered due to reception of an ERR_NONFATAL or ERR_FATAL Message (PCIe r6.0, sec 7.9.14.5). When DPC was triggered by ERR_NONFATAL (PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE) or ERR_FATAL (PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE) from a downstream device, log the Error Source ID (decoded into domain/bus/device/function). Don't print the source otherwise, since it's not valid. For DPC trigger due to reception of ERR_NONFATAL or ERR_FATAL, the dmesg logging changes: - pci 0000:00:01.0: DPC: containment event, status:0x000d source:0x0200 - pci 0000:00:01.0: DPC: ERR_FATAL detected + pci 0000:00:01.0: DPC: containment event, status:0x000d, ERR_FATAL received from 0000:02:00.0 and when DPC triggered for other reasons, where DPC Error Source ID is undefined, e.g., unmasked uncorrectable error: - pci 0000:00:01.0: DPC: containment event, status:0x0009 source:0x0200 - pci 0000:00:01.0: DPC: unmasked uncorrectable error detected + pci 0000:00:01.0: DPC: containment event, status:0x0009: unmasked uncorrectable error detected Previously the "containment event" message was at KERN_INFO and the "%s detected" message was at KERN_WARNING. Now the single message is at KERN_WARNING. Fixes: 26e515713342 ("PCI: Add Downstream Port Containment driver") Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Jonathan Cameron Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/20250522232339.1525671-3-helgaas@kernel.org --- drivers/pci/pcie/dpc.c | 64 ++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 3daaf61c79c9..9d85f1b3b761 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -261,37 +261,45 @@ void dpc_process_error(struct pci_dev *pdev) struct aer_err_info info = {}; pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status); - pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, &source); - - pci_info(pdev, "containment event, status:%#06x source:%#06x\n", - status, source); reason = status & PCI_EXP_DPC_STATUS_TRIGGER_RSN; - ext_reason = status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT; - pci_warn(pdev, "%s detected\n", - (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR) ? - "unmasked uncorrectable error" : - (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE) ? - "ERR_NONFATAL" : - (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE) ? - "ERR_FATAL" : - (ext_reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO) ? - "RP PIO error" : - (ext_reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_SW_TRIGGER) ? - "software trigger" : - "reserved error"); - /* show RP PIO error detail information */ - if (pdev->dpc_rp_extensions && - reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT && - ext_reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO) - dpc_process_rp_pio_error(pdev); - else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR && - dpc_get_aer_uncorrect_severity(pdev, &info) && - aer_get_device_error_info(pdev, &info)) { - aer_print_error(pdev, &info); - pci_aer_clear_nonfatal_status(pdev); - pci_aer_clear_fatal_status(pdev); + switch (reason) { + case PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR: + pci_warn(pdev, "containment event, status:%#06x: unmasked uncorrectable error detected\n", + status); + if (dpc_get_aer_uncorrect_severity(pdev, &info) && + aer_get_device_error_info(pdev, &info)) { + aer_print_error(pdev, &info); + pci_aer_clear_nonfatal_status(pdev); + pci_aer_clear_fatal_status(pdev); + } + break; + case PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE: + case PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE: + pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, + &source); + pci_warn(pdev, "containment event, status:%#06x, %s received from %04x:%02x:%02x.%d\n", + status, + (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE) ? + "ERR_FATAL" : "ERR_NONFATAL", + pci_domain_nr(pdev->bus), PCI_BUS_NUM(source), + PCI_SLOT(source), PCI_FUNC(source)); + break; + case PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT: + ext_reason = status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT; + pci_warn(pdev, "containment event, status:%#06x: %s detected\n", + status, + (ext_reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO) ? + "RP PIO error" : + (ext_reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_SW_TRIGGER) ? + "software trigger" : + "reserved error"); + /* show RP PIO error detail information */ + if (ext_reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO && + pdev->dpc_rp_extensions) + dpc_process_rp_pio_error(pdev); + break; } } From 6fc4dae74afcf29ef82afbaaa9b082893871eda4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:09 -0500 Subject: [PATCH 03/21] PCI/AER: Factor COR/UNCOR error handling out from aer_isr_one_error() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit aer_isr_one_error() duplicates the Error Source ID logging and AER error processing for Correctable Errors and Uncorrectable Errors. Factor out the duplicated code to aer_isr_one_error_type(). aer_isr_one_error() doesn't need the struct aer_rpc pointer, so pass it the Root Port or RCEC pci_dev pointer instead. Signed-off-by: Bjorn Helgaas Reviewed-by: Ilpo Järvinen Reviewed-by: Jonathan Cameron Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20250522232339.1525671-4-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index a1cf8c7ef628..568229288ca3 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1273,17 +1273,32 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info) } /** - * aer_isr_one_error - consume an error detected by Root Port - * @rpc: pointer to the Root Port which holds an error + * aer_isr_one_error_type - consume a Correctable or Uncorrectable Error + * detected by Root Port or RCEC + * @root: pointer to Root Port or RCEC that signaled AER interrupt + * @info: pointer to AER error info + */ +static void aer_isr_one_error_type(struct pci_dev *root, + struct aer_err_info *info) +{ + aer_print_port_info(root, info); + + if (find_source_device(root, info)) + aer_process_err_devices(info); +} + +/** + * aer_isr_one_error - consume error(s) signaled by an AER interrupt from + * Root Port or RCEC + * @root: pointer to Root Port or RCEC that signaled AER interrupt * @e_src: pointer to an error source */ -static void aer_isr_one_error(struct aer_rpc *rpc, +static void aer_isr_one_error(struct pci_dev *root, struct aer_err_source *e_src) { - struct pci_dev *pdev = rpc->rpd; struct aer_err_info e_info; - pci_rootport_aer_stats_incr(pdev, e_src); + pci_rootport_aer_stats_incr(root, e_src); /* * There is a possibility that both correctable error and @@ -1297,10 +1312,8 @@ static void aer_isr_one_error(struct aer_rpc *rpc, e_info.multi_error_valid = 1; else e_info.multi_error_valid = 0; - aer_print_port_info(pdev, &e_info); - if (find_source_device(pdev, &e_info)) - aer_process_err_devices(&e_info); + aer_isr_one_error_type(root, &e_info); } if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) { @@ -1316,10 +1329,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc, else e_info.multi_error_valid = 0; - aer_print_port_info(pdev, &e_info); - - if (find_source_device(pdev, &e_info)) - aer_process_err_devices(&e_info); + aer_isr_one_error_type(root, &e_info); } } @@ -1340,7 +1350,7 @@ static irqreturn_t aer_isr(int irq, void *context) return IRQ_NONE; while (kfifo_get(&rpc->aer_fifo, &e_src)) - aer_isr_one_error(rpc, &e_src); + aer_isr_one_error(rpc->rpd, &e_src); return IRQ_HANDLED; } From 6a1eda745967a1e84f6e4cae14c118c97319891e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:10 -0500 Subject: [PATCH 04/21] PCI/AER: Consolidate Error Source ID logging in aer_isr_one_error_type() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously we decoded the AER Error Source ID in aer_isr_one_error_type(), then again in find_source_device() if we didn't find any devices with errors logged in their AER Capabilities. Consolidate this so we only decode and log the Error Source ID once in aer_isr_one_error_type(). Add a "found" parameter so we can add a note when we didn't find any downstream devices with errors logged in their AER Capability. This changes the dmesg logging when we found no devices with errors logged: - pci 0000:00:01.0: AER: Correctable error message received from 0000:02:00.0 - pci 0000:00:01.0: AER: found no error details for 0000:02:00.0 + pci 0000:00:01.0: AER: Correctable error message received from 0000:02:00.0 (no details found) Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/20250522232339.1525671-5-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 568229288ca3..fe6d323306a0 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -733,16 +733,17 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) info->severity, info->tlp_header_valid, &info->tlp); } -static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info) +static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info, + bool found) { u8 bus = info->id >> 8; u8 devfn = info->id & 0xff; - pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d\n", + pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d%s\n", info->multi_error_valid ? "Multiple " : "", aer_error_severity_string[info->severity], pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn), - PCI_FUNC(devfn)); + PCI_FUNC(devfn), found ? "" : " (no details found"); } #ifdef CONFIG_ACPI_APEI_PCIEAER @@ -926,15 +927,8 @@ static bool find_source_device(struct pci_dev *parent, else pci_walk_bus(parent->subordinate, find_device_iter, e_info); - if (!e_info->error_dev_num) { - u8 bus = e_info->id >> 8; - u8 devfn = e_info->id & 0xff; - - pci_info(parent, "found no error details for %04x:%02x:%02x.%d\n", - pci_domain_nr(parent->bus), bus, PCI_SLOT(devfn), - PCI_FUNC(devfn)); + if (!e_info->error_dev_num) return false; - } return true; } @@ -1281,9 +1275,11 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info) static void aer_isr_one_error_type(struct pci_dev *root, struct aer_err_info *info) { - aer_print_port_info(root, info); + bool found; - if (find_source_device(root, info)) + found = find_source_device(root, info); + aer_print_port_info(root, info, found); + if (found) aer_process_err_devices(info); } From f40bd2865501437bf5dfc7167be7b5c8f536b323 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:11 -0500 Subject: [PATCH 05/21] PCI/AER: Extract bus/dev/fn in aer_print_port_info() with PCI_BUS_NUM(), etc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use PCI_BUS_NUM(), PCI_SLOT(), PCI_FUNC() to extract the bus number, device, and function number directly from the Error Source ID. There's no need to shift and mask it explicitly. Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilpo Järvinen Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250522232339.1525671-6-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index fe6d323306a0..18005615d376 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -736,14 +736,14 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info, bool found) { - u8 bus = info->id >> 8; - u8 devfn = info->id & 0xff; + u16 source = info->id; pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d%s\n", info->multi_error_valid ? "Multiple " : "", aer_error_severity_string[info->severity], - pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn), - PCI_FUNC(devfn), found ? "" : " (no details found"); + pci_domain_nr(dev->bus), PCI_BUS_NUM(source), + PCI_SLOT(source), PCI_FUNC(source), + found ? "" : " (no details found"); } #ifdef CONFIG_ACPI_APEI_PCIEAER From 99c3fd0de8eb4e35459aaac5c1aa315a1a6843a6 Mon Sep 17 00:00:00 2001 From: Jon Pan-Doh Date: Thu, 22 May 2025 18:21:12 -0500 Subject: [PATCH 06/21] PCI/AER: Rename aer_print_port_info() to aer_print_source() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename aer_print_port_info() to aer_print_source() to be more descriptive. This logs the Error Source ID logged by a Root Port or Root Complex Event Collector when it receives an ERR_COR, ERR_NONFATAL, or ERR_FATAL Message. [bhelgaas: aer_print_rp_info() -> aer_print_source()] Signed-off-by: Jon Pan-Doh Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Ilpo Järvinen Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250522232339.1525671-7-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 18005615d376..8b23ef90345b 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -733,8 +733,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) info->severity, info->tlp_header_valid, &info->tlp); } -static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info, - bool found) +static void aer_print_source(struct pci_dev *dev, struct aer_err_info *info, + bool found) { u16 source = info->id; @@ -1278,7 +1278,7 @@ static void aer_isr_one_error_type(struct pci_dev *root, bool found; found = find_source_device(root, info); - aer_print_port_info(root, info, found); + aer_print_source(root, info, found); if (found) aer_process_err_devices(info); } From ca2426a570ab4bdf6185aea034ee09184420bd0d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:13 -0500 Subject: [PATCH 07/21] PCI/AER: Move aer_print_source() earlier in file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move aer_print_source() earlier in the file so a future change can use it from aer_print_error(), where it's easier to rate limit it. Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilpo Järvinen Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250522232339.1525671-8-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 8b23ef90345b..c0481550363b 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -696,6 +696,19 @@ static void __aer_print_error(struct pci_dev *dev, pci_dev_aer_stats_incr(dev, info); } +static void aer_print_source(struct pci_dev *dev, struct aer_err_info *info, + bool found) +{ + u16 source = info->id; + + pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d%s\n", + info->multi_error_valid ? "Multiple " : "", + aer_error_severity_string[info->severity], + pci_domain_nr(dev->bus), PCI_BUS_NUM(source), + PCI_SLOT(source), PCI_FUNC(source), + found ? "" : " (no details found"); +} + void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { int layer, agent; @@ -733,19 +746,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) info->severity, info->tlp_header_valid, &info->tlp); } -static void aer_print_source(struct pci_dev *dev, struct aer_err_info *info, - bool found) -{ - u16 source = info->id; - - pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d%s\n", - info->multi_error_valid ? "Multiple " : "", - aer_error_severity_string[info->severity], - pci_domain_nr(dev->bus), PCI_BUS_NUM(source), - PCI_SLOT(source), PCI_FUNC(source), - found ? "" : " (no details found"); -} - #ifdef CONFIG_ACPI_APEI_PCIEAER int cper_severity_to_aer(int cper_severity) { From 57964ba39057a76f3485437347bb4219be9c5b52 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:14 -0500 Subject: [PATCH 08/21] PCI/AER: Initialize aer_err_info before using it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the struct aer_err_info "e_info" was allocated on the stack without being initialized, so it contained junk except for the fields we explicitly set later. Initialize "e_info" at declaration with a designated initializer list, which initializes the other members to zero. Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilpo Järvinen Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250522232339.1525671-9-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index c0481550363b..109f5e0ade8a 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1290,9 +1290,9 @@ static void aer_isr_one_error_type(struct pci_dev *root, * @e_src: pointer to an error source */ static void aer_isr_one_error(struct pci_dev *root, - struct aer_err_source *e_src) + struct aer_err_source *e_src) { - struct aer_err_info e_info; + u32 status = e_src->status; pci_rootport_aer_stats_incr(root, e_src); @@ -1300,30 +1300,25 @@ static void aer_isr_one_error(struct pci_dev *root, * There is a possibility that both correctable error and * uncorrectable error being logged. Report correctable error first. */ - if (e_src->status & PCI_ERR_ROOT_COR_RCV) { - e_info.id = ERR_COR_ID(e_src->id); - e_info.severity = AER_CORRECTABLE; - - if (e_src->status & PCI_ERR_ROOT_MULTI_COR_RCV) - e_info.multi_error_valid = 1; - else - e_info.multi_error_valid = 0; + if (status & PCI_ERR_ROOT_COR_RCV) { + int multi = status & PCI_ERR_ROOT_MULTI_COR_RCV; + struct aer_err_info e_info = { + .id = ERR_COR_ID(e_src->id), + .severity = AER_CORRECTABLE, + .multi_error_valid = multi ? 1 : 0, + }; aer_isr_one_error_type(root, &e_info); } - if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) { - e_info.id = ERR_UNCOR_ID(e_src->id); - - if (e_src->status & PCI_ERR_ROOT_FATAL_RCV) - e_info.severity = AER_FATAL; - else - e_info.severity = AER_NONFATAL; - - if (e_src->status & PCI_ERR_ROOT_MULTI_UNCOR_RCV) - e_info.multi_error_valid = 1; - else - e_info.multi_error_valid = 0; + if (status & PCI_ERR_ROOT_UNCOR_RCV) { + int fatal = status & PCI_ERR_ROOT_FATAL_RCV; + int multi = status & PCI_ERR_ROOT_MULTI_UNCOR_RCV; + struct aer_err_info e_info = { + .id = ERR_UNCOR_ID(e_src->id), + .severity = fatal ? AER_FATAL : AER_NONFATAL, + .multi_error_valid = multi ? 1 : 0, + }; aer_isr_one_error_type(root, &e_info); } From ad9839137cf9fb0f0c2d531bd04bc4382e6f2de9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:15 -0500 Subject: [PATCH 09/21] PCI/AER: Simplify pci_print_aer() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify pci_print_aer() by initializing the struct aer_err_info "info" with a designated initializer list (it was previously initialized with memset()) and using pci_name(). Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Ilpo Järvinen Reviewed-by: Jonathan Cameron Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20250522232339.1525671-10-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 109f5e0ade8a..dc5f4bebd613 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -766,7 +766,10 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, { int layer, agent, tlp_header_valid = 0; u32 status, mask; - struct aer_err_info info; + struct aer_err_info info = { + .severity = aer_severity, + .first_error = PCI_ERR_CAP_FEP(aer->cap_control), + }; if (aer_severity == AER_CORRECTABLE) { status = aer->cor_status; @@ -777,14 +780,11 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, tlp_header_valid = status & AER_LOG_TLP_MASKS; } - layer = AER_GET_LAYER_ERROR(aer_severity, status); - agent = AER_GET_AGENT(aer_severity, status); - - memset(&info, 0, sizeof(info)); - info.severity = aer_severity; info.status = status; info.mask = mask; - info.first_error = PCI_ERR_CAP_FEP(aer->cap_control); + + layer = AER_GET_LAYER_ERROR(aer_severity, status); + agent = AER_GET_AGENT(aer_severity, status); pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask); __aer_print_error(dev, &info); @@ -798,7 +798,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, if (tlp_header_valid) pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" ")); - trace_aer_event(dev_name(&dev->dev), (status & ~mask), + trace_aer_event(pci_name(dev), (status & ~mask), aer_severity, tlp_header_valid, &aer->header_log); } EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL"); From 88a7765e62b9e4c79c7ca2c7b749ae04f54a5668 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:16 -0500 Subject: [PATCH 10/21] PCI/AER: Update statistics before ratelimiting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are two AER logging entry points: - aer_print_error() is used by DPC (dpc_process_error()) and native AER handling (aer_process_err_devices()). - pci_print_aer() is used by GHES (aer_recover_work_func()) and CXL (cxl_handle_rdport_errors()) Both use __aer_print_error() to print the AER error bits. Previously __aer_print_error() also incremented the AER statistics via pci_dev_aer_stats_incr(). Call pci_dev_aer_stats_incr() early in the entry points instead of in __aer_print_error() so we update the statistics even if the actual printing of error bits is rate limited by a future change. Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Ilpo Järvinen Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250522232339.1525671-11-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index dc5f4bebd613..53f460bb7e6c 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -693,7 +693,6 @@ static void __aer_print_error(struct pci_dev *dev, aer_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg, info->first_error == i ? " (First)" : ""); } - pci_dev_aer_stats_incr(dev, info); } static void aer_print_source(struct pci_dev *dev, struct aer_err_info *info, @@ -715,6 +714,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) int id = pci_dev_id(dev); const char *level; + pci_dev_aer_stats_incr(dev, info); + if (!info->status) { pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", aer_error_severity_string[info->severity]); @@ -783,6 +784,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, info.status = status; info.mask = mask; + pci_dev_aer_stats_incr(dev, &info); + layer = AER_GET_LAYER_ERROR(aer_severity, status); agent = AER_GET_AGENT(aer_severity, status); From 6bb4befbd65fa7f99688fb707e376637e5acfe36 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:17 -0500 Subject: [PATCH 11/21] PCI/AER: Trace error event before ratelimiting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As with the AER statistics, we always want to emit trace events, even if the actual dmesg logging is rate limited. Call trace_aer_event() immediately after pci_dev_aer_stats_incr() so both happen before ratelimiting. Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Ilpo Järvinen Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20250522232339.1525671-12-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 53f460bb7e6c..636bcd92afa1 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -715,6 +715,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) const char *level; pci_dev_aer_stats_incr(dev, info); + trace_aer_event(pci_name(dev), (info->status & ~info->mask), + info->severity, info->tlp_header_valid, &info->tlp); if (!info->status) { pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", @@ -742,9 +744,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) out: if (info->id && info->error_dev_num > 1 && info->id == id) pci_err(dev, " Error of this Agent is reported first\n"); - - trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask), - info->severity, info->tlp_header_valid, &info->tlp); } #ifdef CONFIG_ACPI_APEI_PCIEAER @@ -785,6 +784,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, info.mask = mask; pci_dev_aer_stats_incr(dev, &info); + trace_aer_event(pci_name(dev), (status & ~mask), + aer_severity, tlp_header_valid, &aer->header_log); layer = AER_GET_LAYER_ERROR(aer_severity, status); agent = AER_GET_AGENT(aer_severity, status); @@ -800,9 +801,6 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, if (tlp_header_valid) pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" ")); - - trace_aer_event(pci_name(dev), (status & ~mask), - aer_severity, tlp_header_valid, &aer->header_log); } EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL"); From c8f6791e33a7757025285db26f3b382cdcb7f7cd Mon Sep 17 00:00:00 2001 From: Karolina Stolarek Date: Thu, 22 May 2025 18:21:18 -0500 Subject: [PATCH 12/21] PCI/AER: Check log level once and remember it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When reporting an AER error, we check its type multiple times to determine the log level for each message. Do this check only in the top-level functions (aer_isr_one_error(), pci_print_aer()) and save the level in struct aer_err_info. [bhelgaas: save log level in struct aer_err_info instead of passing it as a parameter] Signed-off-by: Karolina Stolarek Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Ilpo Järvinen Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250522232339.1525671-13-helgaas@kernel.org --- drivers/pci/pci.h | 1 + drivers/pci/pcie/aer.c | 21 ++++++++++----------- drivers/pci/pcie/dpc.c | 1 + 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index b81e99cd4b62..705f9ef58acc 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -588,6 +588,7 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev) struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; int error_dev_num; + const char *level; /* printk level */ unsigned int id:16; diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 636bcd92afa1..f80c78846a14 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -669,21 +669,18 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev, } } -static void __aer_print_error(struct pci_dev *dev, - struct aer_err_info *info) +static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { const char **strings; unsigned long status = info->status & ~info->mask; - const char *level, *errmsg; + const char *level = info->level; + const char *errmsg; int i; - if (info->severity == AER_CORRECTABLE) { + if (info->severity == AER_CORRECTABLE) strings = aer_correctable_error_string; - level = KERN_WARNING; - } else { + else strings = aer_uncorrectable_error_string; - level = KERN_ERR; - } for_each_set_bit(i, &status, 32) { errmsg = strings[i]; @@ -712,7 +709,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { int layer, agent; int id = pci_dev_id(dev); - const char *level; + const char *level = info->level; pci_dev_aer_stats_incr(dev, info); trace_aer_event(pci_name(dev), (info->status & ~info->mask), @@ -727,8 +724,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) layer = AER_GET_LAYER_ERROR(info->severity, info->status); agent = AER_GET_AGENT(info->severity, info->status); - level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR; - aer_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", aer_error_severity_string[info->severity], aer_error_layer[layer], aer_agent_string[agent]); @@ -774,9 +769,11 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, if (aer_severity == AER_CORRECTABLE) { status = aer->cor_status; mask = aer->cor_mask; + info.level = KERN_WARNING; } else { status = aer->uncor_status; mask = aer->uncor_mask; + info.level = KERN_ERR; tlp_header_valid = status & AER_LOG_TLP_MASKS; } @@ -1306,6 +1303,7 @@ static void aer_isr_one_error(struct pci_dev *root, struct aer_err_info e_info = { .id = ERR_COR_ID(e_src->id), .severity = AER_CORRECTABLE, + .level = KERN_WARNING, .multi_error_valid = multi ? 1 : 0, }; @@ -1318,6 +1316,7 @@ static void aer_isr_one_error(struct pci_dev *root, struct aer_err_info e_info = { .id = ERR_UNCOR_ID(e_src->id), .severity = fatal ? AER_FATAL : AER_NONFATAL, + .level = KERN_ERR, .multi_error_valid = multi ? 1 : 0, }; diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 9d85f1b3b761..6c98fabdba57 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -252,6 +252,7 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev, else info->severity = AER_NONFATAL; + info->level = KERN_ERR; return 1; } From 82013ff394ea58fced098a1fdc001b46cc3dd5ee Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:19 -0500 Subject: [PATCH 13/21] PCI/ERR: Add printk level to pcie_print_tlp_log() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit aer_print_error() produces output at a printk level (KERN_ERR/KERN_WARNING/ etc) that depends on the kind of error, and it calls pcie_print_tlp_log(), which previously always produced output at KERN_ERR. Add a "level" parameter so aer_print_error() can control the level of the pcie_print_tlp_log() output to match. Signed-off-by: Bjorn Helgaas Reviewed-by: Ilpo Järvinen Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20250522232339.1525671-14-helgaas@kernel.org --- drivers/pci/pci.h | 3 ++- drivers/pci/pcie/aer.c | 5 +++-- drivers/pci/pcie/dpc.c | 2 +- drivers/pci/pcie/tlp.c | 6 ++++-- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 705f9ef58acc..1a9bfc708757 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -613,7 +613,8 @@ int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, struct pcie_tlp_log *log); unsigned int aer_tlp_log_len(struct pci_dev *dev, u32 aercc); void pcie_print_tlp_log(const struct pci_dev *dev, - const struct pcie_tlp_log *log, const char *pfx); + const struct pcie_tlp_log *log, const char *level, + const char *pfx); #endif /* CONFIG_PCIEAER */ #ifdef CONFIG_PCIEPORTBUS diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index f80c78846a14..f0936759ba8b 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -734,7 +734,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) __aer_print_error(dev, info); if (info->tlp_header_valid) - pcie_print_tlp_log(dev, &info->tlp, dev_fmt(" ")); + pcie_print_tlp_log(dev, &info->tlp, level, dev_fmt(" ")); out: if (info->id && info->error_dev_num > 1 && info->id == id) @@ -797,7 +797,8 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, aer->uncor_severity); if (tlp_header_valid) - pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" ")); + pcie_print_tlp_log(dev, &aer->header_log, info.level, + dev_fmt(" ")); } EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL"); diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 6c98fabdba57..7ae1590ea1da 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -222,7 +222,7 @@ static void dpc_process_rp_pio_error(struct pci_dev *pdev) dpc_tlp_log_len(pdev), pdev->subordinate->flit_mode, &tlp_log); - pcie_print_tlp_log(pdev, &tlp_log, dev_fmt("")); + pcie_print_tlp_log(pdev, &tlp_log, KERN_ERR, dev_fmt("")); if (pdev->dpc_rp_log_size < PCIE_STD_NUM_TLP_HEADERLOG + 1) goto clear_status; diff --git a/drivers/pci/pcie/tlp.c b/drivers/pci/pcie/tlp.c index 890d5391d7f5..71f8fc9ea2ed 100644 --- a/drivers/pci/pcie/tlp.c +++ b/drivers/pci/pcie/tlp.c @@ -98,12 +98,14 @@ int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, * pcie_print_tlp_log - Print TLP Header / Prefix Log contents * @dev: PCIe device * @log: TLP Log structure + * @level: Printk log level * @pfx: String prefix * * Prints TLP Header and Prefix Log information held by @log. */ void pcie_print_tlp_log(const struct pci_dev *dev, - const struct pcie_tlp_log *log, const char *pfx) + const struct pcie_tlp_log *log, const char *level, + const char *pfx) { /* EE_PREFIX_STR fits the extended DW space needed for the Flit mode */ char buf[11 * PCIE_STD_MAX_TLP_HEADERLOG + 1]; @@ -130,6 +132,6 @@ void pcie_print_tlp_log(const struct pci_dev *dev, } } - pci_err(dev, "%sTLP Header%s: %s\n", pfx, + dev_printk(level, &dev->dev, "%sTLP Header%s: %s\n", pfx, log->flit ? " (Flit)" : "", buf); } From 36c5932074aaedebf905374dba2399a529215aa1 Mon Sep 17 00:00:00 2001 From: Karolina Stolarek Date: Thu, 22 May 2025 18:21:20 -0500 Subject: [PATCH 14/21] PCI/AER: Reduce pci_print_aer() correctable error level to KERN_WARNING MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some existing logs in pci_print_aer() log with error severity by default. Convert them to use KERN_WARNING for correctable errors and KERN_ERR for uncorrectable errors. [bhelgaas: commit log] Signed-off-by: Karolina Stolarek Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilpo Järvinen Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250522232339.1525671-15-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index f0936759ba8b..16779f281b2f 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -787,14 +787,15 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, layer = AER_GET_LAYER_ERROR(aer_severity, status); agent = AER_GET_AGENT(aer_severity, status); - pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask); + aer_printk(info.level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", + status, mask); __aer_print_error(dev, &info); - pci_err(dev, "aer_layer=%s, aer_agent=%s\n", - aer_error_layer[layer], aer_agent_string[agent]); + aer_printk(info.level, dev, "aer_layer=%s, aer_agent=%s\n", + aer_error_layer[layer], aer_agent_string[agent]); if (aer_severity != AER_CORRECTABLE) - pci_err(dev, "aer_uncor_severity: 0x%08x\n", - aer->uncor_severity); + aer_printk(info.level, dev, "aer_uncor_severity: 0x%08x\n", + aer->uncor_severity); if (tlp_header_valid) pcie_print_tlp_log(dev, &aer->header_log, info.level, From 09683a6184ad6fd635831402316651392b56cc3a Mon Sep 17 00:00:00 2001 From: Karolina Stolarek Date: Thu, 22 May 2025 18:21:21 -0500 Subject: [PATCH 15/21] PCI/AER: Rename struct aer_stats to aer_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update name to reflect the broader definition of structs/variables that are stored (e.g. ratelimits). This is a preparatory patch for adding rate limit support. [bhelgaas: "aer_report" -> "aer_info"] Signed-off-by: Karolina Stolarek Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Ilpo Järvinen Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250522232339.1525671-16-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 46 +++++++++++++++++++++--------------------- include/linux/pci.h | 2 +- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 16779f281b2f..787a953fb331 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -54,8 +54,8 @@ struct aer_rpc { DECLARE_KFIFO(aer_fifo, struct aer_err_source, AER_ERROR_SOURCES_MAX); }; -/* AER stats for the device */ -struct aer_stats { +/* AER info for the device */ +struct aer_info { /* * Fields for all AER capable devices. They indicate the errors @@ -377,7 +377,7 @@ void pci_aer_init(struct pci_dev *dev) if (!dev->aer_cap) return; - dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL); + dev->aer_info = kzalloc(sizeof(*dev->aer_info), GFP_KERNEL); /* * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER, @@ -398,8 +398,8 @@ void pci_aer_init(struct pci_dev *dev) void pci_aer_exit(struct pci_dev *dev) { - kfree(dev->aer_stats); - dev->aer_stats = NULL; + kfree(dev->aer_info); + dev->aer_info = NULL; } #define AER_AGENT_RECEIVER 0 @@ -537,10 +537,10 @@ static const char *aer_agent_string[] = { { \ unsigned int i; \ struct pci_dev *pdev = to_pci_dev(dev); \ - u64 *stats = pdev->aer_stats->stats_array; \ + u64 *stats = pdev->aer_info->stats_array; \ size_t len = 0; \ \ - for (i = 0; i < ARRAY_SIZE(pdev->aer_stats->stats_array); i++) {\ + for (i = 0; i < ARRAY_SIZE(pdev->aer_info->stats_array); i++) { \ if (strings_array[i]) \ len += sysfs_emit_at(buf, len, "%s %llu\n", \ strings_array[i], \ @@ -551,7 +551,7 @@ static const char *aer_agent_string[] = { i, stats[i]); \ } \ len += sysfs_emit_at(buf, len, "TOTAL_%s %llu\n", total_string, \ - pdev->aer_stats->total_field); \ + pdev->aer_info->total_field); \ return len; \ } \ static DEVICE_ATTR_RO(name) @@ -572,7 +572,7 @@ aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs, char *buf) \ { \ struct pci_dev *pdev = to_pci_dev(dev); \ - return sysfs_emit(buf, "%llu\n", pdev->aer_stats->field); \ + return sysfs_emit(buf, "%llu\n", pdev->aer_info->field); \ } \ static DEVICE_ATTR_RO(name) @@ -599,7 +599,7 @@ static umode_t aer_stats_attrs_are_visible(struct kobject *kobj, struct device *dev = kobj_to_dev(kobj); struct pci_dev *pdev = to_pci_dev(dev); - if (!pdev->aer_stats) + if (!pdev->aer_info) return 0; if ((a == &dev_attr_aer_rootport_total_err_cor.attr || @@ -623,25 +623,25 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev, unsigned long status = info->status & ~info->mask; int i, max = -1; u64 *counter = NULL; - struct aer_stats *aer_stats = pdev->aer_stats; + struct aer_info *aer_info = pdev->aer_info; - if (!aer_stats) + if (!aer_info) return; switch (info->severity) { case AER_CORRECTABLE: - aer_stats->dev_total_cor_errs++; - counter = &aer_stats->dev_cor_errs[0]; + aer_info->dev_total_cor_errs++; + counter = &aer_info->dev_cor_errs[0]; max = AER_MAX_TYPEOF_COR_ERRS; break; case AER_NONFATAL: - aer_stats->dev_total_nonfatal_errs++; - counter = &aer_stats->dev_nonfatal_errs[0]; + aer_info->dev_total_nonfatal_errs++; + counter = &aer_info->dev_nonfatal_errs[0]; max = AER_MAX_TYPEOF_UNCOR_ERRS; break; case AER_FATAL: - aer_stats->dev_total_fatal_errs++; - counter = &aer_stats->dev_fatal_errs[0]; + aer_info->dev_total_fatal_errs++; + counter = &aer_info->dev_fatal_errs[0]; max = AER_MAX_TYPEOF_UNCOR_ERRS; break; } @@ -653,19 +653,19 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev, static void pci_rootport_aer_stats_incr(struct pci_dev *pdev, struct aer_err_source *e_src) { - struct aer_stats *aer_stats = pdev->aer_stats; + struct aer_info *aer_info = pdev->aer_info; - if (!aer_stats) + if (!aer_info) return; if (e_src->status & PCI_ERR_ROOT_COR_RCV) - aer_stats->rootport_total_cor_errs++; + aer_info->rootport_total_cor_errs++; if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) { if (e_src->status & PCI_ERR_ROOT_FATAL_RCV) - aer_stats->rootport_total_fatal_errs++; + aer_info->rootport_total_fatal_errs++; else - aer_stats->rootport_total_nonfatal_errs++; + aer_info->rootport_total_nonfatal_errs++; } } diff --git a/include/linux/pci.h b/include/linux/pci.h index 0e8e3fd77e96..81a81dbfc873 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -346,7 +346,7 @@ struct pci_dev { u8 hdr_type; /* PCI header type (`multi' flag masked out) */ #ifdef CONFIG_PCIEAER u16 aer_cap; /* AER capability offset */ - struct aer_stats *aer_stats; /* AER stats for this device */ + struct aer_info *aer_info; /* AER info for this device */ #endif #ifdef CONFIG_PCIEPORTBUS struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */ From 94bc15c3484aa1dbbd01aee8f9eaa5dc347a01a8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:22 -0500 Subject: [PATCH 16/21] PCI/AER: Convert aer_get_device_error_info(), aer_print_error() to index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously aer_get_device_error_info() and aer_print_error() took a pointer to struct aer_err_info and a pointer to a pci_dev. Typically the pci_dev was one of the elements of the aer_err_info.dev[] array (DPC was an exception, where the dev[] array was unused). Convert aer_get_device_error_info() and aer_print_error() to take an index into the aer_err_info.dev[] array instead. A future patch will add per-device ratelimit information, so the index makes it convenient to find the ratelimit associated with the device. To accommodate DPC, set info->dev[0] to the DPC port before using these interfaces. Signed-off-by: Bjorn Helgaas Reviewed-by: Ilpo Järvinen Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20250522232339.1525671-17-helgaas@kernel.org --- drivers/pci/pci.h | 4 ++-- drivers/pci/pcie/aer.c | 33 +++++++++++++++++++++++---------- drivers/pci/pcie/dpc.c | 8 ++++++-- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1a9bfc708757..e1a28215967f 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -605,8 +605,8 @@ struct aer_err_info { struct pcie_tlp_log tlp; /* TLP Header */ }; -int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); -void aer_print_error(struct pci_dev *dev, struct aer_err_info *info); +int aer_get_device_error_info(struct aer_err_info *info, int i); +void aer_print_error(struct aer_err_info *info, int i); int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, unsigned int tlp_len, bool flit, diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 787a953fb331..ab99bc1b8542 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -705,12 +705,18 @@ static void aer_print_source(struct pci_dev *dev, struct aer_err_info *info, found ? "" : " (no details found"); } -void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) +void aer_print_error(struct aer_err_info *info, int i) { - int layer, agent; - int id = pci_dev_id(dev); + struct pci_dev *dev; + int layer, agent, id; const char *level = info->level; + if (WARN_ON_ONCE(i >= AER_MAX_MULTI_ERR_DEVICES)) + return; + + dev = info->dev[i]; + id = pci_dev_id(dev); + pci_dev_aer_stats_incr(dev, info); trace_aer_event(pci_name(dev), (info->status & ~info->mask), info->severity, info->tlp_header_valid, &info->tlp); @@ -1193,19 +1199,26 @@ EXPORT_SYMBOL_GPL(aer_recover_queue); /** * aer_get_device_error_info - read error status from dev and store it to info - * @dev: pointer to the device expected to have an error record * @info: pointer to structure to store the error record + * @i: index into info->dev[] * * Return: 1 on success, 0 on error. * * Note that @info is reused among all error devices. Clear fields properly. */ -int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) +int aer_get_device_error_info(struct aer_err_info *info, int i) { - int type = pci_pcie_type(dev); - int aer = dev->aer_cap; + struct pci_dev *dev; + int type, aer; u32 aercc; + if (i >= AER_MAX_MULTI_ERR_DEVICES) + return 0; + + dev = info->dev[i]; + aer = dev->aer_cap; + type = pci_pcie_type(dev); + /* Must reset in this function */ info->status = 0; info->tlp_header_valid = 0; @@ -1257,11 +1270,11 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info) /* Report all before handling them, to not lose records by reset etc. */ for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { - if (aer_get_device_error_info(e_info->dev[i], e_info)) - aer_print_error(e_info->dev[i], e_info); + if (aer_get_device_error_info(e_info, i)) + aer_print_error(e_info, i); } for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { - if (aer_get_device_error_info(e_info->dev[i], e_info)) + if (aer_get_device_error_info(e_info, i)) handle_error_source(e_info->dev[i], e_info); } } diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 7ae1590ea1da..fc18349614d7 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -253,6 +253,10 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev, info->severity = AER_NONFATAL; info->level = KERN_ERR; + + info->dev[0] = dev; + info->error_dev_num = 1; + return 1; } @@ -270,8 +274,8 @@ void dpc_process_error(struct pci_dev *pdev) pci_warn(pdev, "containment event, status:%#06x: unmasked uncorrectable error detected\n", status); if (dpc_get_aer_uncorrect_severity(pdev, &info) && - aer_get_device_error_info(pdev, &info)) { - aer_print_error(pdev, &info); + aer_get_device_error_info(&info, 0)) { + aer_print_error(&info, 0); pci_aer_clear_nonfatal_status(pdev); pci_aer_clear_fatal_status(pdev); } From d72bae423004aa7b4d94c34a7fd0b48b64305a08 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 22 May 2025 18:21:23 -0500 Subject: [PATCH 17/21] PCI/AER: Simplify add_error_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return -ENOSPC error early so the usual path through add_error_device() is the straightline code. Signed-off-by: Bjorn Helgaas Reviewed-by: Ilpo Järvinen Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20250522232339.1525671-18-helgaas@kernel.org --- drivers/pci/pcie/aer.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index ab99bc1b8542..5d62b82b5bc1 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -816,12 +816,15 @@ EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL"); */ static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev) { - if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) { - e_info->dev[e_info->error_dev_num] = pci_dev_get(dev); - e_info->error_dev_num++; - return 0; - } - return -ENOSPC; + int i = e_info->error_dev_num; + + if (i >= AER_MAX_MULTI_ERR_DEVICES) + return -ENOSPC; + + e_info->dev[i] = pci_dev_get(dev); + e_info->error_dev_num++; + + return 0; } /** From a57f2bfb4a5863f83087867c0e671f2418212d23 Mon Sep 17 00:00:00 2001 From: Jon Pan-Doh Date: Thu, 22 May 2025 18:21:24 -0500 Subject: [PATCH 18/21] PCI/AER: Ratelimit correctable and non-fatal error logging Spammy devices can flood kernel logs with AER errors and slow/stall execution. Add per-device ratelimits for AER correctable and non-fatal uncorrectable errors that use the kernel defaults (10 per 5s). Logging of fatal errors is not ratelimited. There are two AER logging entry points: - aer_print_error() is used by DPC and native AER - pci_print_aer() is used by GHES and CXL The native AER aer_print_error() case includes a loop that may log details from multiple devices, which are ratelimited individually. If we log details for any device, we also log the Error Source ID from the Root Port or RCEC. If no such device details are found, we still log the Error Source from the ERR_* Message, ratelimited by the Root Port or RCEC that received it. The DPC aer_print_error() case is not ratelimited, since this only happens for fatal errors. The CXL pci_print_aer() case is ratelimited by the Error Source device. The GHES pci_print_aer() case is via aer_recover_work_func(), which searches for the Error Source device. If the device is not found, there's no per-device ratelimit, so we use a system-wide ratelimit that covers all error types (correctable, non-fatal, and fatal). Sargun at Meta reported internally that a flood of AER errors causes RCU CPU stall warnings and CSD-lock warnings. Tested using aer-inject[1]. Sent 11 AER errors. Observed 10 errors logged while AER stats (cat /sys/bus/pci/devices//aer_dev_correctable) show true count of 11. [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git [bhelgaas: commit log, factor out trace_aer_event() and aer_print_rp_info() changes to previous patches, enable Error Source logging if any downstream detail will be printed, don't ratelimit fatal errors, "aer_report" -> "aer_info", "cor_log_ratelimit" -> "correctable_ratelimit", "uncor_log_ratelimit" -> "nonfatal_ratelimit"] Reported-by: Sargun Dhillon Signed-off-by: Jon Pan-Doh Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20250522232339.1525671-19-helgaas@kernel.org --- drivers/pci/pci.h | 4 ++- drivers/pci/pcie/aer.c | 66 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 64 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index e1a28215967f..3023c68fe485 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -587,13 +587,15 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev) struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; + int ratelimit_print[AER_MAX_MULTI_ERR_DEVICES]; int error_dev_num; const char *level; /* printk level */ unsigned int id:16; unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */ - unsigned int __pad1:5; + unsigned int root_ratelimit_print:1; /* 0=skip, 1=print */ + unsigned int __pad1:4; unsigned int multi_error_valid:1; unsigned int first_error:5; diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 5d62b82b5bc1..7886a9ad126c 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -88,6 +89,10 @@ struct aer_info { u64 rootport_total_cor_errs; u64 rootport_total_fatal_errs; u64 rootport_total_nonfatal_errs; + + /* Ratelimits for errors */ + struct ratelimit_state correctable_ratelimit; + struct ratelimit_state nonfatal_ratelimit; }; #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ @@ -379,6 +384,11 @@ void pci_aer_init(struct pci_dev *dev) dev->aer_info = kzalloc(sizeof(*dev->aer_info), GFP_KERNEL); + ratelimit_state_init(&dev->aer_info->correctable_ratelimit, + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + ratelimit_state_init(&dev->aer_info->nonfatal_ratelimit, + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + /* * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER, * PCI_ERR_COR_MASK, and PCI_ERR_CAP. Root and Root Complex Event @@ -669,6 +679,18 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev, } } +static int aer_ratelimit(struct pci_dev *dev, unsigned int severity) +{ + switch (severity) { + case AER_NONFATAL: + return __ratelimit(&dev->aer_info->nonfatal_ratelimit); + case AER_CORRECTABLE: + return __ratelimit(&dev->aer_info->correctable_ratelimit); + default: + return 1; /* Don't ratelimit fatal errors */ + } +} + static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { const char **strings; @@ -721,6 +743,9 @@ void aer_print_error(struct aer_err_info *info, int i) trace_aer_event(pci_name(dev), (info->status & ~info->mask), info->severity, info->tlp_header_valid, &info->tlp); + if (!info->ratelimit_print[i]) + return; + if (!info->status) { pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", aer_error_severity_string[info->severity]); @@ -790,6 +815,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, trace_aer_event(pci_name(dev), (status & ~mask), aer_severity, tlp_header_valid, &aer->header_log); + if (!aer_ratelimit(dev, info.severity)) + return; + layer = AER_GET_LAYER_ERROR(aer_severity, status); agent = AER_GET_AGENT(aer_severity, status); @@ -824,6 +852,18 @@ static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev) e_info->dev[i] = pci_dev_get(dev); e_info->error_dev_num++; + /* + * Ratelimit AER log messages. "dev" is either the source + * identified by the root's Error Source ID or it has an unmasked + * error logged in its own AER Capability. Messages are emitted + * when "ratelimit_print[i]" is non-zero. If we will print detail + * for a downstream device, make sure we print the Error Source ID + * from the root as well. + */ + if (aer_ratelimit(dev, e_info->severity)) { + e_info->ratelimit_print[i] = 1; + e_info->root_ratelimit_print = 1; + } return 0; } @@ -918,7 +958,7 @@ static int find_device_iter(struct pci_dev *dev, void *data) * e_info->error_dev_num and e_info->dev[], based on the given information. */ static bool find_source_device(struct pci_dev *parent, - struct aer_err_info *e_info) + struct aer_err_info *e_info) { struct pci_dev *dev = parent; int result; @@ -1144,9 +1184,10 @@ static void aer_recover_work_func(struct work_struct *work) pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus, entry.devfn); if (!pdev) { - pr_err("no pci_dev for %04x:%02x:%02x.%x\n", - entry.domain, entry.bus, - PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn)); + pr_err_ratelimited("%04x:%02x:%02x.%x: no pci_dev found\n", + entry.domain, entry.bus, + PCI_SLOT(entry.devfn), + PCI_FUNC(entry.devfn)); continue; } pci_print_aer(pdev, entry.severity, entry.regs); @@ -1294,7 +1335,22 @@ static void aer_isr_one_error_type(struct pci_dev *root, bool found; found = find_source_device(root, info); - aer_print_source(root, info, found); + + /* + * If we're going to log error messages, we've already set + * "info->root_ratelimit_print" and "info->ratelimit_print[i]" to + * non-zero (which enables printing) because this is either an + * ERR_FATAL or we found a device with an error logged in its AER + * Capability. + * + * If we didn't find the Error Source device, at least log the + * Requester ID from the ERR_* Message received by the Root Port or + * RCEC, ratelimited by the RP or RCEC. + */ + if (info->root_ratelimit_print || + (!found && aer_ratelimit(root, info->severity))) + aer_print_source(root, info, found); + if (found) aer_process_err_devices(info); } From 24816cc298ee9cc01003b10b888427faada4403d Mon Sep 17 00:00:00 2001 From: Jon Pan-Doh Date: Thu, 22 May 2025 18:21:25 -0500 Subject: [PATCH 19/21] PCI/AER: Add ratelimits to PCI AER Documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ratelimits section for rationale and defaults. [bhelgaas: note fatal errors are not ratelimited] Signed-off-by: Karolina Stolarek Signed-off-by: Jon Pan-Doh Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Paul E. McKenney Link: https://patch.msgid.link/20250522232339.1525671-20-helgaas@kernel.org --- Documentation/PCI/pcieaer-howto.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst index f013f3b27c82..6fb31516fff1 100644 --- a/Documentation/PCI/pcieaer-howto.rst +++ b/Documentation/PCI/pcieaer-howto.rst @@ -85,6 +85,18 @@ In the example, 'Requester ID' means the ID of the device that sent the error message to the Root Port. Please refer to PCIe specs for other fields. +AER Ratelimits +-------------- + +Since error messages can be generated for each transaction, we may see +large volumes of errors reported. To prevent spammy devices from flooding +the console/stalling execution, messages are throttled by device and error +type (correctable vs. non-fatal uncorrectable). Fatal errors, including +DPC errors, are not ratelimited. + +AER uses the default ratelimit of DEFAULT_RATELIMIT_BURST (10 events) over +DEFAULT_RATELIMIT_INTERVAL (5 seconds). + AER Statistics / Counters ------------------------- From b4fe7398def6df344442b884d2288f80205cbd2d Mon Sep 17 00:00:00 2001 From: Jon Pan-Doh Date: Thu, 22 May 2025 18:21:26 -0500 Subject: [PATCH 20/21] PCI/AER: Add sysfs attributes for log ratelimits Allow userspace to read/write log ratelimits per device (including enable/disable). Create aer/ sysfs directory to store them and any future AER configs. The new sysfs files are: /sys/bus/pci/devices/*/aer/correctable_ratelimit_burst /sys/bus/pci/devices/*/aer/correctable_ratelimit_interval_ms /sys/bus/pci/devices/*/aer/nonfatal_ratelimit_burst /sys/bus/pci/devices/*/aer/nonfatal_ratelimit_interval_ms The default values are ratelimit_burst=10, ratelimit_interval_ms=5000, so if we try to emit more than 10 messages in a 5 second period, some are suppressed. Update AER sysfs ABI filename to reflect the broader scope of AER sysfs attributes (e.g. stats and ratelimits). Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats -> sysfs-bus-pci-devices-aer Tested using aer-inject[1]. Configured correctable log ratelimit to 5. Sent 6 AER errors. Observed 5 errors logged while AER stats (cat /sys/bus/pci/devices//aer_dev_correctable) shows 6. Disabled ratelimiting and sent 6 more AER errors. Observed all 6 errors logged and accounted in AER stats (12 total errors). [1] https://git.kernel.org/pub/scm/linux/kernel/git/gong.chen/aer-inject.git [bhelgaas: note fatal errors are not ratelimited, "aer_report" -> "aer_info", replace ratelimit_log_enable toggle with *_ratelimit_interval_ms] Signed-off-by: Karolina Stolarek Signed-off-by: Jon Pan-Doh Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20250522232339.1525671-21-helgaas@kernel.org --- ...es-aer_stats => sysfs-bus-pci-devices-aer} | 44 ++++++++ Documentation/PCI/pcieaer-howto.rst | 5 +- drivers/pci/pci-sysfs.c | 1 + drivers/pci/pci.h | 1 + drivers/pci/pcie/aer.c | 105 ++++++++++++++++++ 5 files changed, 155 insertions(+), 1 deletion(-) rename Documentation/ABI/testing/{sysfs-bus-pci-devices-aer_stats => sysfs-bus-pci-devices-aer} (72%) diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer similarity index 72% rename from Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats rename to Documentation/ABI/testing/sysfs-bus-pci-devices-aer index d1f67bb81d5d..5ed284523956 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats +++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer @@ -117,3 +117,47 @@ Date: July 2018 KernelVersion: 4.19.0 Contact: linux-pci@vger.kernel.org, rajatja@google.com Description: Total number of ERR_NONFATAL messages reported to rootport. + +PCIe AER ratelimits +------------------- + +These attributes show up under all the devices that are AER capable. +They represent configurable ratelimits of logs per error type. + +See Documentation/PCI/pcieaer-howto.rst for more info on ratelimits. + +What: /sys/bus/pci/devices//aer/correctable_ratelimit_interval_ms +Date: May 2025 +KernelVersion: 6.16.0 +Contact: linux-pci@vger.kernel.org +Description: Writing 0 disables AER correctable error log ratelimiting. + Writing a positive value sets the ratelimit interval in ms. + Default is DEFAULT_RATELIMIT_INTERVAL (5000 ms). + +What: /sys/bus/pci/devices//aer/correctable_ratelimit_burst +Date: May 2025 +KernelVersion: 6.16.0 +Contact: linux-pci@vger.kernel.org +Description: Ratelimit burst for correctable error logs. Writing a value + changes the number of errors (burst) allowed per interval + before ratelimiting. Reading gets the current ratelimit + burst. Default is DEFAULT_RATELIMIT_BURST (10). + +What: /sys/bus/pci/devices//aer/nonfatal_ratelimit_interval_ms +Date: May 2025 +KernelVersion: 6.16.0 +Contact: linux-pci@vger.kernel.org +Description: Writing 0 disables AER non-fatal uncorrectable error log + ratelimiting. Writing a positive value sets the ratelimit + interval in ms. Default is DEFAULT_RATELIMIT_INTERVAL + (5000 ms). + +What: /sys/bus/pci/devices//aer/nonfatal_ratelimit_burst +Date: May 2025 +KernelVersion: 6.16.0 +Contact: linux-pci@vger.kernel.org +Description: Ratelimit burst for non-fatal uncorrectable error logs. + Writing a value changes the number of errors (burst) + allowed per interval before ratelimiting. Reading gets the + current ratelimit burst. Default is DEFAULT_RATELIMIT_BURST + (10). diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst index 6fb31516fff1..4b71e2f43ca7 100644 --- a/Documentation/PCI/pcieaer-howto.rst +++ b/Documentation/PCI/pcieaer-howto.rst @@ -97,12 +97,15 @@ DPC errors, are not ratelimited. AER uses the default ratelimit of DEFAULT_RATELIMIT_BURST (10 events) over DEFAULT_RATELIMIT_INTERVAL (5 seconds). +Ratelimits are exposed in the form of sysfs attributes and configurable. +See Documentation/ABI/testing/sysfs-bus-pci-devices-aer. + AER Statistics / Counters ------------------------- When PCIe AER errors are captured, the counters / statistics are also exposed in the form of sysfs attributes which are documented at -Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats +Documentation/ABI/testing/sysfs-bus-pci-devices-aer. Developer Guide =============== diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index c6cda56ca52c..278de99b00ce 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1805,6 +1805,7 @@ const struct attribute_group *pci_dev_attr_groups[] = { &pcie_dev_attr_group, #ifdef CONFIG_PCIEAER &aer_stats_attr_group, + &aer_attr_group, #endif #ifdef CONFIG_PCIEASPM &aspm_ctrl_attr_group, diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 3023c68fe485..eca2812cfd25 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -965,6 +965,7 @@ void pci_no_aer(void); void pci_aer_init(struct pci_dev *dev); void pci_aer_exit(struct pci_dev *dev); extern const struct attribute_group aer_stats_attr_group; +extern const struct attribute_group aer_attr_group; void pci_aer_clear_fatal_status(struct pci_dev *dev); int pci_aer_clear_status(struct pci_dev *dev); int pci_aer_raw_clear_status(struct pci_dev *dev); diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 7886a9ad126c..70ac66188367 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -627,6 +627,111 @@ const struct attribute_group aer_stats_attr_group = { .is_visible = aer_stats_attrs_are_visible, }; +/* + * Ratelimit interval + * <=0: disabled with ratelimit.interval = 0 + * >0: enabled with ratelimit.interval in ms + */ +#define aer_ratelimit_interval_attr(name, ratelimit) \ + static ssize_t \ + name##_show(struct device *dev, struct device_attribute *attr, \ + char *buf) \ + { \ + struct pci_dev *pdev = to_pci_dev(dev); \ + \ + return sysfs_emit(buf, "%d\n", \ + pdev->aer_info->ratelimit.interval); \ + } \ + \ + static ssize_t \ + name##_store(struct device *dev, struct device_attribute *attr, \ + const char *buf, size_t count) \ + { \ + struct pci_dev *pdev = to_pci_dev(dev); \ + int interval; \ + \ + if (!capable(CAP_SYS_ADMIN)) \ + return -EPERM; \ + \ + if (kstrtoint(buf, 0, &interval) < 0) \ + return -EINVAL; \ + \ + if (interval <= 0) \ + interval = 0; \ + else \ + interval = msecs_to_jiffies(interval); \ + \ + pdev->aer_info->ratelimit.interval = interval; \ + \ + return count; \ + } \ + static DEVICE_ATTR_RW(name); + +#define aer_ratelimit_burst_attr(name, ratelimit) \ + static ssize_t \ + name##_show(struct device *dev, struct device_attribute *attr, \ + char *buf) \ + { \ + struct pci_dev *pdev = to_pci_dev(dev); \ + \ + return sysfs_emit(buf, "%d\n", \ + pdev->aer_info->ratelimit.burst); \ + } \ + \ + static ssize_t \ + name##_store(struct device *dev, struct device_attribute *attr, \ + const char *buf, size_t count) \ + { \ + struct pci_dev *pdev = to_pci_dev(dev); \ + int burst; \ + \ + if (!capable(CAP_SYS_ADMIN)) \ + return -EPERM; \ + \ + if (kstrtoint(buf, 0, &burst) < 0) \ + return -EINVAL; \ + \ + pdev->aer_info->ratelimit.burst = burst; \ + \ + return count; \ + } \ + static DEVICE_ATTR_RW(name); + +#define aer_ratelimit_attrs(name) \ + aer_ratelimit_interval_attr(name##_ratelimit_interval_ms, \ + name##_ratelimit) \ + aer_ratelimit_burst_attr(name##_ratelimit_burst, \ + name##_ratelimit) + +aer_ratelimit_attrs(correctable) +aer_ratelimit_attrs(nonfatal) + +static struct attribute *aer_attrs[] = { + &dev_attr_correctable_ratelimit_interval_ms.attr, + &dev_attr_correctable_ratelimit_burst.attr, + &dev_attr_nonfatal_ratelimit_interval_ms.attr, + &dev_attr_nonfatal_ratelimit_burst.attr, + NULL +}; + +static umode_t aer_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pdev->aer_info) + return 0; + + return a->mode; +} + +const struct attribute_group aer_attr_group = { + .name = "aer", + .attrs = aer_attrs, + .is_visible = aer_attrs_are_visible, +}; + static void pci_dev_aer_stats_incr(struct pci_dev *pdev, struct aer_err_info *info) { From b06d125e6280603a34d9064cd9c12748ca2edb04 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 8 May 2025 12:40:30 +0530 Subject: [PATCH 21/21] PCI/ERR: Remove misleading TODO regarding kernel panic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A PCI device is just another peripheral in a system. So failure to recover it, must not result in a kernel panic. So remove the TODO which is quite misleading. Signed-off-by: Manivannan Sadhasivam Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Reviewed-by: Wilfred Mallawa Link: https://patch.msgid.link/20250508-pcie-reset-slot-v4-1-7050093e2b50@linaro.org --- drivers/pci/pcie/err.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 31090770fffc..de6381c690f5 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -271,7 +271,6 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, pci_uevent_ers(bridge, PCI_ERS_RESULT_DISCONNECT); - /* TODO: Should kernel panic here? */ pci_info(bridge, "device recovery failed\n"); return status;