From 248ed9e227e6cf59acb1aaf3aa30d530a0232c1a Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Thu, 23 Mar 2023 16:35:49 +0800 Subject: [PATCH 01/22] accel/habanalabs: Remove redundant pci_clear_master Remove pci_clear_master to simplify the code, the bus-mastering is also cleared in do_pci_disable_device, like this: ./drivers/pci/pci.c:2197 static void do_pci_disable_device(struct pci_dev *dev) { u16 pci_command; pci_read_config_word(dev, PCI_COMMAND, &pci_command); if (pci_command & PCI_COMMAND_MASTER) { pci_command &= ~PCI_COMMAND_MASTER; pci_write_config_word(dev, PCI_COMMAND, pci_command); } pcibios_disable_device(dev); }. And dev->is_busmaster is set to 0 in pci_disable_device. Signed-off-by: Cai Huoqing Reviewed-by: Stanislaw Gruszka Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 1 - drivers/accel/habanalabs/common/pci/pci.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 713005998cbc..2fb1e2ec3a83 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1271,7 +1271,6 @@ int hl_device_resume(struct hl_device *hdev) return 0; disable_device: - pci_clear_master(hdev->pdev); pci_disable_device(hdev->pdev); return rc; diff --git a/drivers/accel/habanalabs/common/pci/pci.c b/drivers/accel/habanalabs/common/pci/pci.c index d1f4c695baf2..191e0e3cf3a5 100644 --- a/drivers/accel/habanalabs/common/pci/pci.c +++ b/drivers/accel/habanalabs/common/pci/pci.c @@ -420,7 +420,6 @@ int hl_pci_init(struct hl_device *hdev) unmap_pci_bars: hl_pci_bars_unmap(hdev); disable_device: - pci_clear_master(pdev); pci_disable_device(pdev); return rc; @@ -436,6 +435,5 @@ void hl_pci_fini(struct hl_device *hdev) { hl_pci_bars_unmap(hdev); - pci_clear_master(hdev->pdev); pci_disable_device(hdev->pdev); } From 6c31c13759272818108a329f166d86846d0e3f7a Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Wed, 8 Mar 2023 17:53:39 +0200 Subject: [PATCH 02/22] accel/habanalabs: unmap mapped memory when TLB inv fails Once a memory mapping is added to the page tables, it's followed by a TLB invalidation request which could potentially fail (HW failure). Removing the mapping is simply a part of this failure handling routine. TLB invalidation failure prints were updated to be more accurate. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Reviewed-by: Stanislaw Gruszka --- drivers/accel/habanalabs/common/command_buffer.c | 15 ++++++++++++--- drivers/accel/habanalabs/common/mmu/mmu.c | 8 ++++++-- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/accel/habanalabs/common/command_buffer.c b/drivers/accel/habanalabs/common/command_buffer.c index 3a0535ac28b1..6e09f48750a0 100644 --- a/drivers/accel/habanalabs/common/command_buffer.c +++ b/drivers/accel/habanalabs/common/command_buffer.c @@ -45,20 +45,29 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb) } mutex_lock(&hdev->mmu_lock); + rc = hl_mmu_map_contiguous(ctx, cb->virtual_addr, cb->bus_address, cb->roundup_size); if (rc) { dev_err(hdev->dev, "Failed to map VA %#llx to CB\n", cb->virtual_addr); - goto err_va_umap; + goto err_va_pool_free; } + rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV); + if (rc) + goto err_mmu_unmap; + mutex_unlock(&hdev->mmu_lock); cb->is_mmu_mapped = true; - return rc; -err_va_umap: + return 0; + +err_mmu_unmap: + hl_mmu_unmap_contiguous(ctx, cb->virtual_addr, cb->roundup_size); +err_va_pool_free: mutex_unlock(&hdev->mmu_lock); gen_pool_free(ctx->cb_va_pool, cb->virtual_addr, cb->roundup_size); + return rc; } diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c index 17581b1bcc77..f379e5b461a6 100644 --- a/drivers/accel/habanalabs/common/mmu/mmu.c +++ b/drivers/accel/habanalabs/common/mmu/mmu.c @@ -679,7 +679,9 @@ int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags) rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, is_hard, flags); if (rc) - dev_err_ratelimited(hdev->dev, "MMU cache invalidation failed\n"); + dev_err_ratelimited(hdev->dev, + "%s cache invalidation failed, rc=%d\n", + flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", rc); return rc; } @@ -692,7 +694,9 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard, rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, is_hard, flags, asid, va, size); if (rc) - dev_err_ratelimited(hdev->dev, "MMU cache range invalidation failed\n"); + dev_err_ratelimited(hdev->dev, + "%s cache range invalidation failed: va=%#llx, size=%llu, rc=%d", + flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", va, size, rc); return rc; } From 957b247bca4309b07df991ce114ea7a617b2c9fb Mon Sep 17 00:00:00 2001 From: Tal Cohen Date: Thu, 16 Mar 2023 17:30:46 +0200 Subject: [PATCH 03/22] accel/habanalabs: print event type when device is disabled When the device is in disabled state, the driver isn't suppose to receive any events from FW. Printing the event type, as part of the message that was already printed, shall help to get more info if this unexpected message is received. Signed-off-by: Tal Cohen Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Reviewed-by: Stanislaw Gruszka --- drivers/accel/habanalabs/common/irq.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/accel/habanalabs/common/irq.c b/drivers/accel/habanalabs/common/irq.c index fab1abc5c910..0d59bb7c9063 100644 --- a/drivers/accel/habanalabs/common/irq.c +++ b/drivers/accel/habanalabs/common/irq.c @@ -415,8 +415,8 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg) struct hl_eq_entry *eq_base; struct hl_eqe_work *handle_eqe_work; bool entry_ready; - u32 cur_eqe; - u16 cur_eqe_index; + u32 cur_eqe, ctl; + u16 cur_eqe_index, event_type; eq_base = eq->kernel_address; @@ -449,7 +449,10 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg) dma_rmb(); if (hdev->disabled && !hdev->reset_info.in_compute_reset) { - dev_warn(hdev->dev, "Device disabled but received an EQ event\n"); + ctl = le32_to_cpu(eq_entry->hdr.ctl); + event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK) >> EQ_CTL_EVENT_TYPE_SHIFT); + dev_warn(hdev->dev, + "Device disabled but received an EQ event (%u)\n", event_type); goto skip_irq; } From fb10da9337105d51fcc9f64801ede35e098b6ec5 Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Tue, 21 Mar 2023 16:17:37 +0200 Subject: [PATCH 04/22] accel/habanalabs: check return value of add_va_block_locked since the function might fail and we should propagate the failure. Signed-off-by: Dafna Hirschfeld Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Reviewed-by: Stanislaw Gruszka --- drivers/accel/habanalabs/common/memory.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c index 17b79d717896..a7b6a273ce21 100644 --- a/drivers/accel/habanalabs/common/memory.c +++ b/drivers/accel/habanalabs/common/memory.c @@ -605,6 +605,7 @@ static u64 get_va_block(struct hl_device *hdev, bool is_align_pow_2 = is_power_of_2(va_range->page_size); bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr); bool force_hint = flags & HL_MEM_FORCE_HINT; + int rc; if (is_align_pow_2) align_mask = ~((u64)va_block_align - 1); @@ -722,9 +723,13 @@ static u64 get_va_block(struct hl_device *hdev, kfree(new_va_block); } - if (add_prev) - add_va_block_locked(hdev, &va_range->list, prev_start, - prev_end); + if (add_prev) { + rc = add_va_block_locked(hdev, &va_range->list, prev_start, prev_end); + if (rc) { + reserved_valid_start = 0; + goto out; + } + } print_va_list_locked(hdev, &va_range->list); out: From 9d7fef7c5963b90160099b568f84188f394ebd11 Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Tue, 21 Mar 2023 16:03:07 +0200 Subject: [PATCH 05/22] accel/habanalabs: change COMMS warning messages to error level COMMS protocol is used for LKD <--> FW communication, and any communication failure between the two might turn out to be destructive, hence, it should be well emphasized. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Reviewed-by: Stanislaw Gruszka --- drivers/accel/habanalabs/common/firmware_if.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 7ea611392f8c..96027a1c124d 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -1263,7 +1263,7 @@ void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev) COMMS_RST_DEV, 0, false, hdev->fw_loader.cpu_timeout); if (rc) - dev_warn(hdev->dev, "Failed sending COMMS_RST_DEV\n"); + dev_err(hdev->dev, "Failed sending COMMS_RST_DEV\n"); } else { WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_RST_DEV); } @@ -1284,7 +1284,7 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev) COMMS_GOTO_WFE, 0, true, hdev->fw_loader.cpu_timeout); if (rc) - dev_warn(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); + dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); } else { WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE); msleep(static_loader->cpu_reset_wait_msec); From a855f710f5d40537bc2bf5f924827416653db39f Mon Sep 17 00:00:00 2001 From: Tal Cohen Date: Tue, 21 Mar 2023 17:27:24 +0200 Subject: [PATCH 06/22] accel/habanalabs: remove duplicated disable pci msg The disable pci message is sent in reset device. It informs the FW not to raise more EQs. The Driver may ignore received EQs, when the device is in disabled mode. The duplication happens when hard reset is scheduled during compute reset and also performs 'escalate_reset_flow'. Signed-off-by: Tal Cohen Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Reviewed-by: Stanislaw Gruszka --- drivers/accel/habanalabs/common/device.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 2fb1e2ec3a83..c36de13d6729 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1822,9 +1822,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); flags = hdev->reset_info.hard_reset_schedule_flags; hdev->reset_info.hard_reset_schedule_flags = 0; - hdev->disabled = true; hard_reset = true; - handle_reset_trigger(hdev, flags); goto escalate_reset_flow; } } From 3a8d7c3a7d1bf7d1f2121c1f467d6b349b7bf807 Mon Sep 17 00:00:00 2001 From: Tal Cohen Date: Wed, 22 Mar 2023 11:20:05 +0200 Subject: [PATCH 07/22] accel/habanalabs: send disable pci when compute ctx is active Fix an issue in hard reset flow in which the driver didn't send a disable pci message if there was an active compute context. In hard reset, disable pci message should be sent no matter if a compute context exists or not. Signed-off-by: Tal Cohen Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Reviewed-by: Stanislaw Gruszka --- drivers/accel/habanalabs/common/device.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index c36de13d6729..3c1af9d43b65 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1386,7 +1386,7 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) /* No consecutive mechanism when user context exists */ if (hdev->is_compute_ctx_active) - return; + goto disable_pci; /* * 'reset cause' is being updated here, because getting here @@ -1425,6 +1425,8 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) * If F/W is performing the reset, no need to send it a message to disable * PCI access */ + +disable_pci: if ((flags & HL_DRV_RESET_HARD) && !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { /* Disable PCI access from device F/W so he won't send From 6306e815836c753c8de5f075fd14f4a2783e882f Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Thu, 23 Mar 2023 19:40:22 +0200 Subject: [PATCH 08/22] accel/habanalabs: fix access error clear event The register which needs to be cleared is the valid register instead of the address. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index edcbda3d9b40..bace4ac998e0 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -8884,7 +8884,7 @@ static void gaudi2_handle_access_error(struct hl_device *hdev, u64 mmu_base, boo dev_err_ratelimited(hdev->dev, "%s access error on va 0x%llx\n", is_pmmu ? "PMMU" : "HMMU", addr); - WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_ERROR_CAPTURE), 0); + WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0); } static int gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, u16 event_type, From 12f7701138846249fb09dd22e85b88563c708a41 Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Thu, 16 Mar 2023 10:45:47 +0200 Subject: [PATCH 09/22] accel/habanalabs: improvements to FW ver extraction 1. Rename the func to hl_get_preboot_major_minor because we also set the extracted values in hdev fields. 2. Free the allocated string in the calling function which makes more sense Signed-off-by: Dafna Hirschfeld Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/firmware_if.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 96027a1c124d..781256dd49ad 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -71,7 +71,7 @@ static char *extract_fw_ver_from_str(const char *fw_str) return NULL; } -static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver) +static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver) { char major[8], minor[8], *first_dot, *second_dot; int rc; @@ -86,7 +86,7 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver) if (rc) { dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc); - goto out; + return rc; } /* skip the first dot */ @@ -102,9 +102,6 @@ static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver) if (rc) dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc); - -out: - kfree(preboot_ver); return rc; } @@ -2181,8 +2178,8 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev, dev_info(hdev->dev, "preboot version %s\n", preboot_ver); - /* This function takes care of freeing preboot_ver */ - rc = extract_fw_sub_versions(hdev, preboot_ver); + rc = hl_get_preboot_major_minor(hdev, preboot_ver); + kfree(preboot_ver); if (rc) return rc; } From d1943f1b97790f79474c1b55f37d24f714313bb9 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 15 Mar 2023 10:36:41 +0200 Subject: [PATCH 10/22] accel/habanalabs: fix HBM MMU interrupt handling Current mapping between HMMU event and HMMU block is wrong. In addition the captured address in case of a page fault or an access error is scrambled, Hence we must call the descramble function. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 145 +++++++++++++++++------ 1 file changed, 108 insertions(+), 37 deletions(-) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index bace4ac998e0..ad491fb2c39d 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -2112,6 +2112,7 @@ static bool gaudi2_get_mme_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 static bool gaudi2_get_edma_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, struct engines_data *e); static u64 gaudi2_mmu_scramble_addr(struct hl_device *hdev, u64 raw_addr); +static u64 gaudi2_mmu_descramble_addr(struct hl_device *hdev, u64 scrambled_addr); static void gaudi2_init_scrambler_hbm(struct hl_device *hdev) { @@ -8844,7 +8845,7 @@ static int gaudi2_handle_hif_fatal(struct hl_device *hdev, u16 event_type, u64 i static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool is_pmmu, u64 *event_mask) { - u32 valid, val, axid_l, axid_h; + u32 valid, val; u64 addr; valid = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID)); @@ -8857,11 +8858,11 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool addr <<= 32; addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE_VA)); - axid_l = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_FAULT_ID_LSB)); - axid_h = RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_FAULT_ID_MSB)); + if (!is_pmmu) + addr = gaudi2_mmu_descramble_addr(hdev, addr); - dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx, transaction id 0x%llX\n", - is_pmmu ? "PMMU" : "HMMU", addr, ((u64)axid_h << 32) + axid_l); + dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n", + is_pmmu ? "PMMU" : "HMMU", addr); hl_handle_page_fault(hdev, addr, 0, is_pmmu, event_mask); WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0); @@ -8882,6 +8883,9 @@ static void gaudi2_handle_access_error(struct hl_device *hdev, u64 mmu_base, boo addr <<= 32; addr |= RREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_ERROR_CAPTURE_VA)); + if (!is_pmmu) + addr = gaudi2_mmu_descramble_addr(hdev, addr); + dev_err_ratelimited(hdev->dev, "%s access error on va 0x%llx\n", is_pmmu ? "PMMU" : "HMMU", addr); WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_ACCESS_PAGE_ERROR_VALID), 0); @@ -8976,46 +8980,110 @@ static int gaudi2_handle_sm_err(struct hl_device *hdev, u16 event_type, u8 sm_in return error_count; } +static u64 get_hmmu_base(u16 event_type) +{ + u8 dcore, index_in_dcore; + + switch (event_type) { + case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU0_SPI_BASE ... GAUDI2_EVENT_HMMU0_SECURITY_ERROR: + dcore = 0; + index_in_dcore = 0; + break; + case GAUDI2_EVENT_HMMU_1_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU1_SPI_BASE ... GAUDI2_EVENT_HMMU1_SECURITY_ERROR: + dcore = 1; + index_in_dcore = 0; + break; + case GAUDI2_EVENT_HMMU_2_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU2_SPI_BASE ... GAUDI2_EVENT_HMMU2_SECURITY_ERROR: + dcore = 0; + index_in_dcore = 1; + break; + case GAUDI2_EVENT_HMMU_3_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU3_SPI_BASE ... GAUDI2_EVENT_HMMU3_SECURITY_ERROR: + dcore = 1; + index_in_dcore = 1; + break; + case GAUDI2_EVENT_HMMU_4_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU4_SPI_BASE ... GAUDI2_EVENT_HMMU4_SECURITY_ERROR: + dcore = 3; + index_in_dcore = 2; + break; + case GAUDI2_EVENT_HMMU_5_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU5_SPI_BASE ... GAUDI2_EVENT_HMMU5_SECURITY_ERROR: + dcore = 2; + index_in_dcore = 2; + break; + case GAUDI2_EVENT_HMMU_6_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU6_SPI_BASE ... GAUDI2_EVENT_HMMU6_SECURITY_ERROR: + dcore = 3; + index_in_dcore = 3; + break; + case GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU7_SPI_BASE ... GAUDI2_EVENT_HMMU7_SECURITY_ERROR: + dcore = 2; + index_in_dcore = 3; + break; + case GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU8_SPI_BASE ... GAUDI2_EVENT_HMMU8_SECURITY_ERROR: + dcore = 0; + index_in_dcore = 2; + break; + case GAUDI2_EVENT_HMMU_9_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU9_SPI_BASE ... GAUDI2_EVENT_HMMU9_SECURITY_ERROR: + dcore = 1; + index_in_dcore = 2; + break; + case GAUDI2_EVENT_HMMU_10_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU10_SPI_BASE ... GAUDI2_EVENT_HMMU10_SECURITY_ERROR: + dcore = 0; + index_in_dcore = 3; + break; + case GAUDI2_EVENT_HMMU_11_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU11_SPI_BASE ... GAUDI2_EVENT_HMMU11_SECURITY_ERROR: + dcore = 1; + index_in_dcore = 3; + break; + case GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU12_SPI_BASE ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR: + dcore = 3; + index_in_dcore = 0; + break; + case GAUDI2_EVENT_HMMU_13_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU13_SPI_BASE ... GAUDI2_EVENT_HMMU13_SECURITY_ERROR: + dcore = 2; + index_in_dcore = 0; + break; + case GAUDI2_EVENT_HMMU_14_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU14_SPI_BASE ... GAUDI2_EVENT_HMMU14_SECURITY_ERROR: + dcore = 3; + index_in_dcore = 1; + break; + case GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU15_SPI_BASE ... GAUDI2_EVENT_HMMU15_SECURITY_ERROR: + dcore = 2; + index_in_dcore = 1; + break; + default: + return ULONG_MAX; + } + + return mmDCORE0_HMMU0_MMU_BASE + dcore * DCORE_OFFSET + index_in_dcore * DCORE_HMMU_OFFSET; +} + static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, u64 *event_mask) { bool is_pmmu = false; u32 error_count = 0; u64 mmu_base; - u8 index; switch (event_type) { - case GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM ... GAUDI2_EVENT_HMMU3_SECURITY_ERROR: - index = (event_type - GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM) / 3; - mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_3_AXI_ERR_RSP: - index = (event_type - GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP); - mmu_base = mmDCORE0_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU11_SECURITY_ERROR: - index = (event_type - GAUDI2_EVENT_HMMU8_PAGE_FAULT_WR_PERM) / 3; - mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_11_AXI_ERR_RSP: - index = (event_type - GAUDI2_EVENT_HMMU_8_AXI_ERR_RSP); - mmu_base = mmDCORE1_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU4_SECURITY_ERROR: - index = (event_type - GAUDI2_EVENT_HMMU7_PAGE_FAULT_WR_PERM) / 3; - mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_4_AXI_ERR_RSP: - index = (event_type - GAUDI2_EVENT_HMMU_7_AXI_ERR_RSP); - mmu_base = mmDCORE2_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR: - index = (event_type - GAUDI2_EVENT_HMMU15_PAGE_FAULT_WR_PERM) / 3; - mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; - break; - case GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: - index = (event_type - GAUDI2_EVENT_HMMU_15_AXI_ERR_RSP); - mmu_base = mmDCORE3_HMMU0_MMU_BASE + index * DCORE_HMMU_OFFSET; + case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: + case GAUDI2_EVENT_HMMU0_SPI_BASE ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR: + mmu_base = get_hmmu_base(event_type); break; + case GAUDI2_EVENT_PMMU0_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_PMMU0_SECURITY_ERROR: case GAUDI2_EVENT_PMMU_AXI_ERR_RSP_0: is_pmmu = true; @@ -9025,6 +9093,9 @@ static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, return 0; } + if (mmu_base == ULONG_MAX) + return 0; + error_count = gaudi2_handle_mmu_spi_sei_generic(hdev, event_type, mmu_base, is_pmmu, event_mask); hl_check_for_glbl_errors(hdev); From 49fd071d1572dc8ae824146394dcda883dac3a1f Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 26 Mar 2023 14:01:54 +0300 Subject: [PATCH 11/22] accel/habanalabs: print raw binning masks in debug level There are rare cases of failures when cards are initialized due to wrong values in efuse mappings that are parsed by firmware. To help debug those cases, print (in debug level) the raw binning masks as fetched from the firmware during device initialization. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index ad491fb2c39d..ea9fdc616de4 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -2888,6 +2888,10 @@ static int gaudi2_cpucp_info_get(struct hl_device *hdev) hdev->tpc_binning = le64_to_cpu(prop->cpucp_info.tpc_binning_mask); hdev->decoder_binning = lower_32_bits(le64_to_cpu(prop->cpucp_info.decoder_binning_mask)); + dev_dbg(hdev->dev, "Read binning masks: tpc: 0x%llx, dram: 0x%llx, edma: 0x%x, dec: 0x%x\n", + hdev->tpc_binning, hdev->dram_binning, hdev->edma_binning, + hdev->decoder_binning); + /* * at this point the DRAM parameters need to be updated according to data obtained * from the FW From 9cf56f0d97806eaff68c4c4facd7ef94e2a072bc Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Sun, 26 Mar 2023 23:51:25 +0300 Subject: [PATCH 12/22] accel/habanalabs: remove completion from abnormal interrupt work name Decoder abnormal interrupts are for errors and not for completion, so rename the relevant work and work function to not include 'completion'. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/decoder.c | 22 +++++++------------- drivers/accel/habanalabs/common/habanalabs.h | 10 ++++----- drivers/accel/habanalabs/common/irq.c | 2 +- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/drivers/accel/habanalabs/common/decoder.c b/drivers/accel/habanalabs/common/decoder.c index 69c78c1784b4..59a1ecb20c04 100644 --- a/drivers/accel/habanalabs/common/decoder.c +++ b/drivers/accel/habanalabs/common/decoder.c @@ -43,22 +43,24 @@ static void dec_print_abnrm_intr_source(struct hl_device *hdev, u32 irq_status) intr_source[2], intr_source[3], intr_source[4], intr_source[5]); } -static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id) +static void dec_abnrm_intr_work(struct work_struct *work) { + struct hl_dec *dec = container_of(work, struct hl_dec, abnrm_intr_work); + struct hl_device *hdev = dec->hdev; bool reset_required = false; u32 irq_status, event_mask; - irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET); + irq_status = RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET); - dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, core_id); + dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, dec->core_id); dec_print_abnrm_intr_source(hdev, irq_status); /* Clear the interrupt */ - WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status); + WREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status); /* Flush the interrupt clear */ - RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET); + RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET); if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) { reset_required = true; @@ -77,14 +79,6 @@ static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_ } } -static void dec_completion_abnrm(struct work_struct *work) -{ - struct hl_dec *dec = container_of(work, struct hl_dec, completion_abnrm_work); - struct hl_device *hdev = dec->hdev; - - dec_error_intr_work(hdev, dec->base_addr, dec->core_id); -} - void hl_dec_fini(struct hl_device *hdev) { kfree(hdev->dec); @@ -108,7 +102,7 @@ int hl_dec_init(struct hl_device *hdev) dec = hdev->dec + j; dec->hdev = hdev; - INIT_WORK(&dec->completion_abnrm_work, dec_completion_abnrm); + INIT_WORK(&dec->abnrm_intr_work, dec_abnrm_intr_work); dec->core_id = j; dec->base_addr = hdev->asic_funcs->get_dec_base_addr(hdev, j); if (!dec->base_addr) { diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index a6f5c2152b0a..7b6ad3d7dbaa 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -1211,15 +1211,15 @@ struct hl_eq { /** * struct hl_dec - describes a decoder sw instance. * @hdev: pointer to the device structure. - * @completion_abnrm_work: workqueue object to run when decoder generates an error interrupt + * @abnrm_intr_work: workqueue work item to run when decoder generates an error interrupt. * @core_id: ID of the decoder. * @base_addr: base address of the decoder. */ struct hl_dec { - struct hl_device *hdev; - struct work_struct completion_abnrm_work; - u32 core_id; - u32 base_addr; + struct hl_device *hdev; + struct work_struct abnrm_intr_work; + u32 core_id; + u32 base_addr; }; /** diff --git a/drivers/accel/habanalabs/common/irq.c b/drivers/accel/habanalabs/common/irq.c index 0d59bb7c9063..c67895b1cdeb 100644 --- a/drivers/accel/habanalabs/common/irq.c +++ b/drivers/accel/habanalabs/common/irq.c @@ -489,7 +489,7 @@ irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg) { struct hl_dec *dec = arg; - schedule_work(&dec->completion_abnrm_work); + schedule_work(&dec->abnrm_intr_work); return IRQ_HANDLED; } From d4801c048543115f9eddbecd6f72a3f68d562bdb Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 27 Mar 2023 00:08:45 +0300 Subject: [PATCH 13/22] accel/habanalabs: fix events mask of decoder abnormal interrupts The decoder IRQ status register may have several set bits upon an abnormal interrupt. Therefore, when setting the events mask, need to check all bits and not using if-else. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/decoder.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/accel/habanalabs/common/decoder.c b/drivers/accel/habanalabs/common/decoder.c index 59a1ecb20c04..c03a6da45d00 100644 --- a/drivers/accel/habanalabs/common/decoder.c +++ b/drivers/accel/habanalabs/common/decoder.c @@ -47,8 +47,8 @@ static void dec_abnrm_intr_work(struct work_struct *work) { struct hl_dec *dec = container_of(work, struct hl_dec, abnrm_intr_work); struct hl_device *hdev = dec->hdev; + u32 irq_status, event_mask = 0; bool reset_required = false; - u32 irq_status, event_mask; irq_status = RREG32(dec->base_addr + VCMD_IRQ_STATUS_OFFSET); @@ -64,17 +64,21 @@ static void dec_abnrm_intr_work(struct work_struct *work) if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) { reset_required = true; - event_mask = HL_NOTIFIER_EVENT_GENERAL_HW_ERR; - } else if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) { - event_mask = HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; - } else { - event_mask = HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; } + if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) + event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; + + if (irq_status & (VCMD_IRQ_STATUS_ENDCMD_MASK | + VCMD_IRQ_STATUS_BUSERR_MASK | + VCMD_IRQ_STATUS_ABORT_MASK)) + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; + if (reset_required) { event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; hl_device_cond_reset(hdev, 0, event_mask); - } else { + } else if (event_mask) { hl_notifier_event_send_all(hdev, event_mask); } } From 82a1b48a4e3e8fdb945af63b6fff5304ff5c3c17 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 26 Mar 2023 11:59:44 +0300 Subject: [PATCH 14/22] accel/habanalabs: fix wrong reset and event flags During event handling, driver sets relevant reset and user event notifier flags. Fix few wrong flags settings. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index ea9fdc616de4..ce85308d03e9 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -9510,19 +9510,18 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_ARC_AXI_ERROR_RESPONSE_0: - reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; error_count = gaudi2_handle_arc_farm_sei_err(hdev, event_type); - event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_CPU_AXI_ERR_RSP: error_count = gaudi2_handle_cpu_sei_err(hdev, event_type); - event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; + event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR; break; case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP: case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP: - reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; error_count = gaudi2_handle_qm_sei_err(hdev, event_type, true, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -9709,12 +9708,14 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_PCIE_DRAIN_COMPLETE: error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data); + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI2_EVENT_PSOC59_RPM_ERROR_OR_DRAIN: error_count = gaudi2_handle_psoc_drain(hdev, le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; @@ -9743,6 +9744,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_PSOC_AXI_ERR_RSP: error_count = GAUDI2_NA_EVENT_CAUSE; + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI2_EVENT_PSOC_PRSTN_FALL: @@ -9756,6 +9758,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_PCIE_FATAL_ERR: error_count = GAUDI2_NA_EVENT_CAUSE; + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI2_EVENT_TPC0_BMON_SPMU: @@ -9823,6 +9826,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_CPU_PKT_QUEUE_OUT_SYNC: gaudi2_print_out_of_sync_info(hdev, event_type, &eq_entry->pkt_sync_err); error_count = GAUDI2_NA_EVENT_CAUSE; + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; @@ -9864,6 +9868,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_CPU_PKT_SANITY_FAILED: gaudi2_print_cpu_pkt_failure_info(hdev, event_type, &eq_entry->pkt_sync_err); error_count = GAUDI2_NA_EVENT_CAUSE; + reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; From 802f25b6c2c0377c681dd1e4f799a648c3df50dd Mon Sep 17 00:00:00 2001 From: Tal Cohen Date: Tue, 21 Mar 2023 10:59:28 +0200 Subject: [PATCH 15/22] accel/habanalabs: sync f/w events interrupt in hard reset Receiving events from FW, while the device is in hard reset, causes a warning message in Driver log. The message may point to a problem in the Driver or FW. But It also can appear as a result of events that have been sent from FW just before the hard reset. In order to avoid receiving events from FW while the device is in reset and is already in 'disabled' mode, sync the f/w events interrupt right before setting the device to 'disabled'. Signed-off-by: Tal Cohen Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 55 +++++++++++--------- drivers/accel/habanalabs/common/habanalabs.h | 2 + drivers/accel/habanalabs/gaudi/gaudi.c | 3 ++ drivers/accel/habanalabs/gaudi2/gaudi2.c | 1 + drivers/accel/habanalabs/goya/goya.c | 1 + 5 files changed, 37 insertions(+), 25 deletions(-) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 3c1af9d43b65..fabfc501ef54 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1380,13 +1380,41 @@ static void device_disable_open_processes(struct hl_device *hdev, bool control_d mutex_unlock(fd_lock); } +static void send_disable_pci_access(struct hl_device *hdev, u32 flags) +{ + /* If reset is due to heartbeat, device CPU is no responsive in + * which case no point sending PCI disable message to it. + */ + if ((flags & HL_DRV_RESET_HARD) && + !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { + /* Disable PCI access from device F/W so he won't send + * us additional interrupts. We disable MSI/MSI-X at + * the halt_engines function and we can't have the F/W + * sending us interrupts after that. We need to disable + * the access here because if the device is marked + * disable, the message won't be send. Also, in case + * of heartbeat, the device CPU is marked as disable + * so this message won't be sent + */ + if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) { + dev_warn(hdev->dev, "Failed to disable FW's PCI access\n"); + return; + } + + /* verify that last EQs are handled before disabled is set */ + if (hdev->cpu_queues_enable) + synchronize_irq(pci_irq_vector(hdev->pdev, + hdev->asic_prop.eq_interrupt_id)); + } +} + static void handle_reset_trigger(struct hl_device *hdev, u32 flags) { u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT; /* No consecutive mechanism when user context exists */ if (hdev->is_compute_ctx_active) - goto disable_pci; + return; /* * 'reset cause' is being updated here, because getting here @@ -1418,30 +1446,6 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) } else { hdev->reset_info.reset_trigger_repeated = 1; } - - /* If reset is due to heartbeat, device CPU is no responsive in - * which case no point sending PCI disable message to it. - * - * If F/W is performing the reset, no need to send it a message to disable - * PCI access - */ - -disable_pci: - if ((flags & HL_DRV_RESET_HARD) && - !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { - /* Disable PCI access from device F/W so he won't send - * us additional interrupts. We disable MSI/MSI-X at - * the halt_engines function and we can't have the F/W - * sending us interrupts after that. We need to disable - * the access here because if the device is marked - * disable, the message won't be send. Also, in case - * of heartbeat, the device CPU is marked as disable - * so this message won't be sent - */ - if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) - dev_warn(hdev->dev, - "Failed to disable FW's PCI access\n"); - } } /* @@ -1562,6 +1566,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) escalate_reset_flow: handle_reset_trigger(hdev, flags); + send_disable_pci_access(hdev, flags); /* This also blocks future CS/VM/JOB completion operations */ hdev->disabled = true; diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 7b6ad3d7dbaa..8c3bcc50e560 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -662,6 +662,7 @@ struct hl_hints_range { * @user_interrupt_count: number of user interrupts. * @user_dec_intr_count: number of decoder interrupts exposed to user. * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host. + * @eq_interrupt_id: interrupt id for EQ, uses to synchronize EQ interrupts in hard-reset. * @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error. * @cache_line_size: device cache line size. * @server_type: Server type that the ASIC is currently installed in. @@ -793,6 +794,7 @@ struct asic_fixed_properties { u16 user_interrupt_count; u16 user_dec_intr_count; u16 tpc_interrupt_id; + u16 eq_interrupt_id; u16 unexpected_user_error_interrupt_id; u16 cache_line_size; u16 server_type; diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c index 08a4b1cf2b42..2ad8e4efce7f 100644 --- a/drivers/accel/habanalabs/gaudi/gaudi.c +++ b/drivers/accel/habanalabs/gaudi/gaudi.c @@ -682,6 +682,9 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev) prop->first_available_user_interrupt = USHRT_MAX; prop->tpc_interrupt_id = USHRT_MAX; + /* single msi */ + prop->eq_interrupt_id = 0; + for (i = 0 ; i < HL_MAX_DCORES ; i++) prop->first_available_cq[i] = USHRT_MAX; diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index ce85308d03e9..554020026da8 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -2439,6 +2439,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev) prop->first_available_user_interrupt = GAUDI2_IRQ_NUM_USER_FIRST; prop->tpc_interrupt_id = GAUDI2_IRQ_NUM_TPC_ASSERT; + prop->eq_interrupt_id = GAUDI2_IRQ_NUM_EVENT_QUEUE; prop->unexpected_user_error_interrupt_id = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR; prop->first_available_cq[0] = GAUDI2_RESERVED_CQ_NUMBER; diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c index 07d67878eac5..fb0ac9df841a 100644 --- a/drivers/accel/habanalabs/goya/goya.c +++ b/drivers/accel/habanalabs/goya/goya.c @@ -473,6 +473,7 @@ int goya_set_fixed_properties(struct hl_device *hdev) prop->first_available_user_interrupt = USHRT_MAX; prop->tpc_interrupt_id = USHRT_MAX; + prop->eq_interrupt_id = GOYA_EVENT_QUEUE_MSIX_IDX; for (i = 0 ; i < HL_MAX_DCORES ; i++) prop->first_available_cq[i] = USHRT_MAX; From c19350efa9dae7f4474d7847ab7d6e667082fd18 Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Sun, 26 Mar 2023 18:22:57 +0300 Subject: [PATCH 16/22] accel/habanalabs: don't wait for STS_OK after sending COMMS WFE Sending COMMS_GOTO_WFE instructs the FW's CPU to halt (WFE state). Once sent, FW's CPU isn't expected to continue communicating with LKD. Therefore, the stage of waiting for COMMS_STS_OK should be skipped or else waiting for COMMS_STS_OK will simply timeout, which will trigger unexpected behavior. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/firmware_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 781256dd49ad..59f61ec66445 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -1278,7 +1278,7 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev) /* Stop device CPU to make sure nothing bad happens */ if (hdev->asic_prop.dynamic_fw_load) { rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader, - COMMS_GOTO_WFE, 0, true, + COMMS_GOTO_WFE, 0, false, hdev->fw_loader.cpu_timeout); if (rc) dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); From 38f3c732fce6c57ab174800e2e5456498207d440 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 28 Mar 2023 10:59:43 +0300 Subject: [PATCH 17/22] accel/habanalabs: fixes for unexpected error interrupt Removing redundant asic prop variable as we don't need to expose this to common code. In addition, fix some typos. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/habanalabs.h | 2 -- drivers/accel/habanalabs/gaudi2/gaudi2.c | 5 ++--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 8c3bcc50e560..eaae69a9f817 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -663,7 +663,6 @@ struct hl_hints_range { * @user_dec_intr_count: number of decoder interrupts exposed to user. * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host. * @eq_interrupt_id: interrupt id for EQ, uses to synchronize EQ interrupts in hard-reset. - * @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error. * @cache_line_size: device cache line size. * @server_type: Server type that the ASIC is currently installed in. * The value is according to enum hl_server_type in uapi file. @@ -795,7 +794,6 @@ struct asic_fixed_properties { u16 user_dec_intr_count; u16 tpc_interrupt_id; u16 eq_interrupt_id; - u16 unexpected_user_error_interrupt_id; u16 cache_line_size; u16 server_type; u8 completion_queues_count; diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 554020026da8..da1b2e6dd683 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -2440,7 +2440,6 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev) prop->first_available_user_interrupt = GAUDI2_IRQ_NUM_USER_FIRST; prop->tpc_interrupt_id = GAUDI2_IRQ_NUM_TPC_ASSERT; prop->eq_interrupt_id = GAUDI2_IRQ_NUM_EVENT_QUEUE; - prop->unexpected_user_error_interrupt_id = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR; prop->first_available_cq[0] = GAUDI2_RESERVED_CQ_NUMBER; @@ -3351,7 +3350,7 @@ static void gaudi2_user_interrupt_setup(struct hl_device *hdev) /* Initialize TPC interrupt */ HL_USR_INTR_STRUCT_INIT(hdev->tpc_interrupt, hdev, 0, HL_USR_INTERRUPT_TPC); - /* Initialize general purpose interrupt */ + /* Initialize unexpected error interrupt */ HL_USR_INTR_STRUCT_INIT(hdev->unexpected_error_interrupt, hdev, 0, HL_USR_INTERRUPT_UNEXPECTED); @@ -4015,7 +4014,7 @@ static const char *gaudi2_irq_name(u16 irq_number) case GAUDI2_IRQ_NUM_TPC_ASSERT: return "gaudi2 tpc assert"; case GAUDI2_IRQ_NUM_UNEXPECTED_ERROR: - return "gaudi2 tpc assert"; + return "gaudi2 unexpected error"; case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST: return "gaudi2 user completion"; default: From a25c2f7a467265fa24d63fb6dd46fa7ba4e3b108 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 30 Mar 2023 12:30:56 +0300 Subject: [PATCH 18/22] accel/habanalabs/uapi: new Gaudi2 server type Add definition of a new Gaudi2 server type. This represents the connectivity between the cards in that server type. Signed-off-by: Oded Gabbay Reviewed-by: Stanislaw Gruszka --- include/uapi/drm/habanalabs_accel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h index c139aab17c8a..d9ef1b151d04 100644 --- a/include/uapi/drm/habanalabs_accel.h +++ b/include/uapi/drm/habanalabs_accel.h @@ -708,7 +708,8 @@ enum hl_server_type { HL_SERVER_GAUDI_HLS1H = 2, HL_SERVER_GAUDI_TYPE1 = 3, HL_SERVER_GAUDI_TYPE2 = 4, - HL_SERVER_GAUDI2_HLS2 = 5 + HL_SERVER_GAUDI2_HLS2 = 5, + HL_SERVER_GAUDI2_TYPE1 = 7 }; /* From b207e166dbadc88f38d9550c592cc4f8413b7a15 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 27 Mar 2023 13:40:56 +0300 Subject: [PATCH 19/22] accel/habanalabs: remove Gaudi1 multi MSI code Multi MSI interrupts aren't working in Gaudi1 and because of that, we are only using a single MSI interrupt. Therefore, let's remove this dead code in order to avoid confusion. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi/gaudi.c | 83 ++----------------------- drivers/accel/habanalabs/gaudi/gaudiP.h | 15 ----- 2 files changed, 5 insertions(+), 93 deletions(-) diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c index 2ad8e4efce7f..a29aa8f7b6f3 100644 --- a/drivers/accel/habanalabs/gaudi/gaudi.c +++ b/drivers/accel/habanalabs/gaudi/gaudi.c @@ -2020,38 +2020,6 @@ static int gaudi_enable_msi_single(struct hl_device *hdev) return rc; } -static int gaudi_enable_msi_multi(struct hl_device *hdev) -{ - int cq_cnt = hdev->asic_prop.completion_queues_count; - int rc, i, irq_cnt_init, irq; - - for (i = 0, irq_cnt_init = 0 ; i < cq_cnt ; i++, irq_cnt_init++) { - irq = gaudi_pci_irq_vector(hdev, i, false); - rc = request_irq(irq, hl_irq_handler_cq, 0, gaudi_irq_name[i], - &hdev->completion_queue[i]); - if (rc) { - dev_err(hdev->dev, "Failed to request IRQ %d", irq); - goto free_irqs; - } - } - - irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX, true); - rc = request_irq(irq, hl_irq_handler_eq, 0, gaudi_irq_name[cq_cnt], - &hdev->event_queue); - if (rc) { - dev_err(hdev->dev, "Failed to request IRQ %d", irq); - goto free_irqs; - } - - return 0; - -free_irqs: - for (i = 0 ; i < irq_cnt_init ; i++) - free_irq(gaudi_pci_irq_vector(hdev, i, false), - &hdev->completion_queue[i]); - return rc; -} - static int gaudi_enable_msi(struct hl_device *hdev) { struct gaudi_device *gaudi = hdev->asic_specific; @@ -2066,14 +2034,7 @@ static int gaudi_enable_msi(struct hl_device *hdev) return rc; } - if (rc < NUMBER_OF_INTERRUPTS) { - gaudi->multi_msi_mode = false; - rc = gaudi_enable_msi_single(hdev); - } else { - gaudi->multi_msi_mode = true; - rc = gaudi_enable_msi_multi(hdev); - } - + rc = gaudi_enable_msi_single(hdev); if (rc) goto free_pci_irq_vectors; @@ -2089,47 +2050,23 @@ static int gaudi_enable_msi(struct hl_device *hdev) static void gaudi_sync_irqs(struct hl_device *hdev) { struct gaudi_device *gaudi = hdev->asic_specific; - int i, cq_cnt = hdev->asic_prop.completion_queues_count; if (!(gaudi->hw_cap_initialized & HW_CAP_MSI)) return; /* Wait for all pending IRQs to be finished */ - if (gaudi->multi_msi_mode) { - for (i = 0 ; i < cq_cnt ; i++) - synchronize_irq(gaudi_pci_irq_vector(hdev, i, false)); - - synchronize_irq(gaudi_pci_irq_vector(hdev, - GAUDI_EVENT_QUEUE_MSI_IDX, - true)); - } else { - synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false)); - } + synchronize_irq(gaudi_pci_irq_vector(hdev, 0, false)); } static void gaudi_disable_msi(struct hl_device *hdev) { struct gaudi_device *gaudi = hdev->asic_specific; - int i, irq, cq_cnt = hdev->asic_prop.completion_queues_count; if (!(gaudi->hw_cap_initialized & HW_CAP_MSI)) return; gaudi_sync_irqs(hdev); - - if (gaudi->multi_msi_mode) { - irq = gaudi_pci_irq_vector(hdev, GAUDI_EVENT_QUEUE_MSI_IDX, - true); - free_irq(irq, &hdev->event_queue); - - for (i = 0 ; i < cq_cnt ; i++) { - irq = gaudi_pci_irq_vector(hdev, i, false); - free_irq(irq, &hdev->completion_queue[i]); - } - } else { - free_irq(gaudi_pci_irq_vector(hdev, 0, false), hdev); - } - + free_irq(gaudi_pci_irq_vector(hdev, 0, false), hdev); pci_free_irq_vectors(hdev->pdev); gaudi->hw_cap_initialized &= ~HW_CAP_MSI; @@ -3924,11 +3861,7 @@ static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout) WREG32(mmCPU_IF_PF_PQ_PI, 0); - if (gaudi->multi_msi_mode) - WREG32(mmCPU_IF_QUEUE_INIT, PQ_INIT_STATUS_READY_FOR_CP); - else - WREG32(mmCPU_IF_QUEUE_INIT, - PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI); + WREG32(mmCPU_IF_QUEUE_INIT, PQ_INIT_STATUS_READY_FOR_CP_SINGLE_MSI); irq_handler_offset = prop->gic_interrupts_enable ? mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : @@ -5605,7 +5538,6 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, void *kernel_add u32 len, u32 original_len, u64 cq_addr, u32 cq_val, u32 msi_vec, bool eb) { - struct gaudi_device *gaudi = hdev->asic_specific; struct packet_msg_prot *cq_pkt; struct packet_nop *cq_padding; u64 msi_addr; @@ -5635,12 +5567,7 @@ static void gaudi_add_end_of_cb_packets(struct hl_device *hdev, void *kernel_add tmp |= FIELD_PREP(GAUDI_PKT_CTL_MB_MASK, 1); cq_pkt->ctl = cpu_to_le32(tmp); cq_pkt->value = cpu_to_le32(1); - - if (gaudi->multi_msi_mode) - msi_addr = mmPCIE_MSI_INTR_0 + msi_vec * 4; - else - msi_addr = mmPCIE_CORE_MSI_REQ; - + msi_addr = hdev->pdev ? mmPCIE_CORE_MSI_REQ : mmPCIE_MSI_INTR_0 + msi_vec * 4; cq_pkt->addr = cpu_to_le64(CFG_BASE + msi_addr); } diff --git a/drivers/accel/habanalabs/gaudi/gaudiP.h b/drivers/accel/habanalabs/gaudi/gaudiP.h index 3d88d56c8eb3..b8fa724be5a1 100644 --- a/drivers/accel/habanalabs/gaudi/gaudiP.h +++ b/drivers/accel/habanalabs/gaudi/gaudiP.h @@ -28,20 +28,8 @@ #define NUMBER_OF_COLLECTIVE_QUEUES 12 #define NUMBER_OF_SOBS_IN_GRP 11 -/* - * Number of MSI interrupts IDS: - * Each completion queue has 1 ID - * The event queue has 1 ID - */ -#define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + \ - NUMBER_OF_CPU_HW_QUEUES) - #define GAUDI_STREAM_MASTER_ARR_SIZE 8 -#if (NUMBER_OF_INTERRUPTS > GAUDI_MSI_ENTRIES) -#error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES" -#endif - #define CORESIGHT_TIMEOUT_USEC 100000 /* 100 ms */ #define GAUDI_MAX_CLK_FREQ 2200000000ull /* 2200 MHz */ @@ -324,8 +312,6 @@ struct gaudi_internal_qman_info { * signal we can use this engine in later code paths. * Each bit is cleared upon reset of its corresponding H/W * engine. - * @multi_msi_mode: whether we are working in multi MSI single MSI mode. - * Multi MSI is possible only with IOMMU enabled. * @mmu_cache_inv_pi: PI for MMU cache invalidation flow. The H/W expects an * 8-bit value so use u8. */ @@ -345,7 +331,6 @@ struct gaudi_device { u32 events_stat[GAUDI_EVENT_SIZE]; u32 events_stat_aggregate[GAUDI_EVENT_SIZE]; u32 hw_cap_initialized; - u8 multi_msi_mode; u8 mmu_cache_inv_pi; }; From 91204e4703aef7bcdd045126b889d7e1aab63dd5 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Tue, 28 Mar 2023 20:41:35 +0300 Subject: [PATCH 20/22] accel/habanalabs: fix handling of arc farm sei event There is only single eq entry for arc farm sei event which aggregates events from the four arc farms. Fix the code to handle this event according to this behavior. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 25 +++++++++++-------- .../include/gaudi2/asic_reg/gaudi2_regs.h | 4 ++- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index da1b2e6dd683..b318c67dae13 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -8490,23 +8490,28 @@ static int gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e static int gaudi2_handle_arc_farm_sei_err(struct hl_device *hdev, u16 event_type) { - u32 i, sts_val, sts_clr_val = 0, error_count = 0; + u32 i, sts_val, sts_clr_val, error_count = 0, arc_farm; - sts_val = RREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_STS); + for (arc_farm = 0 ; arc_farm < NUM_OF_ARC_FARMS_ARC ; arc_farm++) { + sts_clr_val = 0; + sts_val = RREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_STS + + (arc_farm * ARC_FARM_OFFSET)); - for (i = 0 ; i < GAUDI2_NUM_OF_ARC_SEI_ERR_CAUSE ; i++) { - if (sts_val & BIT(i)) { - gaudi2_print_event(hdev, event_type, true, - "err cause: %s", gaudi2_arc_sei_error_cause[i]); - sts_clr_val |= BIT(i); - error_count++; + for (i = 0 ; i < GAUDI2_NUM_OF_ARC_SEI_ERR_CAUSE ; i++) { + if (sts_val & BIT(i)) { + gaudi2_print_event(hdev, event_type, true, + "ARC FARM ARC %u err cause: %s", + arc_farm, gaudi2_arc_sei_error_cause[i]); + sts_clr_val |= BIT(i); + error_count++; + } } + WREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_CLR + (arc_farm * ARC_FARM_OFFSET), + sts_clr_val); } hl_check_for_glbl_errors(hdev); - WREG32(mmARC_FARM_ARC0_AUX_ARC_SEI_INTR_CLR, sts_clr_val); - return error_count; } diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h index 452b379f39f6..6c58af614236 100644 --- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h +++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2020-2022 HabanaLabs, Ltd. + * Copyright 2020-2023 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -543,6 +543,8 @@ #define HBM_MC_SPI_IEEE1500_COMP_MASK BIT(3) #define HBM_MC_SPI_IEEE1500_PAUSED_MASK BIT(4) +#define ARC_FARM_OFFSET (mmARC_FARM_ARC1_AUX_BASE - mmARC_FARM_ARC0_AUX_BASE) + #include "nic0_qpc0_regs.h" #include "nic0_qm0_regs.h" #include "nic0_qm_arc_aux0_regs.h" From 31420f93b5c15746759cd87eaa0f572a7316ea46 Mon Sep 17 00:00:00 2001 From: Moti Haimovski Date: Mon, 20 Mar 2023 22:59:11 +0200 Subject: [PATCH 21/22] accel/habanalabs: speedup h/w queues test in Gaudi2 HW queues testing at driver load and after reset takes a substantial amount of time. This commit reduces the queues test time in Gaudi2 devices by running all the tests in parallel instead of one after the other. Time measurements on tests duration shows that the new method is almost x100 faster than the serial approach. Signed-off-by: Moti Haimovski Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 154 ++++++++++++++++------ drivers/accel/habanalabs/gaudi2/gaudi2P.h | 17 +++ 2 files changed, 129 insertions(+), 42 deletions(-) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index b318c67dae13..b778cf764a68 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -3480,6 +3480,48 @@ static int gaudi2_special_blocks_iterator_config(struct hl_device *hdev) return gaudi2_special_blocks_config(hdev); } +static void gaudi2_test_queues_msgs_free(struct hl_device *hdev) +{ + struct gaudi2_device *gaudi2 = hdev->asic_specific; + struct gaudi2_queues_test_info *msg_info = gaudi2->queues_test_info; + int i; + + for (i = 0 ; i < GAUDI2_NUM_TESTED_QS ; i++) { + /* bail-out if this is an allocation failure point */ + if (!msg_info[i].kern_addr) + break; + + hl_asic_dma_pool_free(hdev, msg_info[i].kern_addr, msg_info[i].dma_addr); + msg_info[i].kern_addr = NULL; + } +} + +static int gaudi2_test_queues_msgs_alloc(struct hl_device *hdev) +{ + struct gaudi2_device *gaudi2 = hdev->asic_specific; + struct gaudi2_queues_test_info *msg_info = gaudi2->queues_test_info; + int i, rc; + + /* allocate a message-short buf for each Q we intend to test */ + for (i = 0 ; i < GAUDI2_NUM_TESTED_QS ; i++) { + msg_info[i].kern_addr = + (void *)hl_asic_dma_pool_zalloc(hdev, sizeof(struct packet_msg_short), + GFP_KERNEL, &msg_info[i].dma_addr); + if (!msg_info[i].kern_addr) { + dev_err(hdev->dev, + "Failed to allocate dma memory for H/W queue %d testing\n", i); + rc = -ENOMEM; + goto err_exit; + } + } + + return 0; + +err_exit: + gaudi2_test_queues_msgs_free(hdev); + return rc; +} + static int gaudi2_sw_init(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; @@ -3579,8 +3621,14 @@ static int gaudi2_sw_init(struct hl_device *hdev) if (rc) goto free_scratchpad_mem; + rc = gaudi2_test_queues_msgs_alloc(hdev); + if (rc) + goto special_blocks_free; + return 0; +special_blocks_free: + gaudi2_special_blocks_iterator_free(hdev); free_scratchpad_mem: hl_asic_dma_pool_free(hdev, gaudi2->scratchpad_kernel_address, gaudi2->scratchpad_bus_address); @@ -3603,6 +3651,8 @@ static int gaudi2_sw_fini(struct hl_device *hdev) struct asic_fixed_properties *prop = &hdev->asic_prop; struct gaudi2_device *gaudi2 = hdev->asic_specific; + gaudi2_test_queues_msgs_free(hdev); + gaudi2_special_blocks_iterator_free(hdev); hl_cpu_accessible_dma_pool_free(hdev, prop->pmmu.page_size, gaudi2->virt_msix_db_cpu_addr); @@ -6797,29 +6847,30 @@ static void gaudi2_qman_set_test_mode(struct hl_device *hdev, u32 hw_queue_id, b } } -static int gaudi2_test_queue(struct hl_device *hdev, u32 hw_queue_id) +static inline u32 gaudi2_test_queue_hw_queue_id_to_sob_id(struct hl_device *hdev, u32 hw_queue_id) { - u32 sob_offset = hdev->asic_prop.first_available_user_sob[0] * 4; + return hdev->asic_prop.first_available_user_sob[0] + + hw_queue_id - GAUDI2_QUEUE_ID_PDMA_0_0; +} + +static void gaudi2_test_queue_clear(struct hl_device *hdev, u32 hw_queue_id) +{ + u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4; u32 sob_addr = mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + sob_offset; - u32 timeout_usec, tmp, sob_base = 1, sob_val = 0x5a5a; - struct packet_msg_short *msg_short_pkt; - dma_addr_t pkt_dma_addr; - size_t pkt_size; + + /* Reset the SOB value */ + WREG32(sob_addr, 0); +} + +static int gaudi2_test_queue_send_msg_short(struct hl_device *hdev, u32 hw_queue_id, u32 sob_val, + struct gaudi2_queues_test_info *msg_info) +{ + u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4; + u32 tmp, sob_base = 1; + struct packet_msg_short *msg_short_pkt = msg_info->kern_addr; + size_t pkt_size = sizeof(struct packet_msg_short); int rc; - if (hdev->pldm) - timeout_usec = GAUDI2_PLDM_TEST_QUEUE_WAIT_USEC; - else - timeout_usec = GAUDI2_TEST_QUEUE_WAIT_USEC; - - pkt_size = sizeof(*msg_short_pkt); - msg_short_pkt = hl_asic_dma_pool_zalloc(hdev, pkt_size, GFP_KERNEL, &pkt_dma_addr); - if (!msg_short_pkt) { - dev_err(hdev->dev, "Failed to allocate packet for H/W queue %d testing\n", - hw_queue_id); - return -ENOMEM; - } - tmp = (PACKET_MSG_SHORT << GAUDI2_PKT_CTL_OPCODE_SHIFT) | (1 << GAUDI2_PKT_CTL_EB_SHIFT) | (1 << GAUDI2_PKT_CTL_MB_SHIFT) | @@ -6829,15 +6880,25 @@ static int gaudi2_test_queue(struct hl_device *hdev, u32 hw_queue_id) msg_short_pkt->value = cpu_to_le32(sob_val); msg_short_pkt->ctl = cpu_to_le32(tmp); - /* Reset the SOB value */ - WREG32(sob_addr, 0); + rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, msg_info->dma_addr); + if (rc) + dev_err(hdev->dev, + "Failed to send msg_short packet to H/W queue %d\n", hw_queue_id); - rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, pkt_dma_addr); - if (rc) { - dev_err(hdev->dev, "Failed to send msg_short packet to H/W queue %d\n", - hw_queue_id); - goto free_pkt; - } + return rc; +} + +static int gaudi2_test_queue_wait_completion(struct hl_device *hdev, u32 hw_queue_id, u32 sob_val) +{ + u32 sob_offset = gaudi2_test_queue_hw_queue_id_to_sob_id(hdev, hw_queue_id) * 4; + u32 sob_addr = mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + sob_offset; + u32 timeout_usec, tmp; + int rc; + + if (hdev->pldm) + timeout_usec = GAUDI2_PLDM_TEST_QUEUE_WAIT_USEC; + else + timeout_usec = GAUDI2_TEST_QUEUE_WAIT_USEC; rc = hl_poll_timeout( hdev, @@ -6853,11 +6914,6 @@ static int gaudi2_test_queue(struct hl_device *hdev, u32 hw_queue_id) rc = -EIO; } - /* Reset the SOB value */ - WREG32(sob_addr, 0); - -free_pkt: - hl_asic_dma_pool_free(hdev, (void *) msg_short_pkt, pkt_dma_addr); return rc; } @@ -6877,30 +6933,44 @@ static int gaudi2_test_cpu_queue(struct hl_device *hdev) static int gaudi2_test_queues(struct hl_device *hdev) { - int i, rc, ret_val = 0; + struct gaudi2_device *gaudi2 = hdev->asic_specific; + struct gaudi2_queues_test_info *msg_info; + u32 sob_val = 0x5a5a; + int i, rc; + /* send test message on all enabled Qs */ for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) { if (!gaudi2_is_queue_enabled(hdev, i)) continue; + msg_info = &gaudi2->queues_test_info[i - GAUDI2_QUEUE_ID_PDMA_0_0]; gaudi2_qman_set_test_mode(hdev, i, true); - rc = gaudi2_test_queue(hdev, i); - gaudi2_qman_set_test_mode(hdev, i, false); - - if (rc) { - ret_val = -EINVAL; + gaudi2_test_queue_clear(hdev, i); + rc = gaudi2_test_queue_send_msg_short(hdev, i, sob_val, msg_info); + if (rc) goto done; - } } rc = gaudi2_test_cpu_queue(hdev); - if (rc) { - ret_val = -EINVAL; + if (rc) goto done; + + /* verify that all messages were processed */ + for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) { + if (!gaudi2_is_queue_enabled(hdev, i)) + continue; + + rc = gaudi2_test_queue_wait_completion(hdev, i, sob_val); + if (rc) + /* chip is not usable, no need for cleanups, just bail-out with error */ + goto done; + + gaudi2_test_queue_clear(hdev, i); + gaudi2_qman_set_test_mode(hdev, i, false); } done: - return ret_val; + return rc; } static int gaudi2_compute_reset_late_init(struct hl_device *hdev) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h index 0742046810f9..1cebe707772e 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h +++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h @@ -240,6 +240,8 @@ #define GAUDI2_SOB_INCREMENT_BY_ONE (FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1) | \ FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1)) +#define GAUDI2_NUM_TESTED_QS (GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0) + #define GAUDI2_NUM_OF_GLBL_ERR_CAUSE 8 enum gaudi2_reserved_sob_id { @@ -452,6 +454,17 @@ struct dup_block_ctx { unsigned int instances; }; +/** + * struct gaudi2_queues_test_info - Holds the address of a the messages used for testing the + * device queues. + * @dma_addr: the address used by the HW for accessing the message. + * @kern_addr: The address used by the driver for accessing the message. + */ +struct gaudi2_queues_test_info { + dma_addr_t dma_addr; + void *kern_addr; +}; + /** * struct gaudi2_device - ASIC specific manage structure. * @cpucp_info_get: get information on device from CPU-CP @@ -510,6 +523,7 @@ struct dup_block_ctx { * @flush_db_fifo: flag to force flush DB FIFO after a write. * @hbm_cfg: HBM subsystem settings * @hw_queues_lock_mutex: used by simulator instead of hw_queues_lock. + * @queues_test_info: information used by the driver when testing the HW queues. */ struct gaudi2_device { int (*cpucp_info_get)(struct hl_device *hdev); @@ -537,6 +551,9 @@ struct gaudi2_device { u32 events_stat[GAUDI2_EVENT_SIZE]; u32 events_stat_aggregate[GAUDI2_EVENT_SIZE]; u32 num_of_valid_hw_events; + + /* Queue testing */ + struct gaudi2_queues_test_info queues_test_info[GAUDI2_NUM_TESTED_QS]; }; /* From 56499c461589634f2c89ffbd9cfb78268191d349 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Sun, 2 Apr 2023 13:42:35 +0300 Subject: [PATCH 22/22] accel/habanalabs: add missing error flow in hl_sysfs_init() hl_sysfs_fini() is called only if hl_sysfs_init() completes successfully. Therefore if hl_sysfs_init() fails, need to remove any sysfs group that was added until that point. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/sysfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c index 735d8bed0066..01f89f029355 100644 --- a/drivers/accel/habanalabs/common/sysfs.c +++ b/drivers/accel/habanalabs/common/sysfs.c @@ -497,10 +497,14 @@ int hl_sysfs_init(struct hl_device *hdev) if (rc) { dev_err(hdev->dev, "Failed to add groups to device, error %d\n", rc); - return rc; + goto remove_groups; } return 0; + +remove_groups: + device_remove_groups(hdev->dev, hl_dev_attr_groups); + return rc; } void hl_sysfs_fini(struct hl_device *hdev)