amd/amdgpu: Reduce unnecessary repetitive GPU resets

In multiple GPUs case, after a GPU has started
resetting all GPUs on hive, other GPUs do not
need to trigger GPU reset again.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
YiPeng Chai
2024-09-20 15:22:24 +08:00
committed by Alex Deucher
parent 14f2fe34f5
commit 9e0feb7946

View File

@@ -4294,8 +4294,27 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
}
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) {
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
int hive_ras_recovery = 0;
if (hive) {
hive_ras_recovery = atomic_read(&hive->ras_recovery);
amdgpu_put_xgmi_hive(hive);
}
/* In the case of multiple GPUs, after a GPU has started
* resetting all GPUs on hive, other GPUs do not need to
* trigger GPU reset again.
*/
if (!hive_ras_recovery)
amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
else
atomic_set(&ras->in_recovery, 0);
} else {
flush_work(&ras->recovery_work);
amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
}
return 0;
}