drm/sched: add optional errno to drm_sched_start()

The current implementation of drm_sched_start uses a hardcoded -ECANCELED to dispose of a job when the parent/hw fence is NULL. This results in drm_sched_job_done being called with -ECANCELED for each job with a NULL parent in the pending list, making it difficult to distinguish between recovery methods, whether a queue reset or a full GPU reset was used. To improve this, we first try a soft recovery for timeout jobs and use the error code -ENODATA. If soft recovery fails, we proceed with a queue reset, where the error code remains -ENODATA for the job. Finally, for a full GPU reset, we use error codes -ECANCELED or -ETIME. This patch adds an error code parameter to drm_sched_start, allowing us to differentiate between queue reset and GPU reset failures. This enables user mode and test applications to validate the expected correctness of the requested operation. After a successful queue reset, the only way to continue normal operation is to call drm_sched_job_done with the specific error code -ENODATA. v1: Initial implementation by Jesse utilized amdgpu_device_lock_reset_domain and amdgpu_device_unlock_reset_domain to allow user mode to track the queue reset status and distinguish between queue reset and GPU reset. v2: Christian suggested using the error codes -ENODATA for queue reset and -ECANCELED or -ETIME for GPU reset, returned to amdgpu_cs_wait_ioctl. v3: To meet the requirements, we introduce a new function drm_sched_start_ex with an additional parameter to set dma_fence_set_error, allowing us to handle the specific error codes appropriately and dispose of bad jobs with the selected error code depending on whether it was a queue reset or GPU reset. v4: Alex suggested using a new name, drm_sched_start_with_recovery_error, which more accurately describes the function's purpose. Additionally, it was recommended to add documentation details about the new method. v5: Fixed declaration of new function drm_sched_start_with_recovery_error.(Alex) v6 (chk): rebase on upstream changes, cleanup the commit message, drop the new function again and update all callers, apply the errno also to scheduler fences with hw fences v7 (chk): rebased Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20240826122541.85663-1-christian.koenig@amd.com
2026-07-22 02:17:36 -04:00 · 2024-08-26 14:25:38 +02:00
parent 498ba74654
commit b2ef808786
13 changed files with 18 additions and 17 deletions
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -300,7 +300,7 @@ static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus
 			if (r)
 				goto out;
 		} else {
-			drm_sched_start(&ring->sched);
+			drm_sched_start(&ring->sched, 0);
 		}
 	}

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5907,7 +5907,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 			if (!amdgpu_ring_sched_ready(ring))
 				continue;

-			drm_sched_start(&ring->sched);
+			drm_sched_start(&ring->sched, 0);
 		}

 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
@@ -6414,7 +6414,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
 		if (!amdgpu_ring_sched_ready(ring))
 			continue;

-		drm_sched_start(&ring->sched);
+		drm_sched_start(&ring->sched, 0);
 	}

 	amdgpu_device_unset_mp1_state(adev);
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -87,7 +87,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 			atomic_inc(&ring->adev->gpu_reset_counter);
 			amdgpu_fence_driver_force_completion(ring);
 			if (amdgpu_ring_sched_ready(ring))
-				drm_sched_start(&ring->sched);
+				drm_sched_start(&ring->sched, 0);
 			goto exit;
 		}
 	}
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -72,7 +72,7 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job

 	drm_sched_resubmit_jobs(&gpu->sched);

-	drm_sched_start(&gpu->sched);
+	drm_sched_start(&gpu->sched, 0);
 	return DRM_GPU_SCHED_STAT_NOMINAL;

 out_no_timeout:
--- a/drivers/gpu/drm/imagination/pvr_queue.c
+++ b/drivers/gpu/drm/imagination/pvr_queue.c
@@ -782,7 +782,7 @@ static void pvr_queue_start(struct pvr_queue *queue)
 		}
 	}

-	drm_sched_start(&queue->scheduler);
+	drm_sched_start(&queue->scheduler, 0);
 }

 /**
@@ -842,7 +842,7 @@ pvr_queue_timedout_job(struct drm_sched_job *s_job)
 	}
 	mutex_unlock(&pvr_dev->queues.lock);

-	drm_sched_start(sched);
+	drm_sched_start(sched, 0);

 	return DRM_GPU_SCHED_STAT_NOMINAL;
 }
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -463,7 +463,7 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
 	lima_pm_idle(ldev);

 	drm_sched_resubmit_jobs(&pipe->base);
-	drm_sched_start(&pipe->base);
+	drm_sched_start(&pipe->base, 0);

 	return DRM_GPU_SCHED_STAT_NOMINAL;
 }
--- a/drivers/gpu/drm/nouveau/nouveau_sched.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
@@ -379,7 +379,7 @@ nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
 	else
 		NV_PRINTK(warn, job->cli, "Generic job timeout.\n");

-	drm_sched_start(sched);
+	drm_sched_start(sched, 0);

 	return stat;
 }
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -733,7 +733,7 @@ panfrost_reset(struct panfrost_device *pfdev,

 	/* Restart the schedulers */
 	for (i = 0; i < NUM_JOB_SLOTS; i++)
-		drm_sched_start(&pfdev->js->queue[i].sched);
+		drm_sched_start(&pfdev->js->queue[i].sched, 0);

 	/* Re-enable job interrupts now that everything has been restarted. */
 	job_write(pfdev, JOB_INT_MASK,
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -827,7 +827,7 @@ static void panthor_vm_stop(struct panthor_vm *vm)

 static void panthor_vm_start(struct panthor_vm *vm)
 {
-	drm_sched_start(&vm->sched);
+	drm_sched_start(&vm->sched, 0);
 }

 /**
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -2538,7 +2538,7 @@ static void queue_start(struct panthor_queue *queue)
 	list_for_each_entry(job, &queue->scheduler.pending_list, base.list)
 		job->base.s_fence->parent = dma_fence_get(job->done_fence);

-	drm_sched_start(&queue->scheduler);
+	drm_sched_start(&queue->scheduler, 0);
 }

 static void panthor_group_stop(struct panthor_group *group)
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -674,9 +674,10 @@ EXPORT_SYMBOL(drm_sched_stop);
 * drm_sched_start - recover jobs after a reset
 *
 * @sched: scheduler instance
+ * @errno: error to set on the pending fences
 *
 */
-void drm_sched_start(struct drm_gpu_scheduler *sched)
+void drm_sched_start(struct drm_gpu_scheduler *sched, int errno)
 {
 	struct drm_sched_job *s_job, *tmp;

@@ -691,13 +692,13 @@ void drm_sched_start(struct drm_gpu_scheduler *sched)
 		atomic_add(s_job->credits, &sched->credit_count);

 		if (!fence) {
-			drm_sched_job_done(s_job, -ECANCELED);
+			drm_sched_job_done(s_job, errno ?: -ECANCELED);
 			continue;
 		}

 		if (dma_fence_add_callback(fence, &s_job->cb,
 					   drm_sched_job_done_cb))
-			drm_sched_job_done(s_job, fence->error);
+			drm_sched_job_done(s_job, fence->error ?: errno);
 	}

 	drm_sched_start_timeout_unlocked(sched);
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -661,7 +661,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)

 	/* Unblock schedulers and restart their jobs. */
 	for (q = 0; q < V3D_MAX_QUEUES; q++) {
-		drm_sched_start(&v3d->queue[q].sched);
+		drm_sched_start(&v3d->queue[q].sched, 0);
 	}

 	mutex_unlock(&v3d->reset_lock);
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -579,7 +579,7 @@ bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched);
 void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched);
 void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched);
 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad);
-void drm_sched_start(struct drm_gpu_scheduler *sched);
+void drm_sched_start(struct drm_gpu_scheduler *sched, int errno);
 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched);
 void drm_sched_increase_karma(struct drm_sched_job *bad);
 void drm_sched_reset_karma(struct drm_sched_job *bad);