mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson
synced 2025-09-09 16:41:16 +00:00
drm/amdgpu: refactor amdgpu_device_gpu_recover
Split amdgpu_device_gpu_recover into the following stages: halt activities,asic reset,schedule resume and amdgpu resume. The reason is that the subsequent addition of dpc recover code will have a high similarity with gpu reset Signed-off-by: Ce Sun <cesun102@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
921c040efe
commit
11bb33766f
@ -829,6 +829,10 @@ struct amdgpu_mqd {
|
|||||||
struct amdgpu_mqd_prop *p);
|
struct amdgpu_mqd_prop *p);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct amdgpu_pcie_reset_ctx {
|
||||||
|
bool audio_suspended;
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Custom Init levels could be defined for different situations where a full
|
* Custom Init levels could be defined for different situations where a full
|
||||||
* initialization of all hardware blocks are not expected. Sample cases are
|
* initialization of all hardware blocks are not expected. Sample cases are
|
||||||
@ -1159,6 +1163,8 @@ struct amdgpu_device {
|
|||||||
struct pci_saved_state *pci_state;
|
struct pci_saved_state *pci_state;
|
||||||
pci_channel_state_t pci_channel_state;
|
pci_channel_state_t pci_channel_state;
|
||||||
|
|
||||||
|
struct amdgpu_pcie_reset_ctx pcie_reset_ctx;
|
||||||
|
|
||||||
/* Track auto wait count on s_barrier settings */
|
/* Track auto wait count on s_barrier settings */
|
||||||
bool barrier_has_auto_waitcnt;
|
bool barrier_has_auto_waitcnt;
|
||||||
|
|
||||||
|
@ -5930,94 +5930,40 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
|
||||||
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
|
|
||||||
*
|
|
||||||
* @adev: amdgpu_device pointer
|
|
||||||
* @job: which job trigger hang
|
|
||||||
* @reset_context: amdgpu reset context pointer
|
|
||||||
*
|
|
||||||
* Attempt to reset the GPU if it has hung (all asics).
|
|
||||||
* Attempt to do soft-reset or full-reset and reinitialize Asic
|
|
||||||
* Returns 0 for success or an error on failure.
|
|
||||||
*/
|
|
||||||
|
|
||||||
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|
||||||
struct amdgpu_job *job,
|
struct amdgpu_job *job,
|
||||||
struct amdgpu_reset_context *reset_context)
|
struct amdgpu_reset_context *reset_context,
|
||||||
|
struct list_head *device_list,
|
||||||
|
struct amdgpu_hive_info *hive,
|
||||||
|
bool need_emergency_restart)
|
||||||
{
|
{
|
||||||
struct list_head device_list, *device_list_handle = NULL;
|
struct list_head *device_list_handle = NULL;
|
||||||
bool job_signaled = false;
|
|
||||||
struct amdgpu_hive_info *hive = NULL;
|
|
||||||
struct amdgpu_device *tmp_adev = NULL;
|
struct amdgpu_device *tmp_adev = NULL;
|
||||||
int i, r = 0;
|
int i, r = 0;
|
||||||
bool need_emergency_restart = false;
|
|
||||||
bool audio_suspended = false;
|
|
||||||
int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If it reaches here because of hang/timeout and a RAS error is
|
|
||||||
* detected at the same time, let RAS recovery take care of it.
|
|
||||||
*/
|
|
||||||
if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
|
|
||||||
!amdgpu_sriov_vf(adev) &&
|
|
||||||
reset_context->src != AMDGPU_RESET_SRC_RAS) {
|
|
||||||
dev_dbg(adev->dev,
|
|
||||||
"Gpu recovery from source: %d yielding to RAS error recovery handling",
|
|
||||||
reset_context->src);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* Special case: RAS triggered and full reset isn't supported
|
|
||||||
*/
|
|
||||||
need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Flush RAM to disk so that after reboot
|
|
||||||
* the user can read log and see why the system rebooted.
|
|
||||||
*/
|
|
||||||
if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
|
|
||||||
amdgpu_ras_get_context(adev)->reboot) {
|
|
||||||
DRM_WARN("Emergency reboot.");
|
|
||||||
|
|
||||||
ksys_sync_helper();
|
|
||||||
emergency_restart();
|
|
||||||
}
|
|
||||||
|
|
||||||
dev_info(adev->dev, "GPU %s begin!\n",
|
|
||||||
need_emergency_restart ? "jobs stop":"reset");
|
|
||||||
|
|
||||||
if (!amdgpu_sriov_vf(adev))
|
|
||||||
hive = amdgpu_get_xgmi_hive(adev);
|
|
||||||
if (hive)
|
|
||||||
mutex_lock(&hive->hive_lock);
|
|
||||||
|
|
||||||
reset_context->job = job;
|
|
||||||
reset_context->hive = hive;
|
|
||||||
/*
|
/*
|
||||||
* Build list of devices to reset.
|
* Build list of devices to reset.
|
||||||
* In case we are in XGMI hive mode, resort the device list
|
* In case we are in XGMI hive mode, resort the device list
|
||||||
* to put adev in the 1st position.
|
* to put adev in the 1st position.
|
||||||
*/
|
*/
|
||||||
INIT_LIST_HEAD(&device_list);
|
|
||||||
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
|
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
|
||||||
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
|
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
|
||||||
list_add_tail(&tmp_adev->reset_list, &device_list);
|
list_add_tail(&tmp_adev->reset_list, device_list);
|
||||||
if (adev->shutdown)
|
if (adev->shutdown)
|
||||||
tmp_adev->shutdown = true;
|
tmp_adev->shutdown = true;
|
||||||
}
|
}
|
||||||
if (!list_is_first(&adev->reset_list, &device_list))
|
if (!list_is_first(&adev->reset_list, device_list))
|
||||||
list_rotate_to_front(&adev->reset_list, &device_list);
|
list_rotate_to_front(&adev->reset_list, device_list);
|
||||||
device_list_handle = &device_list;
|
device_list_handle = device_list;
|
||||||
} else {
|
} else {
|
||||||
list_add_tail(&adev->reset_list, &device_list);
|
list_add_tail(&adev->reset_list, device_list);
|
||||||
device_list_handle = &device_list;
|
device_list_handle = device_list;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!amdgpu_sriov_vf(adev)) {
|
if (!amdgpu_sriov_vf(adev)) {
|
||||||
r = amdgpu_device_health_check(device_list_handle);
|
r = amdgpu_device_health_check(device_list_handle);
|
||||||
if (r)
|
if (r)
|
||||||
goto end_reset;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* We need to lock reset domain only once both for XGMI and single device */
|
/* We need to lock reset domain only once both for XGMI and single device */
|
||||||
@ -6041,7 +5987,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||||||
* some audio codec errors.
|
* some audio codec errors.
|
||||||
*/
|
*/
|
||||||
if (!amdgpu_device_suspend_display_audio(tmp_adev))
|
if (!amdgpu_device_suspend_display_audio(tmp_adev))
|
||||||
audio_suspended = true;
|
tmp_adev->pcie_reset_ctx.audio_suspended = true;
|
||||||
|
|
||||||
amdgpu_ras_set_error_query_ready(tmp_adev, false);
|
amdgpu_ras_set_error_query_ready(tmp_adev, false);
|
||||||
|
|
||||||
@ -6076,23 +6022,19 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||||||
atomic_inc(&tmp_adev->gpu_reset_counter);
|
atomic_inc(&tmp_adev->gpu_reset_counter);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (need_emergency_restart)
|
return r;
|
||||||
goto skip_sched_resume;
|
}
|
||||||
|
|
||||||
/*
|
static int amdgpu_device_asic_reset(struct amdgpu_device *adev,
|
||||||
* Must check guilty signal here since after this point all old
|
struct list_head *device_list,
|
||||||
* HW fences are force signaled.
|
struct amdgpu_reset_context *reset_context)
|
||||||
*
|
{
|
||||||
* job->base holds a reference to parent fence
|
struct amdgpu_device *tmp_adev = NULL;
|
||||||
*/
|
int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
|
||||||
if (job && dma_fence_is_signaled(&job->hw_fence)) {
|
int r = 0;
|
||||||
job_signaled = true;
|
|
||||||
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
|
|
||||||
goto skip_hw_reset;
|
|
||||||
}
|
|
||||||
|
|
||||||
retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
||||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
list_for_each_entry(tmp_adev, device_list, reset_list) {
|
||||||
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
|
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
|
||||||
/*TODO Should we stop ?*/
|
/*TODO Should we stop ?*/
|
||||||
if (r) {
|
if (r) {
|
||||||
@ -6119,12 +6061,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||||||
if (r)
|
if (r)
|
||||||
adev->asic_reset_res = r;
|
adev->asic_reset_res = r;
|
||||||
} else {
|
} else {
|
||||||
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
|
r = amdgpu_do_asic_reset(device_list, reset_context);
|
||||||
if (r && r == -EAGAIN)
|
if (r && r == -EAGAIN)
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
|
|
||||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
list_for_each_entry(tmp_adev, device_list, reset_list) {
|
||||||
/*
|
/*
|
||||||
* Drop any pending non scheduler resets queued before reset is done.
|
* Drop any pending non scheduler resets queued before reset is done.
|
||||||
* Any reset scheduled after this point would be valid. Scheduler resets
|
* Any reset scheduled after this point would be valid. Scheduler resets
|
||||||
@ -6134,10 +6076,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||||||
amdgpu_device_stop_pending_resets(tmp_adev);
|
amdgpu_device_stop_pending_resets(tmp_adev);
|
||||||
}
|
}
|
||||||
|
|
||||||
skip_hw_reset:
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int amdgpu_device_sched_resume(struct list_head *device_list,
|
||||||
|
struct amdgpu_reset_context *reset_context,
|
||||||
|
bool job_signaled)
|
||||||
|
{
|
||||||
|
struct amdgpu_device *tmp_adev = NULL;
|
||||||
|
int i, r = 0;
|
||||||
|
|
||||||
/* Post ASIC reset for all devs .*/
|
/* Post ASIC reset for all devs .*/
|
||||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
list_for_each_entry(tmp_adev, device_list, reset_list) {
|
||||||
|
|
||||||
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
||||||
struct amdgpu_ring *ring = tmp_adev->rings[i];
|
struct amdgpu_ring *ring = tmp_adev->rings[i];
|
||||||
@ -6173,8 +6123,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
skip_sched_resume:
|
return r;
|
||||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
}
|
||||||
|
|
||||||
|
static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
|
||||||
|
struct list_head *device_list,
|
||||||
|
bool need_emergency_restart)
|
||||||
|
{
|
||||||
|
struct amdgpu_device *tmp_adev = NULL;
|
||||||
|
|
||||||
|
list_for_each_entry(tmp_adev, device_list, reset_list) {
|
||||||
/* unlock kfd: SRIOV would do it separately */
|
/* unlock kfd: SRIOV would do it separately */
|
||||||
if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
|
if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
|
||||||
amdgpu_amdkfd_post_reset(tmp_adev);
|
amdgpu_amdkfd_post_reset(tmp_adev);
|
||||||
@ -6185,18 +6143,114 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||||||
if (!adev->kfd.init_complete)
|
if (!adev->kfd.init_complete)
|
||||||
amdgpu_amdkfd_device_init(adev);
|
amdgpu_amdkfd_device_init(adev);
|
||||||
|
|
||||||
if (audio_suspended)
|
if (tmp_adev->pcie_reset_ctx.audio_suspended)
|
||||||
amdgpu_device_resume_display_audio(tmp_adev);
|
amdgpu_device_resume_display_audio(tmp_adev);
|
||||||
|
|
||||||
amdgpu_device_unset_mp1_state(tmp_adev);
|
amdgpu_device_unset_mp1_state(tmp_adev);
|
||||||
|
|
||||||
amdgpu_ras_set_error_query_ready(tmp_adev, true);
|
amdgpu_ras_set_error_query_ready(tmp_adev, true);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
|
tmp_adev = list_first_entry(device_list, struct amdgpu_device,
|
||||||
reset_list);
|
reset_list);
|
||||||
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
|
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
|
||||||
|
*
|
||||||
|
* @adev: amdgpu_device pointer
|
||||||
|
* @job: which job trigger hang
|
||||||
|
* @reset_context: amdgpu reset context pointer
|
||||||
|
*
|
||||||
|
* Attempt to reset the GPU if it has hung (all asics).
|
||||||
|
* Attempt to do soft-reset or full-reset and reinitialize Asic
|
||||||
|
* Returns 0 for success or an error on failure.
|
||||||
|
*/
|
||||||
|
|
||||||
|
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||||
|
struct amdgpu_job *job,
|
||||||
|
struct amdgpu_reset_context *reset_context)
|
||||||
|
{
|
||||||
|
struct list_head device_list;
|
||||||
|
bool job_signaled = false;
|
||||||
|
struct amdgpu_hive_info *hive = NULL;
|
||||||
|
int r = 0;
|
||||||
|
bool need_emergency_restart = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If it reaches here because of hang/timeout and a RAS error is
|
||||||
|
* detected at the same time, let RAS recovery take care of it.
|
||||||
|
*/
|
||||||
|
if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
|
||||||
|
!amdgpu_sriov_vf(adev) &&
|
||||||
|
reset_context->src != AMDGPU_RESET_SRC_RAS) {
|
||||||
|
dev_dbg(adev->dev,
|
||||||
|
"Gpu recovery from source: %d yielding to RAS error recovery handling",
|
||||||
|
reset_context->src);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Special case: RAS triggered and full reset isn't supported
|
||||||
|
*/
|
||||||
|
need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Flush RAM to disk so that after reboot
|
||||||
|
* the user can read log and see why the system rebooted.
|
||||||
|
*/
|
||||||
|
if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
|
||||||
|
amdgpu_ras_get_context(adev)->reboot) {
|
||||||
|
DRM_WARN("Emergency reboot.");
|
||||||
|
|
||||||
|
ksys_sync_helper();
|
||||||
|
emergency_restart();
|
||||||
|
}
|
||||||
|
|
||||||
|
dev_info(adev->dev, "GPU %s begin!\n",
|
||||||
|
need_emergency_restart ? "jobs stop":"reset");
|
||||||
|
|
||||||
|
if (!amdgpu_sriov_vf(adev))
|
||||||
|
hive = amdgpu_get_xgmi_hive(adev);
|
||||||
|
if (hive)
|
||||||
|
mutex_lock(&hive->hive_lock);
|
||||||
|
|
||||||
|
reset_context->job = job;
|
||||||
|
reset_context->hive = hive;
|
||||||
|
INIT_LIST_HEAD(&device_list);
|
||||||
|
|
||||||
|
r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
|
||||||
|
hive, need_emergency_restart);
|
||||||
|
if (r)
|
||||||
|
goto end_reset;
|
||||||
|
|
||||||
|
if (need_emergency_restart)
|
||||||
|
goto skip_sched_resume;
|
||||||
|
/*
|
||||||
|
* Must check guilty signal here since after this point all old
|
||||||
|
* HW fences are force signaled.
|
||||||
|
*
|
||||||
|
* job->base holds a reference to parent fence
|
||||||
|
*/
|
||||||
|
if (job && dma_fence_is_signaled(&job->hw_fence)) {
|
||||||
|
job_signaled = true;
|
||||||
|
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
|
||||||
|
goto skip_hw_reset;
|
||||||
|
}
|
||||||
|
|
||||||
|
r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
|
||||||
|
if (r)
|
||||||
|
goto end_reset;
|
||||||
|
skip_hw_reset:
|
||||||
|
r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
|
||||||
|
if (r)
|
||||||
|
goto end_reset;
|
||||||
|
skip_sched_resume:
|
||||||
|
amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
|
||||||
end_reset:
|
end_reset:
|
||||||
if (hive) {
|
if (hive) {
|
||||||
mutex_unlock(&hive->hive_lock);
|
mutex_unlock(&hive->hive_lock);
|
||||||
|
Loading…
Reference in New Issue
Block a user