aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c76
1 files changed, 25 insertions, 51 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4be5187391c6..e75deb789e56 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4152,16 +4152,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
}
need_full_reset = job_signaled = false;
- INIT_LIST_HEAD(&device_list);
-
- amdgpu_ras_set_error_query_ready(adev, false);
dev_info(adev->dev, "GPU %s begin!\n",
(in_ras_intr && !use_baco) ? "jobs stop":"reset");
- cancel_delayed_work_sync(&adev->delayed_init_work);
-
- hive = amdgpu_get_xgmi_hive(adev, false);
+ hive = amdgpu_get_xgmi_hive(adev, true);
/*
* Here we trylock to avoid chain of resets executing from
@@ -4174,35 +4169,21 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (hive && !mutex_trylock(&hive->reset_lock)) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
job ? job->base.id : -1, hive->hive_id);
+ mutex_unlock(&hive->hive_lock);
return 0;
}
- /* Start with adev pre asic reset first for soft reset check.*/
- if (!amdgpu_device_lock_adev(adev, !hive)) {
- DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
- job ? job->base.id : -1);
- return 0;
- }
-
- /* Block kfd: SRIOV would do it separately */
- if (!amdgpu_sriov_vf(adev))
- amdgpu_amdkfd_pre_reset(adev);
-
- /* Build list of devices to reset */
- if (adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!hive) {
- /*unlock kfd: SRIOV would do it separately */
- if (!amdgpu_sriov_vf(adev))
- amdgpu_amdkfd_post_reset(adev);
- amdgpu_device_unlock_adev(adev);
+ /*
+ * Build list of devices to reset.
+ * In case we are in XGMI hive mode, resort the device list
+ * to put adev in the 1st position.
+ */
+ INIT_LIST_HEAD(&device_list);
+ if (adev->gmc.xgmi.num_physical_nodes > 1) {
+ if (!hive)
return -ENODEV;
- }
-
- /*
- * In case we are in XGMI hive mode device reset is done for all the
- * nodes in the hive to retrain all XGMI links and hence the reset
- * sequence is executed in loop on all nodes.
- */
+ if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
+ list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
device_list_handle = &hive->device_list;
} else {
list_add_tail(&adev->gmc.xgmi.head, &device_list);
@@ -4211,15 +4192,20 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- if (tmp_adev != adev) {
- amdgpu_ras_set_error_query_ready(tmp_adev, false);
- amdgpu_device_lock_adev(tmp_adev, false);
- if (!amdgpu_sriov_vf(tmp_adev))
- amdgpu_amdkfd_pre_reset(tmp_adev);
+ if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
+ DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
+ job ? job->base.id : -1);
+ mutex_unlock(&hive->hive_lock);
+ return 0;
}
+ amdgpu_ras_set_error_query_ready(tmp_adev, false);
+
cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
+ if (!amdgpu_sriov_vf(tmp_adev))
+ amdgpu_amdkfd_pre_reset(tmp_adev);
+
/*
* Mark these ASICs to be reseted as untracked first
* And add them back after reset completed
@@ -4265,22 +4251,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
goto skip_hw_reset;
}
-
- /* Guilty job will be freed after this*/
- r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset);
- if (r) {
- /*TODO Should we stop ?*/
- DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
- r, adev->ddev->unique);
- adev->asic_reset_res = r;
- }
-
retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-
- if (tmp_adev == adev)
- continue;
-
r = amdgpu_device_pre_asic_reset(tmp_adev,
NULL,
&need_full_reset);
@@ -4345,8 +4317,10 @@ skip_sched_resume:
amdgpu_device_unlock_adev(tmp_adev);
}
- if (hive)
+ if (hive) {
mutex_unlock(&hive->reset_lock);
+ mutex_unlock(&hive->hive_lock);
+ }
if (r)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);