From c9a6b82f45e261d247b980a7949aaa6a9bfffe01 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 29 Jul 2020 12:59:45 -0400 Subject: drm/amdgpu: Implement DPC recovery Add PCI Downstream Port Containment (DPC) with basic recovery functionality v2: remove pci_save_state to avoid breaking suspend/resume v3: Fix style comments v4: Improve description. Signed-off-by: Andrey Grodzovsky Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 161 +++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f7307af76452..99c0e6e53e84 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2999,6 +2999,7 @@ static const struct attribute *amdgpu_dev_attributes[] = { NULL }; + /** * amdgpu_device_init - initialize the driver * @@ -3217,6 +3218,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, } } + pci_enable_pcie_error_reporting(adev->ddev.pdev); + /* Post card if necessary */ if (amdgpu_device_need_post(adev)) { if (!adev->bios) { @@ -4705,3 +4708,161 @@ int amdgpu_device_baco_exit(struct drm_device *dev) return 0; } + +/** + * amdgpu_pci_error_detected - Called when a PCI error is detected. + * @pdev: PCI device struct + * @state: PCI channel state + * + * Description: Called when a PCI error is detected. + * + * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. + */ +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + + DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); + + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + /* Fatal error, prepare for slot reset */ + amdgpu_device_lock_adev(adev); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + /* Permanent error, prepare for device removal */ + return PCI_ERS_RESULT_DISCONNECT; + } + + return PCI_ERS_RESULT_NEED_RESET; +} + +/** + * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers + * @pdev: pointer to PCI device + */ +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) +{ + + DRM_INFO("PCI error: mmio enabled callback!!\n"); + + /* TODO - dump whatever for debugging purposes */ + + /* This called only if amdgpu_pci_error_detected returns + * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still + * works, no need to reset slot. + */ + + return PCI_ERS_RESULT_RECOVERED; +} + +/** + * amdgpu_pci_slot_reset - Called when PCI slot has been reset. + * @pdev: PCI device struct + * + * Description: This routine is called by the pci error recovery + * code after the PCI slot has been reset, just before we + * should resume normal operations. + */ +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r; + bool vram_lost; + + DRM_INFO("PCI error: slot reset callback!!\n"); + + pci_restore_state(pdev); + + r = amdgpu_device_ip_suspend(adev); + if (r) + goto out; + + + /* post card */ + r = amdgpu_atom_asic_init(adev->mode_info.atom_context); + if (r) + goto out; + + r = amdgpu_device_ip_resume_phase1(adev); + if (r) + goto out; + + vram_lost = amdgpu_device_check_vram_lost(adev); + if (vram_lost) { + DRM_INFO("VRAM is lost due to GPU reset!\n"); + amdgpu_inc_vram_lost(adev); + } + + r = amdgpu_gtt_mgr_recover( + &adev->mman.bdev.man[TTM_PL_TT]); + if (r) + goto out; + + r = amdgpu_device_fw_loading(adev); + if (r) + return r; + + r = amdgpu_device_ip_resume_phase2(adev); + if (r) + goto out; + + if (vram_lost) + amdgpu_device_fill_reset_magic(adev); + + /* + * Add this ASIC as tracked as reset was already + * complete successfully. + */ + amdgpu_register_gpu_instance(adev); + + r = amdgpu_device_ip_late_init(adev); + if (r) + goto out; + + amdgpu_fbdev_set_suspend(adev, 0); + + /* must succeed. */ + amdgpu_ras_resume(adev); + + + amdgpu_irq_gpu_reset_resume_helper(adev); + r = amdgpu_ib_ring_tests(adev); + if (r) + goto out; + + r = amdgpu_device_recover_vram(adev); + +out: + + if (!r) { + DRM_INFO("PCIe error recovery succeeded\n"); + } else { + DRM_ERROR("PCIe error recovery failed, err:%d", r); + amdgpu_device_unlock_adev(adev); + } + + return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; +} + +/** + * amdgpu_pci_resume() - resume normal ops after PCI reset + * @pdev: pointer to PCI device + * + * Called when the error recovery driver tells us that its + * OK to resume normal operation. Use completion to allow + * halted scsi ops to resume. + */ +void amdgpu_pci_resume(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + + amdgpu_device_unlock_adev(adev); + + DRM_INFO("PCI error: resume callback!!\n"); +} -- cgit v1.2.3 From bf36b52e781d7412c3fce826f74ba6a73b9be4d0 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 29 Jul 2020 13:10:29 -0400 Subject: drm/amdgpu: Avoid accessing HW when suspending SW state At this point the ASIC is already post reset by the HW/PSP so the HW not in proper state to be configured for suspension, some blocks might be even gated and so best is to avoid touching it. v2: Rename in_dpc to more meaningful name Signed-off-by: Andrey Grodzovsky Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 99c0e6e53e84..4d9a8734ea20 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, { uint32_t ret; + if (adev->in_pci_err_recovery) + return 0; + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && down_read_trylock(&adev->reset_sem)) { ret = amdgpu_kiq_rreg(adev, reg); @@ -356,6 +359,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, * Returns the 8 bit value from the offset specified. */ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { + if (adev->in_pci_err_recovery) + return 0; + if (offset < adev->rmmio_size) return (readb(adev->rmmio + offset)); BUG(); @@ -377,6 +383,9 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { * Writes the value specified to the offset specified. */ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { + if (adev->in_pci_err_recovery) + return; + if (offset < adev->rmmio_size) writeb(value, adev->rmmio + offset); else @@ -387,6 +396,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); if ((reg * 4) < adev->rmmio_size) @@ -414,6 +426,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && down_read_trylock(&adev->reset_sem)) { amdgpu_kiq_wreg(adev, reg, v); @@ -432,6 +447,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { + if (adev->in_pci_err_recovery) + return; + if (amdgpu_sriov_fullaccess(adev) && adev->gfx.rlc.funcs && adev->gfx.rlc.funcs->is_rlcg_access_range) { @@ -453,6 +471,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t */ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) { + if (adev->in_pci_err_recovery) + return 0; + if ((reg * 4) < adev->rio_mem_size) return ioread32(adev->rio_mem + (reg * 4)); else { @@ -472,6 +493,9 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) */ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) { + if (adev->in_pci_err_recovery) + return; + if ((reg * 4) < adev->rio_mem_size) iowrite32(v, adev->rio_mem + (reg * 4)); else { @@ -491,6 +515,9 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) */ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) { + if (adev->in_pci_err_recovery) + return 0; + if (index < adev->doorbell.num_doorbells) { return readl(adev->doorbell.ptr + index); } else { @@ -511,6 +538,9 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) { + if (adev->in_pci_err_recovery) + return; + if (index < adev->doorbell.num_doorbells) { writel(v, adev->doorbell.ptr + index); } else { @@ -529,6 +559,9 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) */ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) { + if (adev->in_pci_err_recovery) + return 0; + if (index < adev->doorbell.num_doorbells) { return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); } else { @@ -549,6 +582,9 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) { + if (adev->in_pci_err_recovery) + return; + if (index < adev->doorbell.num_doorbells) { atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); } else { @@ -4778,7 +4814,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) pci_restore_state(pdev); + adev->in_pci_err_recovery = true; r = amdgpu_device_ip_suspend(adev); + adev->in_pci_err_recovery = false; if (r) goto out; -- cgit v1.2.3 From acd89fca672dd2a7c3a83038cce7601498f21105 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 29 Jul 2020 14:14:36 -0400 Subject: drm/amdgpu: Block all job scheduling activity during DPC recovery DPC recovery involves ASIC reset just as normal GPU recovery so block SW GPU schedulers and wait on all concurrent GPU resets. Signed-off-by: Andrey Grodzovsky Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 57 +++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 4d9a8734ea20..2d155355a05f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4745,6 +4745,20 @@ int amdgpu_device_baco_exit(struct drm_device *dev) return 0; } +static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) +{ + int i; + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + cancel_delayed_work_sync(&ring->sched.work_tdr); + } +} + /** * amdgpu_pci_error_detected - Called when a PCI error is detected. * @pdev: PCI device struct @@ -4758,15 +4772,37 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); + int i; DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); switch (state) { case pci_channel_io_normal: return PCI_ERS_RESULT_CAN_RECOVER; - case pci_channel_io_frozen: - /* Fatal error, prepare for slot reset */ - amdgpu_device_lock_adev(adev); + /* Fatal error, prepare for slot reset */ + case pci_channel_io_frozen: + /* + * Cancel and wait for all TDRs in progress if failing to + * set adev->in_gpu_reset in amdgpu_device_lock_adev + * + * Locking adev->reset_sem will prevent any external access + * to GPU during PCI error recovery + */ + while (!amdgpu_device_lock_adev(adev, NULL)) + amdgpu_cancel_all_tdr(adev); + + /* + * Block any work scheduling as we do for regular GPU reset + * for the duration of the recovery + */ + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + drm_sched_stop(&ring->sched, NULL); + } return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: /* Permanent error, prepare for device removal */ @@ -4899,8 +4935,21 @@ void amdgpu_pci_resume(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); + int i; - amdgpu_device_unlock_adev(adev); DRM_INFO("PCI error: resume callback!!\n"); + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + + drm_sched_resubmit_jobs(&ring->sched); + drm_sched_start(&ring->sched, true); + } + + amdgpu_device_unlock_adev(adev); } -- cgit v1.2.3 From 362c7b91c1b00e31ebd219af1e493e7ef50ccfb4 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Mon, 24 Aug 2020 11:02:18 -0400 Subject: drm/amdgpu: Fix SMU error failure Wait for HW/PSP initiated ASIC reset to complete before starting the recovery operations. v2: Remove typo Signed-off-by: Andrey Grodzovsky Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2d155355a05f..36a8f128fb6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4843,14 +4843,32 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - int r; + int r, i; bool vram_lost; + u32 memsize; DRM_INFO("PCI error: slot reset callback!!\n"); + /* wait for asic to come out of reset */ + msleep(500); + pci_restore_state(pdev); - adev->in_pci_err_recovery = true; + /* confirm ASIC came out of reset */ + for (i = 0; i < adev->usec_timeout; i++) { + memsize = amdgpu_asic_get_config_memsize(adev); + + if (memsize != 0xffffffff) + break; + udelay(1); + } + if (memsize == 0xffffffff) { + r = -ETIME; + goto out; + } + + /* TODO Call amdgpu_pre_asic_reset instead */ + adev->in_pci_err_recovery = true; r = amdgpu_device_ip_suspend(adev); adev->in_pci_err_recovery = false; if (r) -- cgit v1.2.3 From c1dd4aa624076cb6d4724fad2d9e9e71e46bbc9f Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Mon, 24 Aug 2020 12:30:47 -0400 Subject: drm/amdgpu: Fix consecutive DPC recovery failures. Cache the PCI state on boot and before each case where we might loose it. v2: Add pci_restore_state while caching the PCI state to avoid breaking PCI core logic for stuff like suspend/resume. v3: Extract pci_restore_state from amdgpu_device_cache_pci_state to avoid superflous restores during GPU resets and suspend/resumes. v4: Style fixes. Signed-off-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 62 ++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 36a8f128fb6b..9be9355bb0e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1292,7 +1292,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; pci_set_power_state(dev->pdev, PCI_D0); - pci_restore_state(dev->pdev); + amdgpu_device_load_pci_state(dev->pdev); r = pci_enable_device(dev->pdev); if (r) DRM_WARN("pci_enable_device failed (%d)\n", r); @@ -1305,7 +1305,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; amdgpu_device_suspend(dev, true); - pci_save_state(dev->pdev); + amdgpu_device_cache_pci_state(dev->pdev); /* Shut down the device */ pci_disable_device(dev->pdev); pci_set_power_state(dev->pdev, PCI_D3cold); @@ -3408,6 +3408,10 @@ fence_driver_init: if (r) dev_err(adev->dev, "amdgpu_pmu_init failed\n"); + /* Have stored pci confspace at hand for restore in sudden PCI error */ + if (amdgpu_device_cache_pci_state(adev->pdev)) + pci_restore_state(pdev); + return 0; failed: @@ -3432,6 +3436,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) flush_delayed_work(&adev->delayed_init_work); adev->shutdown = true; + kfree(adev->pci_state); + /* make sure IB test finished before entering exclusive mode * to avoid preemption on IB test * */ @@ -4852,7 +4858,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) /* wait for asic to come out of reset */ msleep(500); - pci_restore_state(pdev); + amdgpu_device_load_pci_state(pdev); /* confirm ASIC came out of reset */ for (i = 0; i < adev->usec_timeout; i++) { @@ -4932,6 +4938,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) out: if (!r) { + if (amdgpu_device_cache_pci_state(adev->pdev)) + pci_restore_state(adev->pdev); + DRM_INFO("PCIe error recovery succeeded\n"); } else { DRM_ERROR("PCIe error recovery failed, err:%d", r); @@ -4971,3 +4980,50 @@ void amdgpu_pci_resume(struct pci_dev *pdev) amdgpu_device_unlock_adev(adev); } + +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r; + + r = pci_save_state(pdev); + if (!r) { + kfree(adev->pci_state); + + adev->pci_state = pci_store_saved_state(pdev); + + if (!adev->pci_state) { + DRM_ERROR("Failed to store PCI saved state"); + return false; + } + } else { + DRM_WARN("Failed to save PCI state, err:%d\n", r); + return false; + } + + return true; +} + +bool amdgpu_device_load_pci_state(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r; + + if (!adev->pci_state) + return false; + + r = pci_load_saved_state(pdev, adev->pci_state); + + if (!r) { + pci_restore_state(pdev); + } else { + DRM_WARN("Failed to load PCI state, err:%d\n", r); + return false; + } + + return true; +} + + -- cgit v1.2.3 From 7ac71382e90aba1f58e304e386acdd02af963027 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Mon, 24 Aug 2020 14:27:40 -0400 Subject: drm/amdgpu: Trim amdgpu_pci_slot_reset by reusing code. Reuse exsisting functions from GPU recovery to avoid code duplications. Signed-off-by: Andrey Grodzovsky Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 73 +++++------------------------- 1 file changed, 12 insertions(+), 61 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9be9355bb0e0..ab42d0088f57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4117,7 +4117,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, struct list_head *device_list_handle, - bool *need_full_reset_arg) + bool *need_full_reset_arg, + bool skip_hw_reset) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; @@ -4127,7 +4128,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, * ASIC reset has to be done on all HGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) */ - if (need_full_reset) { + if (!skip_hw_reset && need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -4522,7 +4523,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); + r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); if (r && r == -EAGAIN) goto retry; } @@ -4850,14 +4851,19 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); int r, i; - bool vram_lost; + bool need_full_reset = true; u32 memsize; + struct list_head device_list; DRM_INFO("PCI error: slot reset callback!!\n"); + INIT_LIST_HEAD(&device_list); + list_add_tail(&adev->gmc.xgmi.head, &device_list); + /* wait for asic to come out of reset */ msleep(500); + /* Restore PCI confspace */ amdgpu_device_load_pci_state(pdev); /* confirm ASIC came out of reset */ @@ -4873,70 +4879,15 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) goto out; } - /* TODO Call amdgpu_pre_asic_reset instead */ adev->in_pci_err_recovery = true; - r = amdgpu_device_ip_suspend(adev); + r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); adev->in_pci_err_recovery = false; if (r) goto out; - - /* post card */ - r = amdgpu_atom_asic_init(adev->mode_info.atom_context); - if (r) - goto out; - - r = amdgpu_device_ip_resume_phase1(adev); - if (r) - goto out; - - vram_lost = amdgpu_device_check_vram_lost(adev); - if (vram_lost) { - DRM_INFO("VRAM is lost due to GPU reset!\n"); - amdgpu_inc_vram_lost(adev); - } - - r = amdgpu_gtt_mgr_recover( - &adev->mman.bdev.man[TTM_PL_TT]); - if (r) - goto out; - - r = amdgpu_device_fw_loading(adev); - if (r) - return r; - - r = amdgpu_device_ip_resume_phase2(adev); - if (r) - goto out; - - if (vram_lost) - amdgpu_device_fill_reset_magic(adev); - - /* - * Add this ASIC as tracked as reset was already - * complete successfully. - */ - amdgpu_register_gpu_instance(adev); - - r = amdgpu_device_ip_late_init(adev); - if (r) - goto out; - - amdgpu_fbdev_set_suspend(adev, 0); - - /* must succeed. */ - amdgpu_ras_resume(adev); - - - amdgpu_irq_gpu_reset_resume_helper(adev); - r = amdgpu_ib_ring_tests(adev); - if (r) - goto out; - - r = amdgpu_device_recover_vram(adev); + r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); out: - if (!r) { if (amdgpu_device_cache_pci_state(adev->pdev)) pci_restore_state(adev->pdev); -- cgit v1.2.3 From 6894305c97ceb56c0caa6f701516546a732af95f Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Mon, 24 Aug 2020 14:41:56 -0400 Subject: drm/amdgpu: Disable DPC for XGMI for now. XGMI support is more complicated than single device support as questions of synchronization between the device recovering from PCI error and other members of the hive are required. Leaving this for next round. Signed-off-by: Andrey Grodzovsky Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ab42d0088f57..36231fe9e4e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4783,6 +4783,11 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); + if (adev->gmc.xgmi.num_physical_nodes > 1) { + DRM_WARN("No support for XGMI hive yet..."); + return PCI_ERS_RESULT_DISCONNECT; + } + switch (state) { case pci_channel_io_normal: return PCI_ERS_RESULT_CAN_RECOVER; -- cgit v1.2.3 From 7cbbc745dc11a75e11df532ca1566949709d6695 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Mon, 31 Aug 2020 11:34:33 -0400 Subject: drm/amdgpu: Minor checkpatch fix Signed-off-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 36231fe9e4e5..d633e5448476 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -358,7 +358,8 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, * * Returns the 8 bit value from the offset specified. */ -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) +{ if (adev->in_pci_err_recovery) return 0; @@ -382,7 +383,8 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { * * Writes the value specified to the offset specified. */ -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) +{ if (adev->in_pci_err_recovery) return; -- cgit v1.2.3 From 5aea5327ea2ddf544cbeff096f45fc2319b0714e Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Wed, 16 Sep 2020 13:03:50 -0400 Subject: drm/amdgpu: No sysfs, not an error condition Not being able to create amdgpu sysfs attributes is not a fatal error warranting not to continue to try to bring up the display. Thus, if we get an error trying to create amdgpu sysfs attrs, report it and continue on to try to bring up a display. Signed-off-by: Luben Tuikov Acked-by: Slava Abramov Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d633e5448476..53b9d86da596 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3400,10 +3400,8 @@ fence_driver_init: flush_delayed_work(&adev->delayed_init_work); r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); - if (r) { + if (r) dev_err(adev->dev, "Could not create amdgpu device attr\n"); - return r; - } if (IS_ENABLED(CONFIG_PERF_EVENTS)) r = amdgpu_pmu_init(adev); -- cgit v1.2.3 From 4192f7b5768912ceda82be2f83c87ea7181f9980 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 16 Sep 2020 10:28:55 -0400 Subject: drm/amdgpu: unmap register bar on device init failure We never unmapped the regiser BAR on failure. Reported-by: kernel test robot Reported-by: Dan Carpenter Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 53b9d86da596..2ff43a3d52fc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3209,13 +3209,13 @@ int amdgpu_device_init(struct amdgpu_device *adev, r = amdgpu_device_get_job_timeout_settings(adev); if (r) { dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); - return r; + goto failed_unmap; } /* early init functions */ r = amdgpu_device_ip_early_init(adev); if (r) - return r; + goto failed_unmap; /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); @@ -3419,6 +3419,10 @@ failed: if (boco) vga_switcheroo_fini_domain_pm_ops(adev->dev); +failed_unmap: + iounmap(adev->rmmio); + adev->rmmio = NULL; + return r; } -- cgit v1.2.3