diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 83 |
1 files changed, 58 insertions, 25 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1a1395c5fff1..1d9eda883bb8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3146,7 +3146,42 @@ static int amdgpu_ras_page_retirement_thread(void *param) return 0; } -int amdgpu_ras_recovery_init(struct amdgpu_device *adev) +int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + int ret; + + if (!con || amdgpu_sriov_vf(adev)) + return 0; + + ret = amdgpu_ras_eeprom_init(&con->eeprom_control); + + if (ret) + return ret; + + /* HW not usable */ + if (amdgpu_ras_is_rma(adev)) + return -EHWPOISON; + + if (con->eeprom_control.ras_num_recs) { + ret = amdgpu_ras_load_bad_pages(adev); + if (ret) + return ret; + + amdgpu_dpm_send_hbm_bad_pages_num( + adev, con->eeprom_control.ras_num_recs); + + if (con->update_channel_flag == true) { + amdgpu_dpm_send_hbm_bad_channel_flag( + adev, con->eeprom_control.bad_channel_bitmap); + con->update_channel_flag = false; + } + } + + return ret; +} + +int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; @@ -3181,31 +3216,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); - /* Todo: During test the SMU might fail to read the eeprom through I2C - * when the GPU is pending on XGMI reset during probe time - * (Mostly after second bus reset), skip it now - */ - if (adev->gmc.xgmi.pending_reset) - return 0; - ret = amdgpu_ras_eeprom_init(&con->eeprom_control); - /* - * This calling fails when is_rma is true or - * ret != 0. - */ - if (amdgpu_ras_is_rma(adev) || ret) - goto free; - - if (con->eeprom_control.ras_num_recs) { - ret = amdgpu_ras_load_bad_pages(adev); + if (init_bp_info) { + ret = amdgpu_ras_init_badpage_info(adev); if (ret) goto free; - - amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); - - if (con->update_channel_flag == true) { - amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); - con->update_channel_flag = false; - } } mutex_init(&con->page_rsv_lock); @@ -4294,8 +4308,27 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; } - if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) { + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + int hive_ras_recovery = 0; + + if (hive) { + hive_ras_recovery = atomic_read(&hive->ras_recovery); + amdgpu_put_xgmi_hive(hive); + } + /* In the case of multiple GPUs, after a GPU has started + * resetting all GPUs on hive, other GPUs do not need to + * trigger GPU reset again. + */ + if (!hive_ras_recovery) + amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); + else + atomic_set(&ras->in_recovery, 0); + } else { + flush_work(&ras->recovery_work); amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); + } + return 0; } |