diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 86 |
1 files changed, 54 insertions, 32 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 443409d4f4b0..dc07936d2fcb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1498,6 +1498,9 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, !amdgpu_ras_get_aca_debug_mode(adev)) return -EOPNOTSUPP; + if (amdgpu_sriov_vf(adev)) + return -EOPNOTSUPP; + /* skip ras error reset in gpu reset */ if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) && ((smu_funcs && smu_funcs->set_debug_mode) || @@ -2161,7 +2164,7 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) /* Fatal error events are handled on host side */ if (amdgpu_sriov_vf(adev)) return; - /** + /* * If the current interrupt is caused by a non-fatal RAS error, skip * check for fatal error. For fatal errors, FED status of all devices * in XGMI hive gets set when the first device gets fatal error @@ -2886,6 +2889,7 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) return -EINVAL; } + return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit); } @@ -2900,7 +2904,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, &adev->psp.ras_context.ras->eeprom_control; enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; int ret = 0; - uint32_t i; + uint32_t i = 0; if (!con || !con->eh_data || !bps || pages <= 0) return 0; @@ -2921,34 +2925,36 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, mutex_lock(&con->recovery_lock); if (from_rom) { - for (i = 0; i < pages; i++) { - if (control->ras_num_recs - i >= adev->umc.retire_unit) { - if ((bps[i].address == bps[i + 1].address) && - (bps[i].mem_channel == bps[i + 1].mem_channel)) { - //deal with retire_unit records a time - ret = __amdgpu_ras_convert_rec_array_from_rom(adev, - &bps[i], &err_data, nps); - if (ret) - goto free; - i += (adev->umc.retire_unit - 1); + /* there is no pa recs in V3, so skip pa recs processing */ + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { + for (i = 0; i < pages; i++) { + if (control->ras_num_recs - i >= adev->umc.retire_unit) { + if ((bps[i].address == bps[i + 1].address) && + (bps[i].mem_channel == bps[i + 1].mem_channel)) { + /* deal with retire_unit records a time */ + ret = __amdgpu_ras_convert_rec_array_from_rom(adev, + &bps[i], &err_data, nps); + if (ret) + control->ras_num_bad_pages -= adev->umc.retire_unit; + i += (adev->umc.retire_unit - 1); + } else { + break; + } } else { break; } - } else { - break; } } for (; i < pages; i++) { ret = __amdgpu_ras_convert_rec_from_rom(adev, &bps[i], &err_data, nps); if (ret) - goto free; + control->ras_num_bad_pages -= adev->umc.retire_unit; } } else { ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); } -free: if (from_rom) kfree(err_data.err_addr); mutex_unlock(&con->recovery_lock); @@ -3037,21 +3043,28 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) dev_err(adev->dev, "Failed to load EEPROM table records!"); } else { if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { - for (i = 0; i < control->ras_num_recs; i++) { - if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { - if ((bps[i].address == bps[i + 1].address) && - (bps[i].mem_channel == bps[i + 1].mem_channel)) { - control->ras_num_pa_recs += adev->umc.retire_unit; - i += (adev->umc.retire_unit - 1); + /*In V3, there is no pa recs, and some cases(when address==0) may be parsed + as pa recs, so add verion check to avoid it. + */ + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { + for (i = 0; i < control->ras_num_recs; i++) { + if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { + if ((bps[i].address == bps[i + 1].address) && + (bps[i].mem_channel == bps[i + 1].mem_channel)) { + control->ras_num_pa_recs += adev->umc.retire_unit; + i += (adev->umc.retire_unit - 1); + } else { + control->ras_num_mca_recs += + (control->ras_num_recs - i); + break; + } } else { - control->ras_num_mca_recs += - (control->ras_num_recs - i); + control->ras_num_mca_recs += (control->ras_num_recs - i); break; } - } else { - control->ras_num_mca_recs += (control->ras_num_recs - i); - break; } + } else { + control->ras_num_mca_recs = control->ras_num_recs; } } @@ -3460,6 +3473,10 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) control->ras_num_pa_recs = control->ras_num_recs; + if (adev->umc.ras && + adev->umc.ras->get_retire_flip_bits) + adev->umc.ras->get_retire_flip_bits(adev); + if (control->ras_num_recs) { ret = amdgpu_ras_load_bad_pages(adev); if (ret) @@ -3793,10 +3810,12 @@ init_ras_enabled_flag: adev->ras_hw_enabled & amdgpu_ras_mask; /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */ - adev->aca.is_enabled = - (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || - amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || - amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)); + if (!amdgpu_sriov_vf(adev)) { + adev->aca.is_enabled = + (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)); + } /* bad page feature is not applicable to specific app platform */ if (adev->gmc.is_app_apu && @@ -4479,8 +4498,11 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) enum ras_event_type type = RAS_EVENT_TYPE_FATAL; u64 event_id; - if (amdgpu_ras_mark_ras_event(adev, type)) + if (amdgpu_ras_mark_ras_event(adev, type)) { + dev_err(adev->dev, + "uncorrectable hardware error (ERREVENT_ATHUB_INTERRUPT) detected!\n"); return; + } event_id = amdgpu_ras_acquire_event_id(adev, type); |