diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 114 |
1 files changed, 79 insertions, 35 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 443409d4f4b0..de0944947eaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1498,6 +1498,9 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, !amdgpu_ras_get_aca_debug_mode(adev)) return -EOPNOTSUPP; + if (amdgpu_sriov_vf(adev)) + return -EOPNOTSUPP; + /* skip ras error reset in gpu reset */ if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) && ((smu_funcs && smu_funcs->set_debug_mode) || @@ -2161,7 +2164,7 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) /* Fatal error events are handled on host side */ if (amdgpu_sriov_vf(adev)) return; - /** + /* * If the current interrupt is caused by a non-fatal RAS error, skip * check for fatal error. For fatal errors, FED status of all devices * in XGMI hive gets set when the first device gets fatal error @@ -2856,6 +2859,15 @@ static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev, return -EINVAL; } } else { + if (bps[0].address == 0) { + /* for specific old eeprom data, mca address is not stored, + * calc it from pa + */ + if (amdgpu_umc_pa2mca(adev, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT, + &(bps[0].address), AMDGPU_NPS1_PARTITION_MODE)) + return -EINVAL; + } + if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) { if (nps == AMDGPU_NPS1_PARTITION_MODE) memcpy(err_data->err_addr, bps, @@ -2883,9 +2895,22 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, bps->retired_page << AMDGPU_GPU_PAGE_SHIFT)) return -EINVAL; } else { - if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) - return -EINVAL; + if (bps->address) { + if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) + return -EINVAL; + } else { + /* for specific old eeprom data, mca address is not stored, + * calc it from pa + */ + if (amdgpu_umc_pa2mca(adev, bps->retired_page << AMDGPU_GPU_PAGE_SHIFT, + &(bps->address), AMDGPU_NPS1_PARTITION_MODE)) + return -EINVAL; + + if (amdgpu_ras_mca2pa(adev, bps, err_data)) + return -EOPNOTSUPP; + } } + return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit); } @@ -2900,7 +2925,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, &adev->psp.ras_context.ras->eeprom_control; enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; int ret = 0; - uint32_t i; + uint32_t i = 0; if (!con || !con->eh_data || !bps || pages <= 0) return 0; @@ -2921,34 +2946,36 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, mutex_lock(&con->recovery_lock); if (from_rom) { - for (i = 0; i < pages; i++) { - if (control->ras_num_recs - i >= adev->umc.retire_unit) { - if ((bps[i].address == bps[i + 1].address) && - (bps[i].mem_channel == bps[i + 1].mem_channel)) { - //deal with retire_unit records a time - ret = __amdgpu_ras_convert_rec_array_from_rom(adev, - &bps[i], &err_data, nps); - if (ret) - goto free; - i += (adev->umc.retire_unit - 1); + /* there is no pa recs in V3, so skip pa recs processing */ + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { + for (i = 0; i < pages; i++) { + if (control->ras_num_recs - i >= adev->umc.retire_unit) { + if ((bps[i].address == bps[i + 1].address) && + (bps[i].mem_channel == bps[i + 1].mem_channel)) { + /* deal with retire_unit records a time */ + ret = __amdgpu_ras_convert_rec_array_from_rom(adev, + &bps[i], &err_data, nps); + if (ret) + control->ras_num_bad_pages -= adev->umc.retire_unit; + i += (adev->umc.retire_unit - 1); + } else { + break; + } } else { break; } - } else { - break; } } for (; i < pages; i++) { ret = __amdgpu_ras_convert_rec_from_rom(adev, &bps[i], &err_data, nps); if (ret) - goto free; + control->ras_num_bad_pages -= adev->umc.retire_unit; } } else { ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); } -free: if (from_rom) kfree(err_data.err_addr); mutex_unlock(&con->recovery_lock); @@ -3037,21 +3064,28 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) dev_err(adev->dev, "Failed to load EEPROM table records!"); } else { if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { - for (i = 0; i < control->ras_num_recs; i++) { - if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { - if ((bps[i].address == bps[i + 1].address) && - (bps[i].mem_channel == bps[i + 1].mem_channel)) { - control->ras_num_pa_recs += adev->umc.retire_unit; - i += (adev->umc.retire_unit - 1); + /*In V3, there is no pa recs, and some cases(when address==0) may be parsed + as pa recs, so add verion check to avoid it. + */ + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { + for (i = 0; i < control->ras_num_recs; i++) { + if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { + if ((bps[i].address == bps[i + 1].address) && + (bps[i].mem_channel == bps[i + 1].mem_channel)) { + control->ras_num_pa_recs += adev->umc.retire_unit; + i += (adev->umc.retire_unit - 1); + } else { + control->ras_num_mca_recs += + (control->ras_num_recs - i); + break; + } } else { - control->ras_num_mca_recs += - (control->ras_num_recs - i); + control->ras_num_mca_recs += (control->ras_num_recs - i); break; } - } else { - control->ras_num_mca_recs += (control->ras_num_recs - i); - break; } + } else { + control->ras_num_mca_recs = control->ras_num_recs; } } @@ -3460,6 +3494,10 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) control->ras_num_pa_recs = control->ras_num_recs; + if (adev->umc.ras && + adev->umc.ras->get_retire_flip_bits) + adev->umc.ras->get_retire_flip_bits(adev); + if (control->ras_num_recs) { ret = amdgpu_ras_load_bad_pages(adev); if (ret) @@ -3691,7 +3729,8 @@ static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev */ if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || - amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3)) + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(5, 0, 1)) adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | 1 << AMDGPU_RAS_BLOCK__JPEG); else @@ -3793,10 +3832,12 @@ init_ras_enabled_flag: adev->ras_hw_enabled & amdgpu_ras_mask; /* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */ - adev->aca.is_enabled = - (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || - amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || - amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)); + if (!amdgpu_sriov_vf(adev)) { + adev->aca.is_enabled = + (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) || + amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14)); + } /* bad page feature is not applicable to specific app platform */ if (adev->gmc.is_app_apu && @@ -4479,8 +4520,11 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) enum ras_event_type type = RAS_EVENT_TYPE_FATAL; u64 event_id; - if (amdgpu_ras_mark_ras_event(adev, type)) + if (amdgpu_ras_mark_ras_event(adev, type)) { + dev_err(adev->dev, + "uncorrectable hardware error (ERREVENT_ATHUB_INTERRUPT) detected!\n"); return; + } event_id = amdgpu_ras_acquire_event_id(adev, type); |