summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c114
1 files changed, 79 insertions, 35 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 443409d4f4b0..de0944947eaf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1498,6 +1498,9 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
!amdgpu_ras_get_aca_debug_mode(adev))
return -EOPNOTSUPP;
+ if (amdgpu_sriov_vf(adev))
+ return -EOPNOTSUPP;
+
/* skip ras error reset in gpu reset */
if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) &&
((smu_funcs && smu_funcs->set_debug_mode) ||
@@ -2161,7 +2164,7 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
/* Fatal error events are handled on host side */
if (amdgpu_sriov_vf(adev))
return;
- /**
+ /*
* If the current interrupt is caused by a non-fatal RAS error, skip
* check for fatal error. For fatal errors, FED status of all devices
* in XGMI hive gets set when the first device gets fatal error
@@ -2856,6 +2859,15 @@ static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,
return -EINVAL;
}
} else {
+ if (bps[0].address == 0) {
+ /* for specific old eeprom data, mca address is not stored,
+ * calc it from pa
+ */
+ if (amdgpu_umc_pa2mca(adev, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT,
+ &(bps[0].address), AMDGPU_NPS1_PARTITION_MODE))
+ return -EINVAL;
+ }
+
if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
if (nps == AMDGPU_NPS1_PARTITION_MODE)
memcpy(err_data->err_addr, bps,
@@ -2883,9 +2895,22 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,
bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
return -EINVAL;
} else {
- if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
- return -EINVAL;
+ if (bps->address) {
+ if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
+ return -EINVAL;
+ } else {
+ /* for specific old eeprom data, mca address is not stored,
+ * calc it from pa
+ */
+ if (amdgpu_umc_pa2mca(adev, bps->retired_page << AMDGPU_GPU_PAGE_SHIFT,
+ &(bps->address), AMDGPU_NPS1_PARTITION_MODE))
+ return -EINVAL;
+
+ if (amdgpu_ras_mca2pa(adev, bps, err_data))
+ return -EOPNOTSUPP;
+ }
}
+
return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
adev->umc.retire_unit);
}
@@ -2900,7 +2925,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
&adev->psp.ras_context.ras->eeprom_control;
enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
int ret = 0;
- uint32_t i;
+ uint32_t i = 0;
if (!con || !con->eh_data || !bps || pages <= 0)
return 0;
@@ -2921,34 +2946,36 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
mutex_lock(&con->recovery_lock);
if (from_rom) {
- for (i = 0; i < pages; i++) {
- if (control->ras_num_recs - i >= adev->umc.retire_unit) {
- if ((bps[i].address == bps[i + 1].address) &&
- (bps[i].mem_channel == bps[i + 1].mem_channel)) {
- //deal with retire_unit records a time
- ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
- &bps[i], &err_data, nps);
- if (ret)
- goto free;
- i += (adev->umc.retire_unit - 1);
+ /* there is no pa recs in V3, so skip pa recs processing */
+ if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
+ for (i = 0; i < pages; i++) {
+ if (control->ras_num_recs - i >= adev->umc.retire_unit) {
+ if ((bps[i].address == bps[i + 1].address) &&
+ (bps[i].mem_channel == bps[i + 1].mem_channel)) {
+ /* deal with retire_unit records a time */
+ ret = __amdgpu_ras_convert_rec_array_from_rom(adev,
+ &bps[i], &err_data, nps);
+ if (ret)
+ control->ras_num_bad_pages -= adev->umc.retire_unit;
+ i += (adev->umc.retire_unit - 1);
+ } else {
+ break;
+ }
} else {
break;
}
- } else {
- break;
}
}
for (; i < pages; i++) {
ret = __amdgpu_ras_convert_rec_from_rom(adev,
&bps[i], &err_data, nps);
if (ret)
- goto free;
+ control->ras_num_bad_pages -= adev->umc.retire_unit;
}
} else {
ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
}
-free:
if (from_rom)
kfree(err_data.err_addr);
mutex_unlock(&con->recovery_lock);
@@ -3037,21 +3064,28 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
dev_err(adev->dev, "Failed to load EEPROM table records!");
} else {
if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
- for (i = 0; i < control->ras_num_recs; i++) {
- if ((control->ras_num_recs - i) >= adev->umc.retire_unit) {
- if ((bps[i].address == bps[i + 1].address) &&
- (bps[i].mem_channel == bps[i + 1].mem_channel)) {
- control->ras_num_pa_recs += adev->umc.retire_unit;
- i += (adev->umc.retire_unit - 1);
+ /*In V3, there is no pa recs, and some cases(when address==0) may be parsed
+ as pa recs, so add verion check to avoid it.
+ */
+ if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
+ for (i = 0; i < control->ras_num_recs; i++) {
+ if ((control->ras_num_recs - i) >= adev->umc.retire_unit) {
+ if ((bps[i].address == bps[i + 1].address) &&
+ (bps[i].mem_channel == bps[i + 1].mem_channel)) {
+ control->ras_num_pa_recs += adev->umc.retire_unit;
+ i += (adev->umc.retire_unit - 1);
+ } else {
+ control->ras_num_mca_recs +=
+ (control->ras_num_recs - i);
+ break;
+ }
} else {
- control->ras_num_mca_recs +=
- (control->ras_num_recs - i);
+ control->ras_num_mca_recs += (control->ras_num_recs - i);
break;
}
- } else {
- control->ras_num_mca_recs += (control->ras_num_recs - i);
- break;
}
+ } else {
+ control->ras_num_mca_recs = control->ras_num_recs;
}
}
@@ -3460,6 +3494,10 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
control->ras_num_pa_recs = control->ras_num_recs;
+ if (adev->umc.ras &&
+ adev->umc.ras->get_retire_flip_bits)
+ adev->umc.ras->get_retire_flip_bits(adev);
+
if (control->ras_num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
if (ret)
@@ -3691,7 +3729,8 @@ static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev
*/
if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) ||
amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) ||
- amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
+ amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3) ||
+ amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(5, 0, 1))
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
1 << AMDGPU_RAS_BLOCK__JPEG);
else
@@ -3793,10 +3832,12 @@ init_ras_enabled_flag:
adev->ras_hw_enabled & amdgpu_ras_mask;
/* aca is disabled by default except for psp v13_0_6/v13_0_12/v13_0_14 */
- adev->aca.is_enabled =
- (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) ||
- amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) ||
- amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14));
+ if (!amdgpu_sriov_vf(adev)) {
+ adev->aca.is_enabled =
+ (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) ||
+ amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12) ||
+ amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 14));
+ }
/* bad page feature is not applicable to specific app platform */
if (adev->gmc.is_app_apu &&
@@ -4479,8 +4520,11 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
enum ras_event_type type = RAS_EVENT_TYPE_FATAL;
u64 event_id;
- if (amdgpu_ras_mark_ras_event(adev, type))
+ if (amdgpu_ras_mark_ras_event(adev, type)) {
+ dev_err(adev->dev,
+ "uncorrectable hardware error (ERREVENT_ATHUB_INTERRUPT) detected!\n");
return;
+ }
event_id = amdgpu_ras_acquire_event_id(adev, type);