From 201761b5eb57c3fad810cde555795c3b5721a031 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Thu, 9 Nov 2023 10:50:38 +0530 Subject: drm/amdgpu: Move mca debug mode decision to ras Refactor code such that ras block decides the default mca debug mode, and not swsmu block. By default mca debug mode is set to false. v2: squash in uninitialized value fix (Alex) Signed-off-by: Lijo Lazar Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index cf33eb219e25..54f2f346579e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -377,7 +377,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 val) struct amdgpu_device *adev = (struct amdgpu_device *)data; int ret; - ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false); + ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false); if (ret) return ret; -- cgit From 00f9d49bce844e8196e0c2ea298f9a41a11129d9 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 30 Nov 2023 12:58:14 +0800 Subject: drm/amdgpu: Fix missing mca debugfs node Use amdgpu_ip_version() helper function to check ip version. The ip version contains other information, use the helper function to avoid reading wrong value. Reviewed-by: Lijo Lazar Signed-off-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 54f2f346579e..210aea590a52 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -485,7 +485,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(mca_debug_mode_fops, NULL, amdgpu_mca_smu_debug_mode_se void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root) { #if defined(CONFIG_DEBUG_FS) - if (!root || adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 6)) + if (!root || amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6)) return; debugfs_create_file("mca_debug_mode", 0200, root, adev, &mca_debug_mode_fops); -- cgit From 9f91e983ee82d3b6f6d713e1c84ebb8d53180b3d Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Tue, 12 Dec 2023 17:26:58 +0800 Subject: drm/amdgpu: MCA supports recording umc address information MCA supports recording umc address information. V2: Move err_addr variable from struct ras_err_node to struct ras_err_info. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 210aea590a52..8911310f98df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -218,6 +218,7 @@ static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, st int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data) { struct amdgpu_smuio_mcm_config_info mcm_info; + struct ras_err_addr err_addr = {0}; struct mca_bank_set mca_set; struct mca_bank_node *node; struct mca_bank_entry *entry; @@ -246,10 +247,18 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo mcm_info.socket_id = entry->info.socket_id; mcm_info.die_id = entry->info.aid; + if (blk == AMDGPU_RAS_BLOCK__UMC) { + err_addr.err_status = entry->regs[MCA_REG_IDX_STATUS]; + err_addr.err_ipid = entry->regs[MCA_REG_IDX_IPID]; + err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR]; + } + if (type == AMDGPU_MCA_ERROR_TYPE_UE) - amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, (uint64_t)count); + amdgpu_ras_error_statistic_ue_count(err_data, + &mcm_info, &err_addr, (uint64_t)count); else - amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, (uint64_t)count); + amdgpu_ras_error_statistic_ce_count(err_data, + &mcm_info, &err_addr, (uint64_t)count); } out_mca_release: -- cgit From 4f32504a2f85a7b40fe149436881381f48e9c0c0 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Wed, 3 Jan 2024 22:05:16 +0530 Subject: drm/amdgpu: Fix variable 'mca_funcs' dereferenced before NULL check in 'amdgpu_mca_smu_get_mca_entry()' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the below: drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c:377 amdgpu_mca_smu_get_mca_entry() warn: variable dereferenced before check 'mca_funcs' (see line 368) 357 int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, 358 int idx, struct mca_bank_entry *entry) 359 { 360 const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; 361 int count; 362 363 switch (type) { 364 case AMDGPU_MCA_ERROR_TYPE_UE: 365 count = mca_funcs->max_ue_count; mca_funcs is dereferenced here. 366 break; 367 case AMDGPU_MCA_ERROR_TYPE_CE: 368 count = mca_funcs->max_ce_count; mca_funcs is dereferenced here. 369 break; 370 default: 371 return -EINVAL; 372 } 373 374 if (idx >= count) 375 return -EINVAL; 376 377 if (mca_funcs && mca_funcs->mca_get_mca_entry) ^^^^^^^^^ Checked too late! Cc: Yang Wang Cc: Hawking Zhang Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 8911310f98df..59fafb8392e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -360,6 +360,9 @@ int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_err const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; int count; + if (!mca_funcs || !mca_funcs->mca_get_mca_entry) + return -EOPNOTSUPP; + switch (type) { case AMDGPU_MCA_ERROR_TYPE_UE: count = mca_funcs->max_ue_count; @@ -374,10 +377,7 @@ int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_err if (idx >= count) return -EINVAL; - if (mca_funcs && mca_funcs->mca_get_mca_entry) - return mca_funcs->mca_get_mca_entry(adev, type, idx, entry); - - return -EOPNOTSUPP; + return mca_funcs->mca_get_mca_entry(adev, type, idx, entry); } #if defined(CONFIG_DEBUG_FS) -- cgit