From 19d0dfda4c75a23012dd714e57b4f569df61089d Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Mon, 19 Apr 2021 11:33:10 +0800 Subject: drm/amdgpu: optimize gfx ras features flag clean Signed-off-by: Stanley.Yang Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b0d2fc9454ca..bb0d02755f11 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -658,11 +658,7 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, con->features |= BIT(head->block); } else { if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { - /* skip clean gfx ras context feature for VEGA20 Gaming. - * will clean later - */ - if (!(!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX))) - con->features &= ~BIT(head->block); + con->features &= ~BIT(head->block); put_obj(obj); } } @@ -770,6 +766,10 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, con->features |= BIT(head->block); ret = amdgpu_ras_feature_enable(adev, head, 0); + + /* clean gfx block ras features flag */ + if (adev->ras_features && head->block == AMDGPU_RAS_BLOCK__GFX) + con->features &= ~BIT(head->block); } } else ret = amdgpu_ras_feature_enable(adev, head, enable); -- cgit From 502f0e28042b4ef271c70f54dc2e4feb230fdb64 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 22 Apr 2021 21:58:08 +0800 Subject: drm/amdgpu: disable gfx ras by default in aldebaran aldebaran gfx ras is still under development Signed-off-by: Hawking Zhang Reviewed-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index bb0d02755f11..f62873fbf249 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2066,8 +2066,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, } else { /* driver only manages a few IP blocks RAS feature * when GPU is connected cpu through XGMI */ - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | - 1 << AMDGPU_RAS_BLOCK__SDMA | + *hw_supported |= (1 << AMDGPU_RAS_BLOCK__SDMA | 1 << AMDGPU_RAS_BLOCK__MMHUB); } -- cgit From a30f128602001c986bb2b3ede074b6193d43905f Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Sun, 25 Apr 2021 14:34:25 +0800 Subject: drm/amdgpu: provide socket/die id info in RAS message Add socket/die information in RAS messages for platforms that support query those information Signed-off-by: Hawking Zhang Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f62873fbf249..ae9fb2025259 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -901,17 +901,42 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, info->ce_count = obj->err_data.ce_count; if (err_data.ce_count) { - dev_info(adev->dev, "%ld correctable hardware errors " + if (adev->smuio.funcs && + adev->smuio.funcs->get_socket_id && + adev->smuio.funcs->get_die_id) { + dev_info(adev->dev, "socket: %d, die: %d " + "%ld correctable hardware errors " "detected in %s block, no user " "action is needed.\n", + adev->smuio.funcs->get_socket_id(adev), + adev->smuio.funcs->get_die_id(adev), obj->err_data.ce_count, ras_block_str(info->head.block)); + } else { + dev_info(adev->dev, "%ld correctable hardware errors " + "detected in %s block, no user " + "action is needed.\n", + obj->err_data.ce_count, + ras_block_str(info->head.block)); + } } if (err_data.ue_count) { - dev_info(adev->dev, "%ld uncorrectable hardware errors " + if (adev->smuio.funcs && + adev->smuio.funcs->get_socket_id && + adev->smuio.funcs->get_die_id) { + dev_info(adev->dev, "socket: %d, die: %d " + "%ld uncorrectable hardware errors " "detected in %s block\n", + adev->smuio.funcs->get_socket_id(adev), + adev->smuio.funcs->get_die_id(adev), obj->err_data.ue_count, ras_block_str(info->head.block)); + } else { + dev_info(adev->dev, "%ld uncorrectable hardware errors " + "detected in %s block\n", + obj->err_data.ue_count, + ras_block_str(info->head.block)); + } } return 0; -- cgit From 78871b6c8be303a8626bfc85f022b31362495c1b Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Wed, 28 Apr 2021 22:51:22 +0800 Subject: drm/amdgpu: enable ras error count query and reset for HDP add hdp block ras error query and reset support in amdgpu ras error count query and reset interface Signed-off-by: Hawking Zhang Reviewed-by: John Clements Reviewed-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ae9fb2025259..984e8271e675 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -890,6 +890,11 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->gmc.xgmi.ras_funcs->query_ras_error_count) adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data); break; + case AMDGPU_RAS_BLOCK__HDP: + if (adev->hdp.ras_funcs && + adev->hdp.ras_funcs->query_ras_error_count) + adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data); + break; default: break; } @@ -967,6 +972,11 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, if (adev->sdma.funcs->reset_ras_error_count) adev->sdma.funcs->reset_ras_error_count(adev); break; + case AMDGPU_RAS_BLOCK__HDP: + if (adev->hdp.ras_funcs && + adev->hdp.ras_funcs->reset_ras_error_count) + adev->hdp.ras_funcs->reset_ras_error_count(adev); + break; default: break; } -- cgit From 1f6e8eb153113c2da2027bea7793d88dc9c299c3 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 29 Apr 2021 14:28:13 +0800 Subject: drm/amdgpu: enable gfx ras in aldebran by default gfx ras now can be enabled by default in aldebaran Signed-off-by: Hawking Zhang Reviewed-by: John Clements Reviewed-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 984e8271e675..9306e3925efe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2101,7 +2101,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, } else { /* driver only manages a few IP blocks RAS feature * when GPU is connected cpu through XGMI */ - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__SDMA | + *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | + 1 << AMDGPU_RAS_BLOCK__SDMA | 1 << AMDGPU_RAS_BLOCK__MMHUB); } -- cgit From f50160cf0f98b5bc3074c219a46af7305a14ffff Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Thu, 29 Apr 2021 09:32:12 +0800 Subject: drm/amdgpu: force enable gfx ras for vega20 ws Signed-off-by: Stanley.Yang Reviewed-by: Feifei Xu Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9306e3925efe..ebbe2c5190c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -33,6 +33,7 @@ #include "amdgpu_atomfirmware.h" #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +#include "atom.h" static const char *RAS_FS_NAME = "ras"; @@ -2063,6 +2064,24 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) adev->asic_type == CHIP_SIENNA_CICHLID; } +/* + * this is workaround for vega20 workstation sku, + * force enable gfx ras, ignore vbios gfx ras flag + * due to GC EDC can not write + */ +static void amdgpu_ras_get_quirks(struct amdgpu_device *adev, + uint32_t *hw_supported) +{ + struct atom_context *ctx = adev->mode_info.atom_context; + + if (!ctx) + return; + + if (strnstr(ctx->vbios_version, "D16406", + sizeof(ctx->vbios_version))) + *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX); +} + /* * check hardware's ras ability which will be saved in hw_supported. * if hardware does not support ras, we can skip some ras initializtion and @@ -2106,6 +2125,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, 1 << AMDGPU_RAS_BLOCK__MMHUB); } + amdgpu_ras_get_quirks(adev, hw_supported); + /* hw_supported needs to be aligned with RAS block mask. */ *hw_supported &= AMDGPU_RAS_BLOCK_MASK; -- cgit From acdae2169bae6993c61ded36ae0b2301c07c0a9e Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Mon, 3 May 2021 20:02:22 -0400 Subject: drm/amdgpu: Remove redundant ras->supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove redundant ras->supported, as this value is also stored in adev->ras_features. Use adev->ras_features, as that supercedes "ras", since the latter is its member. The dependency goes like this: ras <== adev->ras_features <== hw_supported, and is read as "ras depends on ras_features, which depends on hw_supported." The arrows show the flow of information, i.e. the dependency update. "hw_supported" should also live in "adev". Cc: Alexander Deucher Cc: John Clements Cc: Hawking Zhang Signed-off-by: Luben Tuikov Acked-by: Christian König Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ebbe2c5190c4..a484ac6a8399 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2130,9 +2130,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, /* hw_supported needs to be aligned with RAS block mask. */ *hw_supported &= AMDGPU_RAS_BLOCK_MASK; - *supported = amdgpu_ras_enable == 0 ? - 0 : *hw_supported & amdgpu_ras_mask; - adev->ras_features = *supported; + *supported = amdgpu_ras_enable == 0 ? 0 : + *hw_supported & amdgpu_ras_mask; } int amdgpu_ras_init(struct amdgpu_device *adev) @@ -2154,7 +2153,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_set_context(adev, con); amdgpu_ras_check_supported(adev, &con->hw_supported, - &con->supported); + &adev->ras_features); if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) { /* set gfx block ras context feature for VEGA20 Gaming * send ras disable cmd to ras ta during ras late init. @@ -2210,7 +2209,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", - con->hw_supported, con->supported); + con->hw_supported, adev->ras_features); return 0; release_con: amdgpu_ras_set_context(adev, NULL); -- cgit From e509965e58ab2c96418e1a5e756e25d9a54e0765 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Mon, 3 May 2021 20:43:00 -0400 Subject: drm/amdgpu: Move up ras_hw_supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move ras_hw_supported into struct amdgpu_dev. The dependency is: struct amdgpu_ras <== struct amdgpu_dev <== ASIC, read as "struct amdgpu_ras depends on struct amdgpu_dev, which depends on the hardware." This can be loosely understood as, "if RAS is supported, which is property of the ASIC (struct amdgpu_dev), then we can access struct amdgpu_ras." v2: Fix a typo: must binary AND in ternary cond in amdgpu_ras.c Cc: Alexander Deucher Cc: John Clements Cc: Hawking Zhang Signed-off-by: Luben Tuikov Acked-by: Christian König Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 58 +++++++++++++++------------------ 1 file changed, 26 insertions(+), 32 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a484ac6a8399..e7940e813f32 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -611,11 +611,9 @@ static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, /* feature ctl begin */ static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, - struct ras_common_if *head) + struct ras_common_if *head) { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - - return con->hw_supported & BIT(head->block); + return adev->ras_hw_supported & BIT(head->block); } static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, @@ -2069,8 +2067,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) * force enable gfx ras, ignore vbios gfx ras flag * due to GC EDC can not write */ -static void amdgpu_ras_get_quirks(struct amdgpu_device *adev, - uint32_t *hw_supported) +static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) { struct atom_context *ctx = adev->mode_info.atom_context; @@ -2078,8 +2075,8 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev, return; if (strnstr(ctx->vbios_version, "D16406", - sizeof(ctx->vbios_version))) - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX); + sizeof(ctx->vbios_version))) + adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX); } /* @@ -2091,11 +2088,9 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev, * we have to initialize ras as normal. but need check if operation is * allowed or not in each function. */ -static void amdgpu_ras_check_supported(struct amdgpu_device *adev, - uint32_t *hw_supported, uint32_t *supported) +static void amdgpu_ras_check_supported(struct amdgpu_device *adev) { - *hw_supported = 0; - *supported = 0; + adev->ras_hw_supported = adev->ras_features = 0; if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || !amdgpu_ras_asic_supported(adev)) @@ -2104,34 +2099,34 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, if (!adev->gmc.xgmi.connected_to_cpu) { if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { dev_info(adev->dev, "MEM ECC is active.\n"); - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | - 1 << AMDGPU_RAS_BLOCK__DF); + adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); } else { dev_info(adev->dev, "MEM ECC is not presented.\n"); } if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { dev_info(adev->dev, "SRAM ECC is active.\n"); - *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | - 1 << AMDGPU_RAS_BLOCK__DF); + adev->ras_hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); } else { dev_info(adev->dev, "SRAM ECC is not presented.\n"); } } else { /* driver only manages a few IP blocks RAS feature * when GPU is connected cpu through XGMI */ - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | - 1 << AMDGPU_RAS_BLOCK__SDMA | - 1 << AMDGPU_RAS_BLOCK__MMHUB); + adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | + 1 << AMDGPU_RAS_BLOCK__SDMA | + 1 << AMDGPU_RAS_BLOCK__MMHUB); } - amdgpu_ras_get_quirks(adev, hw_supported); + amdgpu_ras_get_quirks(adev); /* hw_supported needs to be aligned with RAS block mask. */ - *hw_supported &= AMDGPU_RAS_BLOCK_MASK; + adev->ras_hw_supported &= AMDGPU_RAS_BLOCK_MASK; - *supported = amdgpu_ras_enable == 0 ? 0 : - *hw_supported & amdgpu_ras_mask; + adev->ras_features = amdgpu_ras_enable == 0 ? 0 : + adev->ras_hw_supported & amdgpu_ras_mask; } int amdgpu_ras_init(struct amdgpu_device *adev) @@ -2152,9 +2147,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_set_context(adev, con); - amdgpu_ras_check_supported(adev, &con->hw_supported, - &adev->ras_features); - if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) { + amdgpu_ras_check_supported(adev); + + if (!adev->ras_hw_supported || adev->asic_type == CHIP_VEGA10) { /* set gfx block ras context feature for VEGA20 Gaming * send ras disable cmd to ras ta during ras late init. */ @@ -2208,8 +2203,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev) } dev_info(adev->dev, "RAS INFO: ras initialized successfully, " - "hardware ability[%x] ras_mask[%x]\n", - con->hw_supported, adev->ras_features); + "hardware ability[%x] ras_mask[%x]\n", + adev->ras_hw_supported, adev->ras_features); + return 0; release_con: amdgpu_ras_set_context(adev, NULL); @@ -2415,10 +2411,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { - uint32_t hw_supported, supported; - - amdgpu_ras_check_supported(adev, &hw_supported, &supported); - if (!hw_supported) + amdgpu_ras_check_supported(adev); + if (!adev->ras_hw_supported) return; if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { -- cgit From 8ab0d6f030bab4d8c7310e9894f6fdb59817d241 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Tue, 4 May 2021 02:25:29 -0400 Subject: drm/amdgpu: Rename to ras_*_enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename, ras_hw_supported --> ras_hw_enabled, and ras_features --> ras_enabled, to show that ras_enabled is a subset of ras_hw_enabled, which itself is a subset of the ASIC capability. Cc: Alexander Deucher Cc: John Clements Cc: Hawking Zhang Signed-off-by: Luben Tuikov Acked-by: Christian König Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 50 ++++++++++++++++----------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e7940e813f32..444f5325f092 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -532,7 +532,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return NULL; if (head->block >= AMDGPU_RAS_BLOCK_COUNT) @@ -559,7 +559,7 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, struct ras_manager *obj; int i; - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return NULL; if (head) { @@ -613,7 +613,7 @@ static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, struct ras_common_if *head) { - return adev->ras_hw_supported & BIT(head->block); + return adev->ras_hw_enabled & BIT(head->block); } static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, @@ -767,7 +767,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, ret = amdgpu_ras_feature_enable(adev, head, 0); /* clean gfx block ras features flag */ - if (adev->ras_features && head->block == AMDGPU_RAS_BLOCK__GFX) + if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) con->features &= ~BIT(head->block); } } else @@ -1072,7 +1072,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, struct ras_manager *obj; struct ras_err_data data = {0, 0}; - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return 0; list_for_each_entry(obj, &con->head, node) { @@ -1595,7 +1595,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return; list_for_each_entry(obj, &con->head, node) { @@ -1645,7 +1645,7 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return; list_for_each_entry(obj, &con->head, node) { @@ -1959,7 +1959,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) bool exc_err_limit = false; int ret; - if (adev->ras_features && con) + if (adev->ras_enabled && con) data = &con->eh_data; else return 0; @@ -2076,7 +2076,7 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) if (strnstr(ctx->vbios_version, "D16406", sizeof(ctx->vbios_version))) - adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX); + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); } /* @@ -2090,7 +2090,7 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) */ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) { - adev->ras_hw_supported = adev->ras_features = 0; + adev->ras_hw_enabled = adev->ras_enabled = 0; if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || !amdgpu_ras_asic_supported(adev)) @@ -2099,7 +2099,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (!adev->gmc.xgmi.connected_to_cpu) { if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { dev_info(adev->dev, "MEM ECC is active.\n"); - adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); } else { dev_info(adev->dev, "MEM ECC is not presented.\n"); @@ -2107,7 +2107,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { dev_info(adev->dev, "SRAM ECC is active.\n"); - adev->ras_hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | + adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); } else { dev_info(adev->dev, "SRAM ECC is not presented.\n"); @@ -2115,7 +2115,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) } else { /* driver only manages a few IP blocks RAS feature * when GPU is connected cpu through XGMI */ - adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | 1 << AMDGPU_RAS_BLOCK__SDMA | 1 << AMDGPU_RAS_BLOCK__MMHUB); } @@ -2123,10 +2123,10 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) amdgpu_ras_get_quirks(adev); /* hw_supported needs to be aligned with RAS block mask. */ - adev->ras_hw_supported &= AMDGPU_RAS_BLOCK_MASK; + adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; - adev->ras_features = amdgpu_ras_enable == 0 ? 0 : - adev->ras_hw_supported & amdgpu_ras_mask; + adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : + adev->ras_hw_enabled & amdgpu_ras_mask; } int amdgpu_ras_init(struct amdgpu_device *adev) @@ -2149,11 +2149,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_check_supported(adev); - if (!adev->ras_hw_supported || adev->asic_type == CHIP_VEGA10) { + if (!adev->ras_hw_enabled || adev->asic_type == CHIP_VEGA10) { /* set gfx block ras context feature for VEGA20 Gaming * send ras disable cmd to ras ta during ras late init. */ - if (!adev->ras_features && adev->asic_type == CHIP_VEGA20) { + if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); return 0; @@ -2204,7 +2204,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", - adev->ras_hw_supported, adev->ras_features); + adev->ras_hw_enabled, adev->ras_enabled); return 0; release_con: @@ -2319,7 +2319,7 @@ void amdgpu_ras_resume(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj, *tmp; - if (!adev->ras_features || !con) { + if (!adev->ras_enabled || !con) { /* clean ras context for VEGA20 Gaming after send ras disable cmd */ amdgpu_release_ras_context(adev); @@ -2365,7 +2365,7 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return; amdgpu_ras_disable_all_features(adev, 0); @@ -2379,7 +2379,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return 0; /* Need disable ras on all IPs here before ip [hw/sw]fini */ @@ -2392,7 +2392,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return 0; amdgpu_ras_fs_fini(adev); @@ -2412,7 +2412,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { amdgpu_ras_check_supported(adev); - if (!adev->ras_hw_supported) + if (!adev->ras_hw_enabled) return; if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { @@ -2441,7 +2441,7 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev) if (!con) return; - if (!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { + if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); amdgpu_ras_set_context(adev, NULL); kfree(con); -- cgit From ef0d7d2001c99eedb5a8ba71e9f7a055bfb029ea Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Tue, 4 May 2021 02:32:20 -0400 Subject: drm/amdgpu: Export ras_*_enabled to debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export the runtime-set "ras_hw_enabled" and "ras_enabled" to debugfs, for debugging. Cc: Alexander Deucher Cc: John Clements Cc: Hawking Zhang Signed-off-by: Luben Tuikov Acked-by: Christian König Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 444f5325f092..609530c4a599 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1299,8 +1299,8 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct dentry *dir; - struct drm_minor *minor = adev_to_drm(adev)->primary; + struct drm_minor *minor = adev_to_drm(adev)->primary; + struct dentry *dir; dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, @@ -1309,6 +1309,8 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device * &amdgpu_ras_debugfs_eeprom_ops); debugfs_create_u32("bad_page_cnt_threshold", 0444, dir, &con->bad_page_cnt_threshold); + debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); + debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); /* * After one uncorrectable error happens, usually GPU recovery will -- cgit From 7ddd9770857e26e6c284475260afce2fe71b4cb5 Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Thu, 6 May 2021 11:24:17 -0500 Subject: drm/amdgpu: Quit RAS initialization earlier if RAS is disabled If RAS is disabled through amdgpu_ras_enable kernel parameter, we should quit the RAS initialization eariler to avoid initialization of some RAS data structure such as sysfs etc. Reviewed-by: Hawking Zhang Signed-off-by: Oak Zeng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 609530c4a599..a94be181d066 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2151,7 +2151,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_check_supported(adev); - if (!adev->ras_hw_enabled || adev->asic_type == CHIP_VEGA10) { + if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { /* set gfx block ras context feature for VEGA20 Gaming * send ras disable cmd to ras ta during ras late init. */ -- cgit From 011907fda3605177f4fc05bf08fcf0fceaabda49 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Mon, 10 May 2021 11:04:59 +0800 Subject: drm/amdgpu: covert ras status to kernel errno The original codes use ras status and kernl errno together in the same function, which is a wrong code style. Signed-off-by: Dennis Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 ++++++--------------------------- 1 file changed, 7 insertions(+), 36 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a94be181d066..4eebb97994d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -586,29 +586,6 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, } /* obj end */ -static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, - const char* invoke_type, - const char* block_name, - enum ta_ras_status ret) -{ - switch (ret) { - case TA_RAS_STATUS__SUCCESS: - return; - case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE: - dev_warn(adev->dev, - "RAS WARN: %s %s currently unavailable\n", - invoke_type, - block_name); - break; - default: - dev_err(adev->dev, - "RAS ERROR: %s %s error failed ret 0x%X\n", - invoke_type, - block_name, - ret); - } -} - /* feature ctl begin */ static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, struct ras_common_if *head) @@ -703,15 +680,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, info, enable); if (ret) { - amdgpu_ras_parse_status_code(adev, - enable ? "enable":"disable", - ras_block_str(head->block), - (enum ta_ras_status)ret); - if (ret == TA_RAS_STATUS__RESET_NEEDED) - ret = -EAGAIN; - else - ret = -EINVAL; - + dev_err(adev->dev, "ras %s %s failed %d\n", + enable ? "enable":"disable", + ras_block_str(head->block), + ret); goto out; } } @@ -1056,10 +1028,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, ret = -EINVAL; } - amdgpu_ras_parse_status_code(adev, - "inject", - ras_block_str(info->head.block), - (enum ta_ras_status)ret); + if (ret) + dev_err(adev->dev, "ras inject %s failed %d\n", + ras_block_str(info->head.block), ret); return ret; } -- cgit From 7780f50358ee386de3b42ec236755fa72d97bb36 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Mon, 10 May 2021 19:08:11 +0800 Subject: drm/amdgpu: add function to clear MMEA error status for aldebaran For aldebaran, hardware will not clear error status automatically when reading error status register, insteadly driver should set clear bit of the error status register explicitly to clear error status. Signed-off-by: Dennis Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4eebb97994d6..a324dc2da101 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -938,6 +938,10 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, if (adev->mmhub.ras_funcs && adev->mmhub.ras_funcs->reset_ras_error_count) adev->mmhub.ras_funcs->reset_ras_error_count(adev); + + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->reset_ras_error_status) + adev->mmhub.ras_funcs->reset_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__SDMA: if (adev->sdma.funcs->reset_ras_error_count) -- cgit From c666bbf0e9b56de479a8453fa5713d694db494ac Mon Sep 17 00:00:00 2001 From: Dwaipayan Ray Date: Sun, 9 May 2021 20:19:23 +0530 Subject: drm/amd/amdgpu: Fix errors in function documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a couple of syntax errors and removed one excess parameter in the function documentations which lead to kernel docs build warning. Reviewed-by: Christian König Signed-off-by: Dwaipayan Ray Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a324dc2da101..b1c57a5b6e89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -321,11 +321,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * "disable" requires only the block. * "enable" requires the block and error type. * "inject" requires the block, error type, address, and value. + * * The block is one of: umc, sdma, gfx, etc. * see ras_block_string[] for details + * * The error type is one of: ue, ce, where, * ue is multi-uncorrectable * ce is single-correctable + * * The sub-block is a the sub-block index, pass 0 if there is no sub-block. * The address and value are hexadecimal numbers, leading 0x is optional. * -- cgit