From fdc94d3a8c887e4e06a7ff8dcb51d55cd70e16cf Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Mon, 13 Mar 2023 14:18:34 +0800 Subject: drm/amdgpu: Rework pcie_bif ras sw_init pcie_bif ras blocks needs to be initialized as early as possible to handle fatal error detected in hw_init phase. also align the pcie_bif ras sw_init with other ras blocks Signed-off-by: Hawking Zhang Reviewed-by: Stanley Yang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 63dfcc98152d..b0d050ffc200 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2554,21 +2554,24 @@ int amdgpu_ras_init(struct amdgpu_device *adev) /* initialize nbio ras function ahead of any other * ras functions so hardware fatal error interrupt * can be enabled as early as possible */ - switch (adev->asic_type) { - case CHIP_VEGA20: - case CHIP_ARCTURUS: - case CHIP_ALDEBARAN: - if (!adev->gmc.xgmi.connected_to_cpu) { + switch (adev->ip_versions[NBIO_HWIP][0]) { + case IP_VERSION(7, 4, 0): + case IP_VERSION(7, 4, 1): + case IP_VERSION(7, 4, 4): + if (!adev->gmc.xgmi.connected_to_cpu) adev->nbio.ras = &nbio_v7_4_ras; - amdgpu_ras_register_ras_block(adev, &adev->nbio.ras->ras_block); - adev->nbio.ras_if = &adev->nbio.ras->ras_block.ras_comm; - } break; default: /* nbio ras is not available */ break; } + /* nbio ras block needs to be enabled ahead of other ras blocks + * to handle fatal error */ + r = amdgpu_nbio_ras_sw_init(adev); + if (r) + return r; + if (adev->nbio.ras && adev->nbio.ras->init_ras_controller_interrupt) { r = adev->nbio.ras->init_ras_controller_interrupt(adev); -- cgit From 370808876b5cab365f8fc6dbaf8cae13a2bc6efa Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Sat, 4 Mar 2023 20:22:23 +0800 Subject: drm/amdgpu: drop ras check at asic level for new blocks amdgpu_ras_register_ras_block should always be invoked by ras_sw_init, where driver needs to check ras caps at ip level, instead of asic level. Signed-off-by: Hawking Zhang Reviewed-by: Stanley Yang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b0d050ffc200..11df6ee052b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3076,9 +3076,6 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, if (!adev || !ras_block_obj) return -EINVAL; - if (!amdgpu_ras_asic_supported(adev)) - return 0; - ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL); if (!ras_node) return -ENOMEM; -- cgit From fe120b9f5ce873516a2604e4ff0c19084be94e8c Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Fri, 3 Mar 2023 10:01:12 +0800 Subject: drm/amdgpu: enable ras for mp0 v13_0_10 on SRIOV Enable ras for mp0 v13_0_10 on SRIOV. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 11df6ee052b4..c6dc3cd2a9de 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2340,6 +2340,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) { switch (adev->ip_versions[MP0_HWIP][0]) { case IP_VERSION(13, 0, 2): + case IP_VERSION(13, 0, 10): return true; default: return false; -- cgit From 9af357bc3e05400eb632f3975986e1eac196f159 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 23 Mar 2023 10:21:49 +0800 Subject: drm/amdgpu: Add fatal error handling in nbio v4_3 GPU will stop working once fatal error is detected. it will inform driver to do reset to recover from the fatal error. v2: squash in logic fix (Srinivasan) v3: squash in logic fix (Dan) Signed-off-by: Hawking Zhang Reviewed-by: Candice Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c6dc3cd2a9de..4069bce9479f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -34,6 +34,7 @@ #include "amdgpu_atomfirmware.h" #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +#include "nbio_v4_3.h" #include "atom.h" #include "amdgpu_reset.h" @@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev) if (!adev->gmc.xgmi.connected_to_cpu) adev->nbio.ras = &nbio_v7_4_ras; break; + case IP_VERSION(4, 3, 0): + if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) + /* unlike other generation of nbio ras, + * nbio v4_3 only support fatal error interrupt + * to inform software that DF is freezed due to + * system fatal error event. driver should not + * enable nbio ras in such case. Instead, + * check DF RAS */ + adev->nbio.ras = &nbio_v4_3_ras; + break; default: /* nbio ras is not available */ break; -- cgit From 58bc2a9cbfdd4abdbfaafd835a0cd78bdad11423 Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Mon, 10 Apr 2023 19:43:16 +0800 Subject: drm/amdgpu: correct ras enabled flag XGMI RAS should be according to the gmc xgmi physical nodes number, XGMI RAS should not be enabled if xgmi num_physical_nodes is zero. Signed-off-by: Stanley.Yang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4069bce9479f..ad5d456918f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2430,6 +2430,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) else adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | 1 << AMDGPU_RAS_BLOCK__JPEG); + + /* + * XGMI RAS is not supported if xgmi num physical nodes + * is zero + */ + if (!adev->gmc.xgmi.num_physical_nodes) + adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); } else { dev_info(adev->dev, "SRAM ECC is not presented.\n"); } -- cgit From b805d8d785e49cb3ee9279dad1402d5dcf902166 Mon Sep 17 00:00:00 2001 From: Jane Jian Date: Fri, 14 Apr 2023 11:33:19 +0800 Subject: Revert "drm/amdgpu: enable ras for mp0 v13_0_10 on SRIOV" This reverts commit fe120b9f5ce873516a2604e4ff0c19084be94e8c. This patch impacts sriov multi-vf stability Signed-off-by: Jane Jian Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ad5d456918f4..3ab8a88789c8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2341,7 +2341,6 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) { switch (adev->ip_versions[MP0_HWIP][0]) { case IP_VERSION(13, 0, 2): - case IP_VERSION(13, 0, 10): return true; default: return false; -- cgit