From 16b85a0942c0b0f1611bcaa42cc98f020e34b1cf Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Wed, 22 Jan 2025 19:34:33 +0800 Subject: drm/amdgpu: Update usage for bad page threshold The driver's behavior varies based on the configuration of amdgpu_bad_page_threshold setting Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 40 ++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f0924aa3f4e4..90394f89aba6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3080,31 +3080,29 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); /* - * Justification of value bad_page_cnt_threshold in ras structure - * - * Generally, 0 <= amdgpu_bad_page_threshold <= max record length - * in eeprom or amdgpu_bad_page_threshold == -2, introduce two - * scenarios accordingly. - * - * Bad page retirement enablement: - * - If amdgpu_bad_page_threshold = -2, - * bad_page_cnt_threshold = typical value by formula. - * - * - When the value from user is 0 < amdgpu_bad_page_threshold < - * max record length in eeprom, use it directly. - * - * Bad page retirement disablement: - * - If amdgpu_bad_page_threshold = 0, bad page retirement - * functionality is disabled, and bad_page_cnt_threshold will - * take no effect. + * amdgpu_bad_page_threshold is used to config + * the threshold for the number of bad pages. + * -1: Threshold is set to default value + * Driver will issue a warning message when threshold is reached + * and continue runtime services. + * 0: Disable bad page retirement + * Driver will not retire bad pages + * which is intended for debugging purpose. + * -2: Threshold is determined by a formula + * that assumes 1 bad page per 100M of local memory. + * Driver will continue runtime services when threhold is reached. + * 0 < threshold < max number of bad page records in EEPROM, + * A user-defined threshold is set + * Driver will halt runtime services when this custom threshold is reached. */ - - if (amdgpu_bad_page_threshold < 0) { + if (amdgpu_bad_page_threshold == -2) { u64 val = adev->gmc.mc_vram_size; do_div(val, RAS_BAD_PAGE_COVER); con->bad_page_cnt_threshold = min(lower_32_bits(val), max_count); + } else if (amdgpu_bad_page_threshold == -1) { + con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4; } else { con->bad_page_cnt_threshold = min_t(int, max_count, amdgpu_bad_page_threshold); @@ -3848,8 +3846,10 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev) case IP_VERSION(13, 0, 2): case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 12): + con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT; + break; case IP_VERSION(13, 0, 14): - con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE; + con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1); break; default: break; -- cgit From 04893397766a2b2f1bc7fe5c6414e4c0846ed171 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 20 Jan 2025 22:00:22 -0500 Subject: drm/amdgpu: Skip err_count sysfs creation on VF unsupported RAS blocks VFs are not able to query error counts for all RAS blocks. Rather than returning error for queries on these blocks, skip sysfs the creation all together. Signed-off-by: Victor Skvortsov Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 90394f89aba6..44d13a60588d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1864,6 +1864,9 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, if (!obj || obj->attr_inuse) return -EINVAL; + if (amdgpu_sriov_vf(adev) && !amdgpu_virt_ras_telemetry_block_en(adev, head->block)) + return 0; + get_obj(obj); snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), -- cgit From 59af05d6a391575c5fe48e8ae693ff9cc5c682c9 Mon Sep 17 00:00:00 2001 From: Candice Li Date: Tue, 11 Feb 2025 16:39:52 +0800 Subject: drm/amdgpu: Enable ACA by default for psp v13_0_12 Enable ACA by default for psp v13_0_12. Signed-off-by: Candice Li Reviewed-by: Hawking Zhang Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 44d13a60588d..3c3312bbfee8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3760,8 +3760,9 @@ init_ras_enabled_flag: adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : adev->ras_hw_enabled & amdgpu_ras_mask; - /* aca is disabled by default */ - adev->aca.is_enabled = false; + /* aca is disabled by default except for psp v13_0_12 */ + adev->aca.is_enabled = + (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12)); /* bad page feature is not applicable to specific app platform */ if (adev->gmc.is_app_apu && -- cgit