From 88293c03c87e4db28890dd4e4ccb3640eadb4a08 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Wed, 10 Feb 2021 14:10:12 +0100 Subject: drm/amdgpu: do not keep debugfs dentry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup unnecessary debugfs dentries and surrounding functions. v3: remove return value check for debugfs_create_file() v2: remove ttm_debugfs_entries array. do not init variables. Signed-off-by: Nirmoy Das Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 73 ++++++++++++++------------------- 1 file changed, 30 insertions(+), 43 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1fb2a91ad30a..b504914519ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1137,16 +1137,17 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) * */ /* debugfs begin */ -static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) +struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct dentry *dir; struct drm_minor *minor = adev_to_drm(adev)->primary; - con->dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); - debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, - adev, &amdgpu_ras_debugfs_ctrl_ops); - debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, - adev, &amdgpu_ras_debugfs_eeprom_ops); + dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); + debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, + &amdgpu_ras_debugfs_ctrl_ops); + debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev, + &amdgpu_ras_debugfs_eeprom_ops); /* * After one uncorrectable error happens, usually GPU recovery will @@ -1156,24 +1157,24 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine * will never be called. */ - debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir, - &con->reboot); + debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot); /* * User could set this not to clean up hardware's error count register * of RAS IPs during ras recovery. */ - debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, - con->dir, &con->disable_ras_err_cnt_harvest); + debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir, + &con->disable_ras_err_cnt_harvest); + return dir; } static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, - struct ras_fs_if *head) + struct ras_fs_if *head, + struct dentry *dir) { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); - if (!obj || obj->ent) + if (!obj || !dir) return; get_obj(obj); @@ -1182,14 +1183,14 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, head->debugfs_name, sizeof(obj->fs_data.debugfs_name)); - obj->ent = debugfs_create_file(obj->fs_data.debugfs_name, - S_IWUGO | S_IRUGO, con->dir, obj, - &amdgpu_ras_debugfs_ops); + debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir, + obj, &amdgpu_ras_debugfs_ops); } void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct dentry *dir; struct ras_manager *obj; struct ras_fs_if fs_info; @@ -1200,7 +1201,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con) return; - amdgpu_ras_debugfs_create_ctrl_node(adev); + dir = amdgpu_ras_debugfs_create_ctrl_node(adev); list_for_each_entry(obj, &con->head, node) { if (amdgpu_ras_is_supported(adev, obj->head.block) && @@ -1208,34 +1209,11 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) sprintf(fs_info.debugfs_name, "%s_err_inject", ras_block_str(obj->head.block)); fs_info.head = obj->head; - amdgpu_ras_debugfs_create(adev, &fs_info); + amdgpu_ras_debugfs_create(adev, &fs_info, dir); } } } -static void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, - struct ras_common_if *head) -{ - struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); - - if (!obj || !obj->ent) - return; - - obj->ent = NULL; - put_obj(obj); -} - -static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_manager *obj, *tmp; - - list_for_each_entry_safe(obj, tmp, &con->head, node) { - amdgpu_ras_debugfs_remove(adev, &obj->head); - } - - con->dir = NULL; -} /* debugfs end */ /* ras fs */ @@ -1282,8 +1260,17 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) { - if (IS_ENABLED(CONFIG_DEBUG_FS)) - amdgpu_ras_debugfs_remove_all(adev); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_manager *con_obj, *ip_obj, *tmp; + + if (IS_ENABLED(CONFIG_DEBUG_FS)) { + list_for_each_entry_safe(con_obj, tmp, &con->head, node) { + ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); + if (ip_obj) + put_obj(ip_obj); + } + } + amdgpu_ras_sysfs_remove_all(adev); return 0; } -- cgit From ea1b8c9b837c18e2322d1b91ac3c1af8a4f7a455 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Tue, 16 Feb 2021 15:33:42 +0100 Subject: drm/amdgpu: mark local function as static Mark amdgpu_ras_debugfs_create_ctrl_node() as static. Fixes: eb14235668777b ("drm/amdgpu: do not keep debugfs dentry") Reported-by: kernel test robot Reviewed-by: Alex Deucher Signed-off-by: Nirmoy Das Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b504914519ce..93699ea4860c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1137,7 +1137,7 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) * */ /* debugfs begin */ -struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) +static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct dentry *dir; -- cgit From f89b881c81d9a6481fc17b46b351ca38f5dd6f3a Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Mon, 22 Feb 2021 18:22:57 +0800 Subject: drm/amdgpu: reserve backup pages for bad page retirment To ensure user has a constant of VRAM accessible in run-time, driver reserves limit backup pages when init, and return ones when bad pages retired, to keep no change of unused memory size. v2: refine codes to calculate badpags threshold Reviewed-by: Hawking Zhang Signed-off-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 93699ea4860c..09546dec40ff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1747,13 +1747,14 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, return ret; } -static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, - uint32_t max_length) +static uint32_t +amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev) { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int tmp_threshold = amdgpu_bad_page_threshold; u64 val; + uint32_t max_length = 0; + max_length = amdgpu_ras_eeprom_get_record_max_length(); /* * Justification of value bad_page_cnt_threshold in ras structure * @@ -1779,20 +1780,18 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, tmp_threshold = max_length; if (tmp_threshold == -1) { - val = adev->gmc.mc_vram_size; + val = adev->gmc.real_vram_size; do_div(val, RAS_BAD_PAGE_RATE); - con->bad_page_cnt_threshold = min(lower_32_bits(val), - max_length); - } else { - con->bad_page_cnt_threshold = tmp_threshold; + tmp_threshold = min(lower_32_bits(val), max_length); } + + return tmp_threshold; } int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; - uint32_t max_eeprom_records_len = 0; bool exc_err_limit = false; int ret; @@ -1812,8 +1811,16 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->adev = adev; - max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); - amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + if (!con->bad_page_cnt_threshold) { + con->bad_page_cnt_threshold = + amdgpu_ras_calculate_badpags_threshold(adev); + + ret = amdgpu_vram_mgr_reserve_backup_pages( + ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), + con->bad_page_cnt_threshold); + if (ret) + goto out; + } ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); /* -- cgit From 11003c68b158c07b95e4ba3630a86aff9c442ee7 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Fri, 26 Feb 2021 09:17:10 +0800 Subject: drm/amdgpu: remove unnecessary reading for epprom header If the number of badpage records exceed the threshold, driver has updated both epprom header and control->tbl_hdr.header before gpu reset, therefore GPU recovery thread no need to read epprom header directly. v2: merge amdgpu_ras_check_err_threshold into amdgpu_ras_eeprom_check_err_threshold Signed-off-by: Dennis Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 09546dec40ff..c669435ccc74 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2189,19 +2189,3 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) return false; } - -bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - bool exc_err_limit = false; - - if (con && (amdgpu_bad_page_threshold != 0)) - amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control, - &exc_err_limit); - - /* - * We are only interested in variable exc_err_limit, - * as it says if GPU is in bad state or not. - */ - return exc_err_limit; -} -- cgit From 88f8575bca5fc70ba8608cfc49811f9b4d1eb6f9 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Fri, 5 Mar 2021 16:30:54 -0500 Subject: drm/amdgpu: enable watchdog feature for SQ of aldebaran SQ's watchdog timer monitors forward progress, a mask of which waves caused the watchdog timeout is recorded into ras status registers and then trigger a system fatal error event. v2: 1. change *query_timeout_status to *query_sq_timeout_status. 2. move query_sq_timeout_status into amdgpu_ras_do_recovery. 3. add module parameters to enable/disable fatal error event and modify the watchdog timer. v3: 1. remove unused parameters of *enable_watchdog_timer Signed-off-by: Dennis Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c669435ccc74..c1516d871881 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1467,6 +1467,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, case AMDGPU_RAS_BLOCK__GFX: if (adev->gfx.funcs->query_ras_error_status) adev->gfx.funcs->query_ras_error_status(adev); + + if (adev->gfx.funcs->query_sq_timeout_status) + adev->gfx.funcs->query_sq_timeout_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: if (adev->mmhub.funcs->query_ras_error_status) -- cgit From 761d86d37f86ebba77e59fa59ccef4dc2f38674f Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Thu, 4 Feb 2021 13:32:05 +0800 Subject: drm/amdgpu: harvest edc status when connected to host via xGMI When connected to a host via xGMI, system fatal errors may trigger warm reset, driver has no change to query edc status before reset. Therefore in this case, driver should harvest previous error loging registers during boot, instead of only resetting them. v2: 1. IP's ras_manager object is created when its ras feature is enabled, so change to query edc status after amdgpu_ras_late_init called 2. change to enable watchdog timer after finishing gfx edc init Signed-off-by: Dennis Li Reivewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 50 +++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c1516d871881..ed83a32f6f30 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -109,7 +109,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, ssize_t s; char val[128]; - if (amdgpu_ras_error_query(obj->adev, &info)) + if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", @@ -434,7 +434,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, return snprintf(buf, PAGE_SIZE, "Query currently inaccessible\n"); - if (amdgpu_ras_error_query(obj->adev, &info)) + if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", @@ -757,8 +757,8 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, /* feature ctl end */ /* query/inject/cure begin */ -int amdgpu_ras_error_query(struct amdgpu_device *adev, - struct ras_query_if *info) +int amdgpu_ras_query_error_status(struct amdgpu_device *adev, + struct ras_query_if *info) { struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); struct ras_err_data err_data = {0, 0, 0, NULL}; @@ -787,10 +787,16 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, case AMDGPU_RAS_BLOCK__GFX: if (adev->gfx.funcs->query_ras_error_count) adev->gfx.funcs->query_ras_error_count(adev, &err_data); + + if (adev->gfx.funcs->query_ras_error_status) + adev->gfx.funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: if (adev->mmhub.funcs->query_ras_error_count) adev->mmhub.funcs->query_ras_error_count(adev, &err_data); + + if (adev->mmhub.funcs->query_ras_error_status) + adev->mmhub.funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__PCIE_BIF: if (adev->nbio.funcs->query_ras_error_count) @@ -826,6 +832,35 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, return 0; } +int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, + enum amdgpu_ras_block block) +{ + if (!amdgpu_ras_is_supported(adev, block)) + return -EINVAL; + + switch (block) { + case AMDGPU_RAS_BLOCK__GFX: + if (adev->gfx.funcs->reset_ras_error_count) + adev->gfx.funcs->reset_ras_error_count(adev); + + if (adev->gfx.funcs->reset_ras_error_status) + adev->gfx.funcs->reset_ras_error_status(adev); + break; + case AMDGPU_RAS_BLOCK__MMHUB: + if (adev->mmhub.funcs->reset_ras_error_count) + adev->mmhub.funcs->reset_ras_error_count(adev); + break; + case AMDGPU_RAS_BLOCK__SDMA: + if (adev->sdma.funcs->reset_ras_error_count) + adev->sdma.funcs->reset_ras_error_count(adev); + break; + default: + break; + } + + return 0; +} + /* Trigger XGMI/WAFL error */ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, struct ta_ras_trigger_error_input *block_info) @@ -921,7 +956,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, .head = obj->head, }; - if (amdgpu_ras_error_query(adev, &info)) + if (amdgpu_ras_query_error_status(adev, &info)) return 0; data.ce_count += info.ce_count; @@ -1451,7 +1486,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF) continue; - amdgpu_ras_error_query(adev, &info); + amdgpu_ras_query_error_status(adev, &info); } } @@ -1467,9 +1502,6 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, case AMDGPU_RAS_BLOCK__GFX: if (adev->gfx.funcs->query_ras_error_status) adev->gfx.funcs->query_ras_error_status(adev); - - if (adev->gfx.funcs->query_sq_timeout_status) - adev->gfx.funcs->query_sq_timeout_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: if (adev->mmhub.funcs->query_ras_error_status) -- cgit From e5086659d0fdfa6729758fefcc6522f5f44cbe1b Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Tue, 9 Mar 2021 20:02:42 -0500 Subject: drm/amdgpu: skip read eeprom for device that pending on XGMI reset Read eeprom through SMU doesn't works stable on XGMI reset during test. skip it for now Signed-off-by: shaoyunl Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ed83a32f6f30..ea363336bc5e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1857,6 +1857,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) goto out; } + /* Todo: During test the SMU might fail to read the eeprom through I2C + * when the GPU is pending on XGMI reset during probe time + * (Mostly after second bus reset), skip it now + */ + if (adev->gmc.xgmi.pending_reset) + return 0; ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); /* * This calling fails when exc_err_limit is true or -- cgit From b69d5c7e95023d370056d95e4bcddecaf4b78eda Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Tue, 9 Mar 2021 19:36:19 +0800 Subject: drm/amdgpu: support query ecc cap for SIENNA_CICHLID driver needs to query umc_info_v3_3 for ecc capability in sienna_cichlid Signed-off-by: Hawking Zhang Reviewed-by: Likun Gao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ea363336bc5e..50f1a76389bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1963,11 +1963,11 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, return; if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { - dev_info(adev->dev, "HBM ECC is active.\n"); + dev_info(adev->dev, "MEM ECC is active.\n"); *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); } else - dev_info(adev->dev, "HBM ECC is not presented.\n"); + dev_info(adev->dev, "MEM ECC is not presented.\n"); if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { dev_info(adev->dev, "SRAM ECC is active.\n"); -- cgit From 970fd19764349081d8fcb1ce816f7c75907b9d54 Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Wed, 10 Mar 2021 19:10:11 +0800 Subject: drm/amdgpu: fix send ras disable cmd when asic not support ras cause: It is necessary to send ras disable command to ras-ta during gfx block ras later init, because the ras capability is disable read from vbios for vega20 gaming, but the ras context is released during ras init process, this will cause send ras disable command to ras-to failed. how: Delay releasing ras context, the ras context will be released after gfx block later init done. Changed from V1: move release_ras_context into ras_resume Changed from V2: check BIT(UMC) is more reasonable before access eeprom table Signed-off-by: Stanley.Yang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57 ++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 50f1a76389bc..a90bf33358d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -463,7 +463,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; - if (!con) + if (!adev->ras_features || !con) return NULL; if (head->block >= AMDGPU_RAS_BLOCK_COUNT) @@ -490,7 +490,7 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, struct ras_manager *obj; int i; - if (!con) + if (!adev->ras_features || !con) return NULL; if (head) { @@ -590,7 +590,11 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, con->features |= BIT(head->block); } else { if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { - con->features &= ~BIT(head->block); + /* skip clean gfx ras context feature for VEGA20 Gaming. + * will clean later + */ + if (!(!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX))) + con->features &= ~BIT(head->block); put_obj(obj); } } @@ -693,6 +697,10 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, if (ret) return ret; + /* gfx block ras dsiable cmd must send to ras-ta */ + if (head->block == AMDGPU_RAS_BLOCK__GFX) + con->features |= BIT(head->block); + ret = amdgpu_ras_feature_enable(adev, head, 0); } } else @@ -948,7 +956,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, struct ras_manager *obj; struct ras_err_data data = {0, 0}; - if (!con) + if (!adev->ras_features || !con) return 0; list_for_each_entry(obj, &con->head, node) { @@ -1469,7 +1477,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; - if (!con) + if (!adev->ras_features || !con) return; list_for_each_entry(obj, &con->head, node) { @@ -1517,7 +1525,7 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; - if (!con) + if (!adev->ras_features || !con) return; list_for_each_entry(obj, &con->head, node) { @@ -1830,7 +1838,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) bool exc_err_limit = false; int ret; - if (con) + if (adev->ras_features && con) data = &con->eh_data; else return 0; @@ -2005,6 +2013,15 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_check_supported(adev, &con->hw_supported, &con->supported); if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) { + /* set gfx block ras context feature for VEGA20 Gaming + * send ras disable cmd to ras ta during ras late init. + */ + if (!adev->ras_features && adev->asic_type == CHIP_VEGA20) { + con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); + + return 0; + } + r = 0; goto release_con; } @@ -2118,8 +2135,12 @@ void amdgpu_ras_resume(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj, *tmp; - if (!con) + if (!adev->ras_features || !con) { + /* clean ras context for VEGA20 Gaming after send ras disable cmd */ + amdgpu_release_ras_context(adev); + return; + } if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { /* Set up all other IPs which are not implemented. There is a @@ -2160,7 +2181,7 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!con) + if (!adev->ras_features || !con) return; amdgpu_ras_disable_all_features(adev, 0); @@ -2174,7 +2195,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!con) + if (!adev->ras_features || !con) return 0; /* Need disable ras on all IPs here before ip [hw/sw]fini */ @@ -2187,7 +2208,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!con) + if (!adev->ras_features || !con) return 0; amdgpu_ras_fs_fini(adev); @@ -2230,3 +2251,17 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) return false; } + +void amdgpu_release_ras_context(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!con) + return; + + if (!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { + con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); + amdgpu_ras_set_context(adev, NULL); + kfree(con); + } +} -- cgit From e5c04edfcde373b093d2a07322873b3ce1c5b88e Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 18 Mar 2021 14:04:06 +0100 Subject: drm/amdgpu: revert "reserve backup pages for bad page retirment" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As noted during the review this approach doesn't make sense at all. We should not apply any limitation on the VRAM applications can use inside the kernel. If an application or end user wants to reserve a certain amount of VRAM for bad pages handling we should do this in the upper layer. This reverts commit f89b881c81d9a6481fc17b46b351ca38f5dd6f3a. Signed-off-by: Christian König Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a90bf33358d3..0e16683876aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1790,14 +1790,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, return ret; } -static uint32_t -amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev) +static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, + uint32_t max_length) { + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int tmp_threshold = amdgpu_bad_page_threshold; u64 val; - uint32_t max_length = 0; - max_length = amdgpu_ras_eeprom_get_record_max_length(); /* * Justification of value bad_page_cnt_threshold in ras structure * @@ -1823,18 +1822,20 @@ amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev) tmp_threshold = max_length; if (tmp_threshold == -1) { - val = adev->gmc.real_vram_size; + val = adev->gmc.mc_vram_size; do_div(val, RAS_BAD_PAGE_RATE); - tmp_threshold = min(lower_32_bits(val), max_length); + con->bad_page_cnt_threshold = min(lower_32_bits(val), + max_length); + } else { + con->bad_page_cnt_threshold = tmp_threshold; } - - return tmp_threshold; } int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; + uint32_t max_eeprom_records_len = 0; bool exc_err_limit = false; int ret; @@ -1854,16 +1855,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->adev = adev; - if (!con->bad_page_cnt_threshold) { - con->bad_page_cnt_threshold = - amdgpu_ras_calculate_badpags_threshold(adev); - - ret = amdgpu_vram_mgr_reserve_backup_pages( - ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), - con->bad_page_cnt_threshold); - if (ret) - goto out; - } + max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); + amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); /* Todo: During test the SMU might fail to read the eeprom through I2C * when the GPU is pending on XGMI reset during probe time -- cgit From 084e2640e51626f413f85663e3ba7e32d4272477 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Thu, 11 Mar 2021 19:11:01 -0500 Subject: drm/amdgpu: Fix check for RAS support Use positive logic to check for RAS support. Rename the function to actually indicate what it is testing for. Essentially, make the function a predicate with the correct name. Cc: Stanley Yang Cc: Alexander Deucher Signed-off-by: Luben Tuikov Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0e16683876aa..17652972fd49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1933,15 +1933,12 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, return 0; } -static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev) +static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) { - if (adev->asic_type != CHIP_VEGA10 && - adev->asic_type != CHIP_VEGA20 && - adev->asic_type != CHIP_ARCTURUS && - adev->asic_type != CHIP_SIENNA_CICHLID) - return 1; - else - return 0; + return adev->asic_type == CHIP_VEGA10 || + adev->asic_type == CHIP_VEGA20 || + adev->asic_type == CHIP_ARCTURUS || + adev->asic_type == CHIP_SIENNA_CICHLID; } /* @@ -1960,7 +1957,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, *supported = 0; if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || - amdgpu_ras_check_asic_type(adev)) + !amdgpu_ras_asic_supported(adev)) return; if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { -- cgit From 36000c7a51080840902d79e1558851076ecb7a96 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Wed, 24 Mar 2021 17:17:40 +0800 Subject: drm/amdgpu: Convert sysfs sprintf/snprintf family to sysfs_emit Fix the following coccicheck warning: drivers/gpu//drm/amd/amdgpu/amdgpu_ras.c:434:9-17: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_xgmi.c:220:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_xgmi.c:249:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/df_v3_6.c:208:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_psp.c:2973:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_vram_mgr.c:75:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_vram_mgr.c:112:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_vram_mgr.c:58:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_vram_mgr.c:93:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_vram_mgr.c:125:9-17: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_gtt_mgr.c:52:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_gtt_mgr.c:71:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_device.c:140:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_device.c:164:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_device.c:186:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_device.c:208:8-16: WARNING: use scnprintf or sprintf drivers/gpu//drm/amd/amdgpu/amdgpu_atombios.c:1916:8-16: WARNING: use scnprintf or sprintf Signed-off-by: Tian Tao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 17652972fd49..c5a73b5c26d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -431,15 +431,13 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, }; if (!amdgpu_ras_get_error_query_ready(obj->adev)) - return snprintf(buf, PAGE_SIZE, - "Query currently inaccessible\n"); + return sysfs_emit(buf, "Query currently inaccessible\n"); if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; - return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", - "ue", info.ue_count, - "ce", info.ce_count); + return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, + "ce", info.ce_count); } /* obj begin */ -- cgit From 5a4345270474c886dceee48c5b54b5c2c07aa877 Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Thu, 1 Apr 2021 19:10:54 +0800 Subject: drm/amdgpu: support sdma error injection Signed-off-by: Stanley.Yang Reivewed-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c5a73b5c26d9..32be6dd1fcd7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -925,6 +925,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, ret = -EINVAL; break; case AMDGPU_RAS_BLOCK__UMC: + case AMDGPU_RAS_BLOCK__SDMA: case AMDGPU_RAS_BLOCK__MMHUB: case AMDGPU_RAS_BLOCK__PCIE_BIF: ret = psp_ras_trigger_error(&adev->psp, &block_info); -- cgit From f08726868c7543e0754212dcadae0d6911f721bd Mon Sep 17 00:00:00 2001 From: Bernard Zhao Date: Wed, 31 Mar 2021 06:12:03 -0700 Subject: drm/amd: cleanup coding style a bit Fix patch check warning: WARNING: suspect code indent for conditional statements (8, 17) + if (obj && obj->use < 0) { + DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); WARNING: braces {} are not necessary for single statement blocks + if (obj && obj->use < 0) { + DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); + } Signed-off-by: Bernard Zhao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 32be6dd1fcd7..26458946145c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -447,11 +447,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, static inline void put_obj(struct ras_manager *obj) { - if (obj && --obj->use == 0) + if (obj && (--obj->use == 0)) list_del(&obj->node); - if (obj && obj->use < 0) { - DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); - } + if (obj && (obj->use < 0)) + DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); } /* make one obj and return it. */ -- cgit From 75f06251c921baf99c003662c529c25ba9937b2d Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Mon, 8 Mar 2021 16:40:07 +0800 Subject: drm/amdgpu: initialze ras caps per paltform config Driver only manages GFX/SDMA/MMHUB RAS in platforms that gpu node is connected to cpu through XGMI, other than that, it queries VBIOS for RAS capabilities. Signed-off-by: Hawking Zhang Acked-by: Alex Deucher Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 35 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 26458946145c..1708045e2a0d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1936,6 +1936,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) return adev->asic_type == CHIP_VEGA10 || adev->asic_type == CHIP_VEGA20 || adev->asic_type == CHIP_ARCTURUS || + adev->asic_type == CHIP_ALDEBARAN || adev->asic_type == CHIP_SIENNA_CICHLID; } @@ -1958,19 +1959,29 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, !amdgpu_ras_asic_supported(adev)) return; - if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { - dev_info(adev->dev, "MEM ECC is active.\n"); - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | - 1 << AMDGPU_RAS_BLOCK__DF); - } else - dev_info(adev->dev, "MEM ECC is not presented.\n"); + if (!adev->gmc.xgmi.connected_to_cpu) { + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { + dev_info(adev->dev, "MEM ECC is active.\n"); + *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); + } else { + dev_info(adev->dev, "MEM ECC is not presented.\n"); + } - if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { - dev_info(adev->dev, "SRAM ECC is active.\n"); - *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | - 1 << AMDGPU_RAS_BLOCK__DF); - } else - dev_info(adev->dev, "SRAM ECC is not presented.\n"); + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { + dev_info(adev->dev, "SRAM ECC is active.\n"); + *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); + } else { + dev_info(adev->dev, "SRAM ECC is not presented.\n"); + } + } else { + /* driver only manages a few IP blocks RAS feature + * when GPU is connected cpu through XGMI */ + *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | + 1 << AMDGPU_RAS_BLOCK__SDMA | + 1 << AMDGPU_RAS_BLOCK__MMHUB); + } /* hw_supported needs to be aligned with RAS block mask. */ *hw_supported &= AMDGPU_RAS_BLOCK_MASK; -- cgit From 6e36f23193cc870856a41e87281f62fb2b04bd1f Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 2 Apr 2021 14:39:36 +0800 Subject: drm/amdgpu: split nbio callbacks into ras and non-ras ones nbio ras is not managed by gpu driver when gpu is connected to cpu through xgmi. split nbio callbacks into ras and non-ras ones so gpu driver only initializes nbio ras callbacks when it manages nbio ras. Signed-off-by: Hawking Zhang Reviewed-by: Dennis Li Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1708045e2a0d..ac3f4c3266bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -804,8 +804,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->mmhub.funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__PCIE_BIF: - if (adev->nbio.funcs->query_ras_error_count) - adev->nbio.funcs->query_ras_error_count(adev, &err_data); + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->query_ras_error_count) + adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data); break; case AMDGPU_RAS_BLOCK__XGMI_WAFL: amdgpu_xgmi_query_ras_error_count(adev, &err_data); @@ -2030,14 +2031,31 @@ int amdgpu_ras_init(struct amdgpu_device *adev) /* Might need get this flag from vbios. */ con->flags = RAS_DEFAULT_FLAGS; - if (adev->nbio.funcs->init_ras_controller_interrupt) { - r = adev->nbio.funcs->init_ras_controller_interrupt(adev); + /* initialize nbio ras function ahead of any other + * ras functions so hardware fatal error interrupt + * can be enabled as early as possible */ + switch (adev->asic_type) { + case CHIP_VEGA20: + case CHIP_ARCTURUS: + case CHIP_ALDEBARAN: + if (!adev->gmc.xgmi.connected_to_cpu) + adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs; + break; + default: + /* nbio ras is not available */ + break; + } + + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->init_ras_controller_interrupt) { + r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev); if (r) goto release_con; } - if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) { - r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev); + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) { + r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev); if (r) goto release_con; } -- cgit From 52137ca8526cdeceab7651c314ce68ac49963512 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 18 Mar 2021 20:18:19 +0800 Subject: drm/amdgpu: move xgmi ras functions to xgmi_ras_funcs xgmi ras is not managed by gpu driver when gpu is connected to cpu through xgmi. move all xgmi ras functions to xgmi_ras_funcs so gpu driver only initializes xgmi ras functions when it manages xgmi ras. Signed-off-by: Hawking Zhang Reviewed-by: Dennis Li Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ac3f4c3266bc..172738cc99db 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -809,7 +809,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data); break; case AMDGPU_RAS_BLOCK__XGMI_WAFL: - amdgpu_xgmi_query_ras_error_count(adev, &err_data); + if (adev->gmc.xgmi.ras_funcs && + adev->gmc.xgmi.ras_funcs->query_ras_error_count) + adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data); break; default: break; -- cgit From 49070c4ea3d97b76c5666466efb35dcc42c6c8fd Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Wed, 17 Mar 2021 19:17:52 +0800 Subject: drm/amdgpu: split umc callbacks to ras and non-ras ones umc ras is not managed by gpu driver when gpu is connected to cpu through xgmi. split umc callbacks into ras and non-ras ones so gpu driver only initializes umc ras callbacks when it manages umc ras. Signed-off-by: Hawking Zhang Reviewed-by: Dennis Li Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 172738cc99db..459a470744f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -774,13 +774,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, switch (info->head.block) { case AMDGPU_RAS_BLOCK__UMC: - if (adev->umc.funcs->query_ras_error_count) - adev->umc.funcs->query_ras_error_count(adev, &err_data); + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->query_ras_error_count) + adev->umc.ras_funcs->query_ras_error_count(adev, &err_data); /* umc query_ras_error_address is also responsible for clearing * error status */ - if (adev->umc.funcs->query_ras_error_address) - adev->umc.funcs->query_ras_error_address(adev, &err_data); + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->query_ras_error_address) + adev->umc.ras_funcs->query_ras_error_address(adev, &err_data); break; case AMDGPU_RAS_BLOCK__SDMA: if (adev->sdma.funcs->query_ras_error_count) { -- cgit From 8bc7b360ad4b0a090380d7548dbf24a627f0b035 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 19 Mar 2021 15:50:14 +0800 Subject: drm/amdgpu: split mmhub callbacks into ras and non-ras ones mmhub ras is only avaiable in cerntain mmhub ip generation. Signed-off-by: Hawking Zhang Reviewed-by: Dennis Li Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 459a470744f4..b55f470eb747 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -799,11 +799,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->gfx.funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.funcs->query_ras_error_count) - adev->mmhub.funcs->query_ras_error_count(adev, &err_data); + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->query_ras_error_count) + adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data); - if (adev->mmhub.funcs->query_ras_error_status) - adev->mmhub.funcs->query_ras_error_status(adev); + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->query_ras_error_status) + adev->mmhub.ras_funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__PCIE_BIF: if (adev->nbio.ras_funcs && @@ -857,8 +859,9 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, adev->gfx.funcs->reset_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.funcs->reset_ras_error_count) - adev->mmhub.funcs->reset_ras_error_count(adev); + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->reset_ras_error_count) + adev->mmhub.ras_funcs->reset_ras_error_count(adev); break; case AMDGPU_RAS_BLOCK__SDMA: if (adev->sdma.funcs->reset_ras_error_count) @@ -1515,8 +1518,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, adev->gfx.funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.funcs->query_ras_error_status) - adev->mmhub.funcs->query_ras_error_status(adev); + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->query_ras_error_status) + adev->mmhub.ras_funcs->query_ras_error_status(adev); break; default: break; -- cgit From 719a9b332305b8c4b91805c4bedee27ce82ee916 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 19 Mar 2021 16:59:09 +0800 Subject: drm/amdgpu: split gfx callbacks into ras and non-ras ones gfx ras is only available in cerntain ip generations. Signed-off-by: Hawking Zhang Reviewed-by: Dennis Li Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b55f470eb747..1d905bcbc1ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -792,11 +792,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, } break; case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.funcs->query_ras_error_count) - adev->gfx.funcs->query_ras_error_count(adev, &err_data); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->query_ras_error_count) + adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data); - if (adev->gfx.funcs->query_ras_error_status) - adev->gfx.funcs->query_ras_error_status(adev); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->query_ras_error_status) + adev->gfx.ras_funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: if (adev->mmhub.ras_funcs && @@ -852,11 +854,13 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, switch (block) { case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.funcs->reset_ras_error_count) - adev->gfx.funcs->reset_ras_error_count(adev); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->reset_ras_error_count) + adev->gfx.ras_funcs->reset_ras_error_count(adev); - if (adev->gfx.funcs->reset_ras_error_status) - adev->gfx.funcs->reset_ras_error_status(adev); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->reset_ras_error_status) + adev->gfx.ras_funcs->reset_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: if (adev->mmhub.ras_funcs && @@ -926,8 +930,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, switch (info->head.block) { case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.funcs->ras_error_inject) - ret = adev->gfx.funcs->ras_error_inject(adev, info); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->ras_error_inject) + ret = adev->gfx.ras_funcs->ras_error_inject(adev, info); else ret = -EINVAL; break; @@ -1514,8 +1519,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, */ switch (info->head.block) { case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.funcs->query_ras_error_status) - adev->gfx.funcs->query_ras_error_status(adev); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->query_ras_error_status) + adev->gfx.ras_funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: if (adev->mmhub.ras_funcs && -- cgit From 134d16d50f0948f00e7172b509e869b6eaecf437 Mon Sep 17 00:00:00 2001 From: John Clements Date: Thu, 25 Mar 2021 17:10:10 +0800 Subject: drm/amdgpu: RAS harvest on driver load In event of RAS UE + warm reset, error counters shall be harvested and cleared on driver load Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1d905bcbc1ac..b0fe5885e4c6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2090,6 +2090,32 @@ release_con: return r; } +static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) +{ + if (adev->gmc.xgmi.connected_to_cpu) + return 1; + return 0; +} + +static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + struct ras_query_if info = { + .head = *ras_block, + }; + + if (!amdgpu_persistent_edc_harvesting_supported(adev)) + return 0; + + if (amdgpu_ras_query_error_status(adev, &info) != 0) + DRM_WARN("RAS init harvest failure"); + + if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) + DRM_WARN("RAS init harvest reset failure"); + + return 0; +} + /* helper function to handle common stuff in ip late init phase */ int amdgpu_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block, @@ -2119,6 +2145,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, return r; } + /* check for errors on warm reset edc persisant supported ASIC */ + amdgpu_persistent_edc_harvesting(adev, ras_block); + /* in resume phase, no need to create ras fs node */ if (adev->in_suspend || amdgpu_in_reset(adev)) return 0; -- cgit From cbb8f989d5a07cb3e39e9c149a6f89d6c83432aa Mon Sep 17 00:00:00 2001 From: John Clements Date: Fri, 9 Apr 2021 17:25:29 +0800 Subject: drm/amdgpu: page retire over debugfs mechanism added support in RAS debugfs to add bad page for isolated page retirement testing Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 67 +++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b0fe5885e4c6..0541196ae1ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -99,6 +99,49 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) return false; } +static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) +{ + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct eeprom_table_record err_rec; + + if ((address >= adev->gmc.mc_vram_size) || + (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { + dev_warn(adev->dev, + "RAS WARN: input address 0x%llx is invalid.\n", + address); + return -EINVAL; + } + + if (amdgpu_ras_check_bad_page(adev, address)) { + dev_warn(adev->dev, + "RAS WARN: 0x%llx has been marked as bad page!\n", + address); + return 0; + } + + memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); + + err_rec.address = address; + err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT; + err_rec.ts = (uint64_t)ktime_get_real_seconds(); + err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; + + err_data.err_addr = &err_rec; + err_data.err_addr_cnt = 1; + + if (amdgpu_bad_page_threshold != 0) { + amdgpu_ras_add_bad_pages(adev, err_data.err_addr, + err_data.err_addr_cnt); + amdgpu_ras_save_bad_pages(adev); + } + + dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); + dev_warn(adev->dev, "Clear EEPROM:\n"); + dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); + + return 0; +} + static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { @@ -178,11 +221,25 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; + else if (sscanf(str, "retire_page") == 0) + op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; if (op != -1) { + + if (op == 3) { + if (sscanf(str, "%*s %llu", &address) != 1) + if (sscanf(str, "%*s 0x%llx", &address) != 1) + return -EINVAL; + + data->op = op; + data->inject.address = address; + + return 0; + } + if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) return -EINVAL; @@ -310,6 +367,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * if (ret) return -EINVAL; + if (data.op == 3) + { + ret = amdgpu_reserve_page_direct(adev, data.inject.address); + + if (ret) + return size; + else + return ret; + } + if (!amdgpu_ras_is_supported(adev, data.head.block)) return -EINVAL; -- cgit