From bd607166af7fe31f8d8e9c575f4561a4b56b9f24 Mon Sep 17 00:00:00 2001 From: Kent Russell Date: Fri, 13 Mar 2020 09:21:55 -0400 Subject: drm/amdgpu: Enable reading FRU chip via I2C v3 Allow for reading of information like manufacturer, product number and serial number from the FRU chip. Report the serial number as the new sysfs file serial_number. Note that this only works on server cards, as consumer cards do not feature the FRU chip, which contains this information. v2: Add documentation to amdgpu.rst, add helper functions, rename functions for consistency, fix bad starting offset v3: Remove testing definitions Signed-off-by: Kent Russell Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 90 ++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index faa3e7102156..f422ef58b4d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -64,6 +64,7 @@ #include "amdgpu_xgmi.h" #include "amdgpu_ras.h" #include "amdgpu_pmu.h" +#include "amdgpu_fru_eeprom.h" #include #include @@ -137,6 +138,72 @@ static DEVICE_ATTR(pcie_replay_count, S_IRUGO, static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); +/** + * DOC: product_name + * + * The amdgpu driver provides a sysfs API for reporting the product name + * for the device + * The file serial_number is used for this and returns the product name + * as returned from the FRU. + * NOTE: This is only available for certain server cards + */ + +static ssize_t amdgpu_device_get_product_name(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = ddev->dev_private; + + return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); +} + +static DEVICE_ATTR(product_name, S_IRUGO, + amdgpu_device_get_product_name, NULL); + +/** + * DOC: product_number + * + * The amdgpu driver provides a sysfs API for reporting the part number + * for the device + * The file serial_number is used for this and returns the part number + * as returned from the FRU. + * NOTE: This is only available for certain server cards + */ + +static ssize_t amdgpu_device_get_product_number(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = ddev->dev_private; + + return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); +} + +static DEVICE_ATTR(product_number, S_IRUGO, + amdgpu_device_get_product_number, NULL); + +/** + * DOC: serial_number + * + * The amdgpu driver provides a sysfs API for reporting the serial number + * for the device + * The file serial_number is used for this and returns the serial number + * as returned from the FRU. + * NOTE: This is only available for certain server cards + */ + +static ssize_t amdgpu_device_get_serial_number(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = ddev->dev_private; + + return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); +} + +static DEVICE_ATTR(serial_number, S_IRUGO, + amdgpu_device_get_serial_number, NULL); + /** * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control * @@ -1975,6 +2042,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) amdgpu_xgmi_add_device(adev); amdgpu_amdkfd_device_init(adev); + amdgpu_fru_get_product_info(adev); + init_failed: if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, true); @@ -3189,6 +3258,24 @@ fence_driver_init: return r; } + r = device_create_file(adev->dev, &dev_attr_product_name); + if (r) { + dev_err(adev->dev, "Could not create product_name"); + return r; + } + + r = device_create_file(adev->dev, &dev_attr_product_number); + if (r) { + dev_err(adev->dev, "Could not create product_number"); + return r; + } + + r = device_create_file(adev->dev, &dev_attr_serial_number); + if (r) { + dev_err(adev->dev, "Could not create serial_number"); + return r; + } + if (IS_ENABLED(CONFIG_PERF_EVENTS)) r = amdgpu_pmu_init(adev); if (r) @@ -3271,6 +3358,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev) device_remove_file(adev->dev, &dev_attr_pcie_replay_count); if (adev->ucode_sysfs_en) amdgpu_ucode_sysfs_fini(adev); + device_remove_file(adev->dev, &dev_attr_product_name); + device_remove_file(adev->dev, &dev_attr_product_number); + device_remove_file(adev->dev, &dev_attr_serial_number); if (IS_ENABLED(CONFIG_PERF_EVENTS)) amdgpu_pmu_fini(adev); if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) -- cgit From 3aa0115d238c71423d0e212138678a8cf51d4361 Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Wed, 4 Mar 2020 14:02:55 +0800 Subject: drm/amdgpu: cleanup all virtualization detection routine we need to move virt detection much earlier because: 1) HW team confirms us that RCC_IOV_FUNC_IDENTIFIER will always be at DE5 (dw) mmio offset from vega10, this way there is no need to implement detect_hw_virt() routine in each nbio/chip file. for VI SRIOV chip (tonga & fiji), the BIF_IOV_FUNC_IDENTIFIER is at 0x1503 2) we need to acknowledged we are SRIOV VF before we do IP discovery because the IP discovery content will be updated by host everytime after it recieved a new coming "REQ_GPU_INIT_DATA" request from guest (there will be patches for this new handshake soon). Signed-off-by: Monk Liu Reviewed-by: Emily Deng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f422ef58b4d8..449720086fbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3055,6 +3055,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) adev->enable_mes = true; + /* detect hw virtualization here */ + amdgpu_detect_virtualization(adev); + if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) { r = amdgpu_discovery_init(adev); if (r) { -- cgit From 61380faa4b4cc577df8a7ff5db5859bac6b351f7 Mon Sep 17 00:00:00 2001 From: John Clements Date: Wed, 25 Mar 2020 16:01:14 +0800 Subject: drm/amdgpu: disable ras query and iject during gpu reset added flag to ras context to indicate if ras query functionality is ready Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 449720086fbc..f88fe7fd78ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4168,6 +4168,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, need_full_reset = job_signaled = false; INIT_LIST_HEAD(&device_list); + amdgpu_ras_set_error_query_ready(adev, false); + dev_info(adev->dev, "GPU %s begin!\n", (in_ras_intr && !use_baco) ? "jobs stop":"reset"); @@ -4224,6 +4226,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (tmp_adev != adev) { + amdgpu_ras_set_error_query_ready(tmp_adev, false); amdgpu_device_lock_adev(tmp_adev, false); if (!amdgpu_sriov_vf(tmp_adev)) amdgpu_amdkfd_pre_reset(tmp_adev); -- cgit From 122078de168b8380e9dde15a5c04a5412e710cb6 Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Wed, 4 Mar 2020 23:51:51 +0800 Subject: drm/amdgpu: equip new req_init_data handshake by this new handshake host side can prepare vbios/ip-discovery and pf&vf exchange data upon recieving this request without stopping world switch. this way the world switch is less impacted by VF's exclusive mode request Signed-off-by: Monk Liu Reviewed-by: Emily Deng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f88fe7fd78ca..8d39ed47d65a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1798,6 +1798,21 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) amdgpu_amdkfd_device_probe(adev); if (amdgpu_sriov_vf(adev)) { + /* handle vbios stuff prior full access mode for new handshake */ + if (adev->virt.req_init_data_ver == 1) { + if (!amdgpu_get_bios(adev)) { + DRM_ERROR("failed to get vbios\n"); + return -EINVAL; + } + + r = amdgpu_atombios_init(adev); + if (r) { + dev_err(adev->dev, "amdgpu_atombios_init failed\n"); + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); + return r; + } + } + r = amdgpu_virt_request_full_gpu(adev, true); if (r) return -EAGAIN; @@ -1830,6 +1845,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) } /* get the vbios after the asic_funcs are set up */ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { + /* skip vbios handling for new handshake */ + if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver == 1) + continue; + /* Read BIOS */ if (!amdgpu_get_bios(adev)) return -EINVAL; -- cgit From dffa11b4f74b1572341a667ec7a006420dc48626 Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Wed, 4 Mar 2020 21:33:27 +0800 Subject: drm/amdgpu: adjust sequence of ip_discovery init and timeout_setting what: 1)move timtout setting before ip_early_init to reduce exclusive mode cost for SRIOV 2)move ip_discovery_init() to inside of amdgpu_discovery_reg_base_init() it is a prepare for the later upcoming patches. why: in later upcoming patches we would use a new mailbox event -- "req_gpu_init_data", which is a callback hooked in adev->virt.ops and this callback send a new event "REQ_GPU_INIT_DAT" to host to notify host to do some preparation like "IP discovery/vbios on the VF FB" and this callback must be: A) invoked after set_ip_block() because virt.ops is configured during set_ip_block() B) invoked before ip_discovery_init() becausen ip_discovery_init() need host side prepares everything in VF FB first. current place of ip_discovery_init() is before we can invoke callback of adev->virt.ops, thus we must move ip_discovery_init() to a place after the adev->virt.ops all settle done, and the perfect place is in amdgpu_discovery_reg_base_init() Signed-off-by: Monk Liu Reviewed-by: Emily Deng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8d39ed47d65a..3c19ae0b13b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3077,12 +3077,10 @@ int amdgpu_device_init(struct amdgpu_device *adev, /* detect hw virtualization here */ amdgpu_detect_virtualization(adev); - if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) { - r = amdgpu_discovery_init(adev); - if (r) { - dev_err(adev->dev, "amdgpu_discovery_init failed\n"); - return r; - } + r = amdgpu_device_get_job_timeout_settings(adev); + if (r) { + dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); + return r; } /* early init functions */ @@ -3090,12 +3088,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) return r; - r = amdgpu_device_get_job_timeout_settings(adev); - if (r) { - dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); - return r; - } - /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); -- cgit From 2f2941324c65bf23695038968cecab4e5cde647e Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Tue, 10 Mar 2020 18:12:13 +0800 Subject: drm/amdgpu: postpone entering fullaccess mode if host support new handshake we only need to enter fullaccess_mode in ip_init() part, otherwise we need to do it before reading vbios (becuase host prepares vbios for VF only after received REQ_GPU_INIT event under legacy handshake) Signed-off-by: Monk Liu Reviewed-by: Emily Deng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3c19ae0b13b9..a97492f3bc42 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1812,10 +1812,14 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) return r; } } + } + /* we need to send REQ_GPU here for legacy handshaker otherwise the vbios + * will not be prepared by host for this VF */ + if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver < 1) { r = amdgpu_virt_request_full_gpu(adev, true); if (r) - return -EAGAIN; + return r; } adev->pm.pp_feature = amdgpu_pp_feature_mask; @@ -1975,6 +1979,12 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) return r; + if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver > 0) { + r = amdgpu_virt_request_full_gpu(adev, true); + if (r) + return -EAGAIN; + } + for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.valid) continue; -- cgit From b7b2a316b95e09ad51db0fd18c5f291051b06117 Mon Sep 17 00:00:00 2001 From: Jiawei Date: Thu, 26 Mar 2020 15:10:51 +0800 Subject: drm/amdgpu: extend compute job timeout extend compute lockup timeout to 60000 for SR-IOV. Reviewed-by: Emily Deng Signed-off-by: Jiawei Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a97492f3bc42..90601966c524 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2867,12 +2867,12 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) * By default timeout for non compute jobs is 10000. * And there is no timeout enforced on compute jobs. * In SR-IOV or passthrough mode, timeout for compute - * jobs are 10000 by default. + * jobs are 60000 by default. */ adev->gfx_timeout = msecs_to_jiffies(10000); adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) - adev->compute_timeout = adev->gfx_timeout; + adev->compute_timeout = msecs_to_jiffies(60000); else adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; -- cgit From 04bef61e5da18c2b301c629a209ccdba4d4c6fbb Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Thu, 2 Apr 2020 15:10:24 +0800 Subject: drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate Without this change, sriov tdr code path will never free those allocated memories and get memory leak. v2:add a bugfix for kiq ring test fail Signed-off-by: Jack Zhang Reviewed-by: Monk Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 90601966c524..626b46fa1a63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3852,6 +3852,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; + amdgpu_amdkfd_pre_reset(adev); + /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) -- cgit From 1c6d567bdf73a207f51ef2e5745854ba7daa22c7 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Wed, 1 Apr 2020 11:46:57 +0200 Subject: drm/amdgpu: rework sched_list generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generate HW IP's sched_list in amdgpu_ring_init() instead of amdgpu_ctx.c. This makes amdgpu_ctx_init_compute_sched(), ring.has_high_prio and amdgpu_ctx_init_sched() unnecessary. This patch also stores sched_list for all HW IPs in one big array in struct amdgpu_device which makes amdgpu_ctx_init_entity() much more leaner. v2: fix a coding style issue do not use drm hw_ip const to populate amdgpu_ring_type enum v3: remove ctx reference and move sched array and num_sched to a struct use num_scheds to detect uninitialized scheduler list v4: use array_index_nospec for user space controlled variables fix possible checkpatch.pl warnings Signed-off-by: Nirmoy Das Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 626b46fa1a63..f7c51fe1bc35 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3210,8 +3210,6 @@ fence_driver_init: adev->gfx.config.max_cu_per_sh, adev->gfx.cu_info.number); - amdgpu_ctx_init_sched(adev); - adev->accel_working = true; amdgpu_vm_check_compute_bug(adev); -- cgit From fe9824d15eff7ef38f63eddd1a06648e49286c7b Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Tue, 7 Apr 2020 13:44:51 +0800 Subject: drm/amdkfd Avoid destroy hqd when GPU is on reset This reverts commit 5161bba4311f in order to split it into two different patches, and this will make it easier to understand. [PATCH 1/2] porting to gfx10 from commit 1b0bfcff463f390c40 ("drm/amdgpu: Avoid destroy hqd when GPU is on reset") Originally, MEC is touched without GPU initialized first. Signed-off-by: Jack Zhang Reviewed-by: Monk Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f7c51fe1bc35..c38777184433 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3850,8 +3850,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; - amdgpu_amdkfd_pre_reset(adev); - /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) -- cgit From b639c22c98ff0ba140799ddc56f639240014f999 Mon Sep 17 00:00:00 2001 From: Jack Zhang Date: Tue, 7 Apr 2020 13:50:05 +0800 Subject: drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset [PATCH 2/2] kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate Without this change, sriov tdr code path will never free those allocated memories and get memory leak. Signed-off-by: Jack Zhang Reviewed-by: Monk Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c38777184433..f7c51fe1bc35 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3850,6 +3850,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; + amdgpu_amdkfd_pre_reset(adev); + /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) -- cgit From a23ca7f76d558e2275cea6033171a6c47fcd002c Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Tue, 7 Apr 2020 20:21:26 +0800 Subject: drm/amdgpu: fix gfx hang during suspend with video playback (v2) The system will be hang up during S3 suspend because of SMU is pending for GC not respose the register CP_HQD_ACTIVE access request.This issue root cause of accessing the GC register under enter GFX CGGPG and can be fixed by disable GFX CGPG before perform suspend. v2: Use disable the GFX CGPG instead of RLC safe mode guard. Signed-off-by: Prike Liang Tested-by: Mengbing Wang Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f7c51fe1bc35..a191f6e48550 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2438,8 +2438,6 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) { int i, r; - amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); - amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.valid) @@ -3468,6 +3466,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) } } + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); + amdgpu_amdkfd_suspend(adev, !fbcon); amdgpu_ras_suspend(adev); -- cgit From dec0520aff8df2f1aca8ab9b29818a8002592bec Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 3 Apr 2020 16:40:35 +0800 Subject: drm/amdgpu: remove inproper workaround for vega10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the workaround is not needed for soc15 ASICs except for vega10. it is even not needed with latest vega10 vbios. Signed-off-by: Hawking Zhang Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a191f6e48550..eddfef191405 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -387,10 +387,6 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); } - - if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { - udelay(500); - } } /** @@ -406,10 +402,6 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { - if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { - adev->last_mm_index = v; - } - if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) return amdgpu_kiq_wreg(adev, reg, v); @@ -464,20 +456,12 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) */ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) { - if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { - adev->last_mm_index = v; - } - if ((reg * 4) < adev->rio_mem_size) iowrite32(v, adev->rio_mem + (reg * 4)); else { iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); } - - if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { - udelay(500); - } } /** -- cgit From ec59847e741d7d2c9d404bb3d1efcf11b77293ff Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 3 Apr 2020 17:51:42 +0800 Subject: drm/amdgpu: retire RREG32_IDX/WREG32_IDX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit those are not needed anymore Signed-off-by: Hawking Zhang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index eddfef191405..f3e144d226c0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -317,7 +317,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) return amdgpu_kiq_rreg(adev, reg); - if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) + if ((reg * 4) < adev->rmmio_size) ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); else { unsigned long flags; @@ -377,7 +377,7 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, { trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); - if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) + if ((reg * 4) < adev->rmmio_size) writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); else { unsigned long flags; -- cgit From f384ff95f652ad743d6e3a901c60ca521da78674 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 3 Apr 2020 17:58:06 +0800 Subject: drm/amdgpu: retire AMDGPU_REGS_KIQ flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit all the register access through kiq is redirected to amdgpu_kiq_rreg/amdgpu_kiq_wreg Signed-off-by: Hawking Zhang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f3e144d226c0..a431de1015a3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -314,7 +314,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, { uint32_t ret; - if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) return amdgpu_kiq_rreg(adev, reg); if ((reg * 4) < adev->rmmio_size) @@ -402,7 +402,7 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { - if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) return amdgpu_kiq_wreg(adev, reg, v); amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); -- cgit From 2eee0229f65e897134566888e5321bcb3af0df7a Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Wed, 8 Apr 2020 16:18:52 +0800 Subject: drm/amdgpu: support access regs outside of mmio bar MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add indirect access support to registers outside of mmio bar. Signed-off-by: Hawking Zhang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 +++++++++++------------------- 1 file changed, 17 insertions(+), 28 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a431de1015a3..f51a32fb3c03 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -298,10 +298,10 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, } /* - * MMIO register access helper functions. + * device register access helper functions. */ /** - * amdgpu_mm_rreg - read a memory mapped IO register + * amdgpu_device_rreg - read a register * * @adev: amdgpu_device pointer * @reg: dword aligned register offset @@ -309,8 +309,8 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, * * Returns the 32 bit value from the offset specified. */ -uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, - uint32_t acc_flags) +uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, uint32_t reg, + uint32_t acc_flags) { uint32_t ret; @@ -319,15 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, if ((reg * 4) < adev->rmmio_size) ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); - else { - unsigned long flags; - - spin_lock_irqsave(&adev->mmio_idx_lock, flags); - writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); - ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); - spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); - } - trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); + else + ret = adev->pcie_rreg(adev, (reg * 4)); + trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); return ret; } @@ -373,24 +367,19 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) BUG(); } -void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) +void static inline amdgpu_device_wreg_no_kiq(struct amdgpu_device *adev, uint32_t reg, + uint32_t v, uint32_t acc_flags) { - trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); + trace_amdgpu_device_wreg(adev->pdev->device, reg, v); if ((reg * 4) < adev->rmmio_size) writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); - else { - unsigned long flags; - - spin_lock_irqsave(&adev->mmio_idx_lock, flags); - writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); - writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); - spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); - } + else + adev->pcie_wreg(adev, (reg * 4), v); } /** - * amdgpu_mm_wreg - write to a memory mapped IO register + * amdgpu_device_wreg - write to a register * * @adev: amdgpu_device pointer * @reg: dword aligned register offset @@ -399,13 +388,13 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, * * Writes the value specified to the offset specified. */ -void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, - uint32_t acc_flags) +void amdgpu_device_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, + uint32_t acc_flags) { if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) return amdgpu_kiq_wreg(adev, reg, v); - amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); + amdgpu_device_wreg_no_kiq(adev, reg, v, acc_flags); } /* @@ -424,7 +413,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); } - amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); + amdgpu_device_wreg_no_kiq(adev, reg, v, acc_flags); } /** -- cgit From dadce777e0947b9b6839f06f360882e54ba2a154 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Fri, 10 Apr 2020 15:38:44 +0800 Subject: drm/amdgpu: fix wrong vram lost counter increment V2 Vram lost counter is wrongly increased by two during baco reset. V2: assumed vram lost for mode1 reset on all ASICs Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f51a32fb3c03..9db9ab417dae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2079,8 +2079,24 @@ static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) */ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) { - return !!memcmp(adev->gart.ptr, adev->reset_magic, - AMDGPU_RESET_MAGIC_NUM); + if (memcmp(adev->gart.ptr, adev->reset_magic, + AMDGPU_RESET_MAGIC_NUM)) + return true; + + if (!adev->in_gpu_reset) + return false; + + /* + * For all ASICs with baco/mode1 reset, the VRAM is + * always assumed to be lost. + */ + switch (amdgpu_asic_reset_method(adev)) { + case AMD_RESET_METHOD_BACO: + case AMD_RESET_METHOD_MODE1: + return true; + default: + return false; + } } /** -- cgit From dd4fa6c1b89a4f4034ec79c65405fdfc920204d1 Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai Date: Wed, 8 Apr 2020 21:28:15 -0400 Subject: drm/amd/amdgpu: remove hardcoded module name in prints Let format prefixes take care of printing the module name through pr_fmt and dev_fmt definitions. Signed-off-by: Aurabindo Pillai Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9db9ab417dae..9f1377c16090 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1187,7 +1187,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero return; if (state == VGA_SWITCHEROO_ON) { - pr_info("amdgpu: switched on\n"); + pr_info("switched on\n"); /* don't suspend or resume card normally */ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; @@ -1201,7 +1201,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero dev->switch_power_state = DRM_SWITCH_POWER_ON; drm_kms_helper_poll_enable(dev); } else { - pr_info("amdgpu: switched off\n"); + pr_info("switched off\n"); drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; amdgpu_device_suspend(dev, true); -- cgit From ced1ba9761693ec310c33edf47c63b062a09be4a Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Mon, 13 Apr 2020 21:41:14 +0800 Subject: drm/amdgpu: fix the hw hang during perform system reboot and reset The system reboot failed as some IP blocks enter power gate before perform hw resource destory. Meanwhile use unify interface to set device CGPG to ungate state can simplify the amdgpu poweroff or reset ungate guard. Fixes: 487eca11a321ef ("drm/amdgpu: fix gfx hang during suspend with video playback (v2)") Signed-off-by: Prike Liang Tested-by: Mengbing Wang Tested-by: Paul Menzel Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9f1377c16090..706f93d26136 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2427,6 +2427,8 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) { int i, r; + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.valid) -- cgit From fdd21e62b01b0c618a19237344dc96b843647811 Mon Sep 17 00:00:00 2001 From: Kent Russell Date: Mon, 13 Apr 2020 14:15:44 -0400 Subject: Revert "drm/amdgpu: use the BAR if possible in amdgpu_device_vram_access v2" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit c12b84d6e0d70f1185e6daddfd12afb671791b6e. The original patch causes a RAS event and subsequent kernel hard-hang when running the KFDMemoryTest.PtraceAccessInvisibleVram on VG20 and Arcturus dmesg output at hang time: [drm] RAS event of type ERREVENT_ATHUB_INTERRUPT detected! amdgpu 0000:67:00.0: GPU reset begin! Evicting PASID 0x8000 queues Started evicting pasid 0x8000 qcm fence wait loop timeout expired The cp might be in an unrecoverable state due to an unsuccessful queues preemption Failed to evict process queues Failed to suspend process 0x8000 Finished evicting pasid 0x8000 Started restoring pasid 0x8000 Finished restoring pasid 0x8000 [drm] UVD VCPU state may lost due to RAS ERREVENT_ATHUB_INTERRUPT amdgpu: [powerplay] Failed to send message 0x26, response 0x0 amdgpu: [powerplay] Failed to set soft min gfxclk ! amdgpu: [powerplay] Failed to upload DPM Bootup Levels! amdgpu: [powerplay] Failed to send message 0x7, response 0x0 amdgpu: [powerplay] [DisableAllSMUFeatures] Failed to disable all smu features! amdgpu: [powerplay] [DisableDpmTasks] Failed to disable all smu features! amdgpu: [powerplay] [PowerOffAsic] Failed to disable DPM! [drm:amdgpu_device_ip_suspend_phase2 [amdgpu]] *ERROR* suspend of IP block failed -5 Signed-off-by: Kent Russell Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 706f93d26136..9da5fc805d00 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -254,32 +254,6 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, uint32_t hi = ~0; uint64_t last; - -#ifdef CONFIG_64BIT - last = min(pos + size, adev->gmc.visible_vram_size); - if (last > pos) { - void __iomem *addr = adev->mman.aper_base_kaddr + pos; - size_t count = last - pos; - - if (write) { - memcpy_toio(addr, buf, count); - mb(); - amdgpu_asic_flush_hdp(adev, NULL); - } else { - amdgpu_asic_invalidate_hdp(adev, NULL); - mb(); - memcpy_fromio(buf, addr, count); - } - - if (count == size) - return; - - pos += count; - buf += count / 4; - size -= count; - } -#endif - spin_lock_irqsave(&adev->mmio_idx_lock, flags); for (last = pos + size; pos < last; pos += 4) { uint32_t tmp = pos >> 31; -- cgit From d84a430d9f7b1ce6baedf1305106d0ae706aca76 Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Tue, 17 Mar 2020 15:43:41 -0400 Subject: drm/amdgpu: fix race between pstate and remote buffer map Vega20 arbitrates pstate at hive level and not device level. Last peer to remote buffer unmap could drop P-State while another process is still remote buffer mapped. With this fix, P-States still needs to be disabled for now as SMU bug was discovered on synchronous P2P transfers. This should be fixed in the next FW update. Signed-off-by: Jonathan Kim Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9da5fc805d00..ad74fd88fa76 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2248,7 +2248,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) if (gpu_instance->adev->flags & AMD_IS_APU) continue; - r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0); + r = amdgpu_xgmi_set_pstate(gpu_instance->adev, + AMDGPU_XGMI_PSTATE_MIN); if (r) { DRM_ERROR("pstate setting failed (%d).\n", r); break; -- cgit From d69b8971e540ae908a4b7ba44aa11ecb42a2c406 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Fri, 17 Apr 2020 16:11:48 -0400 Subject: drm/amdgpu: Print CU information by default during initialization This is convenient for multiple teams to obtain the information. Also, add device info by using dev_info(). Signed-off-by: Yong Zhao Reviewed-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ad74fd88fa76..889e68851504 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3168,7 +3168,8 @@ fence_driver_init: goto failed; } - DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", + dev_info(adev->dev, + "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", adev->gfx.config.max_shader_engines, adev->gfx.config.max_sh_per_se, adev->gfx.config.max_cu_per_sh, -- cgit From e05185b34157ba606bd2200bcc3c335cf19422ae Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 20 Apr 2020 23:08:14 +0800 Subject: drm/amdgpu: clean up unused variable about ring lru MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clean up unused variable: 1. ring_lru_list 2. ring_lru_list_lock related-commit: drm/amdgpu: remove ring lru handling Signed-off-by: Kevin Wang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 889e68851504..c239035cdf14 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2988,9 +2988,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_LIST_HEAD(&adev->shadow_list); mutex_init(&adev->shadow_list_lock); - INIT_LIST_HEAD(&adev->ring_lru_list); - spin_lock_init(&adev->ring_lru_list_lock); - INIT_DELAYED_WORK(&adev->delayed_init_work, amdgpu_device_delayed_init_work_handler); INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, -- cgit From a2f63ee8b5eabda8b317425d1e487e51a4d7b21e Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 16 Apr 2020 12:15:31 +0800 Subject: drm/amdgpu: correct fbdev suspend on gpu reset As for XGMI setup, it needs to be performed on all the devices from the same hive. Signed-off-by: Evan Quan Acked-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c239035cdf14..f66a33410ee8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4224,7 +4224,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, */ amdgpu_unregister_gpu_instance(tmp_adev); - amdgpu_fbdev_set_suspend(adev, 1); + amdgpu_fbdev_set_suspend(tmp_adev, 1); /* disable ras on ALL IPs */ if (!(in_ras_intr && !use_baco) && -- cgit From 52fb44cf30fc6b11a2faf8d8d905e756ea24f919 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 16 Apr 2020 12:20:38 +0800 Subject: drm/amdgpu: correct cancel_delayed_work_sync on gpu reset As for XGMI setup, it should be performed on other devices from the hive also. Signed-off-by: Evan Quan Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f66a33410ee8..4be5187391c6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4218,6 +4218,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, amdgpu_amdkfd_pre_reset(tmp_adev); } + cancel_delayed_work_sync(&tmp_adev->delayed_init_work); + /* * Mark these ASICs to be reseted as untracked first * And add them back after reset completed -- cgit From 9e94d22c008585815f32630ee7d0d28c4ec12bb7 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 16 Apr 2020 12:27:28 +0800 Subject: drm/amdgpu: optimize the gpu reset for XGMI setup V2 This is basically just some code cosmetic. The current design for XGMI setup gput reset is to operate on current device(adev) first and then on other devices from the hive(by another 'for' loop). But actually we can do some sort to the device list(to put current device 1st position) and handle all the devices in a single 'for' loop. V2: added missing hive->hive_lock protection Signed-off-by: Evan Quan Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 ++++++++++-------------------- 1 file changed, 25 insertions(+), 51 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 4be5187391c6..e75deb789e56 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4152,16 +4152,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, } need_full_reset = job_signaled = false; - INIT_LIST_HEAD(&device_list); - - amdgpu_ras_set_error_query_ready(adev, false); dev_info(adev->dev, "GPU %s begin!\n", (in_ras_intr && !use_baco) ? "jobs stop":"reset"); - cancel_delayed_work_sync(&adev->delayed_init_work); - - hive = amdgpu_get_xgmi_hive(adev, false); + hive = amdgpu_get_xgmi_hive(adev, true); /* * Here we trylock to avoid chain of resets executing from @@ -4174,35 +4169,21 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (hive && !mutex_trylock(&hive->reset_lock)) { DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", job ? job->base.id : -1, hive->hive_id); + mutex_unlock(&hive->hive_lock); return 0; } - /* Start with adev pre asic reset first for soft reset check.*/ - if (!amdgpu_device_lock_adev(adev, !hive)) { - DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", - job ? job->base.id : -1); - return 0; - } - - /* Block kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_pre_reset(adev); - - /* Build list of devices to reset */ - if (adev->gmc.xgmi.num_physical_nodes > 1) { - if (!hive) { - /*unlock kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_post_reset(adev); - amdgpu_device_unlock_adev(adev); + /* + * Build list of devices to reset. + * In case we are in XGMI hive mode, resort the device list + * to put adev in the 1st position. + */ + INIT_LIST_HEAD(&device_list); + if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (!hive) return -ENODEV; - } - - /* - * In case we are in XGMI hive mode device reset is done for all the - * nodes in the hive to retrain all XGMI links and hence the reset - * sequence is executed in loop on all nodes. - */ + if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) + list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); device_list_handle = &hive->device_list; } else { list_add_tail(&adev->gmc.xgmi.head, &device_list); @@ -4211,15 +4192,20 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - if (tmp_adev != adev) { - amdgpu_ras_set_error_query_ready(tmp_adev, false); - amdgpu_device_lock_adev(tmp_adev, false); - if (!amdgpu_sriov_vf(tmp_adev)) - amdgpu_amdkfd_pre_reset(tmp_adev); + if (!amdgpu_device_lock_adev(tmp_adev, !hive)) { + DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", + job ? job->base.id : -1); + mutex_unlock(&hive->hive_lock); + return 0; } + amdgpu_ras_set_error_query_ready(tmp_adev, false); + cancel_delayed_work_sync(&tmp_adev->delayed_init_work); + if (!amdgpu_sriov_vf(tmp_adev)) + amdgpu_amdkfd_pre_reset(tmp_adev); + /* * Mark these ASICs to be reseted as untracked first * And add them back after reset completed @@ -4265,22 +4251,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, goto skip_hw_reset; } - - /* Guilty job will be freed after this*/ - r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset); - if (r) { - /*TODO Should we stop ?*/ - DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", - r, adev->ddev->unique); - adev->asic_reset_res = r; - } - retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - - if (tmp_adev == adev) - continue; - r = amdgpu_device_pre_asic_reset(tmp_adev, NULL, &need_full_reset); @@ -4345,8 +4317,10 @@ skip_sched_resume: amdgpu_device_unlock_adev(tmp_adev); } - if (hive) + if (hive) { mutex_unlock(&hive->reset_lock); + mutex_unlock(&hive->hive_lock); + } if (r) dev_info(adev->dev, "GPU reset end with ret = %d\n", r); -- cgit From 7dd8c205eaedfa3163c307143aaf29b65190e54a Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 16 Apr 2020 12:39:04 +0800 Subject: drm/amdgpu: code cleanup around gpu reset Make code more readable. Signed-off-by: Evan Quan Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e75deb789e56..1e4527c64279 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4130,7 +4130,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job) { struct list_head device_list, *device_list_handle = NULL; - bool need_full_reset, job_signaled; + bool need_full_reset = false; + bool job_signaled = false; struct amdgpu_hive_info *hive = NULL; struct amdgpu_device *tmp_adev = NULL; int i, r = 0; @@ -4151,13 +4152,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, emergency_restart(); } - need_full_reset = job_signaled = false; - dev_info(adev->dev, "GPU %s begin!\n", (in_ras_intr && !use_baco) ? "jobs stop":"reset"); - hive = amdgpu_get_xgmi_hive(adev, true); - /* * Here we trylock to avoid chain of resets executing from * either trigger by jobs on different adevs in XGMI hive or jobs on @@ -4165,7 +4162,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * We always reset all schedulers for device and all devices for XGMI * hive so that should take care of them too. */ - + hive = amdgpu_get_xgmi_hive(adev, true); if (hive && !mutex_trylock(&hive->reset_lock)) { DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", job ? job->base.id : -1, hive->hive_id); @@ -4232,7 +4229,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, } } - if (in_ras_intr && !use_baco) goto skip_sched_resume; @@ -4243,10 +4239,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * job->base holds a reference to parent fence */ if (job && job->base.s_fence->parent && - dma_fence_is_signaled(job->base.s_fence->parent)) + dma_fence_is_signaled(job->base.s_fence->parent)) { job_signaled = true; - - if (job_signaled) { dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); goto skip_hw_reset; } -- cgit From a891d239f9e036031f9f1c62fe584232662cb7f1 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Wed, 22 Apr 2020 12:22:54 +0800 Subject: drm/amdgpu: set error query ready after all IPs late init If set error query ready in amdgpu_ras_late_init, which will cause some IP blocks aren't initialized, but their error query is ready. Signed-off-by: Dennis Li Reviewed-by: Guchun Chen Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1e4527c64279..f9b315e7e004 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2216,6 +2216,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) adev->ip_blocks[i].status.late_initialized = true; } + amdgpu_ras_set_error_query_ready(adev, true); + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); -- cgit