From 35963cf2cd25eeea8bdb4d02853dac1e66fb13a0 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 30 Oct 2023 20:24:35 -0700 Subject: drm/sched: Add drm_sched_wqueue_* helpers Add scheduler wqueue ready, stop, and start helpers to hide the implementation details of the scheduler from the drivers. v2: - s/sched_wqueue/sched_wqueue (Luben) - Remove the extra white line after the return-statement (Luben) - update drm_sched_wqueue_ready comment (Luben) Cc: Luben Tuikov Signed-off-by: Matthew Brost Reviewed-by: Luben Tuikov Link: https://lore.kernel.org/r/20231031032439.1558703-2-matthew.brost@intel.com Signed-off-by: Luben Tuikov --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 19b539cab7fe..a7f7afcb2bf0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4601,7 +4601,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev) for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!ring || !drm_sched_wqueue_ready(&ring->sched)) continue; spin_lock(&ring->sched.job_list_lock); @@ -4740,7 +4740,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!ring || !drm_sched_wqueue_ready(&ring->sched)) continue; /* Clear job fence from fence drv to avoid force_completion @@ -5282,7 +5282,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!ring || !drm_sched_wqueue_ready(&ring->sched)) continue; drm_sched_stop(&ring->sched, job ? &job->base : NULL); @@ -5357,7 +5357,7 @@ skip_hw_reset: for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!ring || !drm_sched_wqueue_ready(&ring->sched)) continue; drm_sched_start(&ring->sched, true); @@ -5683,7 +5683,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!ring || !drm_sched_wqueue_ready(&ring->sched)) continue; drm_sched_stop(&ring->sched, NULL); @@ -5811,7 +5811,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev) for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) + if (!ring || !drm_sched_wqueue_ready(&ring->sched)) continue; drm_sched_start(&ring->sched, true); -- cgit From a6149f0393699308fb00149be913044977bceb56 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 30 Oct 2023 20:24:36 -0700 Subject: drm/sched: Convert drm scheduler to use a work queue rather than kthread In Xe, the new Intel GPU driver, a choice has made to have a 1 to 1 mapping between a drm_gpu_scheduler and drm_sched_entity. At first this seems a bit odd but let us explain the reasoning below. 1. In Xe the submission order from multiple drm_sched_entity is not guaranteed to be the same completion even if targeting the same hardware engine. This is because in Xe we have a firmware scheduler, the GuC, which allowed to reorder, timeslice, and preempt submissions. If a using shared drm_gpu_scheduler across multiple drm_sched_entity, the TDR falls apart as the TDR expects submission order == completion order. Using a dedicated drm_gpu_scheduler per drm_sched_entity solve this problem. 2. In Xe submissions are done via programming a ring buffer (circular buffer), a drm_gpu_scheduler provides a limit on number of jobs, if the limit of number jobs is set to RING_SIZE / MAX_SIZE_PER_JOB we get flow control on the ring for free. A problem with this design is currently a drm_gpu_scheduler uses a kthread for submission / job cleanup. This doesn't scale if a large number of drm_gpu_scheduler are used. To work around the scaling issue, use a worker rather than kthread for submission / job cleanup. v2: - (Rob Clark) Fix msm build - Pass in run work queue v3: - (Boris) don't have loop in worker v4: - (Tvrtko) break out submit ready, stop, start helpers into own patch v5: - (Boris) default to ordered work queue v6: - (Luben / checkpatch) fix alignment in msm_ringbuffer.c - (Luben) s/drm_sched_submit_queue/drm_sched_wqueue_enqueue - (Luben) Update comment for drm_sched_wqueue_enqueue - (Luben) Positive check for submit_wq in drm_sched_init - (Luben) s/alloc_submit_wq/own_submit_wq v7: - (Luben) s/drm_sched_wqueue_enqueue/drm_sched_run_job_queue v8: - (Luben) Adjust var names / comments Signed-off-by: Matthew Brost Reviewed-by: Luben Tuikov Link: https://lore.kernel.org/r/20231031032439.1558703-3-matthew.brost@intel.com Signed-off-by: Luben Tuikov --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a7f7afcb2bf0..203dfcea82b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2279,7 +2279,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) break; } - r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, + r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, DRM_SCHED_PRIORITY_COUNT, ring->num_hw_submission, 0, timeout, adev->reset_domain->wq, -- cgit From c8031019dc95e3ab7cc0b09f1894c5f52dc0c187 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Tue, 10 Oct 2023 07:35:06 -0700 Subject: drm/amdgpu: Implement a new 64bit sequence memory driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Developed a new driver which allocates a 64bit memory on each request in sequence order. At the moment, user queue fence memory is the main consumer of this seq64 driver. v2: Worked on review comments from Christian for the following modifications - Move driver name from "semaphore" to "seq64" - Remove unnecessary PT/PD mapping - Move enable_mes check into init/fini functions. v3: Worked on review comments from Christian - drop enable_mes check - use DECLARE_BITMAP for bit array - added kerneldoc for seq64 v4: Worked on review comments from Christian - Rename amdgpu_seq64_get name with amdgpu_seq64_alloc v5: Worked on review comments from Christian - Fix seq64 lockdep warning - move fpriv->seq64_va check into amdgpu_seq64_unmap() - make the function amdgpu_seq64_unmap() return as void. - reserve the buffers as not interruptible. v6: port to drm_exec (Alex) v7: disable for now (Arun) Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7eeaf0aa7f81..151f8b950b72 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2676,6 +2676,12 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) goto init_failed; } } + + r = amdgpu_seq64_init(adev); + if (r) { + DRM_ERROR("allocate seq64 failed %d\n", r); + goto init_failed; + } } } @@ -3138,6 +3144,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) amdgpu_device_wb_fini(adev); amdgpu_device_mem_scratch_fini(adev); amdgpu_ib_pool_fini(adev); + amdgpu_seq64_fini(adev); } r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); -- cgit From 466a7d115326ece682c2b60d1c77d1d0b9010b4f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 10 Nov 2023 16:34:51 -0600 Subject: drm/amd: Use the first non-dGPU PCI device for BW limits When bandwidth limits are looked up using pcie_bandwidth_available() virtual links such as USB4 are analyzed which might not represent the real speed. Furthermore devices may change speeds autonomously which may introduce conditional variation to the results reported in the status registers. Instead look at the capabilities of first PCI device outside of dGPU to decide upper limits that the dGPU will work at. For eGPU this effectively means that it will use the speed of the link partner. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2925#note_2145860 Link: https://www.usb.org/document-library/usb4r-specification-v20 USB4 V2 with Errata and ECN through June 2023 Section 11.2.1 Reviewed-by: Lijo Lazar Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 151f8b950b72..ec3bf949ee2f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5737,6 +5737,39 @@ recover_end: return r; } +/** + * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner + * + * @adev: amdgpu_device pointer + * @speed: pointer to the speed of the link + * @width: pointer to the width of the link + * + * Evaluate the hierarchy to find the speed and bandwidth capabilities of the + * first physical partner to an AMD dGPU. + * This will exclude any virtual switches and links. + */ +static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, + enum pci_bus_speed *speed, + enum pcie_link_width *width) +{ + struct pci_dev *parent = adev->pdev; + + if (!speed || !width) + return; + + *speed = PCI_SPEED_UNKNOWN; + *width = PCIE_LNK_WIDTH_UNKNOWN; + + while ((parent = pci_upstream_bridge(parent))) { + /* skip upstream/downstream switches internal to dGPU*/ + if (parent->vendor == PCI_VENDOR_ID_ATI) + continue; + *speed = pcie_get_speed_cap(parent); + *width = pcie_get_width_cap(parent); + break; + } +} + /** * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot * @@ -5770,8 +5803,8 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) return; - pcie_bandwidth_available(adev->pdev, NULL, - &platform_speed_cap, &platform_link_width); + amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, + &platform_link_width); if (adev->pm.pcie_gen_mask == 0) { /* asic caps */ -- cgit From d9b3a066dfcd3fe50b4dc561d8510c43c0ad8863 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 10 Nov 2023 16:34:52 -0600 Subject: drm/amd: Exclude dGPUs in eGPU enclosures from DPM quirks The PCIe speed capabilities advertised by a USB4 or TBT3 link are limited to PCIe gen 1 per the USB4 spec. In reality the speed will change dynamically based on fabric conditions and other traffic. DPM is disabled when dGPUs are connected directly to Intel hosts since the PCIe root port isn't able to handle dynamic speed switching. As this limitation is specifically for PCIe root ports in the SoC, don't apply it when connected to an eGPU enclosure connected to an Intel host. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2885 Reviewed-by: Lijo Lazar Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ec3bf949ee2f..b5edf40b5d03 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1551,11 +1551,15 @@ bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 */ -static bool amdgpu_device_pcie_dynamic_switching_supported(void) +static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) { #if IS_ENABLED(CONFIG_X86) struct cpuinfo_x86 *c = &cpu_data(0); + /* eGPU change speeds based on USB4 fabric conditions */ + if (dev_is_removable(adev->dev)) + return true; + if (c->x86_vendor == X86_VENDOR_INTEL) return false; #endif @@ -2395,7 +2399,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; - if (!amdgpu_device_pcie_dynamic_switching_supported()) + if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; total = true; -- cgit From 2e9b152325f649923b9324fa8ea5f1a5289145bb Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Tue, 1 Aug 2023 10:37:41 -0400 Subject: drm/amdgpu: optimize RLC powerdown notification on Vangogh The smu needs to get the rlc power down message to sync the rlc state with smu, the rlc state updating message need to be sent at while smu begin suspend sequence , otherwise SMU will crash while RLC state is not notified by driver, and rlc state probally changed after that notification, so it needs to notify rlc state to smu at the end of the suspend sequence in amdgpu_device_suspend() that can make sure the rlc state is correctly set to SMU. [ 101.000590] amdgpu 0000:03:00.0: amdgpu: SMU: I'm not done with your previous command: SMN_C2PMSG_66:0x0000001E SMN_C2PMSG_82:0x00000000 [ 101.000598] amdgpu 0000:03:00.0: amdgpu: Failed to disable gfxoff! [ 110.838026] amdgpu 0000:03:00.0: amdgpu: SMU: I'm not done with your previous command: SMN_C2PMSG_66:0x0000001E SMN_C2PMSG_82:0x00000000 [ 110.838035] amdgpu 0000:03:00.0: amdgpu: Failed to disable smu features. [ 110.838039] amdgpu 0000:03:00.0: amdgpu: Fail to disable dpm features! [ 110.838040] [drm:amdgpu_device_ip_suspend_phase2 [amdgpu]] *ERROR* suspend of IP block failed -62 [ 110.884394] PM: suspend of devices aborted after 21213.620 msecs [ 110.884402] PM: start suspend of devices aborted after 21213.882 msecs [ 110.884405] PM: Some devices failed to suspend, or early wake event detected Reviewed-by: Yifan Zhang Signed-off-by: Perry Yuan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b5edf40b5d03..5d3a83193115 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4549,6 +4549,10 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, false); + r = amdgpu_dpm_notify_rlc_state(adev, false); + if (r) + return r; + return 0; } -- cgit From af39e6f4d8032b101907cc2ac12a21a778da568d Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 6 Oct 2023 13:49:03 +0530 Subject: drm/amdgpu: Add reg_state sysfs attribute Add reg_state attribute to fetch the register snapshot of different IPs like XGMI, WAFL,PCIE and USR. To get a snapshot for a particular IP 1) Open the sysfs file 2) Seek to the offset as defined in amdgpu_sysfs_reg_offset 3) Read Signed-off-by: Lijo Lazar Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 62 ++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5d3a83193115..f29d0faf956e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -162,6 +162,65 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, static DEVICE_ATTR(pcie_replay_count, 0444, amdgpu_device_get_pcie_replay_count, NULL); +static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t ppos, size_t count) +{ + struct device *dev = kobj_to_dev(kobj); + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + ssize_t bytes_read; + + switch (ppos) { + case AMDGPU_SYS_REG_STATE_XGMI: + bytes_read = amdgpu_asic_get_reg_state( + adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); + break; + case AMDGPU_SYS_REG_STATE_WAFL: + bytes_read = amdgpu_asic_get_reg_state( + adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); + break; + case AMDGPU_SYS_REG_STATE_PCIE: + bytes_read = amdgpu_asic_get_reg_state( + adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); + break; + case AMDGPU_SYS_REG_STATE_USR: + bytes_read = amdgpu_asic_get_reg_state( + adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); + break; + case AMDGPU_SYS_REG_STATE_USR_1: + bytes_read = amdgpu_asic_get_reg_state( + adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); + break; + default: + return -EINVAL; + } + + return bytes_read; +} + +BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, + AMDGPU_SYS_REG_STATE_END); + +int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) +{ + int ret; + + if (!amdgpu_asic_get_reg_state_supported(adev)) + return 0; + + ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); + + return ret; +} + +void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) +{ + if (!amdgpu_asic_get_reg_state_supported(adev)) + return; + sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); +} + /** * DOC: board_info * @@ -4233,6 +4292,7 @@ fence_driver_init: "Could not create amdgpu board attributes\n"); amdgpu_fru_sysfs_init(adev); + amdgpu_reg_state_sysfs_init(adev); if (IS_ENABLED(CONFIG_PERF_EVENTS)) r = amdgpu_pmu_init(adev); @@ -4355,6 +4415,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); amdgpu_fru_sysfs_fini(adev); + amdgpu_reg_state_sysfs_fini(adev); + /* disable ras feature must before hw fini */ amdgpu_ras_pre_fini(adev); -- cgit From 8c4e9105b2a8ab4ac4e6eeb479951ba6a3b4e897 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Tue, 1 Aug 2023 10:37:41 -0400 Subject: drm/amdgpu: optimize RLC powerdown notification on Vangogh The smu needs to get the rlc power down message to sync the rlc state with smu, the rlc state updating message need to be sent at while smu begin suspend sequence , otherwise SMU will crash while RLC state is not notified by driver, and rlc state probally changed after that notification, so it needs to notify rlc state to smu at the end of the suspend sequence in amdgpu_device_suspend() that can make sure the rlc state is correctly set to SMU. [ 101.000590] amdgpu 0000:03:00.0: amdgpu: SMU: I'm not done with your previous command: SMN_C2PMSG_66:0x0000001E SMN_C2PMSG_82:0x00000000 [ 101.000598] amdgpu 0000:03:00.0: amdgpu: Failed to disable gfxoff! [ 110.838026] amdgpu 0000:03:00.0: amdgpu: SMU: I'm not done with your previous command: SMN_C2PMSG_66:0x0000001E SMN_C2PMSG_82:0x00000000 [ 110.838035] amdgpu 0000:03:00.0: amdgpu: Failed to disable smu features. [ 110.838039] amdgpu 0000:03:00.0: amdgpu: Fail to disable dpm features! [ 110.838040] [drm:amdgpu_device_ip_suspend_phase2 [amdgpu]] *ERROR* suspend of IP block failed -62 [ 110.884394] PM: suspend of devices aborted after 21213.620 msecs [ 110.884402] PM: start suspend of devices aborted after 21213.882 msecs [ 110.884405] PM: Some devices failed to suspend, or early wake event detected Reviewed-by: Yifan Zhang Signed-off-by: Perry Yuan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7eeaf0aa7f81..5c0817cbc7c2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4538,6 +4538,10 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, false); + r = amdgpu_dpm_notify_rlc_state(adev, false); + if (r) + return r; + return 0; } -- cgit From d6a57588666301acd9d42d3b00d74240964f07f6 Mon Sep 17 00:00:00 2001 From: Jiadong Zhu Date: Fri, 1 Dec 2023 08:38:15 +0800 Subject: drm/amdgpu: disable MCBP by default Disable MCBP(mid command buffer preemption) by default as old Mesa hangs with it. We shall not enable the feature that breaks old usermode driver. Fixes: 50a7c8765ca6 ("drm/amdgpu: enable mcbp by default on gfx9") Signed-off-by: Jiadong Zhu Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5c0817cbc7c2..1cc1ae2743c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3791,10 +3791,6 @@ static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) adev->gfx.mcbp = true; else if (amdgpu_mcbp == 0) adev->gfx.mcbp = false; - else if ((amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 0, 0)) && - (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(10, 0, 0)) && - adev->gfx.num_gfx_rings) - adev->gfx.mcbp = true; if (amdgpu_sriov_vf(adev)) adev->gfx.mcbp = true; -- cgit From dab96d8b61aab1a4f99d0b86964a6c40e7bb1756 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 29 Nov 2023 15:44:25 -0500 Subject: drm/amdgpu: fix buffer funcs setting order on suspend We need to disable this after the last eviction call, but before we disable the SDMA IP. Fixes: b70438004a14 ("drm/amdgpu: move buffer funcs setting up a level") Link: https://lore.kernel.org/r/87edgv4x3i.fsf@vps.thesusis.net Reviewed-by: Luben Tuikov Tested-by: Phillip Susi Signed-off-by: Alex Deucher Cc: Phillip Susi Cc: Luben Tuikov --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1cc1ae2743c1..1f64d8cbb14d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4527,6 +4527,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) if (r) return r; + amdgpu_ttm_set_buffer_funcs_status(adev, false); + amdgpu_fence_driver_hw_fini(adev); amdgpu_device_ip_suspend_phase2(adev); -- cgit From 51ea405c47f833e55d19401b35b71100197e6d5d Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 11 Dec 2023 11:28:30 -0500 Subject: drm/amdgpu: fix buffer funcs setting order on suspend harder Part of commit c03581986234 ("drm/amdgpu: fix buffer funcs setting order on suspend") got dropped accidently. Add it back. Fixes: c03581986234 ("drm/amdgpu: fix buffer funcs setting order on suspend") Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 138c7b37af69..f58fb719292d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4589,8 +4589,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) amdgpu_ras_suspend(adev); - amdgpu_ttm_set_buffer_funcs_status(adev, false); - amdgpu_device_ip_suspend_phase1(adev); if (!adev->in_s0ix) -- cgit From ed342a2e78c4e4a8d82c2d19c95e8a3eb092c0d0 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 1 Dec 2023 17:13:46 +0530 Subject: drm/amdgpu: Use the right method to get IP version Replace direct usage of adev->ip_versions with amdgpu_ip_version. Signed-off-by: Lijo Lazar Reviewed-by: Hawking Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f58fb719292d..85ed0d66a029 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1599,7 +1599,7 @@ bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) if (adev->mman.keep_stolen_vga_memory) return false; - return adev->ip_versions[DCE_HWIP][0] >= IP_VERSION(3, 0, 0); + return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); } /* -- cgit From fb915c87edc2c99bbde148a62bfa97a2c6d991bb Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 20 Dec 2023 12:36:08 -0500 Subject: drm/amdgpu: skip gpu_info fw loading on navi12 It's no longer required. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2318 Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 85ed0d66a029..5bb444bb36ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2251,15 +2251,8 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) adev->firmware.gpu_info_fw = NULL; - if (adev->mman.discovery_bin) { - /* - * FIXME: The bounding box is still needed by Navi12, so - * temporarily read it from gpu_info firmware. Should be dropped - * when DAL no longer needs it. - */ - if (adev->asic_type != CHIP_NAVI12) - return 0; - } + if (adev->mman.discovery_bin) + return 0; switch (adev->asic_type) { default: -- cgit From 8a44fdd3cf91debbd09b43bd2519ad2b2486ccf4 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Thu, 21 Dec 2023 18:13:11 +0530 Subject: drm/amdgpu: Release 'adev->pm.fw' before return in 'amdgpu_device_need_post()' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In function 'amdgpu_device_need_post(struct amdgpu_device *adev)' - 'adev->pm.fw' may not be released before return. Using the function release_firmware() to release adev->pm.fw. Thus fixing the below: drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:1571 amdgpu_device_need_post() warn: 'adev->pm.fw' from request_firmware() not released on lines: 1554. Cc: Monk Liu Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Suggested-by: Lijo Lazar Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5bb444bb36ce..ecbc58269951 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1544,6 +1544,7 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) return true; fw_ver = *((uint32_t *)adev->pm.fw->data + 69); + release_firmware(adev->pm.fw); if (fw_ver < 0x00160e00) return true; } -- cgit From fb1c93c2e9604a884467a773790016199f78ca08 Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 10 Jan 2024 15:19:29 +0100 Subject: drm/amdgpu: revert "Adjust removal control flow for smu v13_0_2" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling amdgpu_device_ip_resume_phase1() during shutdown leaves the HW in an active state and is an unbalanced use of the IP callbacks. Using the IP callbacks like this can lead to memory leaks, double free and imbalanced reference counters. Leaving the HW in an active state can lead to DMA accesses to memory now freed by the driver. Both is a complete no-go for driver unload so completely revert the workaround for now. This reverts commit f5c7e7797060255dbc8160734ccc5ad6183c5e04. Signed-off-by: Christian König Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 +----------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ecbc58269951..b158d27d0a71 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5246,7 +5246,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_device *tmp_adev = NULL; bool need_full_reset, skip_hw_reset, vram_lost = false; int r = 0; - bool gpu_reset_for_dev_remove = 0; /* Try reset handler method first */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, @@ -5266,10 +5265,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); - gpu_reset_for_dev_remove = - test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && - test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); - /* * ASIC reset has to be done on all XGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) @@ -5312,18 +5307,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, amdgpu_ras_intr_cleared(); } - /* Since the mode1 reset affects base ip blocks, the - * phase1 ip blocks need to be resumed. Otherwise there - * will be a BIOS signature error and the psp bootloader - * can't load kdb on the next amdgpu install. - */ - if (gpu_reset_for_dev_remove) { - list_for_each_entry(tmp_adev, device_list_handle, reset_list) - amdgpu_device_ip_resume_phase1(tmp_adev); - - goto end; - } - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (need_full_reset) { /* post card */ @@ -5560,11 +5543,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, int i, r = 0; bool need_emergency_restart = false; bool audio_suspended = false; - bool gpu_reset_for_dev_remove = false; - - gpu_reset_for_dev_remove = - test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && - test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); /* * Special case: RAS triggered and full reset isn't supported @@ -5602,7 +5580,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { list_add_tail(&tmp_adev->reset_list, &device_list); - if (gpu_reset_for_dev_remove && adev->shutdown) + if (adev->shutdown) tmp_adev->shutdown = true; } if (!list_is_first(&adev->reset_list, &device_list)) @@ -5687,10 +5665,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - if (gpu_reset_for_dev_remove) { - /* Workaroud for ASICs need to disable SMC first */ - amdgpu_device_smu_fini_early(tmp_adev); - } r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); /*TODO Should we stop ?*/ if (r) { @@ -5722,9 +5696,6 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ r = amdgpu_do_asic_reset(device_list_handle, reset_context); if (r && r == -EAGAIN) goto retry; - - if (!r && gpu_reset_for_dev_remove) - goto recover_end; } skip_hw_reset: @@ -5780,7 +5751,6 @@ skip_sched_resume: amdgpu_ras_set_error_query_ready(tmp_adev, true); } -recover_end: tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); -- cgit