From 04893397766a2b2f1bc7fe5c6414e4c0846ed171 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 20 Jan 2025 22:00:22 -0500 Subject: drm/amdgpu: Skip err_count sysfs creation on VF unsupported RAS blocks VFs are not able to query error counts for all RAS blocks. Rather than returning error for queries on these blocks, skip sysfs the creation all together. Signed-off-by: Victor Skvortsov Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 0af469ec6fcc..2056efaf157d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -1246,7 +1246,8 @@ amdgpu_ras_block_to_sriov(struct amdgpu_device *adev, enum amdgpu_ras_block bloc case AMDGPU_RAS_BLOCK__MPIO: return RAS_TELEMETRY_GPU_BLOCK_MPIO; default: - dev_err(adev->dev, "Unsupported SRIOV RAS telemetry block 0x%x\n", block); + DRM_WARN_ONCE("Unsupported SRIOV RAS telemetry block 0x%x\n", + block); return RAS_TELEMETRY_GPU_BLOCK_COUNT; } } @@ -1331,3 +1332,17 @@ int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev) return 0; } + +bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev, + enum amdgpu_ras_block block) +{ + enum amd_sriov_ras_telemetry_gpu_block sriov_block; + + sriov_block = amdgpu_ras_block_to_sriov(adev, block); + + if (sriov_block >= RAS_TELEMETRY_GPU_BLOCK_COUNT || + !amdgpu_sriov_ras_telemetry_block_en(adev, sriov_block)) + return false; + + return true; +} -- cgit From dc0297f3198bd60108ccbd167ee5d9fa4af31ed0 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Fri, 14 Feb 2025 14:22:17 +0530 Subject: drm/amdgpu: Replace Mutex with Spinlock for RLCG register access to avoid Priority Inversion in SRIOV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RLCG Register Access is a way for virtual functions to safely access GPU registers in a virtualized environment., including TLB flushes and register reads. When multiple threads or VFs try to access the same registers simultaneously, it can lead to race conditions. By using the RLCG interface, the driver can serialize access to the registers. This means that only one thread can access the registers at a time, preventing conflicts and ensuring that operations are performed correctly. Additionally, when a low-priority task holds a mutex that a high-priority task needs, ie., If a thread holding a spinlock tries to acquire a mutex, it can lead to priority inversion. register access in amdgpu_virt_rlcg_reg_rw especially in a fast code path is critical. The call stack shows that the function amdgpu_virt_rlcg_reg_rw is being called, which attempts to acquire the mutex. This function is invoked from amdgpu_sriov_wreg, which in turn is called from gmc_v11_0_flush_gpu_tlb. The [ BUG: Invalid wait context ] indicates that a thread is trying to acquire a mutex while it is in a context that does not allow it to sleep (like holding a spinlock). Fixes the below: [ 253.013423] ============================= [ 253.013434] [ BUG: Invalid wait context ] [ 253.013446] 6.12.0-amdstaging-drm-next-lol-050225 #14 Tainted: G U OE [ 253.013464] ----------------------------- [ 253.013475] kworker/0:1/10 is trying to lock: [ 253.013487] ffff9f30542e3cf8 (&adev->virt.rlcg_reg_lock){+.+.}-{3:3}, at: amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu] [ 253.013815] other info that might help us debug this: [ 253.013827] context-{4:4} [ 253.013835] 3 locks held by kworker/0:1/10: [ 253.013847] #0: ffff9f3040050f58 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x3f5/0x680 [ 253.013877] #1: ffffb789c008be40 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_one_work+0x1d6/0x680 [ 253.013905] #2: ffff9f3054281838 (&adev->gmc.invalidate_lock){+.+.}-{2:2}, at: gmc_v11_0_flush_gpu_tlb+0x198/0x4f0 [amdgpu] [ 253.014154] stack backtrace: [ 253.014164] CPU: 0 UID: 0 PID: 10 Comm: kworker/0:1 Tainted: G U OE 6.12.0-amdstaging-drm-next-lol-050225 #14 [ 253.014189] Tainted: [U]=USER, [O]=OOT_MODULE, [E]=UNSIGNED_MODULE [ 253.014203] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 11/18/2024 [ 253.014224] Workqueue: events work_for_cpu_fn [ 253.014241] Call Trace: [ 253.014250] [ 253.014260] dump_stack_lvl+0x9b/0xf0 [ 253.014275] dump_stack+0x10/0x20 [ 253.014287] __lock_acquire+0xa47/0x2810 [ 253.014303] ? srso_alias_return_thunk+0x5/0xfbef5 [ 253.014321] lock_acquire+0xd1/0x300 [ 253.014333] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu] [ 253.014562] ? __lock_acquire+0xa6b/0x2810 [ 253.014578] __mutex_lock+0x85/0xe20 [ 253.014591] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu] [ 253.014782] ? sched_clock_noinstr+0x9/0x10 [ 253.014795] ? srso_alias_return_thunk+0x5/0xfbef5 [ 253.014808] ? local_clock_noinstr+0xe/0xc0 [ 253.014822] ? amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu] [ 253.015012] ? srso_alias_return_thunk+0x5/0xfbef5 [ 253.015029] mutex_lock_nested+0x1b/0x30 [ 253.015044] ? mutex_lock_nested+0x1b/0x30 [ 253.015057] amdgpu_virt_rlcg_reg_rw+0xf6/0x330 [amdgpu] [ 253.015249] amdgpu_sriov_wreg+0xc5/0xd0 [amdgpu] [ 253.015435] gmc_v11_0_flush_gpu_tlb+0x44b/0x4f0 [amdgpu] [ 253.015667] gfx_v11_0_hw_init+0x499/0x29c0 [amdgpu] [ 253.015901] ? __pfx_smu_v13_0_update_pcie_parameters+0x10/0x10 [amdgpu] [ 253.016159] ? srso_alias_return_thunk+0x5/0xfbef5 [ 253.016173] ? smu_hw_init+0x18d/0x300 [amdgpu] [ 253.016403] amdgpu_device_init+0x29ad/0x36a0 [amdgpu] [ 253.016614] amdgpu_driver_load_kms+0x1a/0xc0 [amdgpu] [ 253.017057] amdgpu_pci_probe+0x1c2/0x660 [amdgpu] [ 253.017493] local_pci_probe+0x4b/0xb0 [ 253.017746] work_for_cpu_fn+0x1a/0x30 [ 253.017995] process_one_work+0x21e/0x680 [ 253.018248] worker_thread+0x190/0x330 [ 253.018500] ? __pfx_worker_thread+0x10/0x10 [ 253.018746] kthread+0xe7/0x120 [ 253.018988] ? __pfx_kthread+0x10/0x10 [ 253.019231] ret_from_fork+0x3c/0x60 [ 253.019468] ? __pfx_kthread+0x10/0x10 [ 253.019701] ret_from_fork_asm+0x1a/0x30 [ 253.019939] v2: s/spin_trylock/spin_lock_irqsave to be safe (Christian). Fixes: e864180ee49b ("drm/amdgpu: Add lock around VF RLCG interface") Cc: lin cao Cc: Jingwen Chen Cc: Victor Skvortsov Cc: Zhigang Luo Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Suggested-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 2056efaf157d..e6f0152e5b08 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -1017,6 +1017,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 f void *scratch_reg2; void *scratch_reg3; void *spare_int; + unsigned long flags; if (!adev->gfx.rlc.rlcg_reg_access_supported) { dev_err(adev->dev, @@ -1038,7 +1039,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 f scratch_reg2 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg2; scratch_reg3 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg3; - mutex_lock(&adev->virt.rlcg_reg_lock); + spin_lock_irqsave(&adev->virt.rlcg_reg_lock, flags); if (reg_access_ctrl->spare_int) spare_int = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->spare_int; @@ -1097,7 +1098,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 f ret = readl(scratch_reg0); - mutex_unlock(&adev->virt.rlcg_reg_lock); + spin_unlock_irqrestore(&adev->virt.rlcg_reg_lock, flags); return ret; } -- cgit