From aa5fc4362fac9351557eb27c745579159a2e4520 Mon Sep 17 00:00:00 2001 From: Liu01 Tong Date: Mon, 11 Aug 2025 14:52:37 +0800 Subject: drm/amdgpu: fix task hang from failed job submission during process kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During process kill, drm_sched_entity_flush() will kill the vm entities. The following job submissions of this process will fail, and the resources of these jobs have not been released, nor have the fences been signalled, causing tasks to hang and timeout. Fix by check entity status in amdgpu_vm_ready() and avoid submit jobs to stopped entity. v2: add amdgpu_vm_ready() check before amdgpu_vm_clear_freed() in function amdgpu_cs_vm_handling(). Fixes: 1f02f2044bda ("drm/amdgpu: Avoid extra evict-restore process.") Signed-off-by: Liu01 Tong Signed-off-by: Lin.Cao Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit f101c13a8720c73e67f8f9d511fbbeda95bcedb1) --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 5cacf5717016..0b87798daebd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -654,11 +654,10 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, * Check if all VM PDs/PTs are ready for updates * * Returns: - * True if VM is not evicting. + * True if VM is not evicting and all VM entities are not stopped */ bool amdgpu_vm_ready(struct amdgpu_vm *vm) { - bool empty; bool ret; amdgpu_vm_eviction_lock(vm); @@ -666,10 +665,18 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm) amdgpu_vm_eviction_unlock(vm); spin_lock(&vm->status_lock); - empty = list_empty(&vm->evicted); + ret &= list_empty(&vm->evicted); spin_unlock(&vm->status_lock); - return ret && empty; + spin_lock(&vm->immediate.lock); + ret &= !vm->immediate.stopped; + spin_unlock(&vm->immediate.lock); + + spin_lock(&vm->delayed.lock); + ret &= !vm->delayed.stopped; + spin_unlock(&vm->delayed.lock); + + return ret; } /** -- cgit From 3eb61d7cb74cea2ea697363669fa256937164758 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 15 Jul 2025 10:26:22 +0200 Subject: Revert "drm/amdgpu: Use dma_buf from GEM object instance" This reverts commit 515986100d176663d0a03219a3056e4252f729e6. The dma_buf field in struct drm_gem_object is not stable over the object instance's lifetime. The field becomes NULL when user space releases the final GEM handle on the buffer object. This resulted in a NULL-pointer deref. Workarounds in commit 5307dce878d4 ("drm/gem: Acquire references on GEM handles for framebuffers") and commit f6bfc9afc751 ("drm/framebuffer: Acquire internal references on GEM handles") only solved the problem partially. They especially don't work for buffer objects without a DRM framebuffer associated. Hence, this revert to going back to using .import_attach->dmabuf. Signed-off-by: Thomas Zimmermann Reviewed-by: Simona Vetter Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20250715082635.34974-1-tzimmermann@suse.de --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 5cacf5717016..8777ed8facd6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1276,7 +1276,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, struct drm_gem_object *obj = &bo->tbo.base; if (drm_gem_is_imported(obj) && bo_va->is_xgmi) { - struct dma_buf *dma_buf = obj->dma_buf; + struct dma_buf *dma_buf = obj->import_attach->dmabuf; struct drm_gem_object *gobj = dma_buf->priv; struct amdgpu_bo *abo = gem_to_amdgpu_bo(gobj); -- cgit