From df9c8d1aa278c435c30a69b8f2418b4a52fcb929 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Wed, 8 Jul 2020 15:07:13 +0800 Subject: drm/amdgpu: fix system hang issue during GPU reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover, the atomic adev->in_gpu_reset and hive->in_reset are used to avoid re-entering GPU recovery. During GPU reset and resume, it is unsafe that other threads access GPU, which maybe cause GPU reset failed. Therefore the new rw_semaphore adev->reset_sem is introduced, which protect GPU from being accessed by external threads during recovery. v2: 1. add rwlock for some ioctls, debugfs and file-close function. 2. change to use dqm->is_resetting and dqm_lock for protection in kfd driver. 3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid re-enter GPU recovery for the same GPU hang. v3: 1. change back to use adev->reset_sem to protect kfd callback functions, because dqm_lock couldn't protect all codes, for example: free_mqd must be called outside of dqm_lock; [ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019 [ 1230.177221] Call Trace: [ 1230.178249] dump_stack+0x98/0xd5 [ 1230.179443] amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu] [ 1230.180673] gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu] [ 1230.181882] amdgpu_gart_unbind+0xa9/0xe0 [amdgpu] [ 1230.183098] amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu] [ 1230.184239] ? ttm_bo_put+0x171/0x5f0 [ttm] [ 1230.185394] ttm_tt_unbind+0x21/0x40 [ttm] [ 1230.186558] ttm_tt_destroy.part.12+0x12/0x60 [ttm] [ 1230.187707] ttm_tt_destroy+0x13/0x20 [ttm] [ 1230.188832] ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm] [ 1230.189979] ttm_bo_put+0x1be/0x5f0 [ttm] [ 1230.191230] amdgpu_bo_unref+0x1e/0x30 [amdgpu] [ 1230.192522] amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu] [ 1230.193833] free_mqd+0x25/0x40 [amdgpu] [ 1230.195143] destroy_queue_cpsch+0x1a7/0x270 [amdgpu] [ 1230.196475] pqm_destroy_queue+0x105/0x260 [amdgpu] [ 1230.197819] kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu] [ 1230.199154] kfd_ioctl+0x277/0x500 [amdgpu] [ 1230.200458] ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu] [ 1230.201656] ? tomoyo_file_ioctl+0x19/0x20 [ 1230.202831] ksys_ioctl+0x98/0xb0 [ 1230.204004] __x64_sys_ioctl+0x1a/0x20 [ 1230.205174] do_syscall_64+0x5f/0x250 [ 1230.206339] entry_SYSCALL_64_after_hwframe+0x49/0xbe 2. remove try_lock and introduce atomic hive->in_reset, to avoid re-enter GPU recovery. v4: 1. remove an unnecessary whitespace change in kfd_chardev.c 2. remove comment codes in amdgpu_device.c 3. add more detailed comment in commit message 4. define a wrap function amdgpu_in_reset v5: 1. Fix some style issues. Reviewed-by: Hawking Zhang Suggested-by: Andrey Grodzovsky Suggested-by: Christian König Suggested-by: Felix Kuehling Suggested-by: Lijo Lazar Suggested-by: Luben Tukov Signed-off-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 7f9e50247413..73cc68ab53d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -671,6 +671,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, bo_va = NULL; } + down_read(&adev->reset_sem); + switch (args->operation) { case AMDGPU_VA_OP_MAP: va_flags = amdgpu_gem_va_map_flags(adev, args->flags); @@ -700,6 +702,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va, args->operation); + up_read(&adev->reset_sem); + error_backoff: ttm_eu_backoff_reservation(&ticket, &list); -- cgit From f1403342ebdfcff3c3cf57ae476f19d3078f2767 Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 12 Aug 2020 17:48:26 +0200 Subject: drm/amdgpu: revert "fix system hang issue during GPU reset" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The whole approach wasn't thought through till the end. We already had a reset lock like this in the past and it caused the same problems like this one. Completely revert the patch for now and add individual trylock protection to the hardware access functions as necessary. This reverts commit df9c8d1aa278c435c30a69b8f2418b4a52fcb929. Signed-off-by: Christian König Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 73cc68ab53d0..7f9e50247413 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -671,8 +671,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, bo_va = NULL; } - down_read(&adev->reset_sem); - switch (args->operation) { case AMDGPU_VA_OP_MAP: va_flags = amdgpu_gem_va_map_flags(adev, args->flags); @@ -702,8 +700,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va, args->operation); - up_read(&adev->reset_sem); - error_backoff: ttm_eu_backoff_reservation(&ticket, &list); -- cgit From 1348969ab68cb864034ec4fe48a86c157cc4e10d Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Mon, 24 Aug 2020 12:27:47 -0400 Subject: drm/amdgpu: drm_device to amdgpu_device by inline-f (v2) Get the amdgpu_device from the DRM device by use of an inline function, drm_to_adev(). The inline function resolves a pointer to struct drm_device to a pointer to struct amdgpu_device. v2: Use a typed visible static inline function instead of an invisible macro. Signed-off-by: Luben Tuikov Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 7f9e50247413..8371a8724087 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -217,7 +217,7 @@ out_unlock: int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_fpriv *fpriv = filp->driver_priv; struct amdgpu_vm *vm = &fpriv->vm; union drm_amdgpu_gem_create *args = data; @@ -298,7 +298,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { struct ttm_operation_ctx ctx = { true, false }; - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); struct drm_amdgpu_gem_userptr *args = data; struct drm_gem_object *gobj; struct amdgpu_bo *bo; @@ -587,7 +587,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, struct drm_amdgpu_gem_va *args = data; struct drm_gem_object *gobj; - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_fpriv *fpriv = filp->driver_priv; struct amdgpu_bo *abo; struct amdgpu_bo_va *bo_va; @@ -711,7 +711,7 @@ error_unref: int amdgpu_gem_op_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); struct drm_amdgpu_gem_op *args = data; struct drm_gem_object *gobj; struct amdgpu_vm_bo_base *base; @@ -788,7 +788,7 @@ int amdgpu_mode_dumb_create(struct drm_file *file_priv, struct drm_device *dev, struct drm_mode_create_dumb *args) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); struct drm_gem_object *gobj; uint32_t handle; u64 flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED | -- cgit From 4a580877bdcb837e7a3754ae20798dcfccb44e80 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Mon, 24 Aug 2020 12:29:45 -0400 Subject: drm/amdgpu: Get DRM dev from adev by inline-f Add a static inline adev_to_drm() to obtain the DRM device pointer from an amdgpu_device pointer. Signed-off-by: Luben Tuikov Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 8371a8724087..f4c2e2e75b8f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -93,7 +93,7 @@ retry: void amdgpu_gem_force_release(struct amdgpu_device *adev) { - struct drm_device *ddev = adev->ddev; + struct drm_device *ddev = adev_to_drm(adev); struct drm_file *file; mutex_lock(&ddev->filelist_mutex); -- cgit