From 36255b5f619594504e9d76de23b58e99c5e08ceb Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Tue, 20 Apr 2021 10:05:44 -0400 Subject: drm/amdgpu: address remove from fault filter Add interface to remove address from fault filter ring by resetting fault ring entry key, then future vm fault on the address will be processed to recover. Define fault key as atomic64_t type to use atomic read/set/cmpxchg key to protect fault ring access by interrupt handler and interrupt deferred work for vg20. Change fault->timestamp to 48-bit to share same uint64_t with 8-bit fault->next, it is enough for 48bit IH timestamp. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 48 ++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index c39ed9eb0987..dfa67c2255f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc) mc->agp_size >> 20, mc->agp_start, mc->agp_end); } +/** + * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid + * + * @addr: 48 bit physical address, page aligned (36 significant bits) + * @pasid: 16 bit process address space identifier + */ +static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid) +{ + return addr << 4 | pasid; +} + /** * amdgpu_gmc_filter_faults - filter VM faults * @@ -348,8 +359,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, uint16_t pasid, uint64_t timestamp) { struct amdgpu_gmc *gmc = &adev->gmc; - - uint64_t stamp, key = addr << 4 | pasid; + uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid); struct amdgpu_gmc_fault *fault; uint32_t hash; @@ -365,7 +375,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, while (fault->timestamp >= stamp) { uint64_t tmp; - if (fault->key == key) + if (atomic64_read(&fault->key) == key) return true; tmp = fault->timestamp; @@ -378,7 +388,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, /* Add the fault to the ring */ fault = &gmc->fault_ring[gmc->last_fault]; - fault->key = key; + atomic64_set(&fault->key, key); fault->timestamp = timestamp; /* And update the hash */ @@ -387,6 +397,36 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, return false; } +/** + * amdgpu_gmc_filter_faults_remove - remove address from VM faults filter + * + * @adev: amdgpu device structure + * @addr: address of the VM fault + * @pasid: PASID of the process causing the fault + * + * Remove the address from fault filter, then future vm fault on this address + * will pass to retry fault handler to recover. + */ +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr, + uint16_t pasid) +{ + struct amdgpu_gmc *gmc = &adev->gmc; + uint64_t key = amdgpu_gmc_fault_key(addr, pasid); + struct amdgpu_gmc_fault *fault; + uint32_t hash; + uint64_t tmp; + + hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER); + fault = &gmc->fault_ring[gmc->fault_hash[hash].idx]; + do { + if (atomic64_cmpxchg(&fault->key, key, 0) == key) + break; + + tmp = fault->timestamp; + fault = &gmc->fault_ring[fault->next]; + } while (fault->timestamp < tmp); +} + int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev) { int r; -- cgit From 7c63694eb97f4edcb20829b4fcdf74a3db016dda Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Wed, 28 Apr 2021 22:20:00 +0800 Subject: drm/amdgpu: init/fini hdp v4_0 ras invoke hdp v4_0 ras init in gmc late_init phase while ras fini in gmc sw_fini phase Signed-off-by: Hawking Zhang Reviewed-by: John Clements Reviewed-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index dfa67c2255f2..697ab26c9434 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -455,6 +455,13 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev) return r; } + if (adev->hdp.ras_funcs && + adev->hdp.ras_funcs->ras_late_init) { + r = adev->hdp.ras_funcs->ras_late_init(adev); + if (r) + return r; + } + return 0; } @@ -471,6 +478,10 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev) if (adev->gmc.xgmi.ras_funcs && adev->gmc.xgmi.ras_funcs->ras_fini) adev->gmc.xgmi.ras_funcs->ras_fini(adev); + + if (adev->hdp.ras_funcs && + adev->hdp.ras_funcs->ras_fini) + adev->hdp.ras_funcs->ras_fini(adev); } /* -- cgit From 9adaac6eb488ff4e3bd1ef28cbab44956285deaf Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Wed, 28 Apr 2021 23:10:36 +0800 Subject: drm/amdgpu: switch to mmhub ras callback for ras fini invoke callback function for mmhub ras fini Signed-off-by: Hawking Zhang Reviewed-by: John Clements Reviewed-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 697ab26c9434..a129ecc73869 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -473,7 +473,7 @@ void amdgpu_gmc_ras_fini(struct amdgpu_device *adev) if (adev->mmhub.ras_funcs && adev->mmhub.ras_funcs->ras_fini) - amdgpu_mmhub_ras_fini(adev); + adev->mmhub.ras_funcs->ras_fini(adev); if (adev->gmc.xgmi.ras_funcs && adev->gmc.xgmi.ras_funcs->ras_fini) -- cgit