From 67a44e659888569a133a8f858c8230e9d7aad1d5 Mon Sep 17 00:00:00 2001 From: Ernst Sjöstrand Date: Thu, 2 Sep 2021 09:50:27 +0200 Subject: drm/amd/amdgpu: Increase HWIP_MAX_INSTANCE to 10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seems like newer cards can have even more instances now. Found by UBSAN: array-index-out-of-bounds in drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c:318:29 index 8 is out of range for type 'uint32_t *[8]' Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1697 Cc: stable@vger.kernel.org Signed-off-by: Ernst Sjöstrand Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index dc3c6b3a00e5..d356e329e6f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -758,7 +758,7 @@ enum amd_hw_ip_block_type { MAX_HWIP }; -#define HWIP_MAX_INSTANCE 8 +#define HWIP_MAX_INSTANCE 10 struct amd_powerplay { void *pp_handle; -- cgit From 81d1bf01e4820962d6ea218ff5b9719e81e5812d Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 20 Jul 2021 14:53:37 -0400 Subject: drm/amdgpu: add debugfs access to the IP discovery table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Useful for debugging and new asic validation. Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index d356e329e6f8..cdb963b9bea0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -830,6 +830,7 @@ struct amdgpu_device { struct notifier_block acpi_nb; struct amdgpu_i2c_chan *i2c_bus[AMDGPU_MAX_I2C_BUS]; struct debugfs_blob_wrapper debugfs_vbios_blob; + struct debugfs_blob_wrapper debugfs_discovery_blob; struct mutex srbm_mutex; /* GRBM index mutex. Protects concurrent access to GRBM index */ struct mutex grbm_idx_mutex; -- cgit From 5f52e9a78061cbced92ed5c64d70f342f5c9b68c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 20 Jul 2021 16:01:41 -0400 Subject: drm/amdgpu: store HW IP versions in the driver structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So we can check the IP versions directly rather than using asic type. Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index cdb963b9bea0..61ef0d81a957 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -760,6 +760,9 @@ enum amd_hw_ip_block_type { #define HWIP_MAX_INSTANCE 10 +#define HW_ID_MAX 300 +#define IP_VERSION(mj, mn, rv) (((mj) << 16) | ((mn) << 8) | (rv)) + struct amd_powerplay { void *pp_handle; const struct amd_pm_funcs *pp_funcs; @@ -1090,6 +1093,7 @@ struct amdgpu_device { struct pci_saved_state *pci_state; struct amdgpu_reset_control *reset_cntl; + uint32_t ip_versions[HW_ID_MAX]; }; static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) -- cgit From 1534db5549b77a10e242d0c72cdc867b33761343 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 26 Jul 2021 15:27:26 -0400 Subject: drm/amdgpu: add XGMI HWIP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So we can track grab the appropriate XGMI info out of the IP discovery table. Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 61ef0d81a957..8df4be702870 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -755,6 +755,7 @@ enum amd_hw_ip_block_type { CLK_HWIP, UMC_HWIP, RSMU_HWIP, + XGMI_HWIP, MAX_HWIP }; -- cgit From 5f931489556d61018da014cd5edb4dff3cf66742 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 30 Jul 2021 12:44:07 -0400 Subject: drm/amdgpu: add DCI HWIP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So we can track grab the appropriate DCE info out of the IP discovery table. This is a separare IP from DCN. Acked-by: Harry Wentland Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 8df4be702870..815db33190ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -756,6 +756,7 @@ enum amd_hw_ip_block_type { UMC_HWIP, RSMU_HWIP, XGMI_HWIP, + DCI_HWIP, MAX_HWIP }; -- cgit From 5eceb2019215fe38a9ce972193203d66f1d66f95 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 9 Aug 2021 12:18:08 -0400 Subject: drm/amdgpu: add VCN1 hardware IP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So we can store the VCN IP revision for each instance of VCN. Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 815db33190ca..b153c3740307 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -744,6 +744,7 @@ enum amd_hw_ip_block_type { UVD_HWIP, VCN_HWIP = UVD_HWIP, JPEG_HWIP = VCN_HWIP, + VCN1_HWIP, VCE_HWIP, DF_HWIP, DCE_HWIP, -- cgit From 1d789535a03679e5ce0b56a0d32a5e44596dfcdb Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 4 Oct 2021 15:19:10 -0400 Subject: drm/amdgpu: convert IP version array to include instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow us to query instances versions more cleanly. Instancing support is not consistent unfortunately. SDMA is a good example. Sienna cichlid has 4 total SDMA instances, each enumerated separately (HWIDs 42, 43, 68, 69). Arcturus has 8 total SDMA instances, but they are enumerated as multiple instances of the same HWIDs (4x HWID 42, 4x HWID 43). UMC is another example. On most chips there are multiple instances with the same HWID. This allows us to support both forms. v2: rebase v3: clarify instancing support Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index b153c3740307..f4bceb2624fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1096,7 +1096,7 @@ struct amdgpu_device { struct pci_saved_state *pci_state; struct amdgpu_reset_control *reset_cntl; - uint32_t ip_versions[HW_ID_MAX]; + uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE]; }; static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) -- cgit From c8365dbda056578eebe164bf110816b1a39b4b7f Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 30 Sep 2021 11:22:51 +0200 Subject: drm/amdgpu: revert "Add autodump debugfs node for gpu reset v8" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 728e7e0cd61899208e924472b9e641dbeb0775c4. Further discussion reveals that this feature is severely broken and needs to be reverted ASAP. GPU reset can never be delayed by userspace even for debugging or otherwise we can run into in kernel deadlocks. Signed-off-by: Christian König Acked-by: Alex Deucher Acked-by: Nirmoy Das Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index f4bceb2624fb..eb1c117e82b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1085,8 +1085,6 @@ struct amdgpu_device { char product_name[32]; char serial[20]; - struct amdgpu_autodump autodump; - atomic_t throttling_logging_enabled; struct ratelimit_state throttling_logging_rs; uint32_t ras_hw_enabled; -- cgit From e17e27f9bdba274b404454072302cf5ea2282e5d Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Fri, 1 Oct 2021 09:48:50 +0800 Subject: drm/amdgpu: handle the case of pci_channel_io_frozen only in amdgpu_pci_resume In current code, when a PCI error state pci_channel_io_normal is detectd, it will report PCI_ERS_RESULT_CAN_RECOVER status to PCI driver, and PCI driver will continue the execution of PCI resume callback report_resume by pci_walk_bridge, and the callback will go into amdgpu_pci_resume finally, where write lock is releasd unconditionally without acquiring such lock first. In this case, a deadlock will happen when other threads start to acquire the read lock. To fix this, add a member in amdgpu_device strucutre to cache pci_channel_state, and only continue the execution in amdgpu_pci_resume when it's pci_channel_io_frozen. Fixes: c9a6b82f45e2 ("drm/amdgpu: Implement DPC recovery") Suggested-by: Andrey Grodzovsky Signed-off-by: Guchun Chen Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index eb1c117e82b4..d58e37fd01f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1092,6 +1092,7 @@ struct amdgpu_device { bool no_hw_access; struct pci_saved_state *pci_state; + pci_channel_state_t pci_channel_state; struct amdgpu_reset_control *reset_cntl; uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE]; -- cgit