diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-10-02 12:47:25 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-10-02 12:47:25 -0700 |
commit | 58809f614e0e3f4e12b489bddf680bfeb31c0a20 (patch) | |
tree | 6b1468e6c1fbed9e04b0701ae49b634add62f794 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |
parent | 05a54fa773284d1a7923cdfdd8f0c8dabb98bd26 (diff) | |
parent | b2ec5ca9d5c2c019e2316f7ba447596d1dcd8fde (diff) |
Merge tag 'drm-next-2025-10-01' of https://gitlab.freedesktop.org/drm/kernel
Pull drm updates from Dave Airlie:
"cross-subsystem:
- i2c-hid: Make elan touch controllers power on after panel is
enabled
- dt bindings for STM32MP25 SoC
- pci vgaarb: use screen_info helpers
- rust pin-init updates
- add MEI driver for late binding firmware update/load
uapi:
- add ioctl for reassigning GEM handles
- provide boot_display attribute on boot-up devices
core:
- document DRM_MODE_PAGE_FLIP_EVENT
- add vendor specific recovery method to drm device wedged uevent
gem:
- Simplify gpuvm locking
ttm:
- add interface to populate buffers
sched:
- Fix race condition in trace code
atomic:
- Reallow no-op async page flips
display:
- dp: Fix command length
video:
- Improve pixel-format handling for struct screen_info
rust:
- drop Opaque<> from ioctl args
- Alloc:
- BorrowedPage type and AsPageIter traits
- Implement Vmalloc::to_page() and VmallocPageIter
- DMA/Scatterlist:
- Add dma::DataDirection and type alias for dma_addr_t
- Abstraction for struct scatterlist and sg_table
- DRM:
- simplify use of generics
- add DriverFile type alias
- drop Object::SIZE
- Rust:
- pin-init tree merge
- Various methods for AsBytes and FromBytes traits
gpuvm:
- Support madvice in Xe driver
gpusvm:
- fix hmm_pfn_to_map_order usage in gpusvm
bridge:
- Improve and fix ref counting on bridge management
- cdns-dsi: Various improvements to mode setting
- Support Solomon SSD2825 plus DT bindings
- Support Waveshare DSI2DPI plus DT bindings
- Support Content Protection property
- display-connector: Improve DP display detection
- Add support for Radxa Ra620 plus DT bindings
- adv7511: Provide SPD and HDMI infoframes
- it6505: Replace crypto_shash with sha()
- synopsys: Add support for DW DPTX Controller plus DT bindings
- adv7511: Write full Audio infoframe
- ite6263: Support vendor-specific infoframes
- simple: Add support for Realtek RTD2171 DP-to-HDMI plus DT bindings
panel:
- panel-edp: Support mt8189 Chromebooks; Support BOE NV140WUM-N64;
Support SHP LQ134Z1; Fixes
- panel-simple: Support Olimex LCD-OLinuXino-5CTS plus DT bindings
- Support Samsung AMS561RA01
- Support Hydis HV101HD1 plus DT bindings
- ilitek-ili9881c: Refactor mode setting; Add support for Bestar
BSD1218-A101KL68 LCD plus DT bindings
- lvds: Add support for Ampire AMP19201200B5TZQW-T03 to DT bindings
- edp: Add support for additonal mt8189 Chromebook panels
- lvds: Add DT bindings for EDT ETML0700Z8DHA
amdgpu:
- add CRIU support for gem objects
- RAS updates
- VCN SRAM load fixes
- EDID read fixes
- eDP ALPM support
- Documentation updates
- Rework PTE flag generation
- DCE6 fixes
- VCN devcoredump cleanup
- MMHUB client id fixes
- VCN 5.0.1 RAS support
- SMU 13.0.x updates
- Expanded PCIe DPC support
- Expanded VCN reset support
- VPE per queue reset support
- give kernel jobs unique id for tracing
- pre-populate exported buffers
- cyan skillfish updates
- make vbios build number available in sysfs
- userq updates
- HDCP updates
- support MMIO remap page as ttm pool
- JPEG parser updates
- DCE6 DC updates
- use devm for i2c buses
- GPUVM locking updates
- Drop non-DC DCE11 code
- improve fallback handling for pixel encoding
amdkfd:
- SVM/page migration fixes
- debugfs fixes
- add CRIO support for gem objects
- SVM updates
radeon:
- use dev_warn_once in CS parsers
xe:
- add madvise interface
- add DRM_IOCTL_XE_VM_QUERY_MEMORY_RANGE_ATTRS to query VMA count
and memory attributes
- drop L# bank mask reporting from media GT3 on Xe3+.
- add SLPC power_profile sysfs interface
- add configs attribs to add post/mid context-switch commands
- handle firmware reported hardware errors notifying userspace with
device wedged uevent
- use same dir structure across sysfs/debugfs
- cleanup and future proof vram region init
- add G-states and PCI link states to debugfs
- Add SRIOV support for CCS surfaces on Xe2+
- Enable SRIOV PF mode by default on supported platforms
- move flush to common code
- extended core workarounds for Xe2/3
- use DRM scheduler for delayed GT TLB invalidations
- configs improvements and allow VF device enablement
- prep work to expose mmio regions to userspace
- VF migration support added
- prepare GPU SVM for THP migration
- start fixing XE_PAGE_SIZE vs PAGE_SIZE
- add PSMI support for hw validation
- resize VF bars to max possible size according to number of VFs
- Ensure GT is in C0 during resume
- pre-populate exported buffers
- replace xe_hmm with gpusvm
- add more SVM GT stats to debugfs
- improve fake pci and WA kunnit handle for new platform testing
- Test GuC to GuC comms to add debugging
- use attribute groups to simplify sysfs registration
- add Late Binding firmware code to interact with MEI
i915:
- apply multiple JSL/EHL/Gen7/Gen6 workarounds properly
- protect against overflow in active_engine()
- Use try_cmpxchg64() in __active_lookup()
- include GuC registers in error state
- get rid of dev->struct_mutex
- iopoll: generalize read_poll_timout
- lots more display refactoring
- Reject HBR3 in any eDP Panel
- Prune modes for YUV420
- Display Wa fix, additions, and updates
- DP: Fix 2.7 Gbps link training on g4x
- DP: Adjust the idle pattern handling
- DP: Shuffle the link training code a bit
- Don't set/read the DSI C clock divider on GLK
- Enable_psr kernel parameter changes
- Type-C enabled/disconnected dp-alt sink
- Wildcat Lake enabling
- DP HDR updates
- DRAM detection
- wait PSR idle on dsb commit
- Remove FBC modulo 4 restriction for ADL-P+
- panic: refactor framebuffer allocation
habanalabs:
- debug/visibility improvements
- vmalloc-backed coherent mmap support
- HLDIO infrastructure
nova-core:
- various register!() macro improvements
- minor vbios/firmware fixes/refactoring
- advance firmware boot stages; process Booter and patch signatures
- process GSP and GSP bootloader
- Add r570.144 firmware bindings and update to it
- Move GSP boot code to own module
- Use new pin-init features to store driver's private data in a
single allocation
- Update ARef import from sync::aref
nova-drm:
- Update ARef import from sync::aref
tyr:
- initial driver skeleton for a rust driver for ARM Mali GPUs
- capable of powering up, query metadata and provide it to userspace.
msm:
- GPU and Core:
- in DT bindings describe clocks per GPU type
- GMU bandwidth voting for x1-85
- a623/a663 speedbins
- cleanup some remaining no-iommu leftovers after VM_BIND conversion
- fix GEM obj 32b size truncation
- add missing VM_BIND param validation
- IFPC for x1-85 and a750
- register xml and gen_header.py sync from mesa
- Display:
- add missing bindings for display on SC8180X
- added DisplayPort MST bindings
- conversion from round_rate() to determine_rate()
amdxdna:
- add IOCTL_AMDXDNA_GET_ARRAY
- support user space allocated buffers
- streamline PM interfaces
- Refactoring wrt. hardware contexts
- improve error reporting
nouveau:
- use GSP firmware by default
- improve error reporting
- Pre-populate exported buffers
ast:
- Clean up detection of DRAM config
exynos:
- add DSIM bridge driver support for Exynos7870
- Document Exynos7870 DSIM compatible in dt-binding
panthor:
- Print task/pid on errors
- Add support for Mali G710, G510, G310, Gx15, Gx20, Gx25
- Improve cache flushing
- Fail VM bind if BO has offset
renesas:
- convert to RUNTIME_PM_OPS
rcar-du:
- Make number of lanes configurable
- Use RUNTIME_PM_OPS
- Add support for DSI commands
rocket:
- Add driver for Rockchip NPU plus DT bindings
- Use kfree() and sizeof() correctly
- Test DMA status
rockchip:
- dsi2: Add support for RK3576 plus DT bindings
- Add support for RK3588 DPTX output
tidss:
- Use crtc_ fields for programming display mode
- Remove other drivers from aperture
pixpaper:
- Add support for Mayqueen Pixpaper plus DT bindings
v3d:
- Support querying nubmer of GPU resets for KHR_robustness
stm:
- Clean up logging
- ltdc: Add support support for STM32MP257F-EV1 plus DT bindings
sitronix:
- st7571-i2c: Add support for inverted displays and 2-bit grayscale
tidss:
- Convert to kernel's FIELD_ macros
vesadrm:
- Support 8-bit palette mode
imagination:
- Improve power management
- Add support for TH1520 GPU
- Support Risc-V architectures
v3d:
- Improve job management and locking
vkms:
- Support variants of ARGB8888, ARGB16161616, RGB565, RGB888 and P01x
- Spport YUV with 16-bit components"
* tag 'drm-next-2025-10-01' of https://gitlab.freedesktop.org/drm/kernel: (1455 commits)
drm/amd: Add name to modes from amdgpu_connector_add_common_modes()
drm/amd: Drop some common modes from amdgpu_connector_add_common_modes()
drm/amdgpu: update MODULE_PARM_DESC for freesync_video
drm/amd: Use dynamic array size declaration for amdgpu_connector_add_common_modes()
drm/amd/display: Share dce100_validate_global with DCE6-8
drm/amd/display: Share dce100_validate_bandwidth with DCE6-8
drm/amdgpu: Fix fence signaling race condition in userqueue
amd/amdkfd: enhance kfd process check in switch partition
amd/amdkfd: resolve a race in amdgpu_amdkfd_device_fini_sw
drm/amd/display: Reject modes with too high pixel clock on DCE6-10
drm/amd: Drop unnecessary check in amdgpu_connector_add_common_modes()
drm/amd/display: Only enable common modes for eDP and LVDS
drm/amdgpu: remove the redeclaration of variable i
drm/amdgpu/userq: assign an error code for invalid userq va
drm/amdgpu: revert "rework reserved VMID handling" v2
drm/amdgpu: remove leftover from enforcing isolation by VMID
drm/amdgpu: Add fallback to pipe reset if KCQ ring reset fails
accel/habanalabs: add Infineon version check
accel/habanalabs/gaudi2: read preboot status after recovering from dirty state
accel/habanalabs: add HL_GET_P_STATE passthrough type
...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 275 |
1 files changed, 227 insertions, 48 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c8459337fcb8..a77000c2e0bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -95,6 +95,7 @@ MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); #define AMDGPU_RESUME_MS 2000 #define AMDGPU_MAX_RETRY_LIMIT 2 @@ -178,6 +179,8 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = { BIT(AMD_IP_BLOCK_TYPE_PSP) }; +static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); + static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, enum amd_ip_block_type block) { @@ -2445,6 +2448,33 @@ int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, return 1; } +static const char *ip_block_names[] = { + [AMD_IP_BLOCK_TYPE_COMMON] = "common", + [AMD_IP_BLOCK_TYPE_GMC] = "gmc", + [AMD_IP_BLOCK_TYPE_IH] = "ih", + [AMD_IP_BLOCK_TYPE_SMC] = "smu", + [AMD_IP_BLOCK_TYPE_PSP] = "psp", + [AMD_IP_BLOCK_TYPE_DCE] = "dce", + [AMD_IP_BLOCK_TYPE_GFX] = "gfx", + [AMD_IP_BLOCK_TYPE_SDMA] = "sdma", + [AMD_IP_BLOCK_TYPE_UVD] = "uvd", + [AMD_IP_BLOCK_TYPE_VCE] = "vce", + [AMD_IP_BLOCK_TYPE_ACP] = "acp", + [AMD_IP_BLOCK_TYPE_VCN] = "vcn", + [AMD_IP_BLOCK_TYPE_MES] = "mes", + [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg", + [AMD_IP_BLOCK_TYPE_VPE] = "vpe", + [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm", + [AMD_IP_BLOCK_TYPE_ISP] = "isp", +}; + +static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type) +{ + int idx = (int)type; + + return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown"; +} + /** * amdgpu_device_ip_block_add * @@ -2473,8 +2503,13 @@ int amdgpu_device_ip_block_add(struct amdgpu_device *adev, break; } - dev_info(adev->dev, "detected ip block number %d <%s>\n", - adev->num_ip_blocks, ip_block_version->funcs->name); + dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n", + adev->num_ip_blocks, + ip_block_name(adev, ip_block_version->type), + ip_block_version->major, + ip_block_version->minor, + ip_block_version->rev, + ip_block_version->funcs->name); adev->ip_blocks[adev->num_ip_blocks].adev = adev; @@ -2595,6 +2630,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) return 0; chip_name = "navi12"; break; + case CHIP_CYAN_SKILLFISH: + chip_name = "cyan_skillfish"; + break; } err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, @@ -2674,6 +2712,24 @@ out: return err; } +static void amdgpu_uid_init(struct amdgpu_device *adev) +{ + /* Initialize the UID for the device */ + adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); + if (!adev->uid_info) { + dev_warn(adev->dev, "Failed to allocate memory for UID\n"); + return; + } + adev->uid_info->adev = adev; +} + +static void amdgpu_uid_fini(struct amdgpu_device *adev) +{ + /* Free the UID memory */ + kfree(adev->uid_info); + adev->uid_info = NULL; +} + /** * amdgpu_device_ip_early_init - run early init for hardware IPs * @@ -2857,6 +2913,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) if (adev->gmc.xgmi.supported) amdgpu_xgmi_early_init(adev); + if (amdgpu_is_multi_aid(adev)) + amdgpu_uid_init(adev); ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); if (ip_block->status.valid != false) amdgpu_amdkfd_device_probe(adev); @@ -3389,7 +3447,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void) for (i = 0; i < mgpu_info.num_dgpu; i++) { gpu_ins = &(mgpu_info.gpu_ins[i]); adev = gpu_ins->adev; - if (!(adev->flags & AMD_IS_APU) && + if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && !gpu_ins->mgpu_fan_enabled) { ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); if (ret) @@ -3648,6 +3706,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) } amdgpu_ras_fini(adev); + amdgpu_uid_fini(adev); return 0; } @@ -4992,7 +5051,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) adev->reset_domain = NULL; kfree(adev->pci_state); - + kfree(adev->pcie_reset_ctx.swds_pcistate); + kfree(adev->pcie_reset_ctx.swus_pcistate); } /** @@ -5012,6 +5072,10 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) return 0; + /* No need to evict when going to S5 through S4 callbacks */ + if (system_state == SYSTEM_POWER_OFF) + return 0; + ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); if (ret) { dev_warn(adev->dev, "evicting device resources failed\n"); @@ -5697,7 +5761,7 @@ int amdgpu_device_link_reset(struct amdgpu_device *adev) dev_info(adev->dev, "GPU link reset\n"); - if (!adev->pcie_reset_ctx.occurs_dpc) + if (!amdgpu_reset_in_dpc(adev)) ret = amdgpu_dpm_link_reset(adev); if (ret) @@ -5826,6 +5890,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) amdgpu_set_init_level(tmp_adev, init_level); if (full_reset) { /* post card */ + amdgpu_reset_set_dpc_status(tmp_adev, false); amdgpu_ras_clear_err_state(tmp_adev); r = amdgpu_device_asic_init(tmp_adev); if (r) { @@ -6132,12 +6197,11 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle) return ret; } -static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, +static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, struct list_head *device_list, struct amdgpu_hive_info *hive) { struct amdgpu_device *tmp_adev = NULL; - int r; /* * Build list of devices to reset. @@ -6149,7 +6213,7 @@ static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, list_add_tail(&tmp_adev->reset_list, device_list); if (adev->shutdown) tmp_adev->shutdown = true; - if (adev->pcie_reset_ctx.occurs_dpc) + if (amdgpu_reset_in_dpc(adev)) tmp_adev->pcie_reset_ctx.in_link_reset = true; } if (!list_is_first(&adev->reset_list, device_list)) @@ -6157,14 +6221,6 @@ static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, } else { list_add_tail(&adev->reset_list, device_list); } - - if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { - r = amdgpu_device_health_check(device_list); - if (r) - return r; - } - - return 0; } static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, @@ -6233,9 +6289,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev, drm_client_dev_suspend(adev_to_drm(tmp_adev), false); /* disable ras on ALL IPs */ - if (!need_emergency_restart && - (!adev->pcie_reset_ctx.occurs_dpc) && - amdgpu_device_ip_need_full_reset(tmp_adev)) + if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && + amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -6263,11 +6318,7 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list, reset_list) { - if (adev->pcie_reset_ctx.occurs_dpc) - tmp_adev->no_hw_access = true; r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); - if (adev->pcie_reset_ctx.occurs_dpc) - tmp_adev->no_hw_access = false; /*TODO Should we stop ?*/ if (r) { dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", @@ -6445,8 +6496,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, emergency_restart(); } - dev_info(adev->dev, "GPU %s begin!\n", - need_emergency_restart ? "jobs stop":"reset"); + dev_info(adev->dev, "GPU %s begin!. Source: %d\n", + need_emergency_restart ? "jobs stop" : "reset", + reset_context->src); if (!amdgpu_sriov_vf(adev)) hive = amdgpu_get_xgmi_hive(adev); @@ -6457,8 +6509,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, reset_context->hive = hive; INIT_LIST_HEAD(&device_list); - if (amdgpu_device_recovery_prepare(adev, &device_list, hive)) - goto end_reset; + amdgpu_device_recovery_prepare(adev, &device_list, hive); + + if (!amdgpu_sriov_vf(adev)) { + r = amdgpu_device_health_check(&device_list); + if (r) + goto end_reset; + } /* We need to lock reset domain only once both for XGMI and single device */ amdgpu_device_recovery_get_reset_lock(adev, &device_list); @@ -6880,17 +6937,13 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + struct amdgpu_hive_info *hive __free(xgmi_put_hive) = + amdgpu_get_xgmi_hive(adev); struct amdgpu_reset_context reset_context; struct list_head device_list; dev_info(adev->dev, "PCI error: detected callback!!\n"); - if (!amdgpu_dpm_is_link_reset_supported(adev)) { - dev_warn(adev->dev, "No support for XGMI hive yet...\n"); - return PCI_ERS_RESULT_DISCONNECT; - } - adev->pci_channel_state = state; switch (state) { @@ -6900,10 +6953,23 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta case pci_channel_io_frozen: /* Fatal error, prepare for slot reset */ dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); + if (hive) { + /* Hive devices should be able to support FW based + * link reset on other devices, if not return. + */ + if (!amdgpu_dpm_is_link_reset_supported(adev)) { + dev_warn(adev->dev, + "No support for XGMI hive yet...\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + /* Set dpc status only if device is part of hive + * Non-hive devices should be able to recover after + * link reset. + */ + amdgpu_reset_set_dpc_status(adev, true); - if (hive) mutex_lock(&hive->hive_lock); - adev->pcie_reset_ctx.occurs_dpc = true; + } memset(&reset_context, 0, sizeof(reset_context)); INIT_LIST_HEAD(&device_list); @@ -6911,10 +6977,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta amdgpu_device_recovery_get_reset_lock(adev, &device_list); amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, hive, false); - if (hive) { + if (hive) mutex_unlock(&hive->hive_lock); - amdgpu_put_xgmi_hive(hive); - } return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: /* Permanent error, prepare for device removal */ @@ -6962,22 +7026,34 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) struct amdgpu_device *tmp_adev; struct amdgpu_hive_info *hive; struct list_head device_list; - int r = 0, i; + struct pci_dev *link_dev; + int r = 0, i, timeout; u32 memsize; - - /* PCI error slot reset should be skipped During RAS recovery */ - if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && - amdgpu_ras_in_recovery(adev)) - return PCI_ERS_RESULT_RECOVERED; + u16 status; dev_info(adev->dev, "PCI error: slot reset callback!!\n"); memset(&reset_context, 0, sizeof(reset_context)); - /* wait for asic to come out of reset */ - msleep(700); + if (adev->pcie_reset_ctx.swus) + link_dev = adev->pcie_reset_ctx.swus; + else + link_dev = adev->pdev; + /* wait for asic to come out of reset, timeout = 10s */ + timeout = 10000; + do { + usleep_range(10000, 10500); + r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); + timeout -= 10; + } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && + (status != PCI_VENDOR_ID_AMD)); + if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { + r = -ETIME; + goto out; + } + + amdgpu_device_load_switch_state(adev); /* Restore PCI confspace */ amdgpu_device_load_pci_state(pdev); @@ -7072,7 +7148,6 @@ void amdgpu_pci_resume(struct pci_dev *pdev) amdgpu_device_sched_resume(&device_list, NULL, NULL); amdgpu_device_gpu_resume(adev, &device_list, false); amdgpu_device_recovery_put_reset_lock(adev, &device_list); - adev->pcie_reset_ctx.occurs_dpc = false; if (hive) { mutex_unlock(&hive->hive_lock); @@ -7080,6 +7155,58 @@ void amdgpu_pci_resume(struct pci_dev *pdev) } } +static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) +{ + struct pci_dev *parent = pci_upstream_bridge(adev->pdev); + int r; + + if (!parent || parent->vendor != PCI_VENDOR_ID_ATI) + return; + + /* If already saved, return */ + if (adev->pcie_reset_ctx.swus) + return; + /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ + r = pci_save_state(parent); + if (r) + return; + adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(parent); + + parent = pci_upstream_bridge(parent); + r = pci_save_state(parent); + if (r) + return; + adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(parent); + + adev->pcie_reset_ctx.swus = parent; +} + +static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) +{ + struct pci_dev *pdev; + int r; + + if (!adev->pcie_reset_ctx.swds_pcistate || + !adev->pcie_reset_ctx.swus_pcistate) + return; + + pdev = adev->pcie_reset_ctx.swus; + r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); + if (!r) { + pci_restore_state(pdev); + } else { + dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); + return; + } + + pdev = pci_upstream_bridge(adev->pdev); + r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); + if (!r) + pci_restore_state(pdev); + else + dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); +} + bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); @@ -7104,6 +7231,8 @@ bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) return false; } + amdgpu_device_cache_switch_state(adev); + return true; } @@ -7490,3 +7619,53 @@ ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) size += sysfs_emit_at(buf, size, "\n"); return size; } + +void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, + enum amdgpu_uid_type type, uint8_t inst, + uint64_t uid) +{ + if (!uid_info) + return; + + if (type >= AMDGPU_UID_TYPE_MAX) { + dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", + type); + return; + } + + if (inst >= AMDGPU_UID_INST_MAX) { + dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", + inst); + return; + } + + if (uid_info->uid[type][inst] != 0) { + dev_warn_once( + uid_info->adev->dev, + "Overwriting existing UID %llu for type %d instance %d\n", + uid_info->uid[type][inst], type, inst); + } + + uid_info->uid[type][inst] = uid; +} + +u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, + enum amdgpu_uid_type type, uint8_t inst) +{ + if (!uid_info) + return 0; + + if (type >= AMDGPU_UID_TYPE_MAX) { + dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", + type); + return 0; + } + + if (inst >= AMDGPU_UID_INST_MAX) { + dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", + inst); + return 0; + } + + return uid_info->uid[type][inst]; +} |