diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1718 |
1 files changed, 1195 insertions, 523 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7f354cd532dc..12201b8e99b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -71,6 +71,7 @@ #include "amdgpu_xgmi.h" #include "amdgpu_ras.h" +#include "amdgpu_ras_mgr.h" #include "amdgpu_pmu.h" #include "amdgpu_fru_eeprom.h" #include "amdgpu_reset.h" @@ -85,6 +86,7 @@ #if IS_ENABLED(CONFIG_X86) #include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #endif MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); @@ -94,6 +96,7 @@ MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); #define AMDGPU_RESUME_MS 2000 #define AMDGPU_MAX_RETRY_LIMIT 2 @@ -177,6 +180,12 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = { BIT(AMD_IP_BLOCK_TYPE_PSP) }; +static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); +static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); +static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); + +static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); + static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, enum amd_ip_block_type block) { @@ -231,7 +240,7 @@ static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) { int ret = 0; - if (!amdgpu_sriov_vf(adev)) + if (amdgpu_nbio_is_replay_cnt_supported(adev)) ret = sysfs_create_file(&adev->dev->kobj, &dev_attr_pcie_replay_count.attr); @@ -240,7 +249,7 @@ static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) { - if (!amdgpu_sriov_vf(adev)) + if (amdgpu_nbio_is_replay_cnt_supported(adev)) sysfs_remove_file(&adev->dev->kobj, &dev_attr_pcie_replay_count.attr); } @@ -410,19 +419,16 @@ static const struct attribute_group amdgpu_board_attrs_group = { static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); - /** * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control * - * @dev: drm_device pointer + * @adev: amdgpu device pointer * * Returns true if the device is a dGPU with ATPX power control, * otherwise return false. */ -bool amdgpu_device_supports_px(struct drm_device *dev) +bool amdgpu_device_supports_px(struct amdgpu_device *adev) { - struct amdgpu_device *adev = drm_to_adev(dev); - if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) return true; return false; @@ -431,15 +437,13 @@ bool amdgpu_device_supports_px(struct drm_device *dev) /** * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources * - * @dev: drm_device pointer + * @adev: amdgpu device pointer * * Returns true if the device is a dGPU with ACPI power control, * otherwise return false. */ -bool amdgpu_device_supports_boco(struct drm_device *dev) +bool amdgpu_device_supports_boco(struct amdgpu_device *adev) { - struct amdgpu_device *adev = drm_to_adev(dev); - if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) return false; @@ -452,29 +456,24 @@ bool amdgpu_device_supports_boco(struct drm_device *dev) /** * amdgpu_device_supports_baco - Does the device support BACO * - * @dev: drm_device pointer + * @adev: amdgpu device pointer * * Return: * 1 if the device supports BACO; * 3 if the device supports MACO (only works if BACO is supported) * otherwise return 0. */ -int amdgpu_device_supports_baco(struct drm_device *dev) +int amdgpu_device_supports_baco(struct amdgpu_device *adev) { - struct amdgpu_device *adev = drm_to_adev(dev); - return amdgpu_asic_supports_baco(adev); } void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) { - struct drm_device *dev; int bamaco_support; - dev = adev_to_drm(adev); - adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; - bamaco_support = amdgpu_device_supports_baco(dev); + bamaco_support = amdgpu_device_supports_baco(adev); switch (amdgpu_runtime_pm) { case 2: @@ -494,10 +493,12 @@ void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) break; case -1: case -2: - if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ + if (amdgpu_device_supports_px(adev)) { + /* enable PX as runtime mode */ adev->pm.rpm_mode = AMDGPU_RUNPM_PX; dev_info(adev->dev, "Using ATPX for runtime pm\n"); - } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ + } else if (amdgpu_device_supports_boco(adev)) { + /* enable boco as runtime mode */ adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; dev_info(adev->dev, "Using BOCO for runtime pm\n"); } else { @@ -511,12 +512,13 @@ void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) break; case CHIP_VEGA10: /* enable BACO as runpm mode if noretry=0 */ - if (!adev->gmc.noretry) + if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; break; default: /* enable BACO as runpm mode on CI+ */ - adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; + if (!amdgpu_passthrough(adev)) + adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; break; } @@ -545,14 +547,14 @@ no_runtime_pm: * amdgpu_device_supports_smart_shift - Is the device dGPU with * smart shift support * - * @dev: drm_device pointer + * @adev: amdgpu device pointer * * Returns true if the device is a dGPU with Smart Shift support, * otherwise returns false. */ -bool amdgpu_device_supports_smart_shift(struct drm_device *dev) +bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) { - return (amdgpu_device_supports_boco(dev) && + return (amdgpu_device_supports_boco(adev) && amdgpu_acpi_is_power_shift_control_supported()); } @@ -1286,14 +1288,14 @@ u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) */ static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) { - DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); + dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); BUG(); return 0; } static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) { - DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); + dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); BUG(); return 0; } @@ -1310,15 +1312,17 @@ static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg */ static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) { - DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", - reg, v); + dev_err(adev->dev, + "Invalid callback to write register 0x%04X with 0x%08X\n", reg, + v); BUG(); } static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) { - DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", - reg, v); + dev_err(adev->dev, + "Invalid callback to write register 0x%llX with 0x%08X\n", reg, + v); BUG(); } @@ -1334,14 +1338,15 @@ static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, ui */ static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) { - DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); + dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", + reg); BUG(); return 0; } static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) { - DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); + dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); BUG(); return 0; } @@ -1358,15 +1363,17 @@ static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t r */ static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) { - DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", - reg, v); + dev_err(adev->dev, + "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", + reg, v); BUG(); } static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) { - DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", - reg, v); + dev_err(adev->dev, + "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", + reg, v); BUG(); } @@ -1384,8 +1391,9 @@ static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, uint32_t block, uint32_t reg) { - DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", - reg, block); + dev_err(adev->dev, + "Invalid callback to read register 0x%04X in block 0x%04X\n", + reg, block); BUG(); return 0; } @@ -1405,8 +1413,9 @@ static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, uint32_t block, uint32_t reg, uint32_t v) { - DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", - reg, block, v); + dev_err(adev->dev, + "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", + reg, block, v); BUG(); } @@ -1669,9 +1678,9 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); struct pci_bus *root; struct resource *res; + int max_size, r; unsigned int i; u16 cmd; - int r; if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) return 0; @@ -1680,6 +1689,9 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) return 0; + if (!amdgpu_rebar) + return 0; + /* resizing on Dell G5 SE platforms causes problems with runtime pm */ if ((amdgpu_runtime_pm != 0) && adev->pdev->vendor == PCI_VENDOR_ID_ATI && @@ -1689,7 +1701,9 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) - DRM_WARN("System can't access extended configuration space, please check!!\n"); + dev_warn( + adev->dev, + "System can't access extended configuration space, please check!!\n"); /* skip if the bios has already enabled large BAR */ if (adev->gmc.real_vram_size && @@ -1712,28 +1726,27 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) return 0; /* Limit the BAR size to what is available */ - rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, - rbar_size); + max_size = pci_rebar_get_max_size(adev->pdev, 0); + if (max_size < 0) + return 0; + rbar_size = min(max_size, rbar_size); /* Disable memory decoding while we change the BAR addresses and size */ pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); pci_write_config_word(adev->pdev, PCI_COMMAND, cmd & ~PCI_COMMAND_MEMORY); - /* Free the VRAM and doorbell BAR, we most likely need to move both. */ + /* Tear down doorbell as resizing will release BARs */ amdgpu_doorbell_fini(adev); - if (adev->asic_type >= CHIP_BONAIRE) - pci_release_resource(adev->pdev, 2); - pci_release_resource(adev->pdev, 0); - - r = pci_resize_resource(adev->pdev, 0, rbar_size); + r = pci_resize_resource(adev->pdev, 0, rbar_size, + (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5 + : 1 << 2); if (r == -ENOSPC) - DRM_INFO("Not enough PCI address space for a large BAR."); + dev_info(adev->dev, + "Not enough PCI address space for a large BAR."); else if (r && r != -ENOTSUPP) - DRM_ERROR("Problem resizing BAR0 (%d).", r); - - pci_assign_unassigned_bus_resources(adev->pdev->bus); + dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); /* When the doorbell or fb BAR isn't available we have no chance of * using the device. @@ -1833,8 +1846,8 @@ bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) case 0: return false; default: - DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", - amdgpu_seamless); + dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", + amdgpu_seamless); return false; } @@ -1870,6 +1883,42 @@ static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device return true; } +static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) +{ + /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. + * It's unclear if this is a platform-specific or GPU-specific issue. + * Disable ASPM on SI for the time being. + */ + if (adev->family == AMDGPU_FAMILY_SI) + return true; + +#if IS_ENABLED(CONFIG_X86) + struct cpuinfo_x86 *c = &cpu_data(0); + + if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) + return false; + + if (c->x86 == 6 && + adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { + switch (c->x86_model) { + case VFM_MODEL(INTEL_ALDERLAKE): + case VFM_MODEL(INTEL_ALDERLAKE_L): + case VFM_MODEL(INTEL_RAPTORLAKE): + case VFM_MODEL(INTEL_RAPTORLAKE_P): + case VFM_MODEL(INTEL_RAPTORLAKE_S): + return true; + default: + return false; + } + } else { + return false; + } +#else + return false; +#endif +} + /** * amdgpu_device_should_use_aspm - check if the device should program ASPM * @@ -1894,7 +1943,7 @@ bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) } if (adev->flags & AMD_IS_APU) return false; - if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) + if (amdgpu_device_aspm_support_quirk(adev)) return false; return pcie_aspm_enabled(adev->pdev); } @@ -1981,7 +2030,7 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) return; if (!is_os_64) { - DRM_WARN("Not 64-bit OS, feature not supported\n"); + dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); goto def_value; } si_meminfo(&si); @@ -1996,7 +2045,7 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) if (total_memory < dram_size_seven_GB) goto def_value1; } else { - DRM_WARN("Smu memory pool size not supported\n"); + dev_warn(adev->dev, "Smu memory pool size not supported\n"); goto def_value; } adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; @@ -2004,7 +2053,7 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) return; def_value1: - DRM_WARN("No enough system memory\n"); + dev_warn(adev->dev, "No enough system memory\n"); def_value: adev->pm.smu_prv_buffer_size = 0; } @@ -2112,8 +2161,31 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); - for (i = 0; i < MAX_XCP; i++) - adev->enforce_isolation[i] = !!enforce_isolation; + for (i = 0; i < MAX_XCP; i++) { + switch (amdgpu_enforce_isolation) { + case -1: + case 0: + default: + /* disable */ + adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; + break; + case 1: + /* enable */ + adev->enforce_isolation[i] = + AMDGPU_ENFORCE_ISOLATION_ENABLE; + break; + case 2: + /* enable legacy mode */ + adev->enforce_isolation[i] = + AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; + break; + case 3: + /* enable only process isolation without submitting cleaner shader */ + adev->enforce_isolation[i] = + AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; + break; + } + } return 0; } @@ -2133,7 +2205,8 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, struct drm_device *dev = pci_get_drvdata(pdev); int r; - if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) + if (amdgpu_device_supports_px(drm_to_adev(dev)) && + state == VGA_SWITCHEROO_OFF) return; if (state == VGA_SWITCHEROO_ON) { @@ -2145,12 +2218,13 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, amdgpu_device_load_pci_state(pdev); r = pci_enable_device(pdev); if (r) - DRM_WARN("pci_enable_device failed (%d)\n", r); + dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", + r); amdgpu_device_resume(dev, true); dev->switch_power_state = DRM_SWITCH_POWER_ON; } else { - pr_info("switched off\n"); + dev_info(&pdev->dev, "switched off\n"); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; amdgpu_device_prepare(dev); amdgpu_device_suspend(dev, true); @@ -2217,8 +2291,9 @@ int amdgpu_device_ip_set_clockgating_state(void *dev, r = adev->ip_blocks[i].version->funcs->set_clockgating_state( &adev->ip_blocks[i], state); if (r) - DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_err(adev->dev, + "set_clockgating_state of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); } return r; } @@ -2251,8 +2326,9 @@ int amdgpu_device_ip_set_powergating_state(void *dev, r = adev->ip_blocks[i].version->funcs->set_powergating_state( &adev->ip_blocks[i], state); if (r) - DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_err(adev->dev, + "set_powergating_state of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); } return r; } @@ -2314,7 +2390,7 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, } /** - * amdgpu_device_ip_is_valid - is the hardware IP enabled + * amdgpu_device_ip_is_hw - is the hardware IP enabled * * @adev: amdgpu_device pointer * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) @@ -2322,6 +2398,27 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, * Check if the hardware IP is enable or not. * Returns true if it the IP is enable, false if not. */ +bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev, + enum amd_ip_block_type block_type) +{ + int i; + + for (i = 0; i < adev->num_ip_blocks; i++) { + if (adev->ip_blocks[i].version->type == block_type) + return adev->ip_blocks[i].status.hw; + } + return false; +} + +/** + * amdgpu_device_ip_is_valid - is the hardware IP valid + * + * @adev: amdgpu_device pointer + * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) + * + * Check if the hardware IP is valid or not. + * Returns true if it the IP is valid, false if not. + */ bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, enum amd_ip_block_type block_type) { @@ -2382,6 +2479,34 @@ int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, return 1; } +static const char *ip_block_names[] = { + [AMD_IP_BLOCK_TYPE_COMMON] = "common", + [AMD_IP_BLOCK_TYPE_GMC] = "gmc", + [AMD_IP_BLOCK_TYPE_IH] = "ih", + [AMD_IP_BLOCK_TYPE_SMC] = "smu", + [AMD_IP_BLOCK_TYPE_PSP] = "psp", + [AMD_IP_BLOCK_TYPE_DCE] = "dce", + [AMD_IP_BLOCK_TYPE_GFX] = "gfx", + [AMD_IP_BLOCK_TYPE_SDMA] = "sdma", + [AMD_IP_BLOCK_TYPE_UVD] = "uvd", + [AMD_IP_BLOCK_TYPE_VCE] = "vce", + [AMD_IP_BLOCK_TYPE_ACP] = "acp", + [AMD_IP_BLOCK_TYPE_VCN] = "vcn", + [AMD_IP_BLOCK_TYPE_MES] = "mes", + [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg", + [AMD_IP_BLOCK_TYPE_VPE] = "vpe", + [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm", + [AMD_IP_BLOCK_TYPE_ISP] = "isp", + [AMD_IP_BLOCK_TYPE_RAS] = "ras", +}; + +static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type) +{ + int idx = (int)type; + + return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown"; +} + /** * amdgpu_device_ip_block_add * @@ -2410,8 +2535,13 @@ int amdgpu_device_ip_block_add(struct amdgpu_device *adev, break; } - dev_info(adev->dev, "detected ip block number %d <%s>\n", - adev->num_ip_blocks, ip_block_version->funcs->name); + dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n", + adev->num_ip_blocks, + ip_block_name(adev, ip_block_version->type), + ip_block_version->major, + ip_block_version->minor, + ip_block_version->rev, + ip_block_version->funcs->name); adev->ip_blocks[adev->num_ip_blocks].adev = adev; @@ -2468,9 +2598,11 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) } } - DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", - amdgpu_virtual_display, pci_address_name, - adev->enable_virtual_display, adev->mode_info.num_crtc); + dev_info( + adev->dev, + "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", + amdgpu_virtual_display, pci_address_name, + adev->enable_virtual_display, adev->mode_info.num_crtc); kfree(pciaddstr); } @@ -2481,8 +2613,9 @@ void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { adev->mode_info.num_crtc = 1; adev->enable_virtual_display = true; - DRM_INFO("virtual_display:%d, num_crtc:%d\n", - adev->enable_virtual_display, adev->mode_info.num_crtc); + dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", + adev->enable_virtual_display, + adev->mode_info.num_crtc); } } @@ -2504,9 +2637,6 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) adev->firmware.gpu_info_fw = NULL; - if (adev->mman.discovery_bin) - return 0; - switch (adev->asic_type) { default: return 0; @@ -2528,8 +2658,15 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) chip_name = "arcturus"; break; case CHIP_NAVI12: + if (adev->discovery.bin) + return 0; chip_name = "navi12"; break; + case CHIP_CYAN_SKILLFISH: + if (adev->discovery.bin) + return 0; + chip_name = "cyan_skillfish"; + break; } err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, @@ -2609,6 +2746,24 @@ out: return err; } +static void amdgpu_uid_init(struct amdgpu_device *adev) +{ + /* Initialize the UID for the device */ + adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); + if (!adev->uid_info) { + dev_warn(adev->dev, "Failed to allocate memory for UID\n"); + return; + } + adev->uid_info->adev = adev; +} + +static void amdgpu_uid_fini(struct amdgpu_device *adev) +{ + /* Free the UID memory */ + kfree(adev->uid_info); + adev->uid_info = NULL; +} + /** * amdgpu_device_ip_early_init - run early init for hardware IPs * @@ -2633,6 +2788,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) r = amdgpu_virt_request_full_gpu(adev, true); if (r) return r; + + r = amdgpu_virt_init_critical_region(adev); + if (r) + return r; } switch (adev->asic_type) { @@ -2689,6 +2848,13 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) break; } + /* Check for IP version 9.4.3 with A0 hardware */ + if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && + !amdgpu_device_get_rev_id(adev)) { + dev_err(adev->dev, "Unsupported A0 hardware\n"); + return -ENODEV; /* device unsupported - no device error */ + } + if (amdgpu_has_atpx() && (amdgpu_is_atpx_hybrid() || amdgpu_has_atpx_dgpu_power_cntl()) && @@ -2701,7 +2867,6 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) adev->has_pr3 = parent ? pci_pr3_present(parent) : false; } - adev->pm.pp_feature = amdgpu_pp_feature_mask; if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; @@ -2710,21 +2875,29 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; + adev->virt.is_xgmi_node_migrate_enabled = false; + if (amdgpu_sriov_vf(adev)) { + adev->virt.is_xgmi_node_migrate_enabled = + amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); + } + total = true; for (i = 0; i < adev->num_ip_blocks; i++) { ip_block = &adev->ip_blocks[i]; if ((amdgpu_ip_block_mask & (1 << i)) == 0) { - DRM_WARN("disabled ip block: %d <%s>\n", - i, adev->ip_blocks[i].version->funcs->name); + dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, + adev->ip_blocks[i].version->funcs->name); adev->ip_blocks[i].status.valid = false; } else if (ip_block->version->funcs->early_init) { r = ip_block->version->funcs->early_init(ip_block); if (r == -ENOENT) { adev->ip_blocks[i].status.valid = false; } else if (r) { - DRM_ERROR("early_init of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_err(adev->dev, + "early_init of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, + r); total = false; } else { adev->ip_blocks[i].status.valid = true; @@ -2778,6 +2951,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) if (adev->gmc.xgmi.supported) amdgpu_xgmi_early_init(adev); + if (amdgpu_is_multi_aid(adev)) + amdgpu_uid_init(adev); ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); if (ip_block->status.valid != false) amdgpu_amdkfd_device_probe(adev); @@ -2805,8 +2980,10 @@ static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); if (r) { - DRM_ERROR("hw_init of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_err(adev->dev, + "hw_init of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, + r); return r; } adev->ip_blocks[i].status.hw = true; @@ -2830,8 +3007,9 @@ static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) continue; r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); if (r) { - DRM_ERROR("hw_init of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_err(adev->dev, + "hw_init of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); return r; } adev->ip_blocks[i].status.hw = true; @@ -2869,8 +3047,11 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) } else { r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); if (r) { - DRM_ERROR("hw_init of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_err(adev->dev, + "hw_init of IP block <%s> failed %d\n", + adev->ip_blocks[i] + .version->funcs->name, + r); return r; } adev->ip_blocks[i].status.hw = true; @@ -2925,25 +3106,29 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) r = drm_sched_init(&ring->sched, &args); if (r) { - DRM_ERROR("Failed to create scheduler on ring %s.\n", - ring->name); + dev_err(adev->dev, + "Failed to create scheduler on ring %s.\n", + ring->name); return r; } r = amdgpu_uvd_entity_init(adev, ring); if (r) { - DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", - ring->name); + dev_err(adev->dev, + "Failed to create UVD scheduling entity on ring %s.\n", + ring->name); return r; } r = amdgpu_vce_entity_init(adev, ring); if (r) { - DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", - ring->name); + dev_err(adev->dev, + "Failed to create VCE scheduling entity on ring %s.\n", + ring->name); return r; } } - amdgpu_xcp_update_partition_sched_list(adev); + if (adev->xcp_mgr) + amdgpu_xcp_update_partition_sched_list(adev); return 0; } @@ -2975,8 +3160,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (adev->ip_blocks[i].version->funcs->sw_init) { r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); if (r) { - DRM_ERROR("sw_init of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_err(adev->dev, + "sw_init of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, + r); goto init_failed; } } @@ -2990,7 +3177,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) /* need to do common hw init early so everything is set up for gmc */ r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); if (r) { - DRM_ERROR("hw_init %d failed %d\n", i, r); + dev_err(adev->dev, "hw_init %d failed %d\n", i, + r); goto init_failed; } adev->ip_blocks[i].status.hw = true; @@ -3002,17 +3190,21 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) r = amdgpu_device_mem_scratch_init(adev); if (r) { - DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); + dev_err(adev->dev, + "amdgpu_mem_scratch_init failed %d\n", + r); goto init_failed; } r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); if (r) { - DRM_ERROR("hw_init %d failed %d\n", i, r); + dev_err(adev->dev, "hw_init %d failed %d\n", i, + r); goto init_failed; } r = amdgpu_device_wb_init(adev); if (r) { - DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); + dev_err(adev->dev, + "amdgpu_device_wb_init failed %d\n", r); goto init_failed; } adev->ip_blocks[i].status.hw = true; @@ -3024,14 +3216,16 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) AMDGPU_GEM_DOMAIN_GTT, AMDGPU_CSA_SIZE); if (r) { - DRM_ERROR("allocate CSA failed %d\n", r); + dev_err(adev->dev, + "allocate CSA failed %d\n", r); goto init_failed; } } r = amdgpu_seq64_init(adev); if (r) { - DRM_ERROR("allocate seq64 failed %d\n", r); + dev_err(adev->dev, "allocate seq64 failed %d\n", + r); goto init_failed; } } @@ -3172,6 +3366,8 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) * always assumed to be lost. */ switch (amdgpu_asic_reset_method(adev)) { + case AMD_RESET_METHOD_LEGACY: + case AMD_RESET_METHOD_LINK: case AMD_RESET_METHOD_BACO: case AMD_RESET_METHOD_MODE1: return true; @@ -3220,8 +3416,10 @@ int amdgpu_device_set_cg_state(struct amdgpu_device *adev, r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], state); if (r) { - DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_err(adev->dev, + "set_clockgating_state(gate) of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, + r); return r; } } @@ -3247,18 +3445,21 @@ int amdgpu_device_set_pg_state(struct amdgpu_device *adev, (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) continue; - /* skip CG for VCE/UVD, it's handled specially */ + /* skip CG for VCE/UVD/VPE, it's handled specially */ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VPE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && adev->ip_blocks[i].version->funcs->set_powergating_state) { /* enable powergating to save power */ r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], state); if (r) { - DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_err(adev->dev, + "set_powergating_state(gate) of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, + r); return r; } } @@ -3285,7 +3486,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void) for (i = 0; i < mgpu_info.num_dgpu; i++) { gpu_ins = &(mgpu_info.gpu_ins[i]); adev = gpu_ins->adev; - if (!(adev->flags & AMD_IS_APU) && + if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && !gpu_ins->mgpu_fan_enabled) { ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); if (ret) @@ -3324,8 +3525,10 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) if (adev->ip_blocks[i].version->funcs->late_init) { r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); if (r) { - DRM_ERROR("late_init of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_err(adev->dev, + "late_init of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, + r); return r; } } @@ -3334,7 +3537,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) r = amdgpu_ras_late_init(adev); if (r) { - DRM_ERROR("amdgpu_ras_late_init failed %d", r); + dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); return r; } @@ -3348,7 +3551,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) r = amdgpu_device_enable_mgpu_fan_boost(); if (r) - DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); + dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ if (amdgpu_passthrough(adev) && @@ -3381,7 +3584,9 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) r = amdgpu_xgmi_set_pstate(gpu_instance->adev, AMDGPU_XGMI_PSTATE_MIN); if (r) { - DRM_ERROR("pstate setting failed (%d).\n", r); + dev_err(adev->dev, + "pstate setting failed (%d).\n", + r); break; } } @@ -3395,17 +3600,19 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) { + struct amdgpu_device *adev = ip_block->adev; int r; if (!ip_block->version->funcs->hw_fini) { - DRM_ERROR("hw_fini of IP block <%s> not defined\n", - ip_block->version->funcs->name); + dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", + ip_block->version->funcs->name); } else { r = ip_block->version->funcs->hw_fini(ip_block); /* XXX handle errors */ if (r) { - DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", - ip_block->version->funcs->name, r); + dev_dbg(adev->dev, + "hw_fini of IP block <%s> failed %d\n", + ip_block->version->funcs->name, r); } } @@ -3446,15 +3653,17 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); if (r) { - DRM_DEBUG("early_fini of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_dbg(adev->dev, + "early_fini of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); } } amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); - amdgpu_amdkfd_suspend(adev, false); + amdgpu_amdkfd_suspend(adev, true); + amdgpu_userq_suspend(adev); /* Workaround for ASICs need to disable SMC first */ amdgpu_device_smu_fini_early(adev); @@ -3468,7 +3677,22 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) { if (amdgpu_virt_release_full_gpu(adev, false)) - DRM_ERROR("failed to release exclusive mode on fini\n"); + dev_err(adev->dev, + "failed to release exclusive mode on fini\n"); + } + + /* + * Driver reload on the APU can fail due to firmware validation because + * the PSP is always running, as it is shared across the whole SoC. + * This same issue does not occur on dGPU because it has a mechanism + * that checks whether the PSP is running. A solution for those issues + * in the APU is to trigger a GPU reset, but this should be done during + * the unload phase to avoid adding boot latency and screen flicker. + */ + if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) { + r = amdgpu_asic_reset(adev); + if (r) + dev_err(adev->dev, "asic reset on %s failed\n", __func__); } return 0; @@ -3516,8 +3740,10 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); /* XXX handle errors */ if (r) { - DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); + dev_dbg(adev->dev, + "sw_fini of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, + r); } } adev->ip_blocks[i].status.sw = false; @@ -3533,6 +3759,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) } amdgpu_ras_fini(adev); + amdgpu_uid_fini(adev); return 0; } @@ -3550,7 +3777,7 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) r = amdgpu_ib_ring_tests(adev); if (r) - DRM_ERROR("ib ring test failed (%d).\n", r); + dev_err(adev->dev, "ib ring test failed (%d).\n", r); } static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) @@ -3578,7 +3805,7 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) */ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) { - int i, r; + int i, r, rec; amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); @@ -3599,13 +3826,25 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) continue; - /* XXX handle errors */ r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); if (r) - return r; + goto unwind; } return 0; +unwind: + rec = amdgpu_device_ip_resume_phase3(adev); + if (rec) + dev_err(adev->dev, + "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", + rec); + + amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); + + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); + + return r; } /** @@ -3621,7 +3860,7 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) */ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) { - int i, r; + int i, r, rec; if (adev->in_s0ix) amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); @@ -3682,24 +3921,52 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) continue; - /* XXX handle errors */ r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); - adev->ip_blocks[i].status.hw = false; + if (r) + goto unwind; /* handle putting the SMC in the appropriate state */ if (!amdgpu_sriov_vf(adev)) { if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); if (r) { - DRM_ERROR("SMC failed to set mp1 state %d, %d\n", - adev->mp1_state, r); - return r; + dev_err(adev->dev, + "SMC failed to set mp1 state %d, %d\n", + adev->mp1_state, r); + goto unwind; } } } } return 0; +unwind: + /* suspend phase 2 = resume phase 1 + resume phase 2 */ + rec = amdgpu_device_ip_resume_phase1(adev); + if (rec) { + dev_err(adev->dev, + "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", + rec); + return r; + } + + rec = amdgpu_device_fw_loading(adev); + if (rec) { + dev_err(adev->dev, + "amdgpu_device_fw_loading failed during unwind: %d\n", + rec); + return r; + } + + rec = amdgpu_device_ip_resume_phase2(adev); + if (rec) { + dev_err(adev->dev, + "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", + rec); + return r; + } + + return r; } /** @@ -3713,7 +3980,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) * in each IP into a state suitable for suspend. * Returns 0 on success, negative error code on failure. */ -int amdgpu_device_ip_suspend(struct amdgpu_device *adev) +static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) { int r; @@ -3976,12 +4243,14 @@ static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) /** * amdgpu_device_asic_has_dc_support - determine if DC supports the asic * + * @pdev : pci device context * @asic_type: AMD asic type * * Check if there is DC (new modesetting infrastructre) support for an asic. * returns true if DC has support, false if not. */ -bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) +bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, + enum amd_asic_type asic_type) { switch (asic_type) { #ifdef CONFIG_DRM_AMDGPU_SI @@ -3995,25 +4264,13 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) case CHIP_PITCAIRN: case CHIP_VERDE: case CHIP_OLAND: - /* - * We have systems in the wild with these ASICs that require - * LVDS and VGA support which is not supported with DC. - * - * Fallback to the non-DC driver here by default so as not to - * cause regressions. - */ -#if defined(CONFIG_DRM_AMD_DC_SI) - return amdgpu_dc > 0; -#else - return false; -#endif - case CHIP_BONAIRE: + return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI); case CHIP_KAVERI: case CHIP_KABINI: case CHIP_MULLINS: /* * We have systems in the wild with these ASICs that require - * VGA support which is not supported with DC. + * TRAVIS and NUTMEG support which is not supported with DC. * * Fallback to the non-DC driver here by default so as not to * cause regressions. @@ -4024,7 +4281,9 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) #else default: if (amdgpu_dc > 0) - DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); + dev_info_once( + &pdev->dev, + "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); return false; #endif } @@ -4043,7 +4302,7 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) return false; - return amdgpu_device_asic_has_dc_support(adev->asic_type); + return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); } static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) @@ -4065,13 +4324,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { task_barrier_enter(&hive->tb); - adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); + adev->asic_reset_res = amdgpu_device_baco_enter(adev); if (adev->asic_reset_res) goto fail; task_barrier_exit(&hive->tb); - adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); + adev->asic_reset_res = amdgpu_device_baco_exit(adev); if (adev->asic_reset_res) goto fail; @@ -4085,7 +4344,8 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) fail: if (adev->asic_reset_res) - DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", + dev_warn(adev->dev, + "ASIC reset failed with error, %d for drm dev, %s", adev->asic_reset_res, adev_to_drm(adev)->unique); amdgpu_put_xgmi_hive(hive); } @@ -4098,66 +4358,53 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) long timeout; int ret = 0; - /* - * By default timeout for non compute jobs is 10000 - * and 60000 for compute jobs. - * In SR-IOV or passthrough mode, timeout for compute - * jobs are 60000 by default. - */ - adev->gfx_timeout = msecs_to_jiffies(10000); - adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; - if (amdgpu_sriov_vf(adev)) - adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? - msecs_to_jiffies(60000) : msecs_to_jiffies(10000); - else - adev->compute_timeout = msecs_to_jiffies(60000); + /* By default timeout for all queues is 2 sec */ + adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = + adev->video_timeout = msecs_to_jiffies(2000); - if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { - while ((timeout_setting = strsep(&input, ",")) && - strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { - ret = kstrtol(timeout_setting, 0, &timeout); - if (ret) - return ret; + if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) + return 0; - if (timeout == 0) { - index++; - continue; - } else if (timeout < 0) { - timeout = MAX_SCHEDULE_TIMEOUT; - dev_warn(adev->dev, "lockup timeout disabled"); - add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); - } else { - timeout = msecs_to_jiffies(timeout); - } + while ((timeout_setting = strsep(&input, ",")) && + strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { + ret = kstrtol(timeout_setting, 0, &timeout); + if (ret) + return ret; - switch (index++) { - case 0: - adev->gfx_timeout = timeout; - break; - case 1: - adev->compute_timeout = timeout; - break; - case 2: - adev->sdma_timeout = timeout; - break; - case 3: - adev->video_timeout = timeout; - break; - default: - break; - } + if (timeout == 0) { + index++; + continue; + } else if (timeout < 0) { + timeout = MAX_SCHEDULE_TIMEOUT; + dev_warn(adev->dev, "lockup timeout disabled"); + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); + } else { + timeout = msecs_to_jiffies(timeout); } - /* - * There is only one value specified and - * it should apply to all non-compute jobs. - */ - if (index == 1) { - adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; - if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) - adev->compute_timeout = adev->gfx_timeout; + + switch (index++) { + case 0: + adev->gfx_timeout = timeout; + break; + case 1: + adev->compute_timeout = timeout; + break; + case 2: + adev->sdma_timeout = timeout; + break; + case 3: + adev->video_timeout = timeout; + break; + default: + break; } } + /* When only one value specified apply it to all queues. */ + if (index == 1) + adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = + adev->video_timeout = timeout; + return ret; } @@ -4209,7 +4456,56 @@ static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) adev->gfx.mcbp = true; if (adev->gfx.mcbp) - DRM_INFO("MCBP is enabled\n"); + dev_info(adev->dev, "MCBP is enabled\n"); +} + +static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) +{ + int r; + + r = amdgpu_atombios_sysfs_init(adev); + if (r) + drm_err(&adev->ddev, + "registering atombios sysfs failed (%d).\n", r); + + r = amdgpu_pm_sysfs_init(adev); + if (r) + dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); + + r = amdgpu_ucode_sysfs_init(adev); + if (r) { + adev->ucode_sysfs_en = false; + dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); + } else + adev->ucode_sysfs_en = true; + + r = amdgpu_device_attr_sysfs_init(adev); + if (r) + dev_err(adev->dev, "Could not create amdgpu device attr\n"); + + r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); + if (r) + dev_err(adev->dev, + "Could not create amdgpu board attributes\n"); + + amdgpu_fru_sysfs_init(adev); + amdgpu_reg_state_sysfs_init(adev); + amdgpu_xcp_sysfs_init(adev); + + return r; +} + +static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) +{ + if (adev->pm.sysfs_initialized) + amdgpu_pm_sysfs_fini(adev); + if (adev->ucode_sysfs_en) + amdgpu_ucode_sysfs_fini(adev); + amdgpu_device_attr_sysfs_fini(adev); + amdgpu_fru_sysfs_fini(adev); + + amdgpu_reg_state_sysfs_fini(adev); + amdgpu_xcp_sysfs_fini(adev); } /** @@ -4225,7 +4521,6 @@ static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) int amdgpu_device_init(struct amdgpu_device *adev, uint32_t flags) { - struct drm_device *ddev = adev_to_drm(adev); struct pci_dev *pdev = adev->pdev; int r, i; bool px = false; @@ -4277,9 +4572,11 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; - DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", - amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, - pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); + dev_info( + adev->dev, + "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", + amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, + pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); /* mutex initialization are all done here so we * can recall function without having locking issues @@ -4307,7 +4604,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, amdgpu_sync_create(&adev->isolation[i].active); amdgpu_sync_create(&adev->isolation[i].prev); } - mutex_init(&adev->gfx.kfd_sch_mutex); + mutex_init(&adev->gfx.userq_sch_mutex); mutex_init(&adev->gfx.workload_profile_mutex); mutex_init(&adev->vcn.workload_profile_mutex); @@ -4329,12 +4626,16 @@ int amdgpu_device_init(struct amdgpu_device *adev, spin_lock_init(&adev->virt.rlcg_reg_lock); spin_lock_init(&adev->wb.lock); + xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); + INIT_LIST_HEAD(&adev->reset_list); INIT_LIST_HEAD(&adev->ras_list); INIT_LIST_HEAD(&adev->pm.od_kobj_list); + xa_init(&adev->userq_doorbell_xa); + INIT_DELAYED_WORK(&adev->delayed_init_work, amdgpu_device_delayed_init_work_handler); INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, @@ -4356,6 +4657,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, } INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); + INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); adev->gfx.gfx_off_req_count = 1; adev->gfx.gfx_off_residency = 0; @@ -4391,8 +4693,10 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (!adev->rmmio) return -ENOMEM; - DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); - DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); + dev_info(adev->dev, "register mmio base: 0x%08X\n", + (uint32_t)adev->rmmio_base); + dev_info(adev->dev, "register mmio size: %u\n", + (unsigned int)adev->rmmio_size); /* * Reset domain needs to be present early, before XGMI hive discovered @@ -4529,7 +4833,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, r = -EINVAL; goto failed; } - DRM_INFO("GPU posting now...\n"); + dev_info(adev->dev, "GPU posting now...\n"); r = amdgpu_device_asic_init(adev); if (r) { dev_err(adev->dev, "gpu post error!\n"); @@ -4627,39 +4931,14 @@ fence_driver_init: flush_delayed_work(&adev->delayed_init_work); } + if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) + amdgpu_xgmi_reset_on_init(adev); /* * Place those sysfs registering after `late_init`. As some of those * operations performed in `late_init` might affect the sysfs * interfaces creating. */ - r = amdgpu_atombios_sysfs_init(adev); - if (r) - drm_err(&adev->ddev, - "registering atombios sysfs failed (%d).\n", r); - - r = amdgpu_pm_sysfs_init(adev); - if (r) - DRM_ERROR("registering pm sysfs failed (%d).\n", r); - - r = amdgpu_ucode_sysfs_init(adev); - if (r) { - adev->ucode_sysfs_en = false; - DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); - } else - adev->ucode_sysfs_en = true; - - r = amdgpu_device_attr_sysfs_init(adev); - if (r) - dev_err(adev->dev, "Could not create amdgpu device attr\n"); - - r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); - if (r) - dev_err(adev->dev, - "Could not create amdgpu board attributes\n"); - - amdgpu_fru_sysfs_init(adev); - amdgpu_reg_state_sysfs_init(adev); - amdgpu_xcp_cfg_sysfs_init(adev); + r = amdgpu_device_sys_interface_init(adev); if (IS_ENABLED(CONFIG_PERF_EVENTS)) r = amdgpu_pmu_init(adev); @@ -4677,7 +4956,7 @@ fence_driver_init: if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); - px = amdgpu_device_supports_px(ddev); + px = amdgpu_device_supports_px(adev); if (px || (!dev_is_removable(&adev->pdev->dev) && apple_gmux_detect(NULL, NULL))) @@ -4687,9 +4966,6 @@ fence_driver_init: if (px) vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); - if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) - amdgpu_xgmi_reset_on_init(adev); - amdgpu_device_check_iommu_direct_map(adev); adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; @@ -4781,15 +5057,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) } amdgpu_fence_driver_hw_fini(adev); - if (adev->pm.sysfs_initialized) - amdgpu_pm_sysfs_fini(adev); - if (adev->ucode_sysfs_en) - amdgpu_ucode_sysfs_fini(adev); - amdgpu_device_attr_sysfs_fini(adev); - amdgpu_fru_sysfs_fini(adev); - - amdgpu_reg_state_sysfs_fini(adev); - amdgpu_xcp_cfg_sysfs_fini(adev); + amdgpu_device_sys_interface_fini(adev); /* disable ras feature must before hw fini */ amdgpu_ras_pre_fini(adev); @@ -4843,7 +5111,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) kfree(adev->xcp_mgr); adev->xcp_mgr = NULL; - px = amdgpu_device_supports_px(adev_to_drm(adev)); + px = amdgpu_device_supports_px(adev); if (px || (!dev_is_removable(&adev->pdev->dev) && apple_gmux_detect(NULL, NULL))) @@ -4864,14 +5132,15 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) if (IS_ENABLED(CONFIG_PERF_EVENTS)) amdgpu_pmu_fini(adev); - if (adev->mman.discovery_bin) + if (adev->discovery.bin) amdgpu_discovery_fini(adev); amdgpu_reset_put_reset_domain(adev->reset_domain); adev->reset_domain = NULL; kfree(adev->pci_state); - + kfree(adev->pcie_reset_ctx.swds_pcistate); + kfree(adev->pcie_reset_ctx.swus_pcistate); } /** @@ -4891,9 +5160,21 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) return 0; + /* No need to evict when going to S5 through S4 callbacks */ + if (system_state == SYSTEM_POWER_OFF) + return 0; + ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); - if (ret) - DRM_WARN("evicting device resources failed\n"); + if (ret) { + dev_warn(adev->dev, "evicting device resources failed\n"); + return ret; + } + + if (adev->in_s4) { + ret = ttm_device_prepare_hibernation(&adev->mman.bdev); + if (ret) + dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); + } return ret; } @@ -4907,28 +5188,20 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) * @data: data * * This function is called when the system is about to suspend or hibernate. - * It is used to evict resources from the device before the system goes to - * sleep while there is still access to swap. + * It is used to set the appropriate flags so that eviction can be optimized + * in the pm prepare callback. */ static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, void *data) { struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); - int r; switch (mode) { case PM_HIBERNATION_PREPARE: adev->in_s4 = true; - fallthrough; - case PM_SUSPEND_PREPARE: - r = amdgpu_device_evict_resources(adev); - /* - * This is considered non-fatal at this time because - * amdgpu_device_prepare() will also fatally evict resources. - * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 - */ - if (r) - drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); + break; + case PM_POST_HIBERNATION: + adev->in_s4 = false; break; } @@ -4949,15 +5222,13 @@ int amdgpu_device_prepare(struct drm_device *dev) struct amdgpu_device *adev = drm_to_adev(dev); int i, r; - amdgpu_choose_low_power_state(adev); - if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; /* Evict the majority of BOs before starting suspend sequence */ r = amdgpu_device_evict_resources(adev); if (r) - goto unprepare; + return r; flush_delayed_work(&adev->gfx.gfx_off_delay_work); @@ -4968,15 +5239,32 @@ int amdgpu_device_prepare(struct drm_device *dev) continue; r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); if (r) - goto unprepare; + return r; } return 0; +} -unprepare: - adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; +/** + * amdgpu_device_complete - complete power state transition + * + * @dev: drm dev pointer + * + * Undo the changes from amdgpu_device_prepare. This will be + * called on all resume transitions, including those that failed. + */ +void amdgpu_device_complete(struct drm_device *dev) +{ + struct amdgpu_device *adev = drm_to_adev(dev); + int i; - return r; + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_blocks[i].status.valid) + continue; + if (!adev->ip_blocks[i].version->funcs->complete) + continue; + adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); + } } /** @@ -4992,7 +5280,7 @@ unprepare: int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) { struct amdgpu_device *adev = drm_to_adev(dev); - int r = 0; + int r, rec; if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; @@ -5000,44 +5288,125 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) adev->in_suspend = true; if (amdgpu_sriov_vf(adev)) { + if (!adev->in_runpm) + amdgpu_amdkfd_suspend_process(adev); amdgpu_virt_fini_data_exchange(adev); r = amdgpu_virt_request_full_gpu(adev, false); if (r) return r; } - if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) - DRM_WARN("smart shift update failed\n"); + r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); + if (r) + goto unwind_sriov; if (notify_clients) - drm_client_dev_suspend(adev_to_drm(adev), false); + drm_client_dev_suspend(adev_to_drm(adev)); cancel_delayed_work_sync(&adev->delayed_init_work); amdgpu_ras_suspend(adev); - amdgpu_device_ip_suspend_phase1(adev); + r = amdgpu_device_ip_suspend_phase1(adev); + if (r) + goto unwind_smartshift; - if (!adev->in_s0ix) - amdgpu_amdkfd_suspend(adev, adev->in_runpm); + amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); + r = amdgpu_userq_suspend(adev); + if (r) + goto unwind_ip_phase1; r = amdgpu_device_evict_resources(adev); if (r) - return r; + goto unwind_userq; amdgpu_ttm_set_buffer_funcs_status(adev, false); amdgpu_fence_driver_hw_fini(adev); - amdgpu_device_ip_suspend_phase2(adev); + r = amdgpu_device_ip_suspend_phase2(adev); + if (r) + goto unwind_evict; if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, false); - r = amdgpu_dpm_notify_rlc_state(adev, false); + return 0; + +unwind_evict: + if (adev->mman.buffer_funcs_ring->sched.ready) + amdgpu_ttm_set_buffer_funcs_status(adev, true); + amdgpu_fence_driver_hw_init(adev); + +unwind_userq: + rec = amdgpu_userq_resume(adev); + if (rec) { + dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); + return r; + } + rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); + if (rec) { + dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); + return r; + } + +unwind_ip_phase1: + /* suspend phase 1 = resume phase 3 */ + rec = amdgpu_device_ip_resume_phase3(adev); + if (rec) { + dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); + return r; + } + +unwind_smartshift: + rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); + if (rec) { + dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); + return r; + } + + if (notify_clients) + drm_client_dev_resume(adev_to_drm(adev)); + + amdgpu_ras_resume(adev); + +unwind_sriov: + if (amdgpu_sriov_vf(adev)) { + rec = amdgpu_virt_request_full_gpu(adev, true); + if (rec) { + dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); + return r; + } + } + + adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; + + return r; +} + +static inline int amdgpu_virt_resume(struct amdgpu_device *adev) +{ + int r; + unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; + + /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) + * may not work. The access could be blocked by nBIF protection as VF isn't in + * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX + * so that QEMU reprograms MSIX table. + */ + amdgpu_restore_msix(adev); + + r = adev->gfxhub.funcs->get_xgmi_info(adev); if (r) return r; + dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", + prev_physical_node_id, adev->gmc.xgmi.physical_node_id); + + adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); + adev->vm_manager.vram_base_offset += + adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; + return 0; } @@ -5062,6 +5431,12 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) return r; } + if (amdgpu_virt_xgmi_migrate_enabled(adev)) { + r = amdgpu_virt_resume(adev); + if (r) + goto exit; + } + if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; @@ -5082,11 +5457,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) goto exit; } - if (!adev->in_s0ix) { - r = amdgpu_amdkfd_resume(adev, adev->in_runpm); - if (r) - goto exit; - } + r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); + if (r) + goto exit; + + r = amdgpu_userq_resume(adev); + if (r) + goto exit; r = amdgpu_device_ip_late_init(adev); if (r) @@ -5098,6 +5475,9 @@ exit: if (amdgpu_sriov_vf(adev)) { amdgpu_virt_init_data_exchange(adev); amdgpu_virt_release_full_gpu(adev, true); + + if (!r && !adev->in_runpm) + r = amdgpu_amdkfd_resume_process(adev); } if (r) @@ -5107,7 +5487,7 @@ exit: flush_delayed_work(&adev->delayed_init_work); if (notify_clients) - drm_client_dev_resume(adev_to_drm(adev), false); + drm_client_dev_resume(adev_to_drm(adev)); amdgpu_ras_resume(adev); @@ -5132,13 +5512,12 @@ exit: dev->dev->power.disable_depth--; #endif } - adev->in_suspend = false; - if (adev->enable_mes) - amdgpu_mes_self_test(adev); + amdgpu_vram_mgr_clear_reset_blocks(adev); + adev->in_suspend = false; - if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) - DRM_WARN("smart shift update failed\n"); + if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) + dev_warn(adev->dev, "smart shift update failed\n"); return 0; } @@ -5517,6 +5896,29 @@ mode1_reset_failed: return ret; } +int amdgpu_device_link_reset(struct amdgpu_device *adev) +{ + int ret = 0; + + dev_info(adev->dev, "GPU link reset\n"); + + if (!amdgpu_reset_in_dpc(adev)) + ret = amdgpu_dpm_link_reset(adev); + + if (ret) + goto link_reset_failed; + + ret = amdgpu_psp_wait_for_bootloader(adev); + if (ret) + goto link_reset_failed; + + return 0; + +link_reset_failed: + dev_err(adev->dev, "GPU link reset failed\n"); + return ret; +} + int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, struct amdgpu_reset_context *reset_context) { @@ -5541,11 +5943,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (!amdgpu_ring_sched_ready(ring)) continue; - /* Clear job fence from fence drv to avoid force_completion - * leave NULL and vm flush fence in fence drv - */ - amdgpu_fence_driver_clear_job_fences(ring); - /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } @@ -5629,6 +6026,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) amdgpu_set_init_level(tmp_adev, init_level); if (full_reset) { /* post card */ + amdgpu_reset_set_dpc_status(tmp_adev, false); amdgpu_ras_clear_err_state(tmp_adev); r = amdgpu_device_asic_init(tmp_adev); if (r) { @@ -5646,7 +6044,9 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); if (vram_lost) { - DRM_INFO("VRAM is lost due to GPU reset!\n"); + dev_info( + tmp_adev->dev, + "VRAM is lost due to GPU reset!\n"); amdgpu_inc_vram_lost(tmp_adev); } @@ -5687,7 +6087,11 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) if (r) goto out; - drm_client_dev_resume(adev_to_drm(tmp_adev), false); + r = amdgpu_userq_post_reset(tmp_adev, vram_lost); + if (r) + goto out; + + drm_client_dev_resume(adev_to_drm(tmp_adev)); /* * The GPU enters bad state once faulty pages @@ -5821,6 +6225,7 @@ static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) switch (amdgpu_asic_reset_method(adev)) { case AMD_RESET_METHOD_MODE1: + case AMD_RESET_METHOD_LINK: adev->mp1_state = PP_MP1_STATE_SHUTDOWN; break; case AMD_RESET_METHOD_MODE2: @@ -5908,6 +6313,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) if (!amdgpu_sriov_vf(adev)) cancel_work(&adev->reset_work); #endif + cancel_work(&adev->userq_reset_work); if (adev->kfd.dev) cancel_work(&adev->kfd.reset_work); @@ -5924,117 +6330,76 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle) { struct amdgpu_device *tmp_adev; int ret = 0; - u32 status; list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); - if (PCI_POSSIBLE_ERROR(status)) { - dev_err(tmp_adev->dev, "device lost from bus!"); - ret = -ENODEV; - } + ret |= amdgpu_device_bus_status_check(tmp_adev); } return ret; } -/** - * amdgpu_device_gpu_recover - reset the asic and recover scheduler - * - * @adev: amdgpu_device pointer - * @job: which job trigger hang - * @reset_context: amdgpu reset context pointer - * - * Attempt to reset the GPU if it has hung (all asics). - * Attempt to do soft-reset or full-reset and reinitialize Asic - * Returns 0 for success or an error on failure. - */ - -int amdgpu_device_gpu_recover(struct amdgpu_device *adev, - struct amdgpu_job *job, - struct amdgpu_reset_context *reset_context) +static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, + struct list_head *device_list, + struct amdgpu_hive_info *hive) { - struct list_head device_list, *device_list_handle = NULL; - bool job_signaled = false; - struct amdgpu_hive_info *hive = NULL; struct amdgpu_device *tmp_adev = NULL; - int i, r = 0; - bool need_emergency_restart = false; - bool audio_suspended = false; - int retry_limit = AMDGPU_MAX_RETRY_LIMIT; - - /* - * If it reaches here because of hang/timeout and a RAS error is - * detected at the same time, let RAS recovery take care of it. - */ - if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && - !amdgpu_sriov_vf(adev) && - reset_context->src != AMDGPU_RESET_SRC_RAS) { - dev_dbg(adev->dev, - "Gpu recovery from source: %d yielding to RAS error recovery handling", - reset_context->src); - return 0; - } - /* - * Special case: RAS triggered and full reset isn't supported - */ - need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); /* - * Flush RAM to disk so that after reboot - * the user can read log and see why the system rebooted. - */ - if (need_emergency_restart && amdgpu_ras_get_context(adev) && - amdgpu_ras_get_context(adev)->reboot) { - DRM_WARN("Emergency reboot."); - - ksys_sync_helper(); - emergency_restart(); - } - - dev_info(adev->dev, "GPU %s begin!\n", - need_emergency_restart ? "jobs stop":"reset"); - - if (!amdgpu_sriov_vf(adev)) - hive = amdgpu_get_xgmi_hive(adev); - if (hive) - mutex_lock(&hive->hive_lock); - - reset_context->job = job; - reset_context->hive = hive; - /* * Build list of devices to reset. * In case we are in XGMI hive mode, resort the device list * to put adev in the 1st position. */ - INIT_LIST_HEAD(&device_list); if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { - list_add_tail(&tmp_adev->reset_list, &device_list); + list_add_tail(&tmp_adev->reset_list, device_list); if (adev->shutdown) tmp_adev->shutdown = true; + if (amdgpu_reset_in_dpc(adev)) + tmp_adev->pcie_reset_ctx.in_link_reset = true; } - if (!list_is_first(&adev->reset_list, &device_list)) - list_rotate_to_front(&adev->reset_list, &device_list); - device_list_handle = &device_list; + if (!list_is_first(&adev->reset_list, device_list)) + list_rotate_to_front(&adev->reset_list, device_list); } else { - list_add_tail(&adev->reset_list, &device_list); - device_list_handle = &device_list; + list_add_tail(&adev->reset_list, device_list); } +} - if (!amdgpu_sriov_vf(adev)) { - r = amdgpu_device_health_check(device_list_handle); - if (r) - goto end_reset; - } +static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; - /* We need to lock reset domain only once both for XGMI and single device */ - tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, - reset_list); + if (list_empty(device_list)) + return; + tmp_adev = + list_first_entry(device_list, struct amdgpu_device, reset_list); amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); +} - /* block all schedulers and reset given job's ring */ - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { +static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; + + if (list_empty(device_list)) + return; + tmp_adev = + list_first_entry(device_list, struct amdgpu_device, reset_list); + amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); +} + +static void amdgpu_device_halt_activities(struct amdgpu_device *adev, + struct amdgpu_job *job, + struct amdgpu_reset_context *reset_context, + struct list_head *device_list, + struct amdgpu_hive_info *hive, + bool need_emergency_restart) +{ + struct amdgpu_device *tmp_adev = NULL; + int i; + /* block all schedulers and reset given job's ring */ + list_for_each_entry(tmp_adev, device_list, reset_list) { amdgpu_device_set_mp1_state(tmp_adev); /* @@ -6048,7 +6413,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * some audio codec errors. */ if (!amdgpu_device_suspend_display_audio(tmp_adev)) - audio_suspended = true; + tmp_adev->pcie_reset_ctx.audio_suspended = true; amdgpu_ras_set_error_query_ready(tmp_adev, false); @@ -6062,13 +6427,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, */ amdgpu_unregister_gpu_instance(tmp_adev); - drm_client_dev_suspend(adev_to_drm(tmp_adev), false); + drm_client_dev_suspend(adev_to_drm(tmp_adev)); /* disable ras on ALL IPs */ - if (!need_emergency_restart && - amdgpu_device_ip_need_full_reset(tmp_adev)) + if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && + amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); + amdgpu_userq_pre_reset(tmp_adev); + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -6082,24 +6449,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, } atomic_inc(&tmp_adev->gpu_reset_counter); } +} - if (need_emergency_restart) - goto skip_sched_resume; - - /* - * Must check guilty signal here since after this point all old - * HW fences are force signaled. - * - * job->base holds a reference to parent fence - */ - if (job && dma_fence_is_signaled(&job->hw_fence)) { - job_signaled = true; - dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); - goto skip_hw_reset; - } +static int amdgpu_device_asic_reset(struct amdgpu_device *adev, + struct list_head *device_list, + struct amdgpu_reset_context *reset_context) +{ + struct amdgpu_device *tmp_adev = NULL; + int retry_limit = AMDGPU_MAX_RETRY_LIMIT; + int r = 0; retry: /* Rest of adevs pre asic reset from XGMI hive. */ - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + list_for_each_entry(tmp_adev, device_list, reset_list) { r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); /*TODO Should we stop ?*/ if (r) { @@ -6112,6 +6473,11 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ /* Actual ASIC resets if needed.*/ /* Host driver will handle XGMI hive reset for SRIOV */ if (amdgpu_sriov_vf(adev)) { + + /* Bail out of reset early */ + if (amdgpu_ras_is_rma(adev)) + return -ENODEV; + if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); amdgpu_ras_set_fed(adev, true); @@ -6126,12 +6492,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(device_list_handle, reset_context); + r = amdgpu_do_asic_reset(device_list, reset_context); if (r && r == -EAGAIN) goto retry; } - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + list_for_each_entry(tmp_adev, device_list, reset_list) { /* * Drop any pending non scheduler resets queued before reset is done. * Any reset scheduled after this point would be valid. Scheduler resets @@ -6141,10 +6507,18 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ amdgpu_device_stop_pending_resets(tmp_adev); } -skip_hw_reset: + return r; +} + +static int amdgpu_device_sched_resume(struct list_head *device_list, + struct amdgpu_reset_context *reset_context, + bool job_signaled) +{ + struct amdgpu_device *tmp_adev = NULL; + int i, r = 0; /* Post ASIC reset for all devs .*/ - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + list_for_each_entry(tmp_adev, device_list, reset_list) { for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -6158,30 +6532,45 @@ skip_hw_reset: if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); - if (tmp_adev->asic_reset_res) - r = tmp_adev->asic_reset_res; - - tmp_adev->asic_reset_res = 0; - - if (r) { + if (tmp_adev->asic_reset_res) { /* bad news, how to tell it to userspace ? * for ras error, we should report GPU bad status instead of * reset failure */ if (reset_context->src != AMDGPU_RESET_SRC_RAS || !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) - dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", - atomic_read(&tmp_adev->gpu_reset_counter)); - amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); + dev_info( + tmp_adev->dev, + "GPU reset(%d) failed with error %d \n", + atomic_read( + &tmp_adev->gpu_reset_counter), + tmp_adev->asic_reset_res); + amdgpu_vf_error_put(tmp_adev, + AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, + tmp_adev->asic_reset_res); + if (!r) + r = tmp_adev->asic_reset_res; + tmp_adev->asic_reset_res = 0; } else { - dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); - if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) - DRM_WARN("smart shift update failed\n"); + dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", + atomic_read(&tmp_adev->gpu_reset_counter)); + if (amdgpu_acpi_smart_shift_update(tmp_adev, + AMDGPU_SS_DEV_D0)) + dev_warn(tmp_adev->dev, + "smart shift update failed\n"); } } -skip_sched_resume: - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + return r; +} + +static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, + struct list_head *device_list, + bool need_emergency_restart) +{ + struct amdgpu_device *tmp_adev = NULL; + + list_for_each_entry(tmp_adev, device_list, reset_list) { /* unlock kfd: SRIOV would do it separately */ if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) amdgpu_amdkfd_post_reset(tmp_adev); @@ -6192,18 +6581,126 @@ skip_sched_resume: if (!adev->kfd.init_complete) amdgpu_amdkfd_device_init(adev); - if (audio_suspended) + if (tmp_adev->pcie_reset_ctx.audio_suspended) amdgpu_device_resume_display_audio(tmp_adev); amdgpu_device_unset_mp1_state(tmp_adev); amdgpu_ras_set_error_query_ready(tmp_adev, true); + } +} - tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, - reset_list); - amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); +/** + * amdgpu_device_gpu_recover - reset the asic and recover scheduler + * + * @adev: amdgpu_device pointer + * @job: which job trigger hang + * @reset_context: amdgpu reset context pointer + * + * Attempt to reset the GPU if it has hung (all asics). + * Attempt to do soft-reset or full-reset and reinitialize Asic + * Returns 0 for success or an error on failure. + */ + +int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + struct amdgpu_job *job, + struct amdgpu_reset_context *reset_context) +{ + struct list_head device_list; + bool job_signaled = false; + struct amdgpu_hive_info *hive = NULL; + int r = 0; + bool need_emergency_restart = false; + /* save the pasid here as the job may be freed before the end of the reset */ + int pasid = job ? job->pasid : -EINVAL; + + /* + * If it reaches here because of hang/timeout and a RAS error is + * detected at the same time, let RAS recovery take care of it. + */ + if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && + !amdgpu_sriov_vf(adev) && + reset_context->src != AMDGPU_RESET_SRC_RAS) { + dev_dbg(adev->dev, + "Gpu recovery from source: %d yielding to RAS error recovery handling", + reset_context->src); + return 0; + } + + /* + * Special case: RAS triggered and full reset isn't supported + */ + need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); + + /* + * Flush RAM to disk so that after reboot + * the user can read log and see why the system rebooted. + */ + if (need_emergency_restart && amdgpu_ras_get_context(adev) && + amdgpu_ras_get_context(adev)->reboot) { + dev_warn(adev->dev, "Emergency reboot."); + + ksys_sync_helper(); + emergency_restart(); + } + + dev_info(adev->dev, "GPU %s begin!. Source: %d\n", + need_emergency_restart ? "jobs stop" : "reset", + reset_context->src); + + if (!amdgpu_sriov_vf(adev)) + hive = amdgpu_get_xgmi_hive(adev); + if (hive) + mutex_lock(&hive->hive_lock); + + reset_context->job = job; + reset_context->hive = hive; + INIT_LIST_HEAD(&device_list); + + amdgpu_device_recovery_prepare(adev, &device_list, hive); + + if (!amdgpu_sriov_vf(adev)) { + r = amdgpu_device_health_check(&device_list); + if (r) + goto end_reset; + } + + /* Cannot be called after locking reset domain */ + amdgpu_ras_pre_reset(adev, &device_list); + + /* We need to lock reset domain only once both for XGMI and single device */ + amdgpu_device_recovery_get_reset_lock(adev, &device_list); + + amdgpu_device_halt_activities(adev, job, reset_context, &device_list, + hive, need_emergency_restart); + if (need_emergency_restart) + goto skip_sched_resume; + /* + * Must check guilty signal here since after this point all old + * HW fences are force signaled. + * + * job->base holds a reference to parent fence + */ + if (job && dma_fence_is_signaled(&job->hw_fence->base)) { + job_signaled = true; + dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); + goto skip_hw_reset; + } + + r = amdgpu_device_asic_reset(adev, &device_list, reset_context); + if (r) + goto reset_unlock; +skip_hw_reset: + r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); + if (r) + goto reset_unlock; +skip_sched_resume: + amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); +reset_unlock: + amdgpu_device_recovery_put_reset_lock(adev, &device_list); + amdgpu_ras_post_reset(adev, &device_list); end_reset: if (hive) { mutex_unlock(&hive->hive_lock); @@ -6215,8 +6712,21 @@ end_reset: atomic_set(&adev->reset_domain->reset_res, r); - if (!r) - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); + if (!r) { + struct amdgpu_task_info *ti = NULL; + + /* + * The job may already be freed at this point via the sched tdr workqueue so + * use the cached pasid. + */ + if (pasid >= 0) + ti = amdgpu_vm_get_task_info_pasid(adev, pasid); + + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, + ti ? &ti->task : NULL); + + amdgpu_vm_put_task_info(ti); + } return r; } @@ -6535,12 +7045,11 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, #endif } -int amdgpu_device_baco_enter(struct drm_device *dev) +int amdgpu_device_baco_enter(struct amdgpu_device *adev) { - struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - if (!amdgpu_device_supports_baco(dev)) + if (!amdgpu_device_supports_baco(adev)) return -ENOTSUPP; if (ras && adev->ras_enabled && @@ -6550,13 +7059,12 @@ int amdgpu_device_baco_enter(struct drm_device *dev) return amdgpu_dpm_baco_enter(adev); } -int amdgpu_device_baco_exit(struct drm_device *dev) +int amdgpu_device_baco_exit(struct amdgpu_device *adev) { - struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); int ret = 0; - if (!amdgpu_device_supports_baco(dev)) + if (!amdgpu_device_supports_baco(adev)) return -ENOTSUPP; ret = amdgpu_dpm_baco_exit(adev); @@ -6587,45 +7095,52 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - int i; - - DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); + struct amdgpu_hive_info *hive __free(xgmi_put_hive) = + amdgpu_get_xgmi_hive(adev); + struct amdgpu_reset_context reset_context; + struct list_head device_list; - if (adev->gmc.xgmi.num_physical_nodes > 1) { - DRM_WARN("No support for XGMI hive yet..."); - return PCI_ERS_RESULT_DISCONNECT; - } + dev_info(adev->dev, "PCI error: detected callback!!\n"); adev->pci_channel_state = state; switch (state) { case pci_channel_io_normal: + dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); return PCI_ERS_RESULT_CAN_RECOVER; - /* Fatal error, prepare for slot reset */ case pci_channel_io_frozen: - /* - * Locking adev->reset_domain->sem will prevent any external access - * to GPU during PCI error recovery - */ - amdgpu_device_lock_reset_domain(adev->reset_domain); - amdgpu_device_set_mp1_state(adev); - - /* - * Block any work scheduling as we do for regular GPU reset - * for the duration of the recovery - */ - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - struct amdgpu_ring *ring = adev->rings[i]; - - if (!amdgpu_ring_sched_ready(ring)) - continue; + /* Fatal error, prepare for slot reset */ + dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); + if (hive) { + /* Hive devices should be able to support FW based + * link reset on other devices, if not return. + */ + if (!amdgpu_dpm_is_link_reset_supported(adev)) { + dev_warn(adev->dev, + "No support for XGMI hive yet...\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + /* Set dpc status only if device is part of hive + * Non-hive devices should be able to recover after + * link reset. + */ + amdgpu_reset_set_dpc_status(adev, true); - drm_sched_stop(&ring->sched, NULL); + mutex_lock(&hive->hive_lock); } - atomic_inc(&adev->gpu_reset_counter); + memset(&reset_context, 0, sizeof(reset_context)); + INIT_LIST_HEAD(&device_list); + + amdgpu_device_recovery_prepare(adev, &device_list, hive); + amdgpu_device_recovery_get_reset_lock(adev, &device_list); + amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, + hive, false); + if (hive) + mutex_unlock(&hive->hive_lock); return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: /* Permanent error, prepare for device removal */ + dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); return PCI_ERS_RESULT_DISCONNECT; } @@ -6638,8 +7153,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta */ pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) { + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); - DRM_INFO("PCI error: mmio enabled callback!!\n"); + dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); /* TODO - dump whatever for debugging purposes */ @@ -6663,27 +7180,38 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - int r, i; struct amdgpu_reset_context reset_context; - u32 memsize; + struct amdgpu_device *tmp_adev; + struct amdgpu_hive_info *hive; struct list_head device_list; + struct pci_dev *link_dev; + int r = 0, i, timeout; + u32 memsize; + u16 status; - /* PCI error slot reset should be skipped During RAS recovery */ - if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && - amdgpu_ras_in_recovery(adev)) - return PCI_ERS_RESULT_RECOVERED; - - DRM_INFO("PCI error: slot reset callback!!\n"); + dev_info(adev->dev, "PCI error: slot reset callback!!\n"); memset(&reset_context, 0, sizeof(reset_context)); - INIT_LIST_HEAD(&device_list); - list_add_tail(&adev->reset_list, &device_list); + if (adev->pcie_reset_ctx.swus) + link_dev = adev->pcie_reset_ctx.swus; + else + link_dev = adev->pdev; + /* wait for asic to come out of reset, timeout = 10s */ + timeout = 10000; + do { + usleep_range(10000, 10500); + r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); + timeout -= 10; + } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && + (status != PCI_VENDOR_ID_AMD)); - /* wait for asic to come out of reset */ - msleep(500); + if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { + r = -ETIME; + goto out; + } + amdgpu_device_load_switch_state(adev); /* Restore PCI confspace */ amdgpu_device_load_pci_state(pdev); @@ -6703,26 +7231,40 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); - - adev->no_hw_access = true; - r = amdgpu_device_pre_asic_reset(adev, &reset_context); - adev->no_hw_access = false; - if (r) - goto out; + set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); + INIT_LIST_HEAD(&device_list); - r = amdgpu_do_asic_reset(&device_list, &reset_context); + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + mutex_lock(&hive->hive_lock); + reset_context.hive = hive; + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + tmp_adev->pcie_reset_ctx.in_link_reset = true; + list_add_tail(&tmp_adev->reset_list, &device_list); + } + } else { + set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); + list_add_tail(&adev->reset_list, &device_list); + } + r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); out: if (!r) { if (amdgpu_device_cache_pci_state(adev->pdev)) pci_restore_state(adev->pdev); - - DRM_INFO("PCIe error recovery succeeded\n"); + dev_info(adev->dev, "PCIe error recovery succeeded\n"); } else { - DRM_ERROR("PCIe error recovery failed, err:%d", r); - amdgpu_device_unset_mp1_state(adev); - amdgpu_device_unlock_reset_domain(adev->reset_domain); + dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); + if (hive) { + list_for_each_entry(tmp_adev, &device_list, reset_list) + amdgpu_device_unset_mp1_state(tmp_adev); + } + amdgpu_device_recovery_put_reset_lock(adev, &device_list); + } + + if (hive) { + mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); } return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; @@ -6739,26 +7281,95 @@ void amdgpu_pci_resume(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - int i; - + struct list_head device_list; + struct amdgpu_hive_info *hive = NULL; + struct amdgpu_device *tmp_adev = NULL; - DRM_INFO("PCI error: resume callback!!\n"); + dev_info(adev->dev, "PCI error: resume callback!!\n"); /* Only continue execution for the case of pci_channel_io_frozen */ if (adev->pci_channel_state != pci_channel_io_frozen) return; - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - struct amdgpu_ring *ring = adev->rings[i]; + INIT_LIST_HEAD(&device_list); - if (!amdgpu_ring_sched_ready(ring)) - continue; + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + mutex_lock(&hive->hive_lock); + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + tmp_adev->pcie_reset_ctx.in_link_reset = false; + list_add_tail(&tmp_adev->reset_list, &device_list); + } + } else + list_add_tail(&adev->reset_list, &device_list); - drm_sched_start(&ring->sched, 0); + amdgpu_device_sched_resume(&device_list, NULL, NULL); + amdgpu_device_gpu_resume(adev, &device_list, false); + amdgpu_device_recovery_put_reset_lock(adev, &device_list); + + if (hive) { + mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); } +} + +static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) +{ + struct pci_dev *swus, *swds; + int r; - amdgpu_device_unset_mp1_state(adev); - amdgpu_device_unlock_reset_domain(adev->reset_domain); + swds = pci_upstream_bridge(adev->pdev); + if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || + pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) + return; + swus = pci_upstream_bridge(swds); + if (!swus || + (swus->vendor != PCI_VENDOR_ID_ATI && + swus->vendor != PCI_VENDOR_ID_AMD) || + pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) + return; + + /* If already saved, return */ + if (adev->pcie_reset_ctx.swus) + return; + /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ + r = pci_save_state(swds); + if (r) + return; + adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); + + r = pci_save_state(swus); + if (r) + return; + adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); + + adev->pcie_reset_ctx.swus = swus; +} + +static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) +{ + struct pci_dev *pdev; + int r; + + if (!adev->pcie_reset_ctx.swds_pcistate || + !adev->pcie_reset_ctx.swus_pcistate) + return; + + pdev = adev->pcie_reset_ctx.swus; + r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); + if (!r) { + pci_restore_state(pdev); + } else { + dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); + return; + } + + pdev = pci_upstream_bridge(adev->pdev); + r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); + if (!r) + pci_restore_state(pdev); + else + dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); } bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) @@ -6777,14 +7388,16 @@ bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) adev->pci_state = pci_store_saved_state(pdev); if (!adev->pci_state) { - DRM_ERROR("Failed to store PCI saved state"); + dev_err(adev->dev, "Failed to store PCI saved state"); return false; } } else { - DRM_WARN("Failed to save PCI state, err:%d\n", r); + dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); return false; } + amdgpu_device_cache_switch_state(adev); + return true; } @@ -6802,7 +7415,7 @@ bool amdgpu_device_load_pci_state(struct pci_dev *pdev) if (!r) { pci_restore_state(pdev); } else { - DRM_WARN("Failed to load PCI state, err:%d\n", r); + dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); return false; } @@ -6819,10 +7432,17 @@ void amdgpu_device_flush_hdp(struct amdgpu_device *adev, if (adev->gmc.xgmi.connected_to_cpu) return; - if (ring && ring->funcs->emit_hdp_flush) + if (ring && ring->funcs->emit_hdp_flush) { amdgpu_ring_emit_hdp_flush(ring); - else - amdgpu_asic_flush_hdp(adev, ring); + return; + } + + if (!ring && amdgpu_sriov_runtime(adev)) { + if (!amdgpu_kiq_hdp_flush(adev)) + return; + } + + amdgpu_hdp_flush(adev, ring); } void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, @@ -6835,7 +7455,7 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, if (adev->gmc.xgmi.connected_to_cpu) return; - amdgpu_asic_invalidate_hdp(adev, ring); + amdgpu_hdp_invalidate(adev, ring); } int amdgpu_in_reset(struct amdgpu_device *adev) @@ -7048,7 +7668,7 @@ struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, dep = amdgpu_sync_peek_fence(&isolation->prev, ring); r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); if (r) - DRM_WARN("OOM tracking isolation\n"); + dev_warn(adev->dev, "OOM tracking isolation\n"); out_grab_ref: dma_fence_get(dep); @@ -7116,9 +7736,11 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, tmp_ = RREG32(reg_addr); loop--; if (!loop) { - DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", - inst, reg_name, (uint32_t)expected_value, - (uint32_t)(tmp_ & (mask))); + dev_warn( + adev->dev, + "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", + inst, reg_name, (uint32_t)expected_value, + (uint32_t)(tmp_ & (mask))); ret = -ETIMEDOUT; break; } @@ -7169,3 +7791,53 @@ ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) size += sysfs_emit_at(buf, size, "\n"); return size; } + +void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, + enum amdgpu_uid_type type, uint8_t inst, + uint64_t uid) +{ + if (!uid_info) + return; + + if (type >= AMDGPU_UID_TYPE_MAX) { + dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", + type); + return; + } + + if (inst >= AMDGPU_UID_INST_MAX) { + dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", + inst); + return; + } + + if (uid_info->uid[type][inst] != 0) { + dev_warn_once( + uid_info->adev->dev, + "Overwriting existing UID %llu for type %d instance %d\n", + uid_info->uid[type][inst], type, inst); + } + + uid_info->uid[type][inst] = uid; +} + +u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, + enum amdgpu_uid_type type, uint8_t inst) +{ + if (!uid_info) + return 0; + + if (type >= AMDGPU_UID_TYPE_MAX) { + dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", + type); + return 0; + } + + if (inst >= AMDGPU_UID_INST_MAX) { + dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", + inst); + return 0; + } + + return uid_info->uid[type][inst]; +} |
