diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 140 |
1 files changed, 135 insertions, 5 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 7de449fae1e3..b47422b0b5b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -667,6 +667,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) task_barrier_init(&hive->tb); hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; hive->hi_req_gpu = NULL; + atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE); /* * hive pstate on boot is high in vega20 so we have to go to low @@ -800,6 +801,23 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev, return -EINVAL; } +bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev, + struct amdgpu_device *peer_adev) +{ + struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; + int i; + + /* Sharing should always be enabled for non-SRIOV. */ + if (!amdgpu_sriov_vf(adev)) + return true; + + for (i = 0 ; i < top->num_nodes; ++i) + if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) + return !!top->nodes[i].is_sharing_enabled; + + return false; +} + /* * Devices that support extended data require the entire hive to initialize with * the shared memory buffer flag set. @@ -860,8 +878,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) if (!adev->gmc.xgmi.supported) return 0; - if (!adev->gmc.xgmi.pending_reset && - amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { + if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { ret = psp_xgmi_initialize(&adev->psp, false, true); if (ret) { dev_err(adev->dev, @@ -907,8 +924,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) task_barrier_add_task(&hive->tb); - if (!adev->gmc.xgmi.pending_reset && - amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { + if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { /* update node list for other device in the hive */ if (tmp_adev != adev) { @@ -985,7 +1001,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) } } - if (!ret && !adev->gmc.xgmi.pending_reset) + if (!ret) ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); exit_unlock: @@ -1500,3 +1516,117 @@ int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev) return 0; } + +static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work) +{ + struct amdgpu_hive_info *hive = + container_of(work, struct amdgpu_hive_info, reset_on_init_work); + struct amdgpu_reset_context reset_context; + struct amdgpu_device *tmp_adev; + struct list_head device_list; + int r; + + mutex_lock(&hive->hive_lock); + + INIT_LIST_HEAD(&device_list); + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) + list_add_tail(&tmp_adev->reset_list, &device_list); + + tmp_adev = list_first_entry(&device_list, struct amdgpu_device, + reset_list); + amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); + + reset_context.method = AMD_RESET_METHOD_ON_INIT; + reset_context.reset_req_dev = tmp_adev; + reset_context.hive = hive; + reset_context.reset_device_list = &device_list; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); + + amdgpu_reset_do_xgmi_reset_on_init(&reset_context); + mutex_unlock(&hive->hive_lock); + amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); + + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + r = amdgpu_ras_init_badpage_info(tmp_adev); + if (r && r != -EHWPOISON) + dev_err(tmp_adev->dev, + "error during bad page data initialization"); + } +} + +static void amdgpu_xgmi_schedule_reset_on_init(struct amdgpu_hive_info *hive) +{ + INIT_WORK(&hive->reset_on_init_work, amdgpu_xgmi_reset_on_init_work); + amdgpu_reset_domain_schedule(hive->reset_domain, + &hive->reset_on_init_work); +} + +int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev) +{ + struct amdgpu_hive_info *hive; + bool reset_scheduled; + int num_devs; + + hive = amdgpu_get_xgmi_hive(adev); + if (!hive) + return -EINVAL; + + mutex_lock(&hive->hive_lock); + num_devs = atomic_read(&hive->number_devices); + reset_scheduled = false; + if (num_devs == adev->gmc.xgmi.num_physical_nodes) { + amdgpu_xgmi_schedule_reset_on_init(hive); + reset_scheduled = true; + } + + mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); + + if (reset_scheduled) + flush_work(&hive->reset_on_init_work); + + return 0; +} + +int amdgpu_xgmi_request_nps_change(struct amdgpu_device *adev, + struct amdgpu_hive_info *hive, + int req_nps_mode) +{ + struct amdgpu_device *tmp_adev; + int cur_nps_mode, r; + + /* This is expected to be called only during unload of driver. The + * request needs to be placed only once for all devices in the hive. If + * one of them fail, revert the request for previous successful devices. + * After placing the request, make hive mode as UNKNOWN so that other + * devices don't request anymore. + */ + mutex_lock(&hive->hive_lock); + if (atomic_read(&hive->requested_nps_mode) == + UNKNOWN_MEMORY_PARTITION_MODE) { + dev_dbg(adev->dev, "Unexpected entry for hive NPS change"); + mutex_unlock(&hive->hive_lock); + return 0; + } + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + r = adev->gmc.gmc_funcs->request_mem_partition_mode( + tmp_adev, req_nps_mode); + if (r) + break; + } + if (r) { + /* Request back current mode if one of the requests failed */ + cur_nps_mode = + adev->gmc.gmc_funcs->query_mem_partition_mode(tmp_adev); + list_for_each_entry_continue_reverse( + tmp_adev, &hive->device_list, gmc.xgmi.head) + adev->gmc.gmc_funcs->request_mem_partition_mode( + tmp_adev, cur_nps_mode); + } + /* Set to UNKNOWN so that other devices don't request anymore */ + atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE); + mutex_unlock(&hive->hive_lock); + + return r; +} |