diff options
Diffstat (limited to 'drivers/accel')
75 files changed, 6070 insertions, 1034 deletions
diff --git a/drivers/accel/Kconfig b/drivers/accel/Kconfig index bb01cebc42bf..bdf48ccafcf2 100644 --- a/drivers/accel/Kconfig +++ b/drivers/accel/Kconfig @@ -25,6 +25,7 @@ menuconfig DRM_ACCEL and debugfs). source "drivers/accel/amdxdna/Kconfig" +source "drivers/accel/ethosu/Kconfig" source "drivers/accel/habanalabs/Kconfig" source "drivers/accel/ivpu/Kconfig" source "drivers/accel/qaic/Kconfig" diff --git a/drivers/accel/Makefile b/drivers/accel/Makefile index ffc3fa588666..1d3a7251b950 100644 --- a/drivers/accel/Makefile +++ b/drivers/accel/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_DRM_ACCEL_AMDXDNA) += amdxdna/ +obj-$(CONFIG_DRM_ACCEL_ARM_ETHOSU) += ethosu/ obj-$(CONFIG_DRM_ACCEL_HABANALABS) += habanalabs/ obj-$(CONFIG_DRM_ACCEL_IVPU) += ivpu/ obj-$(CONFIG_DRM_ACCEL_QAIC) += qaic/ diff --git a/drivers/accel/amdxdna/Makefile b/drivers/accel/amdxdna/Makefile index 6797dac65efa..6344aaf523fa 100644 --- a/drivers/accel/amdxdna/Makefile +++ b/drivers/accel/amdxdna/Makefile @@ -14,6 +14,7 @@ amdxdna-y := \ amdxdna_mailbox.o \ amdxdna_mailbox_helper.o \ amdxdna_pci_drv.o \ + amdxdna_pm.o \ amdxdna_sysfs.o \ amdxdna_ubuf.o \ npu1_regs.o \ diff --git a/drivers/accel/amdxdna/TODO b/drivers/accel/amdxdna/TODO index ad8ac6e315b6..0e4bbebeaedf 100644 --- a/drivers/accel/amdxdna/TODO +++ b/drivers/accel/amdxdna/TODO @@ -1,2 +1 @@ - Add debugfs support -- Add debug BO support diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c index e9f9b1fa5dc1..42d876a427c5 100644 --- a/drivers/accel/amdxdna/aie2_ctx.c +++ b/drivers/accel/amdxdna/aie2_ctx.c @@ -21,6 +21,7 @@ #include "amdxdna_gem.h" #include "amdxdna_mailbox.h" #include "amdxdna_pci_drv.h" +#include "amdxdna_pm.h" static bool force_cmdlist; module_param(force_cmdlist, bool, 0600); @@ -88,7 +89,7 @@ static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hw goto out; } - ret = aie2_config_cu(hwctx); + ret = aie2_config_cu(hwctx, NULL); if (ret) { XDNA_ERR(xdna, "Config cu failed, ret %d", ret); goto out; @@ -167,14 +168,11 @@ static int aie2_hwctx_resume_cb(struct amdxdna_hwctx *hwctx, void *arg) int aie2_hwctx_resume(struct amdxdna_client *client) { - struct amdxdna_dev *xdna = client->xdna; - /* * The resume path cannot guarantee that mailbox channel can be * regenerated. If this happen, when submit message to this * mailbox channel, error will return. */ - drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); return amdxdna_hwctx_walk(client, NULL, aie2_hwctx_resume_cb); } @@ -184,12 +182,13 @@ aie2_sched_notify(struct amdxdna_sched_job *job) struct dma_fence *fence = job->fence; trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq); + + amdxdna_pm_suspend_put(job->hwctx->client->xdna); job->hwctx->priv->completed++; dma_fence_signal(fence); up(&job->hwctx->priv->job_sem); job->job_done = true; - dma_fence_put(fence); mmput_async(job->mm); aie2_job_put(job); } @@ -204,10 +203,13 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size) cmd_abo = job->cmd_bo; - if (unlikely(!data)) + if (unlikely(job->job_timeout)) { + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT); + ret = -EINVAL; goto out; + } - if (unlikely(size != sizeof(u32))) { + if (unlikely(!data) || unlikely(size != sizeof(u32))) { amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT); ret = -EINVAL; goto out; @@ -226,11 +228,10 @@ out: } static int -aie2_sched_nocmd_resp_handler(void *handle, void __iomem *data, size_t size) +aie2_sched_drvcmd_resp_handler(void *handle, void __iomem *data, size_t size) { struct amdxdna_sched_job *job = handle; int ret = 0; - u32 status; if (unlikely(!data)) goto out; @@ -240,8 +241,7 @@ aie2_sched_nocmd_resp_handler(void *handle, void __iomem *data, size_t size) goto out; } - status = readl(data); - XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status); + job->drv_cmd->result = readl(data); out: aie2_sched_notify(job); @@ -260,6 +260,13 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size) int ret = 0; cmd_abo = job->cmd_bo; + + if (unlikely(job->job_timeout)) { + amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT); + ret = -EINVAL; + goto out; + } + if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) { amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT); ret = -EINVAL; @@ -314,8 +321,18 @@ aie2_sched_job_run(struct drm_sched_job *sched_job) kref_get(&job->refcnt); fence = dma_fence_get(job->fence); - if (unlikely(!cmd_abo)) { - ret = aie2_sync_bo(hwctx, job, aie2_sched_nocmd_resp_handler); + if (job->drv_cmd) { + switch (job->drv_cmd->opcode) { + case SYNC_DEBUG_BO: + ret = aie2_sync_bo(hwctx, job, aie2_sched_drvcmd_resp_handler); + break; + case ATTACH_DEBUG_BO: + ret = aie2_config_debug_bo(hwctx, job, aie2_sched_drvcmd_resp_handler); + break; + default: + ret = -EINVAL; + break; + } goto out; } @@ -362,6 +379,7 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job) xdna = hwctx->client->xdna; trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq); + job->job_timeout = true; mutex_lock(&xdna->dev_lock); aie2_hwctx_stop(xdna, hwctx, sched_job); @@ -531,13 +549,12 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx) .num_rqs = DRM_SCHED_PRIORITY_COUNT, .credit_limit = HWCTX_MAX_CMDS, .timeout = msecs_to_jiffies(HWCTX_MAX_TIMEOUT), - .name = hwctx->name, + .name = "amdxdna_js", .dev = xdna->ddev.dev, }; struct drm_gpu_scheduler *sched; struct amdxdna_hwctx_priv *priv; struct amdxdna_gem_obj *heap; - struct amdxdna_dev_hdl *ndev; int i, ret; priv = kzalloc(sizeof(*hwctx->priv), GFP_KERNEL); @@ -610,10 +627,14 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx) goto free_entity; } + ret = amdxdna_pm_resume_get(xdna); + if (ret) + goto free_col_list; + ret = aie2_alloc_resource(hwctx); if (ret) { XDNA_ERR(xdna, "Alloc hw resource failed, ret %d", ret); - goto free_col_list; + goto suspend_put; } ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id, @@ -628,10 +649,9 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx) XDNA_ERR(xdna, "Create syncobj failed, ret %d", ret); goto release_resource; } + amdxdna_pm_suspend_put(xdna); hwctx->status = HWCTX_STAT_INIT; - ndev = xdna->dev_handle; - ndev->hwctx_num++; init_waitqueue_head(&priv->job_free_wq); XDNA_DBG(xdna, "hwctx %s init completed", hwctx->name); @@ -640,6 +660,8 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx) release_resource: aie2_release_resource(hwctx); +suspend_put: + amdxdna_pm_suspend_put(xdna); free_col_list: kfree(hwctx->col_list); free_entity: @@ -662,26 +684,25 @@ free_priv: void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx) { - struct amdxdna_dev_hdl *ndev; struct amdxdna_dev *xdna; int idx; xdna = hwctx->client->xdna; - ndev = xdna->dev_handle; - ndev->hwctx_num--; XDNA_DBG(xdna, "%s sequence number %lld", hwctx->name, hwctx->priv->seq); - drm_sched_entity_destroy(&hwctx->priv->entity); - aie2_hwctx_wait_for_idle(hwctx); /* Request fw to destroy hwctx and cancel the rest pending requests */ aie2_release_resource(hwctx); + mutex_unlock(&xdna->dev_lock); + drm_sched_entity_destroy(&hwctx->priv->entity); + /* Wait for all submitted jobs to be completed or canceled */ wait_event(hwctx->priv->job_free_wq, atomic64_read(&hwctx->job_submit_cnt) == atomic64_read(&hwctx->job_free_cnt)); + mutex_lock(&xdna->dev_lock); drm_sched_fini(&hwctx->priv->sched); aie2_ctx_syncobj_destroy(hwctx); @@ -697,6 +718,14 @@ void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx) kfree(hwctx->cus); } +static int aie2_config_cu_resp_handler(void *handle, void __iomem *data, size_t size) +{ + struct amdxdna_hwctx *hwctx = handle; + + amdxdna_pm_suspend_put(hwctx->client->xdna); + return 0; +} + static int aie2_hwctx_cu_config(struct amdxdna_hwctx *hwctx, void *buf, u32 size) { struct amdxdna_hwctx_param_config_cu *config = buf; @@ -728,10 +757,14 @@ static int aie2_hwctx_cu_config(struct amdxdna_hwctx *hwctx, void *buf, u32 size if (!hwctx->cus) return -ENOMEM; - ret = aie2_config_cu(hwctx); + ret = amdxdna_pm_resume_get(xdna); + if (ret) + goto free_cus; + + ret = aie2_config_cu(hwctx, aie2_config_cu_resp_handler); if (ret) { XDNA_ERR(xdna, "Config CU to firmware failed, ret %d", ret); - goto free_cus; + goto pm_suspend_put; } wmb(); /* To avoid locking in command submit when check status */ @@ -739,12 +772,82 @@ static int aie2_hwctx_cu_config(struct amdxdna_hwctx *hwctx, void *buf, u32 size return 0; +pm_suspend_put: + amdxdna_pm_suspend_put(xdna); free_cus: kfree(hwctx->cus); hwctx->cus = NULL; return ret; } +static void aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq) +{ + struct dma_fence *out_fence = aie2_cmd_get_out_fence(hwctx, seq); + + if (!out_fence) { + XDNA_ERR(hwctx->client->xdna, "Failed to get fence"); + return; + } + + dma_fence_wait_timeout(out_fence, false, MAX_SCHEDULE_TIMEOUT); + dma_fence_put(out_fence); +} + +static int aie2_hwctx_cfg_debug_bo(struct amdxdna_hwctx *hwctx, u32 bo_hdl, + bool attach) +{ + struct amdxdna_client *client = hwctx->client; + struct amdxdna_dev *xdna = client->xdna; + struct amdxdna_drv_cmd cmd = { 0 }; + struct amdxdna_gem_obj *abo; + u64 seq; + int ret; + + abo = amdxdna_gem_get_obj(client, bo_hdl, AMDXDNA_BO_DEV); + if (!abo) { + XDNA_ERR(xdna, "Get bo %d failed", bo_hdl); + return -EINVAL; + } + + if (attach) { + if (abo->assigned_hwctx != AMDXDNA_INVALID_CTX_HANDLE) { + ret = -EBUSY; + goto put_obj; + } + cmd.opcode = ATTACH_DEBUG_BO; + } else { + if (abo->assigned_hwctx != hwctx->id) { + ret = -EINVAL; + goto put_obj; + } + cmd.opcode = DETACH_DEBUG_BO; + } + + ret = amdxdna_cmd_submit(client, &cmd, AMDXDNA_INVALID_BO_HANDLE, + &bo_hdl, 1, hwctx->id, &seq); + if (ret) { + XDNA_ERR(xdna, "Submit command failed"); + goto put_obj; + } + + aie2_cmd_wait(hwctx, seq); + if (cmd.result) { + XDNA_ERR(xdna, "Response failure 0x%x", cmd.result); + goto put_obj; + } + + if (attach) + abo->assigned_hwctx = hwctx->id; + else + abo->assigned_hwctx = AMDXDNA_INVALID_CTX_HANDLE; + + XDNA_DBG(xdna, "Config debug BO %d to %s", bo_hdl, hwctx->name); + +put_obj: + amdxdna_gem_put_obj(abo); + return ret; +} + int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size) { struct amdxdna_dev *xdna = hwctx->client->xdna; @@ -754,14 +857,40 @@ int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *bu case DRM_AMDXDNA_HWCTX_CONFIG_CU: return aie2_hwctx_cu_config(hwctx, buf, size); case DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF: + return aie2_hwctx_cfg_debug_bo(hwctx, (u32)value, true); case DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF: - return -EOPNOTSUPP; + return aie2_hwctx_cfg_debug_bo(hwctx, (u32)value, false); default: XDNA_DBG(xdna, "Not supported type %d", type); return -EOPNOTSUPP; } } +int aie2_hwctx_sync_debug_bo(struct amdxdna_hwctx *hwctx, u32 debug_bo_hdl) +{ + struct amdxdna_client *client = hwctx->client; + struct amdxdna_dev *xdna = client->xdna; + struct amdxdna_drv_cmd cmd = { 0 }; + u64 seq; + int ret; + + cmd.opcode = SYNC_DEBUG_BO; + ret = amdxdna_cmd_submit(client, &cmd, AMDXDNA_INVALID_BO_HANDLE, + &debug_bo_hdl, 1, hwctx->id, &seq); + if (ret) { + XDNA_ERR(xdna, "Submit command failed"); + return ret; + } + + aie2_cmd_wait(hwctx, seq); + if (cmd.result) { + XDNA_ERR(xdna, "Response failure 0x%x", cmd.result); + return -EINVAL; + } + + return 0; +} + static int aie2_populate_range(struct amdxdna_gem_obj *abo) { struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev); @@ -862,11 +991,15 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, goto free_chain; } + ret = amdxdna_pm_resume_get(xdna); + if (ret) + goto cleanup_job; + retry: ret = drm_gem_lock_reservations(job->bos, job->bo_cnt, &acquire_ctx); if (ret) { XDNA_WARN(xdna, "Failed to lock BOs, ret %d", ret); - goto cleanup_job; + goto suspend_put; } for (i = 0; i < job->bo_cnt; i++) { @@ -874,7 +1007,7 @@ retry: if (ret) { XDNA_WARN(xdna, "Failed to reserve fences %d", ret); drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx); - goto cleanup_job; + goto suspend_put; } } @@ -889,12 +1022,12 @@ retry: msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); } else if (time_after(jiffies, timeout)) { ret = -ETIME; - goto cleanup_job; + goto suspend_put; } ret = aie2_populate_range(abo); if (ret) - goto cleanup_job; + goto suspend_put; goto retry; } } @@ -920,6 +1053,8 @@ retry: return 0; +suspend_put: + amdxdna_pm_suspend_put(xdna); cleanup_job: drm_sched_job_cleanup(&job->base); free_chain: diff --git a/drivers/accel/amdxdna/aie2_error.c b/drivers/accel/amdxdna/aie2_error.c index 5ee905632a39..d452008ec4f4 100644 --- a/drivers/accel/amdxdna/aie2_error.c +++ b/drivers/accel/amdxdna/aie2_error.c @@ -13,6 +13,7 @@ #include "aie2_msg_priv.h" #include "aie2_pci.h" +#include "amdxdna_error.h" #include "amdxdna_mailbox.h" #include "amdxdna_pci_drv.h" @@ -46,6 +47,7 @@ enum aie_module_type { AIE_MEM_MOD = 0, AIE_CORE_MOD, AIE_PL_MOD, + AIE_UNKNOWN_MOD, }; enum aie_error_category { @@ -143,6 +145,31 @@ static const struct aie_event_category aie_ml_shim_tile_event_cat[] = { EVENT_CATEGORY(74U, AIE_ERROR_LOCK), }; +static const enum amdxdna_error_num aie_cat_err_num_map[] = { + [AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION, + [AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP, + [AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM, + [AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS, + [AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS, + [AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION, + [AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC, + [AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK, + [AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA, + [AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY, + [AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN, +}; + +static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1); + +static const enum amdxdna_error_module aie_err_mod_map[] = { + [AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY, + [AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE, + [AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL, + [AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN, +}; + +static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1); + static enum aie_error_category aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type) { @@ -176,12 +203,40 @@ aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type) if (event_id != lut[i].event_id) continue; + if (lut[i].category > AIE_ERROR_UNKNOWN) + return AIE_ERROR_UNKNOWN; + return lut[i].category; } return AIE_ERROR_UNKNOWN; } +static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err) +{ + struct aie_error *errs = err_info; + enum amdxdna_error_module err_mod; + enum aie_error_category aie_err; + enum amdxdna_error_num err_num; + struct aie_error *last_err; + + last_err = &errs[num_err - 1]; + if (last_err->mod_type >= AIE_UNKNOWN_MOD) { + err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN]; + err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD]; + } else { + aie_err = aie_get_error_category(last_err->row, + last_err->event_id, + last_err->mod_type); + err_num = aie_cat_err_num_map[aie_err]; + err_mod = aie_err_mod_map[last_err->mod_type]; + } + + ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod); + ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real()); + ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col); +} + static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err) { struct aie_error *errs = err_info; @@ -264,29 +319,14 @@ static void aie2_error_worker(struct work_struct *err_work) } mutex_lock(&xdna->dev_lock); + aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt); + /* Re-sent this event to firmware */ if (aie2_error_event_send(e)) XDNA_WARN(xdna, "Unable to register async event"); mutex_unlock(&xdna->dev_lock); } -int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev) -{ - struct amdxdna_dev *xdna = ndev->xdna; - struct async_event *e; - int i, ret; - - drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); - for (i = 0; i < ndev->async_events->event_cnt; i++) { - e = &ndev->async_events->event[i]; - ret = aie2_error_event_send(e); - if (ret) - return ret; - } - - return 0; -} - void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev) { struct amdxdna_dev *xdna = ndev->xdna; @@ -341,6 +381,10 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev) e->size = ASYNC_BUF_SIZE; e->resp.status = MAX_AIE2_STATUS_CODE; INIT_WORK(&e->work, aie2_error_worker); + + ret = aie2_error_event_send(e); + if (ret) + goto free_wq; } ndev->async_events = events; @@ -349,6 +393,8 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev) events->event_cnt, events->size); return 0; +free_wq: + destroy_workqueue(events->wq); free_buf: dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf, events->addr, DMA_FROM_DEVICE); @@ -356,3 +402,18 @@ free_events: kfree(events); return ret; } + +int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args) +{ + struct amdxdna_dev *xdna = ndev->xdna; + + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); + + args->num_element = 1; + args->element_size = sizeof(ndev->last_async_err); + if (copy_to_user(u64_to_user_ptr(args->buffer), + &ndev->last_async_err, args->element_size)) + return -EFAULT; + + return 0; +} diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c index 9caad083543d..d493bb1c3360 100644 --- a/drivers/accel/amdxdna/aie2_message.c +++ b/drivers/accel/amdxdna/aie2_message.c @@ -27,6 +27,8 @@ #define DECLARE_AIE2_MSG(name, op) \ DECLARE_XDNA_MSG_COMMON(name, op, MAX_AIE2_STATUS_CODE) +#define EXEC_MSG_OPS(xdna) ((xdna)->dev_handle->exec_msg_ops) + static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev, struct xdna_mailbox_msg *msg) { @@ -37,7 +39,7 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev, if (!ndev->mgmt_chann) return -ENODEV; - drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); + drm_WARN_ON(&xdna->ddev, xdna->rpm_on && !mutex_is_locked(&xdna->dev_lock)); ret = xdna_send_msg_wait(xdna, ndev->mgmt_chann, msg); if (ret == -ETIME) { xdna_mailbox_stop_channel(ndev->mgmt_chann); @@ -45,7 +47,7 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev, ndev->mgmt_chann = NULL; } - if (!ret && *hdl->data != AIE2_STATUS_SUCCESS) { + if (!ret && *hdl->status != AIE2_STATUS_SUCCESS) { XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x", msg->opcode, *hdl->data); ret = -EINVAL; @@ -208,6 +210,14 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct hwctx->fw_ctx_id = resp.context_id; WARN_ONCE(hwctx->fw_ctx_id == -1, "Unexpected context id"); + if (ndev->force_preempt_enabled) { + ret = aie2_runtime_cfg(ndev, AIE2_RT_CFG_FORCE_PREEMPT, &hwctx->fw_ctx_id); + if (ret) { + XDNA_ERR(xdna, "failed to enable force preempt %d", ret); + return ret; + } + } + cq_pair = &resp.cq_pair[0]; x2i.mb_head_ptr_reg = AIE2_MBOX_OFF(ndev, cq_pair->x2i_q.head_addr); x2i.mb_tail_ptr_reg = AIE2_MBOX_OFF(ndev, cq_pair->x2i_q.tail_addr); @@ -233,6 +243,7 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct ret = -EINVAL; goto out_destroy_context; } + ndev->hwctx_num++; XDNA_DBG(xdna, "%s mailbox channel irq: %d, msix_id: %d", hwctx->name, ret, resp.msix_id); @@ -267,6 +278,7 @@ int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwc hwctx->fw_ctx_id); hwctx->priv->mbox_chann = NULL; hwctx->fw_ctx_id = -1; + ndev->hwctx_num--; return ret; } @@ -332,11 +344,6 @@ int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf, goto fail; } - if (resp.status != AIE2_STATUS_SUCCESS) { - XDNA_ERR(xdna, "Query NPU status failed, status 0x%x", resp.status); - ret = -EINVAL; - goto fail; - } XDNA_DBG(xdna, "Query NPU status completed"); if (size < resp.size) { @@ -358,6 +365,55 @@ fail: return ret; } +int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev, + char __user *buf, u32 size, + struct amdxdna_drm_query_telemetry_header *header) +{ + DECLARE_AIE2_MSG(get_telemetry, MSG_OP_GET_TELEMETRY); + struct amdxdna_dev *xdna = ndev->xdna; + dma_addr_t dma_addr; + u8 *addr; + int ret; + + if (header->type >= MAX_TELEMETRY_TYPE) + return -EINVAL; + + addr = dma_alloc_noncoherent(xdna->ddev.dev, size, &dma_addr, + DMA_FROM_DEVICE, GFP_KERNEL); + if (!addr) + return -ENOMEM; + + req.buf_addr = dma_addr; + req.buf_size = size; + req.type = header->type; + + drm_clflush_virt_range(addr, size); /* device can access */ + ret = aie2_send_mgmt_msg_wait(ndev, &msg); + if (ret) { + XDNA_ERR(xdna, "Query telemetry failed, status %d", ret); + goto free_buf; + } + + if (size < resp.size) { + ret = -EINVAL; + XDNA_ERR(xdna, "Bad buffer size. Available: %u. Needs: %u", size, resp.size); + goto free_buf; + } + + if (copy_to_user(buf, addr, resp.size)) { + ret = -EFAULT; + XDNA_ERR(xdna, "Failed to copy telemetry to user space"); + goto free_buf; + } + + header->major = resp.major; + header->minor = resp.minor; + +free_buf: + dma_free_noncoherent(xdna->ddev.dev, size, addr, dma_addr, DMA_FROM_DEVICE); + return ret; +} + int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size, void *handle, int (*cb)(void*, void __iomem *, size_t)) { @@ -377,15 +433,17 @@ int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, return xdna_mailbox_send_msg(ndev->mgmt_chann, &msg, TX_TIMEOUT); } -int aie2_config_cu(struct amdxdna_hwctx *hwctx) +int aie2_config_cu(struct amdxdna_hwctx *hwctx, + int (*notify_cb)(void *, void __iomem *, size_t)) { struct mailbox_channel *chann = hwctx->priv->mbox_chann; struct amdxdna_dev *xdna = hwctx->client->xdna; u32 shift = xdna->dev_info->dev_mem_buf_shift; - DECLARE_AIE2_MSG(config_cu, MSG_OP_CONFIG_CU); + struct config_cu_req req = { 0 }; + struct xdna_mailbox_msg msg; struct drm_gem_object *gobj; struct amdxdna_gem_obj *abo; - int ret, i; + int i; if (!chann) return -ENODEV; @@ -423,191 +481,386 @@ int aie2_config_cu(struct amdxdna_hwctx *hwctx) } req.num_cus = hwctx->cus->num_cus; - ret = xdna_send_msg_wait(xdna, chann, &msg); - if (ret == -ETIME) - aie2_destroy_context(xdna->dev_handle, hwctx); + msg.send_data = (u8 *)&req; + msg.send_size = sizeof(req); + msg.handle = hwctx; + msg.opcode = MSG_OP_CONFIG_CU; + msg.notify_cb = notify_cb; + return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT); +} - if (resp.status == AIE2_STATUS_SUCCESS) { - XDNA_DBG(xdna, "Configure %d CUs, ret %d", req.num_cus, ret); - return 0; - } +static int aie2_init_exec_cu_req(struct amdxdna_gem_obj *cmd_bo, void *req, + size_t *size, u32 *msg_op) +{ + struct execute_buffer_req *cu_req = req; + u32 cmd_len; + void *cmd; - XDNA_ERR(xdna, "Command opcode 0x%x failed, status 0x%x ret %d", - msg.opcode, resp.status, ret); - return ret; + cmd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); + if (cmd_len > sizeof(cu_req->payload)) + return -EINVAL; + + cu_req->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo); + if (cu_req->cu_idx == INVALID_CU_IDX) + return -EINVAL; + + memcpy(cu_req->payload, cmd, cmd_len); + + *size = sizeof(*cu_req); + *msg_op = MSG_OP_EXECUTE_BUFFER_CF; + return 0; } -int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, - int (*notify_cb)(void *, void __iomem *, size_t)) +static int aie2_init_exec_dpu_req(struct amdxdna_gem_obj *cmd_bo, void *req, + size_t *size, u32 *msg_op) { - struct mailbox_channel *chann = hwctx->priv->mbox_chann; - struct amdxdna_dev *xdna = hwctx->client->xdna; - struct amdxdna_gem_obj *cmd_abo = job->cmd_bo; - union { - struct execute_buffer_req ebuf; - struct exec_dpu_req dpu; - } req; - struct xdna_mailbox_msg msg; - u32 payload_len; - void *payload; - int cu_idx; - int ret; - u32 op; + struct exec_dpu_req *dpu_req = req; + struct amdxdna_cmd_start_npu *sn; + u32 cmd_len; - if (!chann) - return -ENODEV; + sn = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); + if (cmd_len - sizeof(*sn) > sizeof(dpu_req->payload)) + return -EINVAL; - payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len); - if (!payload) { - XDNA_ERR(xdna, "Invalid command, cannot get payload"); + dpu_req->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo); + if (dpu_req->cu_idx == INVALID_CU_IDX) return -EINVAL; - } - cu_idx = amdxdna_cmd_get_cu_idx(cmd_abo); - if (cu_idx < 0) { - XDNA_DBG(xdna, "Invalid cu idx"); + dpu_req->inst_buf_addr = sn->buffer; + dpu_req->inst_size = sn->buffer_size; + dpu_req->inst_prop_cnt = sn->prop_count; + memcpy(dpu_req->payload, sn->prop_args, cmd_len - sizeof(*sn)); + + *size = sizeof(*dpu_req); + *msg_op = MSG_OP_EXEC_DPU; + return 0; +} + +static void aie2_init_exec_chain_req(void *req, u64 slot_addr, size_t size, u32 cmd_cnt) +{ + struct cmd_chain_req *chain_req = req; + + chain_req->buf_addr = slot_addr; + chain_req->buf_size = size; + chain_req->count = cmd_cnt; +} + +static void aie2_init_npu_chain_req(void *req, u64 slot_addr, size_t size, u32 cmd_cnt) +{ + struct cmd_chain_npu_req *npu_chain_req = req; + + npu_chain_req->flags = 0; + npu_chain_req->reserved = 0; + npu_chain_req->buf_addr = slot_addr; + npu_chain_req->buf_size = size; + npu_chain_req->count = cmd_cnt; +} + +static int +aie2_cmdlist_fill_cf(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size) +{ + struct cmd_chain_slot_execbuf_cf *cf_slot = slot; + u32 cmd_len; + void *cmd; + + cmd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); + if (*size < sizeof(*cf_slot) + cmd_len) return -EINVAL; - } - op = amdxdna_cmd_get_op(cmd_abo); - switch (op) { + cf_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo); + if (cf_slot->cu_idx == INVALID_CU_IDX) + return -EINVAL; + + cf_slot->arg_cnt = cmd_len / sizeof(u32); + memcpy(cf_slot->args, cmd, cmd_len); + /* Accurate slot size to hint firmware to do necessary copy */ + *size = sizeof(*cf_slot) + cmd_len; + return 0; +} + +static int +aie2_cmdlist_fill_dpu(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size) +{ + struct cmd_chain_slot_dpu *dpu_slot = slot; + struct amdxdna_cmd_start_npu *sn; + u32 cmd_len; + u32 arg_sz; + + sn = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); + arg_sz = cmd_len - sizeof(*sn); + if (cmd_len < sizeof(*sn) || arg_sz > MAX_DPU_ARGS_SIZE) + return -EINVAL; + + if (*size < sizeof(*dpu_slot) + arg_sz) + return -EINVAL; + + dpu_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo); + if (dpu_slot->cu_idx == INVALID_CU_IDX) + return -EINVAL; + + dpu_slot->inst_buf_addr = sn->buffer; + dpu_slot->inst_size = sn->buffer_size; + dpu_slot->inst_prop_cnt = sn->prop_count; + dpu_slot->arg_cnt = arg_sz / sizeof(u32); + memcpy(dpu_slot->args, sn->prop_args, arg_sz); + + /* Accurate slot size to hint firmware to do necessary copy */ + *size = sizeof(*dpu_slot) + arg_sz; + return 0; +} + +static int aie2_cmdlist_unsupp(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size) +{ + return -EOPNOTSUPP; +} + +static u32 aie2_get_chain_msg_op(u32 cmd_op) +{ + switch (cmd_op) { case ERT_START_CU: - if (unlikely(payload_len > sizeof(req.ebuf.payload))) - XDNA_DBG(xdna, "Invalid ebuf payload len: %d", payload_len); - req.ebuf.cu_idx = cu_idx; - memcpy(req.ebuf.payload, payload, sizeof(req.ebuf.payload)); - msg.send_size = sizeof(req.ebuf); - msg.opcode = MSG_OP_EXECUTE_BUFFER_CF; - break; - case ERT_START_NPU: { - struct amdxdna_cmd_start_npu *sn = payload; - - if (unlikely(payload_len - sizeof(*sn) > sizeof(req.dpu.payload))) - XDNA_DBG(xdna, "Invalid dpu payload len: %d", payload_len); - req.dpu.inst_buf_addr = sn->buffer; - req.dpu.inst_size = sn->buffer_size; - req.dpu.inst_prop_cnt = sn->prop_count; - req.dpu.cu_idx = cu_idx; - memcpy(req.dpu.payload, sn->prop_args, sizeof(req.dpu.payload)); - msg.send_size = sizeof(req.dpu); - msg.opcode = MSG_OP_EXEC_DPU; + return MSG_OP_CHAIN_EXEC_BUFFER_CF; + case ERT_START_NPU: + return MSG_OP_CHAIN_EXEC_DPU; + default: break; } - default: - XDNA_DBG(xdna, "Invalid ERT cmd op code: %d", op); + + return MSG_OP_MAX_OPCODE; +} + +static struct aie2_exec_msg_ops legacy_exec_message_ops = { + .init_cu_req = aie2_init_exec_cu_req, + .init_dpu_req = aie2_init_exec_dpu_req, + .init_chain_req = aie2_init_exec_chain_req, + .fill_cf_slot = aie2_cmdlist_fill_cf, + .fill_dpu_slot = aie2_cmdlist_fill_dpu, + .fill_preempt_slot = aie2_cmdlist_unsupp, + .fill_elf_slot = aie2_cmdlist_unsupp, + .get_chain_msg_op = aie2_get_chain_msg_op, +}; + +static int +aie2_cmdlist_fill_npu_cf(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size) +{ + struct cmd_chain_slot_npu *npu_slot = slot; + u32 cmd_len; + void *cmd; + + cmd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); + if (*size < sizeof(*npu_slot) + cmd_len) return -EINVAL; - } - msg.handle = job; - msg.notify_cb = notify_cb; - msg.send_data = (u8 *)&req; - print_hex_dump_debug("cmd: ", DUMP_PREFIX_OFFSET, 16, 4, &req, - 0x40, false); - ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT); - if (ret) { - XDNA_ERR(xdna, "Send message failed"); - return ret; - } + npu_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo); + if (npu_slot->cu_idx == INVALID_CU_IDX) + return -EINVAL; + memset(npu_slot, 0, sizeof(*npu_slot)); + npu_slot->type = EXEC_NPU_TYPE_NON_ELF; + npu_slot->arg_cnt = cmd_len / sizeof(u32); + memcpy(npu_slot->args, cmd, cmd_len); + + *size = sizeof(*npu_slot) + cmd_len; return 0; } static int -aie2_cmdlist_fill_one_slot_cf(void *cmd_buf, u32 offset, - struct amdxdna_gem_obj *abo, u32 *size) +aie2_cmdlist_fill_npu_dpu(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size) { - struct cmd_chain_slot_execbuf_cf *buf = cmd_buf + offset; - int cu_idx = amdxdna_cmd_get_cu_idx(abo); - u32 payload_len; - void *payload; + struct cmd_chain_slot_npu *npu_slot = slot; + struct amdxdna_cmd_start_npu *sn; + u32 cmd_len; + u32 arg_sz; - if (cu_idx < 0) + sn = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); + arg_sz = cmd_len - sizeof(*sn); + if (cmd_len < sizeof(*sn) || arg_sz > MAX_NPU_ARGS_SIZE) return -EINVAL; - payload = amdxdna_cmd_get_payload(abo, &payload_len); - if (!payload) + if (*size < sizeof(*npu_slot) + arg_sz) return -EINVAL; - if (!slot_has_space(*buf, offset, payload_len)) - return -ENOSPC; + npu_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo); + if (npu_slot->cu_idx == INVALID_CU_IDX) + return -EINVAL; + + memset(npu_slot, 0, sizeof(*npu_slot)); + npu_slot->type = EXEC_NPU_TYPE_PARTIAL_ELF; + npu_slot->inst_buf_addr = sn->buffer; + npu_slot->inst_size = sn->buffer_size; + npu_slot->inst_prop_cnt = sn->prop_count; + npu_slot->arg_cnt = arg_sz / sizeof(u32); + memcpy(npu_slot->args, sn->prop_args, arg_sz); - buf->cu_idx = cu_idx; - buf->arg_cnt = payload_len / sizeof(u32); - memcpy(buf->args, payload, payload_len); - /* Accurate buf size to hint firmware to do necessary copy */ - *size = sizeof(*buf) + payload_len; + *size = sizeof(*npu_slot) + arg_sz; return 0; } static int -aie2_cmdlist_fill_one_slot_dpu(void *cmd_buf, u32 offset, - struct amdxdna_gem_obj *abo, u32 *size) +aie2_cmdlist_fill_npu_preempt(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size) { - struct cmd_chain_slot_dpu *buf = cmd_buf + offset; - int cu_idx = amdxdna_cmd_get_cu_idx(abo); - struct amdxdna_cmd_start_npu *sn; - u32 payload_len; - void *payload; + struct cmd_chain_slot_npu *npu_slot = slot; + struct amdxdna_cmd_preempt_data *pd; + u32 cmd_len; u32 arg_sz; - if (cu_idx < 0) + pd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); + arg_sz = cmd_len - sizeof(*pd); + if (cmd_len < sizeof(*pd) || arg_sz > MAX_NPU_ARGS_SIZE) return -EINVAL; - payload = amdxdna_cmd_get_payload(abo, &payload_len); - if (!payload) + if (*size < sizeof(*npu_slot) + arg_sz) return -EINVAL; - sn = payload; - arg_sz = payload_len - sizeof(*sn); - if (payload_len < sizeof(*sn) || arg_sz > MAX_DPU_ARGS_SIZE) + + npu_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo); + if (npu_slot->cu_idx == INVALID_CU_IDX) return -EINVAL; - if (!slot_has_space(*buf, offset, arg_sz)) - return -ENOSPC; + memset(npu_slot, 0, sizeof(*npu_slot)); + npu_slot->type = EXEC_NPU_TYPE_PREEMPT; + npu_slot->inst_buf_addr = pd->inst_buf; + npu_slot->save_buf_addr = pd->save_buf; + npu_slot->restore_buf_addr = pd->restore_buf; + npu_slot->inst_size = pd->inst_size; + npu_slot->save_size = pd->save_size; + npu_slot->restore_size = pd->restore_size; + npu_slot->inst_prop_cnt = pd->inst_prop_cnt; + npu_slot->arg_cnt = arg_sz / sizeof(u32); + memcpy(npu_slot->args, pd->prop_args, arg_sz); + + *size = sizeof(*npu_slot) + arg_sz; + return 0; +} + +static int +aie2_cmdlist_fill_npu_elf(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size) +{ + struct cmd_chain_slot_npu *npu_slot = slot; + struct amdxdna_cmd_preempt_data *pd; + u32 cmd_len; + u32 arg_sz; + + pd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len); + arg_sz = cmd_len - sizeof(*pd); + if (cmd_len < sizeof(*pd) || arg_sz > MAX_NPU_ARGS_SIZE) + return -EINVAL; - buf->inst_buf_addr = sn->buffer; - buf->inst_size = sn->buffer_size; - buf->inst_prop_cnt = sn->prop_count; - buf->cu_idx = cu_idx; - buf->arg_cnt = arg_sz / sizeof(u32); - memcpy(buf->args, sn->prop_args, arg_sz); + if (*size < sizeof(*npu_slot) + arg_sz) + return -EINVAL; - /* Accurate buf size to hint firmware to do necessary copy */ - *size = sizeof(*buf) + arg_sz; + memset(npu_slot, 0, sizeof(*npu_slot)); + npu_slot->type = EXEC_NPU_TYPE_ELF; + npu_slot->inst_buf_addr = pd->inst_buf; + npu_slot->save_buf_addr = pd->save_buf; + npu_slot->restore_buf_addr = pd->restore_buf; + npu_slot->inst_size = pd->inst_size; + npu_slot->save_size = pd->save_size; + npu_slot->restore_size = pd->restore_size; + npu_slot->inst_prop_cnt = pd->inst_prop_cnt; + npu_slot->arg_cnt = 1; + npu_slot->args[0] = AIE2_EXEC_BUFFER_KERNEL_OP_TXN; + + *size = struct_size(npu_slot, args, npu_slot->arg_cnt); return 0; } -static int -aie2_cmdlist_fill_one_slot(u32 op, struct amdxdna_gem_obj *cmdbuf_abo, u32 offset, - struct amdxdna_gem_obj *abo, u32 *size) +static u32 aie2_get_npu_chain_msg_op(u32 cmd_op) +{ + return MSG_OP_CHAIN_EXEC_NPU; +} + +static struct aie2_exec_msg_ops npu_exec_message_ops = { + .init_cu_req = aie2_init_exec_cu_req, + .init_dpu_req = aie2_init_exec_dpu_req, + .init_chain_req = aie2_init_npu_chain_req, + .fill_cf_slot = aie2_cmdlist_fill_npu_cf, + .fill_dpu_slot = aie2_cmdlist_fill_npu_dpu, + .fill_preempt_slot = aie2_cmdlist_fill_npu_preempt, + .fill_elf_slot = aie2_cmdlist_fill_npu_elf, + .get_chain_msg_op = aie2_get_npu_chain_msg_op, +}; + +static int aie2_init_exec_req(void *req, struct amdxdna_gem_obj *cmd_abo, + size_t *size, u32 *msg_op) { - u32 this_op = amdxdna_cmd_get_op(abo); - void *cmd_buf = cmdbuf_abo->mem.kva; + struct amdxdna_dev *xdna = cmd_abo->client->xdna; int ret; + u32 op; - if (this_op != op) { - ret = -EINVAL; - goto done; - } + op = amdxdna_cmd_get_op(cmd_abo); switch (op) { case ERT_START_CU: - ret = aie2_cmdlist_fill_one_slot_cf(cmd_buf, offset, abo, size); + ret = EXEC_MSG_OPS(xdna)->init_cu_req(cmd_abo, req, size, msg_op); + if (ret) { + XDNA_DBG(xdna, "Init CU req failed ret %d", ret); + return ret; + } break; case ERT_START_NPU: - ret = aie2_cmdlist_fill_one_slot_dpu(cmd_buf, offset, abo, size); + ret = EXEC_MSG_OPS(xdna)->init_dpu_req(cmd_abo, req, size, msg_op); + if (ret) { + XDNA_DBG(xdna, "Init DPU req failed ret %d", ret); + return ret; + } + break; default: + XDNA_ERR(xdna, "Unsupported op %d", op); ret = -EOPNOTSUPP; + break; } -done: - if (ret) { - XDNA_ERR(abo->client->xdna, "Can't fill slot for cmd op %d ret %d", - op, ret); + return ret; +} + +static int +aie2_cmdlist_fill_slot(void *slot, struct amdxdna_gem_obj *cmd_abo, + size_t *size, u32 *cmd_op) +{ + struct amdxdna_dev *xdna = cmd_abo->client->xdna; + int ret; + u32 op; + + op = amdxdna_cmd_get_op(cmd_abo); + if (*cmd_op == ERT_INVALID_CMD) + *cmd_op = op; + else if (op != *cmd_op) + return -EINVAL; + + switch (op) { + case ERT_START_CU: + ret = EXEC_MSG_OPS(xdna)->fill_cf_slot(cmd_abo, slot, size); + break; + case ERT_START_NPU: + ret = EXEC_MSG_OPS(xdna)->fill_dpu_slot(cmd_abo, slot, size); + break; + case ERT_START_NPU_PREEMPT: + if (!AIE2_FEATURE_ON(xdna->dev_handle, AIE2_PREEMPT)) + return -EOPNOTSUPP; + ret = EXEC_MSG_OPS(xdna)->fill_preempt_slot(cmd_abo, slot, size); + break; + case ERT_START_NPU_PREEMPT_ELF: + if (!AIE2_FEATURE_ON(xdna->dev_handle, AIE2_PREEMPT)) + return -EOPNOTSUPP; + ret = EXEC_MSG_OPS(xdna)->fill_elf_slot(cmd_abo, slot, size); + break; + default: + XDNA_INFO(xdna, "Unsupported op %d", op); + ret = -EOPNOTSUPP; + break; } + return ret; } +void aie2_msg_init(struct amdxdna_dev_hdl *ndev) +{ + if (AIE2_FEATURE_ON(ndev, AIE2_NPU_COMMAND)) + ndev->exec_msg_ops = &npu_exec_message_ops; + else + ndev->exec_msg_ops = &legacy_exec_message_ops; +} + static inline struct amdxdna_gem_obj * aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job) { @@ -616,29 +869,36 @@ aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job) return job->hwctx->priv->cmd_buf[idx]; } -static void -aie2_cmdlist_prepare_request(struct cmd_chain_req *req, - struct amdxdna_gem_obj *cmdbuf_abo, u32 size, u32 cnt) +int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, + int (*notify_cb)(void *, void __iomem *, size_t)) { - req->buf_addr = cmdbuf_abo->mem.dev_addr; - req->buf_size = size; - req->count = cnt; - drm_clflush_virt_range(cmdbuf_abo->mem.kva, size); - XDNA_DBG(cmdbuf_abo->client->xdna, "Command buf addr 0x%llx size 0x%x count %d", - req->buf_addr, size, cnt); -} + struct mailbox_channel *chann = hwctx->priv->mbox_chann; + struct amdxdna_dev *xdna = hwctx->client->xdna; + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo; + struct xdna_mailbox_msg msg; + union exec_req req; + int ret; -static inline u32 -aie2_cmd_op_to_msg_op(u32 op) -{ - switch (op) { - case ERT_START_CU: - return MSG_OP_CHAIN_EXEC_BUFFER_CF; - case ERT_START_NPU: - return MSG_OP_CHAIN_EXEC_DPU; - default: - return MSG_OP_MAX_OPCODE; + if (!chann) + return -ENODEV; + + ret = aie2_init_exec_req(&req, cmd_abo, &msg.send_size, &msg.opcode); + if (ret) + return ret; + + msg.handle = job; + msg.notify_cb = notify_cb; + msg.send_data = (u8 *)&req; + print_hex_dump_debug("cmd: ", DUMP_PREFIX_OFFSET, 16, 4, &req, + 0x40, false); + + ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT); + if (ret) { + XDNA_ERR(xdna, "Send message failed"); + return ret; } + + return 0; } int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx, @@ -649,12 +909,13 @@ int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx, struct mailbox_channel *chann = hwctx->priv->mbox_chann; struct amdxdna_client *client = hwctx->client; struct amdxdna_gem_obj *cmd_abo = job->cmd_bo; + struct amdxdna_dev *xdna = client->xdna; struct amdxdna_cmd_chain *payload; struct xdna_mailbox_msg msg; - struct cmd_chain_req req; + union exec_chain_req req; u32 payload_len; u32 offset = 0; - u32 size; + size_t size; int ret; u32 op; u32 i; @@ -665,41 +926,42 @@ int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx, payload_len < struct_size(payload, data, payload->command_count)) return -EINVAL; + op = ERT_INVALID_CMD; for (i = 0; i < payload->command_count; i++) { u32 boh = (u32)(payload->data[i]); struct amdxdna_gem_obj *abo; abo = amdxdna_gem_get_obj(client, boh, AMDXDNA_BO_CMD); if (!abo) { - XDNA_ERR(client->xdna, "Failed to find cmd BO %d", boh); + XDNA_ERR(xdna, "Failed to find cmd BO %d", boh); return -ENOENT; } - /* All sub-cmd should have same op, use the first one. */ - if (i == 0) - op = amdxdna_cmd_get_op(abo); - - ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, offset, abo, &size); + size = cmdbuf_abo->mem.size - offset; + ret = aie2_cmdlist_fill_slot(cmdbuf_abo->mem.kva + offset, + abo, &size, &op); amdxdna_gem_put_obj(abo); if (ret) - return -EINVAL; + return ret; offset += size; } + msg.opcode = EXEC_MSG_OPS(xdna)->get_chain_msg_op(op); + if (msg.opcode == MSG_OP_MAX_OPCODE) + return -EOPNOTSUPP; /* The offset is the accumulated total size of the cmd buffer */ - aie2_cmdlist_prepare_request(&req, cmdbuf_abo, offset, payload->command_count); + EXEC_MSG_OPS(xdna)->init_chain_req(&req, cmdbuf_abo->mem.dev_addr, + offset, payload->command_count); + drm_clflush_virt_range(cmdbuf_abo->mem.kva, offset); - msg.opcode = aie2_cmd_op_to_msg_op(op); - if (msg.opcode == MSG_OP_MAX_OPCODE) - return -EOPNOTSUPP; msg.handle = job; msg.notify_cb = notify_cb; msg.send_data = (u8 *)&req; msg.send_size = sizeof(req); ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT); if (ret) { - XDNA_ERR(hwctx->client->xdna, "Send message failed"); + XDNA_ERR(xdna, "Send message failed"); return ret; } @@ -712,23 +974,27 @@ int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx, { struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job); struct mailbox_channel *chann = hwctx->priv->mbox_chann; + struct amdxdna_dev *xdna = hwctx->client->xdna; struct amdxdna_gem_obj *cmd_abo = job->cmd_bo; struct xdna_mailbox_msg msg; - struct cmd_chain_req req; - u32 size; + union exec_chain_req req; + u32 op = ERT_INVALID_CMD; + size_t size; int ret; - u32 op; - op = amdxdna_cmd_get_op(cmd_abo); - ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, 0, cmd_abo, &size); + size = cmdbuf_abo->mem.size; + ret = aie2_cmdlist_fill_slot(cmdbuf_abo->mem.kva, cmd_abo, &size, &op); if (ret) return ret; - aie2_cmdlist_prepare_request(&req, cmdbuf_abo, size, 1); - - msg.opcode = aie2_cmd_op_to_msg_op(op); + msg.opcode = EXEC_MSG_OPS(xdna)->get_chain_msg_op(op); if (msg.opcode == MSG_OP_MAX_OPCODE) return -EOPNOTSUPP; + + EXEC_MSG_OPS(xdna)->init_chain_req(&req, cmdbuf_abo->mem.dev_addr, + size, 1); + drm_clflush_virt_range(cmdbuf_abo->mem.kva, size); + msg.handle = job; msg.notify_cb = notify_cb; msg.send_data = (u8 *)&req; @@ -753,7 +1019,7 @@ int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, int ret = 0; req.src_addr = 0; - req.dst_addr = abo->mem.dev_addr - hwctx->client->dev_heap->mem.dev_addr; + req.dst_addr = amdxdna_dev_bo_offset(abo); req.size = abo->mem.size; /* Device to Host */ @@ -777,3 +1043,32 @@ int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, return 0; } + +int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, + int (*notify_cb)(void *, void __iomem *, size_t)) +{ + struct mailbox_channel *chann = hwctx->priv->mbox_chann; + struct amdxdna_gem_obj *abo = to_xdna_obj(job->bos[0]); + struct amdxdna_dev *xdna = hwctx->client->xdna; + struct config_debug_bo_req req; + struct xdna_mailbox_msg msg; + + if (job->drv_cmd->opcode == ATTACH_DEBUG_BO) + req.config = DEBUG_BO_REGISTER; + else + req.config = DEBUG_BO_UNREGISTER; + + req.offset = amdxdna_dev_bo_offset(abo); + req.size = abo->mem.size; + + XDNA_DBG(xdna, "offset 0x%llx size 0x%llx config %d", + req.offset, req.size, req.config); + + msg.handle = job; + msg.notify_cb = notify_cb; + msg.send_data = (u8 *)&req; + msg.send_size = sizeof(req); + msg.opcode = MSG_OP_CONFIG_DEBUG_BO; + + return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT); +} diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h index 6df9065b13f6..1c957a6298d3 100644 --- a/drivers/accel/amdxdna/aie2_msg_priv.h +++ b/drivers/accel/amdxdna/aie2_msg_priv.h @@ -9,7 +9,8 @@ enum aie2_msg_opcode { MSG_OP_CREATE_CONTEXT = 0x2, MSG_OP_DESTROY_CONTEXT = 0x3, - MSG_OP_SYNC_BO = 0x7, + MSG_OP_GET_TELEMETRY = 0x4, + MSG_OP_SYNC_BO = 0x7, MSG_OP_EXECUTE_BUFFER_CF = 0xC, MSG_OP_QUERY_COL_STATUS = 0xD, MSG_OP_QUERY_AIE_TILE_INFO = 0xE, @@ -18,6 +19,8 @@ enum aie2_msg_opcode { MSG_OP_CONFIG_CU = 0x11, MSG_OP_CHAIN_EXEC_BUFFER_CF = 0x12, MSG_OP_CHAIN_EXEC_DPU = 0x13, + MSG_OP_CONFIG_DEBUG_BO = 0x14, + MSG_OP_CHAIN_EXEC_NPU = 0x18, MSG_OP_MAX_XRT_OPCODE, MSG_OP_SUSPEND = 0x101, MSG_OP_RESUME = 0x102, @@ -135,6 +138,28 @@ struct destroy_ctx_resp { enum aie2_msg_status status; } __packed; +enum telemetry_type { + TELEMETRY_TYPE_DISABLED, + TELEMETRY_TYPE_HEALTH, + TELEMETRY_TYPE_ERROR_INFO, + TELEMETRY_TYPE_PROFILING, + TELEMETRY_TYPE_DEBUG, + MAX_TELEMETRY_TYPE +}; + +struct get_telemetry_req { + enum telemetry_type type; + __u64 buf_addr; + __u32 buf_size; +} __packed; + +struct get_telemetry_resp { + __u32 major; + __u32 minor; + __u32 size; + enum aie2_msg_status status; +} __packed; + struct execute_buffer_req { __u32 cu_idx; __u32 payload[19]; @@ -148,6 +173,18 @@ struct exec_dpu_req { __u32 payload[35]; } __packed; +enum exec_npu_type { + EXEC_NPU_TYPE_NON_ELF = 0x1, + EXEC_NPU_TYPE_PARTIAL_ELF = 0x2, + EXEC_NPU_TYPE_PREEMPT = 0x3, + EXEC_NPU_TYPE_ELF = 0x4, +}; + +union exec_req { + struct execute_buffer_req ebuf; + struct exec_dpu_req dpu_req; +}; + struct execute_buffer_resp { enum aie2_msg_status status; } __packed; @@ -319,9 +356,6 @@ struct async_event_msg_resp { } __packed; #define MAX_CHAIN_CMDBUF_SIZE SZ_4K -#define slot_has_space(slot, offset, payload_size) \ - (MAX_CHAIN_CMDBUF_SIZE >= (offset) + (payload_size) + \ - sizeof(typeof(slot))) struct cmd_chain_slot_execbuf_cf { __u32 cu_idx; @@ -339,12 +373,41 @@ struct cmd_chain_slot_dpu { __u32 args[] __counted_by(arg_cnt); }; +#define MAX_NPU_ARGS_SIZE (26 * sizeof(__u32)) +#define AIE2_EXEC_BUFFER_KERNEL_OP_TXN 3 +struct cmd_chain_slot_npu { + enum exec_npu_type type; + u64 inst_buf_addr; + u64 save_buf_addr; + u64 restore_buf_addr; + u32 inst_size; + u32 save_size; + u32 restore_size; + u32 inst_prop_cnt; + u32 cu_idx; + u32 arg_cnt; + u32 args[] __counted_by(arg_cnt); +} __packed; + struct cmd_chain_req { __u64 buf_addr; __u32 buf_size; __u32 count; } __packed; +struct cmd_chain_npu_req { + u32 flags; + u32 reserved; + u64 buf_addr; + u32 buf_size; + u32 count; +} __packed; + +union exec_chain_req { + struct cmd_chain_npu_req npu_req; + struct cmd_chain_req req; +}; + struct cmd_chain_resp { enum aie2_msg_status status; __u32 fail_cmd_idx; @@ -365,4 +428,21 @@ struct sync_bo_req { struct sync_bo_resp { enum aie2_msg_status status; } __packed; + +#define DEBUG_BO_UNREGISTER 0 +#define DEBUG_BO_REGISTER 1 +struct config_debug_bo_req { + __u64 offset; + __u64 size; + /* + * config operations. + * DEBUG_BO_REGISTER: Register debug buffer + * DEBUG_BO_UNREGISTER: Unregister debug buffer + */ + __u32 config; +} __packed; + +struct config_debug_bo_resp { + enum aie2_msg_status status; +} __packed; #endif /* _AIE2_MSG_PRIV_H_ */ diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c index 87c425e3d2b9..ceef1c502e9e 100644 --- a/drivers/accel/amdxdna/aie2_pci.c +++ b/drivers/accel/amdxdna/aie2_pci.c @@ -25,6 +25,7 @@ #include "amdxdna_gem.h" #include "amdxdna_mailbox.h" #include "amdxdna_pci_drv.h" +#include "amdxdna_pm.h" static int aie2_max_col = XRS_MAX_COL; module_param(aie2_max_col, uint, 0600); @@ -54,6 +55,7 @@ struct mgmt_mbox_chann_info { static int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 fw_minor) { + const struct aie2_fw_feature_tbl *feature; struct amdxdna_dev *xdna = ndev->xdna; /* @@ -77,6 +79,17 @@ static int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 f XDNA_ERR(xdna, "Firmware minor version smaller than supported"); return -EINVAL; } + + for (feature = ndev->priv->fw_feature_tbl; feature && feature->min_minor; + feature++) { + if (fw_minor < feature->min_minor) + continue; + if (feature->max_minor > 0 && fw_minor > feature->max_minor) + continue; + + set_bit(feature->feature, &ndev->feature_mask); + } + return 0; } @@ -170,6 +183,10 @@ int aie2_runtime_cfg(struct amdxdna_dev_hdl *ndev, if (cfg->category != category) continue; + if (cfg->feature_mask && + bitmap_subset(&cfg->feature_mask, &ndev->feature_mask, AIE2_FEATURE_MAX)) + continue; + value = val ? *val : cfg->value; ret = aie2_set_runtime_cfg(ndev, cfg->type, value); if (ret) { @@ -223,15 +240,6 @@ static int aie2_mgmt_fw_init(struct amdxdna_dev_hdl *ndev) return ret; } - if (!ndev->async_events) - return 0; - - ret = aie2_error_async_events_send(ndev); - if (ret) { - XDNA_ERR(ndev->xdna, "Send async events failed"); - return ret; - } - return 0; } @@ -257,6 +265,8 @@ static int aie2_mgmt_fw_query(struct amdxdna_dev_hdl *ndev) return ret; } + ndev->total_col = min(aie2_max_col, ndev->metadata.cols); + return 0; } @@ -338,6 +348,7 @@ static void aie2_hw_stop(struct amdxdna_dev *xdna) ndev->mbox = NULL; aie2_psp_stop(ndev->psp_hdl); aie2_smu_fini(ndev); + aie2_error_async_events_free(ndev); pci_disable_device(pdev); ndev->dev_status = AIE2_DEV_INIT; @@ -424,6 +435,18 @@ static int aie2_hw_start(struct amdxdna_dev *xdna) goto destroy_mgmt_chann; } + ret = aie2_mgmt_fw_query(ndev); + if (ret) { + XDNA_ERR(xdna, "failed to query fw, ret %d", ret); + goto destroy_mgmt_chann; + } + + ret = aie2_error_async_events_alloc(ndev); + if (ret) { + XDNA_ERR(xdna, "Allocate async events failed, ret %d", ret); + goto destroy_mgmt_chann; + } + ndev->dev_status = AIE2_DEV_START; return 0; @@ -459,7 +482,6 @@ static int aie2_hw_resume(struct amdxdna_dev *xdna) struct amdxdna_client *client; int ret; - guard(mutex)(&xdna->dev_lock); ret = aie2_hw_start(xdna); if (ret) { XDNA_ERR(xdna, "Start hardware failed, %d", ret); @@ -565,13 +587,6 @@ static int aie2_init(struct amdxdna_dev *xdna) goto release_fw; } - ret = aie2_mgmt_fw_query(ndev); - if (ret) { - XDNA_ERR(xdna, "Query firmware failed, ret %d", ret); - goto stop_hw; - } - ndev->total_col = min(aie2_max_col, ndev->metadata.cols); - xrs_cfg.clk_list.num_levels = ndev->max_dpm_level + 1; for (i = 0; i < xrs_cfg.clk_list.num_levels; i++) xrs_cfg.clk_list.cu_clk_list[i] = ndev->priv->dpm_clk_tbl[i].hclk; @@ -587,30 +602,11 @@ static int aie2_init(struct amdxdna_dev *xdna) goto stop_hw; } - ret = aie2_error_async_events_alloc(ndev); - if (ret) { - XDNA_ERR(xdna, "Allocate async events failed, ret %d", ret); - goto stop_hw; - } - - ret = aie2_error_async_events_send(ndev); - if (ret) { - XDNA_ERR(xdna, "Send async events failed, ret %d", ret); - goto async_event_free; - } - - /* Issue a command to make sure firmware handled async events */ - ret = aie2_query_firmware_version(ndev, &ndev->xdna->fw_ver); - if (ret) { - XDNA_ERR(xdna, "Re-query firmware version failed"); - goto async_event_free; - } - release_firmware(fw); + aie2_msg_init(ndev); + amdxdna_pm_init(xdna); return 0; -async_event_free: - aie2_error_async_events_free(ndev); stop_hw: aie2_hw_stop(xdna); release_fw: @@ -621,10 +617,8 @@ release_fw: static void aie2_fini(struct amdxdna_dev *xdna) { - struct amdxdna_dev_hdl *ndev = xdna->dev_handle; - + amdxdna_pm_fini(xdna); aie2_hw_stop(xdna); - aie2_error_async_events_free(ndev); } static int aie2_get_aie_status(struct amdxdna_client *client, @@ -845,7 +839,120 @@ static int aie2_get_hwctx_status(struct amdxdna_client *client, } args->buffer_size -= (u32)(array_args.buffer - args->buffer); - return ret; + return 0; +} + +static int aie2_query_resource_info(struct amdxdna_client *client, + struct amdxdna_drm_get_info *args) +{ + struct amdxdna_drm_get_resource_info res_info; + const struct amdxdna_dev_priv *priv; + struct amdxdna_dev_hdl *ndev; + struct amdxdna_dev *xdna; + + xdna = client->xdna; + ndev = xdna->dev_handle; + priv = ndev->priv; + + res_info.npu_clk_max = priv->dpm_clk_tbl[ndev->max_dpm_level].hclk; + res_info.npu_tops_max = ndev->max_tops; + res_info.npu_task_max = priv->hwctx_limit; + res_info.npu_tops_curr = ndev->curr_tops; + res_info.npu_task_curr = ndev->hwctx_num; + + if (copy_to_user(u64_to_user_ptr(args->buffer), &res_info, sizeof(res_info))) + return -EFAULT; + + return 0; +} + +static int aie2_fill_hwctx_map(struct amdxdna_hwctx *hwctx, void *arg) +{ + struct amdxdna_dev *xdna = hwctx->client->xdna; + u32 *map = arg; + + if (hwctx->fw_ctx_id >= xdna->dev_handle->priv->hwctx_limit) { + XDNA_ERR(xdna, "Invalid fw ctx id %d/%d ", hwctx->fw_ctx_id, + xdna->dev_handle->priv->hwctx_limit); + return -EINVAL; + } + + map[hwctx->fw_ctx_id] = hwctx->id; + return 0; +} + +static int aie2_get_telemetry(struct amdxdna_client *client, + struct amdxdna_drm_get_info *args) +{ + struct amdxdna_drm_query_telemetry_header *header __free(kfree) = NULL; + u32 telemetry_data_sz, header_sz, elem_num; + struct amdxdna_dev *xdna = client->xdna; + struct amdxdna_client *tmp_client; + int ret; + + elem_num = xdna->dev_handle->priv->hwctx_limit; + header_sz = struct_size(header, map, elem_num); + if (args->buffer_size <= header_sz) { + XDNA_ERR(xdna, "Invalid buffer size"); + return -EINVAL; + } + + telemetry_data_sz = args->buffer_size - header_sz; + if (telemetry_data_sz > SZ_4M) { + XDNA_ERR(xdna, "Buffer size is too big, %d", telemetry_data_sz); + return -EINVAL; + } + + header = kzalloc(header_sz, GFP_KERNEL); + if (!header) + return -ENOMEM; + + if (copy_from_user(header, u64_to_user_ptr(args->buffer), sizeof(*header))) { + XDNA_ERR(xdna, "Failed to copy telemetry header from user"); + return -EFAULT; + } + + header->map_num_elements = elem_num; + list_for_each_entry(tmp_client, &xdna->client_list, node) { + ret = amdxdna_hwctx_walk(tmp_client, &header->map, + aie2_fill_hwctx_map); + if (ret) + return ret; + } + + ret = aie2_query_telemetry(xdna->dev_handle, + u64_to_user_ptr(args->buffer + header_sz), + telemetry_data_sz, header); + if (ret) { + XDNA_ERR(xdna, "Query telemetry failed ret %d", ret); + return ret; + } + + if (copy_to_user(u64_to_user_ptr(args->buffer), header, header_sz)) { + XDNA_ERR(xdna, "Copy header failed"); + return -EFAULT; + } + + return 0; +} + +static int aie2_get_preempt_state(struct amdxdna_client *client, + struct amdxdna_drm_get_info *args) +{ + struct amdxdna_drm_attribute_state state = {}; + struct amdxdna_dev *xdna = client->xdna; + struct amdxdna_dev_hdl *ndev; + + ndev = xdna->dev_handle; + if (args->param == DRM_AMDXDNA_GET_FORCE_PREEMPT_STATE) + state.state = ndev->force_preempt_enabled; + else if (args->param == DRM_AMDXDNA_GET_FRAME_BOUNDARY_PREEMPT_STATE) + state.state = ndev->frame_boundary_preempt; + + if (copy_to_user(u64_to_user_ptr(args->buffer), &state, sizeof(state))) + return -EFAULT; + + return 0; } static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_info *args) @@ -856,6 +963,10 @@ static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_i if (!drm_dev_enter(&xdna->ddev, &idx)) return -ENODEV; + ret = amdxdna_pm_resume_get(xdna); + if (ret) + goto dev_exit; + switch (args->param) { case DRM_AMDXDNA_QUERY_AIE_STATUS: ret = aie2_get_aie_status(client, args); @@ -878,12 +989,25 @@ static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_i case DRM_AMDXDNA_GET_POWER_MODE: ret = aie2_get_power_mode(client, args); break; + case DRM_AMDXDNA_QUERY_TELEMETRY: + ret = aie2_get_telemetry(client, args); + break; + case DRM_AMDXDNA_QUERY_RESOURCE_INFO: + ret = aie2_query_resource_info(client, args); + break; + case DRM_AMDXDNA_GET_FORCE_PREEMPT_STATE: + case DRM_AMDXDNA_GET_FRAME_BOUNDARY_PREEMPT_STATE: + ret = aie2_get_preempt_state(client, args); + break; default: XDNA_ERR(xdna, "Not supported request parameter %u", args->param); ret = -EOPNOTSUPP; } + + amdxdna_pm_suspend_put(xdna); XDNA_DBG(xdna, "Got param %d", args->param); +dev_exit: drm_dev_exit(idx); return ret; } @@ -898,6 +1022,12 @@ static int aie2_query_ctx_status_array(struct amdxdna_client *client, drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); + if (args->element_size > SZ_4K || args->num_element > SZ_1K) { + XDNA_DBG(xdna, "Invalid element size %d or number of element %d", + args->element_size, args->num_element); + return -EINVAL; + } + array_args.element_size = min(args->element_size, sizeof(struct amdxdna_drm_hwctx_entry)); array_args.buffer = args->buffer; @@ -914,7 +1044,7 @@ static int aie2_query_ctx_status_array(struct amdxdna_client *client, args->num_element = (u32)((array_args.buffer - args->buffer) / args->element_size); - return ret; + return 0; } static int aie2_get_array(struct amdxdna_client *client, @@ -926,16 +1056,26 @@ static int aie2_get_array(struct amdxdna_client *client, if (!drm_dev_enter(&xdna->ddev, &idx)) return -ENODEV; + ret = amdxdna_pm_resume_get(xdna); + if (ret) + goto dev_exit; + switch (args->param) { case DRM_AMDXDNA_HW_CONTEXT_ALL: ret = aie2_query_ctx_status_array(client, args); break; + case DRM_AMDXDNA_HW_LAST_ASYNC_ERR: + ret = aie2_get_array_async_error(xdna->dev_handle, args); + break; default: XDNA_ERR(xdna, "Not supported request parameter %u", args->param); ret = -EOPNOTSUPP; } + + amdxdna_pm_suspend_put(xdna); XDNA_DBG(xdna, "Got param %d", args->param); +dev_exit: drm_dev_exit(idx); return ret; } @@ -965,6 +1105,38 @@ static int aie2_set_power_mode(struct amdxdna_client *client, return aie2_pm_set_mode(xdna->dev_handle, power_mode); } +static int aie2_set_preempt_state(struct amdxdna_client *client, + struct amdxdna_drm_set_state *args) +{ + struct amdxdna_dev_hdl *ndev = client->xdna->dev_handle; + struct amdxdna_drm_attribute_state state; + u32 val; + int ret; + + if (copy_from_user(&state, u64_to_user_ptr(args->buffer), sizeof(state))) + return -EFAULT; + + if (state.state > 1) + return -EINVAL; + + if (XDNA_MBZ_DBG(client->xdna, state.pad, sizeof(state.pad))) + return -EINVAL; + + if (args->param == DRM_AMDXDNA_SET_FORCE_PREEMPT) { + ndev->force_preempt_enabled = state.state; + } else if (args->param == DRM_AMDXDNA_SET_FRAME_BOUNDARY_PREEMPT) { + val = state.state; + ret = aie2_runtime_cfg(ndev, AIE2_RT_CFG_FRAME_BOUNDARY_PREEMPT, + &val); + if (ret) + return ret; + + ndev->frame_boundary_preempt = state.state; + } + + return 0; +} + static int aie2_set_state(struct amdxdna_client *client, struct amdxdna_drm_set_state *args) { @@ -974,16 +1146,26 @@ static int aie2_set_state(struct amdxdna_client *client, if (!drm_dev_enter(&xdna->ddev, &idx)) return -ENODEV; + ret = amdxdna_pm_resume_get(xdna); + if (ret) + goto dev_exit; + switch (args->param) { case DRM_AMDXDNA_SET_POWER_MODE: ret = aie2_set_power_mode(client, args); break; + case DRM_AMDXDNA_SET_FORCE_PREEMPT: + case DRM_AMDXDNA_SET_FRAME_BOUNDARY_PREEMPT: + ret = aie2_set_preempt_state(client, args); + break; default: XDNA_ERR(xdna, "Not supported request parameter %u", args->param); ret = -EOPNOTSUPP; break; } + amdxdna_pm_suspend_put(xdna); +dev_exit: drm_dev_exit(idx); return ret; } @@ -998,6 +1180,7 @@ const struct amdxdna_dev_ops aie2_ops = { .hwctx_init = aie2_hwctx_init, .hwctx_fini = aie2_hwctx_fini, .hwctx_config = aie2_hwctx_config, + .hwctx_sync_debug_bo = aie2_hwctx_sync_debug_bo, .cmd_submit = aie2_cmd_submit, .hmm_invalidate = aie2_hmm_invalidate, .get_array = aie2_get_array, diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h index 91a8e948f82a..a5f9c42155d1 100644 --- a/drivers/accel/amdxdna/aie2_pci.h +++ b/drivers/accel/amdxdna/aie2_pci.h @@ -110,12 +110,15 @@ struct aie_metadata { enum rt_config_category { AIE2_RT_CFG_INIT, AIE2_RT_CFG_CLK_GATING, + AIE2_RT_CFG_FORCE_PREEMPT, + AIE2_RT_CFG_FRAME_BOUNDARY_PREEMPT, }; struct rt_config { u32 type; u32 value; u32 category; + unsigned long feature_mask; }; struct dpm_clk_freq { @@ -156,6 +159,19 @@ enum aie2_dev_status { AIE2_DEV_START, }; +struct aie2_exec_msg_ops { + int (*init_cu_req)(struct amdxdna_gem_obj *cmd_bo, void *req, + size_t *size, u32 *msg_op); + int (*init_dpu_req)(struct amdxdna_gem_obj *cmd_bo, void *req, + size_t *size, u32 *msg_op); + void (*init_chain_req)(void *req, u64 slot_addr, size_t size, u32 cmd_cnt); + int (*fill_cf_slot)(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size); + int (*fill_dpu_slot)(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size); + int (*fill_preempt_slot)(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size); + int (*fill_elf_slot)(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size); + u32 (*get_chain_msg_op)(u32 cmd_op); +}; + struct amdxdna_dev_hdl { struct amdxdna_dev *xdna; const struct amdxdna_dev_priv *priv; @@ -173,6 +189,8 @@ struct amdxdna_dev_hdl { u32 total_col; struct aie_version version; struct aie_metadata metadata; + unsigned long feature_mask; + struct aie2_exec_msg_ops *exec_msg_ops; /* power management and clock*/ enum amdxdna_power_mode_type pw_mode; @@ -182,6 +200,10 @@ struct amdxdna_dev_hdl { u32 clk_gating; u32 npuclk_freq; u32 hclk_freq; + u32 max_tops; + u32 curr_tops; + u32 force_preempt_enabled; + u32 frame_boundary_preempt; /* Mailbox and the management channel */ struct mailbox *mbox; @@ -190,6 +212,8 @@ struct amdxdna_dev_hdl { enum aie2_dev_status dev_status; u32 hwctx_num; + + struct amdxdna_async_error last_async_err; }; #define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \ @@ -204,12 +228,27 @@ struct aie2_hw_ops { int (*set_dpm)(struct amdxdna_dev_hdl *ndev, u32 dpm_level); }; +enum aie2_fw_feature { + AIE2_NPU_COMMAND, + AIE2_PREEMPT, + AIE2_FEATURE_MAX +}; + +struct aie2_fw_feature_tbl { + enum aie2_fw_feature feature; + u32 max_minor; + u32 min_minor; +}; + +#define AIE2_FEATURE_ON(ndev, feature) test_bit(feature, &(ndev)->feature_mask) + struct amdxdna_dev_priv { const char *fw_path; u64 protocol_major; u64 protocol_minor; const struct rt_config *rt_config; const struct dpm_clk_freq *dpm_clk_tbl; + const struct aie2_fw_feature_tbl *fw_feature_tbl; #define COL_ALIGN_NONE 0 #define COL_ALIGN_NATURE 1 @@ -217,6 +256,7 @@ struct amdxdna_dev_priv { u32 mbox_dev_addr; /* If mbox_size is 0, use BAR size. See MBOX_SIZE macro */ u32 mbox_size; + u32 hwctx_limit; u32 sram_dev_addr; struct aie2_bar_off_pair sram_offs[SRAM_MAX_INDEX]; struct aie2_bar_off_pair psp_regs_off[PSP_MAX_REGS]; @@ -234,6 +274,7 @@ extern const struct dpm_clk_freq npu1_dpm_clk_table[]; extern const struct dpm_clk_freq npu4_dpm_clk_table[]; extern const struct rt_config npu1_default_rt_cfg[]; extern const struct rt_config npu4_default_rt_cfg[]; +extern const struct aie2_fw_feature_tbl npu4_fw_feature_table[]; /* aie2_smu.c */ int aie2_smu_init(struct amdxdna_dev_hdl *ndev); @@ -253,10 +294,12 @@ void aie2_psp_stop(struct psp_device *psp); /* aie2_error.c */ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev); void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev); -int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev); int aie2_error_async_msg_thread(void *data); +int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, + struct amdxdna_drm_get_array *args); /* aie2_message.c */ +void aie2_msg_init(struct amdxdna_dev_hdl *ndev); int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev); int aie2_resume_fw(struct amdxdna_dev_hdl *ndev); int aie2_set_runtime_cfg(struct amdxdna_dev_hdl *ndev, u32 type, u64 value); @@ -270,9 +313,13 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx); int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size); int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf, u32 size, u32 *cols_filled); +int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev, + char __user *buf, u32 size, + struct amdxdna_drm_query_telemetry_header *header); int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size, void *handle, int (*cb)(void*, void __iomem *, size_t)); -int aie2_config_cu(struct amdxdna_hwctx *hwctx); +int aie2_config_cu(struct amdxdna_hwctx *hwctx, + int (*notify_cb)(void *, void __iomem *, size_t)); int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, int (*notify_cb)(void *, void __iomem *, size_t)); int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx, @@ -283,11 +330,14 @@ int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx, int (*notify_cb)(void *, void __iomem *, size_t)); int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, int (*notify_cb)(void *, void __iomem *, size_t)); +int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, + int (*notify_cb)(void *, void __iomem *, size_t)); /* aie2_hwctx.c */ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx); void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx); int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size); +int aie2_hwctx_sync_debug_bo(struct amdxdna_hwctx *hwctx, u32 debug_bo_hdl); void aie2_hwctx_suspend(struct amdxdna_client *client); int aie2_hwctx_resume(struct amdxdna_client *client); int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq); diff --git a/drivers/accel/amdxdna/aie2_smu.c b/drivers/accel/amdxdna/aie2_smu.c index d303701b0ded..bd94ee96c2bc 100644 --- a/drivers/accel/amdxdna/aie2_smu.c +++ b/drivers/accel/amdxdna/aie2_smu.c @@ -11,6 +11,7 @@ #include "aie2_pci.h" #include "amdxdna_pci_drv.h" +#include "amdxdna_pm.h" #define SMU_RESULT_OK 1 @@ -22,6 +23,13 @@ #define AIE2_SMU_SET_SOFT_DPMLEVEL 0x7 #define AIE2_SMU_SET_HARD_DPMLEVEL 0x8 +#define NPU4_DPM_TOPS(ndev, dpm_level) \ +({ \ + typeof(ndev) _ndev = ndev; \ + (4096 * (_ndev)->total_col * \ + (_ndev)->priv->dpm_clk_tbl[dpm_level].hclk / 1000000); \ +}) + static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd, u32 reg_arg, u32 *out) { @@ -59,12 +67,16 @@ int npu1_set_dpm(struct amdxdna_dev_hdl *ndev, u32 dpm_level) u32 freq; int ret; + ret = amdxdna_pm_resume_get(ndev->xdna); + if (ret) + return ret; + ret = aie2_smu_exec(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ, ndev->priv->dpm_clk_tbl[dpm_level].npuclk, &freq); if (ret) { XDNA_ERR(ndev->xdna, "Set npu clock to %d failed, ret %d\n", ndev->priv->dpm_clk_tbl[dpm_level].npuclk, ret); - return ret; + goto suspend_put; } ndev->npuclk_freq = freq; @@ -73,49 +85,78 @@ int npu1_set_dpm(struct amdxdna_dev_hdl *ndev, u32 dpm_level) if (ret) { XDNA_ERR(ndev->xdna, "Set h clock to %d failed, ret %d\n", ndev->priv->dpm_clk_tbl[dpm_level].hclk, ret); - return ret; + goto suspend_put; } + + amdxdna_pm_suspend_put(ndev->xdna); ndev->hclk_freq = freq; ndev->dpm_level = dpm_level; + ndev->max_tops = 2 * ndev->total_col; + ndev->curr_tops = ndev->max_tops * freq / 1028; XDNA_DBG(ndev->xdna, "MP-NPU clock %d, H clock %d\n", ndev->npuclk_freq, ndev->hclk_freq); return 0; + +suspend_put: + amdxdna_pm_suspend_put(ndev->xdna); + return ret; } int npu4_set_dpm(struct amdxdna_dev_hdl *ndev, u32 dpm_level) { int ret; + ret = amdxdna_pm_resume_get(ndev->xdna); + if (ret) + return ret; + ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HARD_DPMLEVEL, dpm_level, NULL); if (ret) { XDNA_ERR(ndev->xdna, "Set hard dpm level %d failed, ret %d ", dpm_level, ret); - return ret; + goto suspend_put; } ret = aie2_smu_exec(ndev, AIE2_SMU_SET_SOFT_DPMLEVEL, dpm_level, NULL); if (ret) { XDNA_ERR(ndev->xdna, "Set soft dpm level %d failed, ret %d", dpm_level, ret); - return ret; + goto suspend_put; } + amdxdna_pm_suspend_put(ndev->xdna); ndev->npuclk_freq = ndev->priv->dpm_clk_tbl[dpm_level].npuclk; ndev->hclk_freq = ndev->priv->dpm_clk_tbl[dpm_level].hclk; ndev->dpm_level = dpm_level; + ndev->max_tops = NPU4_DPM_TOPS(ndev, ndev->max_dpm_level); + ndev->curr_tops = NPU4_DPM_TOPS(ndev, dpm_level); XDNA_DBG(ndev->xdna, "MP-NPU clock %d, H clock %d\n", ndev->npuclk_freq, ndev->hclk_freq); return 0; + +suspend_put: + amdxdna_pm_suspend_put(ndev->xdna); + return ret; } int aie2_smu_init(struct amdxdna_dev_hdl *ndev) { int ret; + /* + * Failing to set power off indicates an unrecoverable hardware or + * firmware error. + */ + ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_OFF, 0, NULL); + if (ret) { + XDNA_ERR(ndev->xdna, "Access power failed, ret %d", ret); + return ret; + } + ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_ON, 0, NULL); if (ret) { XDNA_ERR(ndev->xdna, "Power on failed, ret %d", ret); diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c index 4bfe4ef20550..d17aef89a0ad 100644 --- a/drivers/accel/amdxdna/amdxdna_ctx.c +++ b/drivers/accel/amdxdna/amdxdna_ctx.c @@ -113,14 +113,14 @@ void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size) return &cmd->data[num_masks]; } -int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo) +u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo) { struct amdxdna_cmd *cmd = abo->mem.kva; u32 num_masks, i; u32 *cu_mask; if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN) - return -1; + return INVALID_CU_IDX; num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header); cu_mask = cmd->data; @@ -129,7 +129,7 @@ int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo) return ffs(cu_mask[i]) - 1; } - return -1; + return INVALID_CU_IDX; } /* @@ -161,19 +161,14 @@ int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct dr if (args->ext || args->ext_flags) return -EINVAL; - if (!drm_dev_enter(dev, &idx)) - return -ENODEV; - hwctx = kzalloc(sizeof(*hwctx), GFP_KERNEL); - if (!hwctx) { - ret = -ENOMEM; - goto exit; - } + if (!hwctx) + return -ENOMEM; if (copy_from_user(&hwctx->qos, u64_to_user_ptr(args->qos_p), sizeof(hwctx->qos))) { XDNA_ERR(xdna, "Access QoS info failed"); - ret = -EFAULT; - goto free_hwctx; + kfree(hwctx); + return -EFAULT; } hwctx->client = client; @@ -181,30 +176,36 @@ int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct dr hwctx->num_tiles = args->num_tiles; hwctx->mem_size = args->mem_size; hwctx->max_opc = args->max_opc; - ret = xa_alloc_cyclic(&client->hwctx_xa, &hwctx->id, hwctx, - XA_LIMIT(AMDXDNA_INVALID_CTX_HANDLE + 1, MAX_HWCTX_ID), - &client->next_hwctxid, GFP_KERNEL); - if (ret < 0) { - XDNA_ERR(xdna, "Allocate hwctx ID failed, ret %d", ret); + + guard(mutex)(&xdna->dev_lock); + + if (!drm_dev_enter(dev, &idx)) { + ret = -ENODEV; goto free_hwctx; } - hwctx->name = kasprintf(GFP_KERNEL, "hwctx.%d.%d", client->pid, hwctx->id); + ret = xdna->dev_info->ops->hwctx_init(hwctx); + if (ret) { + XDNA_ERR(xdna, "Init hwctx failed, ret %d", ret); + goto dev_exit; + } + + hwctx->name = kasprintf(GFP_KERNEL, "hwctx.%d.%d", client->pid, hwctx->fw_ctx_id); if (!hwctx->name) { ret = -ENOMEM; - goto rm_id; + goto fini_hwctx; } - mutex_lock(&xdna->dev_lock); - ret = xdna->dev_info->ops->hwctx_init(hwctx); - if (ret) { - mutex_unlock(&xdna->dev_lock); - XDNA_ERR(xdna, "Init hwctx failed, ret %d", ret); + ret = xa_alloc_cyclic(&client->hwctx_xa, &hwctx->id, hwctx, + XA_LIMIT(AMDXDNA_INVALID_CTX_HANDLE + 1, MAX_HWCTX_ID), + &client->next_hwctxid, GFP_KERNEL); + if (ret < 0) { + XDNA_ERR(xdna, "Allocate hwctx ID failed, ret %d", ret); goto free_name; } + args->handle = hwctx->id; args->syncobj_handle = hwctx->syncobj_hdl; - mutex_unlock(&xdna->dev_lock); atomic64_set(&hwctx->job_submit_cnt, 0); atomic64_set(&hwctx->job_free_cnt, 0); @@ -214,12 +215,12 @@ int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct dr free_name: kfree(hwctx->name); -rm_id: - xa_erase(&client->hwctx_xa, hwctx->id); +fini_hwctx: + xdna->dev_info->ops->hwctx_fini(hwctx); +dev_exit: + drm_dev_exit(idx); free_hwctx: kfree(hwctx); -exit: - drm_dev_exit(idx); return ret; } @@ -327,6 +328,38 @@ unlock_srcu: return ret; } +int amdxdna_hwctx_sync_debug_bo(struct amdxdna_client *client, u32 debug_bo_hdl) +{ + struct amdxdna_dev *xdna = client->xdna; + struct amdxdna_hwctx *hwctx; + struct amdxdna_gem_obj *abo; + struct drm_gem_object *gobj; + int ret, idx; + + if (!xdna->dev_info->ops->hwctx_sync_debug_bo) + return -EOPNOTSUPP; + + gobj = drm_gem_object_lookup(client->filp, debug_bo_hdl); + if (!gobj) + return -EINVAL; + + abo = to_xdna_obj(gobj); + guard(mutex)(&xdna->dev_lock); + idx = srcu_read_lock(&client->hwctx_srcu); + hwctx = xa_load(&client->hwctx_xa, abo->assigned_hwctx); + if (!hwctx) { + ret = -EINVAL; + goto unlock_srcu; + } + + ret = xdna->dev_info->ops->hwctx_sync_debug_bo(hwctx, debug_bo_hdl); + +unlock_srcu: + srcu_read_unlock(&client->hwctx_srcu, idx); + drm_gem_object_put(gobj); + return ret; +} + static void amdxdna_arg_bos_put(struct amdxdna_sched_job *job) { @@ -389,9 +422,11 @@ void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job) trace_amdxdna_debug_point(job->hwctx->name, job->seq, "job release"); amdxdna_arg_bos_put(job); amdxdna_gem_put_obj(job->cmd_bo); + dma_fence_put(job->fence); } int amdxdna_cmd_submit(struct amdxdna_client *client, + struct amdxdna_drv_cmd *drv_cmd, u32 cmd_bo_hdl, u32 *arg_bo_hdls, u32 arg_bo_cnt, u32 hwctx_hdl, u64 *seq) { @@ -405,6 +440,8 @@ int amdxdna_cmd_submit(struct amdxdna_client *client, if (!job) return -ENOMEM; + job->drv_cmd = drv_cmd; + if (cmd_bo_hdl != AMDXDNA_INVALID_BO_HANDLE) { job->cmd_bo = amdxdna_gem_get_obj(client, cmd_bo_hdl, AMDXDNA_BO_CMD); if (!job->cmd_bo) { @@ -412,8 +449,6 @@ int amdxdna_cmd_submit(struct amdxdna_client *client, ret = -EINVAL; goto free_job; } - } else { - job->cmd_bo = NULL; } ret = amdxdna_arg_bos_lookup(client, job, arg_bo_hdls, arg_bo_cnt); @@ -431,11 +466,6 @@ int amdxdna_cmd_submit(struct amdxdna_client *client, goto unlock_srcu; } - if (hwctx->status != HWCTX_STAT_READY) { - XDNA_ERR(xdna, "HW Context is not ready"); - ret = -EINVAL; - goto unlock_srcu; - } job->hwctx = hwctx; job->mm = current->mm; @@ -512,7 +542,7 @@ static int amdxdna_drm_submit_execbuf(struct amdxdna_client *client, } } - ret = amdxdna_cmd_submit(client, cmd_bo_hdl, arg_bo_hdls, + ret = amdxdna_cmd_submit(client, NULL, cmd_bo_hdl, arg_bo_hdls, args->arg_count, args->hwctx, &args->seq); if (ret) XDNA_DBG(xdna, "Submit cmds failed, ret %d", ret); diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h index 7cd7a55936f0..b6151244d64f 100644 --- a/drivers/accel/amdxdna/amdxdna_ctx.h +++ b/drivers/accel/amdxdna/amdxdna_ctx.h @@ -13,9 +13,12 @@ struct amdxdna_hwctx_priv; enum ert_cmd_opcode { - ERT_START_CU = 0, - ERT_CMD_CHAIN = 19, - ERT_START_NPU = 20, + ERT_START_CU = 0, + ERT_CMD_CHAIN = 19, + ERT_START_NPU = 20, + ERT_START_NPU_PREEMPT = 21, + ERT_START_NPU_PREEMPT_ELF = 22, + ERT_INVALID_CMD = ~0U, }; enum ert_cmd_state { @@ -54,6 +57,21 @@ struct amdxdna_cmd_chain { u64 data[] __counted_by(command_count); }; +/* + * Interpretation of the beginning of data payload for ERT_START_NPU_PREEMPT in + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is regular kernel args. + */ +struct amdxdna_cmd_preempt_data { + u64 inst_buf; /* instruction buffer address */ + u64 save_buf; /* save buffer address */ + u64 restore_buf; /* restore buffer address */ + u32 inst_size; /* size of instruction buffer in bytes */ + u32 save_size; /* size of save buffer in bytes */ + u32 restore_size; /* size of restore buffer in bytes */ + u32 inst_prop_cnt; /* properties count */ + u32 prop_args[]; /* properties and regular kernel arguments */ +}; + /* Exec buffer command header format */ #define AMDXDNA_CMD_STATE GENMASK(3, 0) #define AMDXDNA_CMD_EXTRA_CU_MASK GENMASK(11, 10) @@ -64,6 +82,8 @@ struct amdxdna_cmd { u32 data[]; }; +#define INVALID_CU_IDX (~0U) + struct amdxdna_hwctx { struct amdxdna_client *client; struct amdxdna_hwctx_priv *priv; @@ -95,6 +115,17 @@ struct amdxdna_hwctx { #define drm_job_to_xdna_job(j) \ container_of(j, struct amdxdna_sched_job, base) +enum amdxdna_job_opcode { + SYNC_DEBUG_BO, + ATTACH_DEBUG_BO, + DETACH_DEBUG_BO, +}; + +struct amdxdna_drv_cmd { + enum amdxdna_job_opcode opcode; + u32 result; +}; + struct amdxdna_sched_job { struct drm_sched_job base; struct kref refcnt; @@ -105,7 +136,9 @@ struct amdxdna_sched_job { /* user can wait on this fence */ struct dma_fence *out_fence; bool job_done; + bool job_timeout; u64 seq; + struct amdxdna_drv_cmd *drv_cmd; struct amdxdna_gem_obj *cmd_bo; size_t bo_cnt; struct drm_gem_object *bos[] __counted_by(bo_cnt); @@ -137,15 +170,17 @@ amdxdna_cmd_get_state(struct amdxdna_gem_obj *abo) } void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size); -int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo); +u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo); void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job); void amdxdna_hwctx_remove_all(struct amdxdna_client *client); int amdxdna_hwctx_walk(struct amdxdna_client *client, void *arg, int (*walk)(struct amdxdna_hwctx *hwctx, void *arg)); +int amdxdna_hwctx_sync_debug_bo(struct amdxdna_client *client, u32 debug_bo_hdl); int amdxdna_cmd_submit(struct amdxdna_client *client, - u32 cmd_bo_hdls, u32 *arg_bo_hdls, u32 arg_bo_cnt, + struct amdxdna_drv_cmd *drv_cmd, u32 cmd_bo_hdls, + u32 *arg_bo_hdls, u32 arg_bo_cnt, u32 hwctx_hdl, u64 *seq); int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl, diff --git a/drivers/accel/amdxdna/amdxdna_error.h b/drivers/accel/amdxdna/amdxdna_error.h new file mode 100644 index 000000000000..c51de86ec12b --- /dev/null +++ b/drivers/accel/amdxdna/amdxdna_error.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025, Advanced Micro Devices, Inc. + */ + +#ifndef _AMDXDNA_ERROR_H_ +#define _AMDXDNA_ERROR_H_ + +#include <linux/bitfield.h> +#include <linux/bits.h> + +#define AMDXDNA_ERR_DRV_AIE 4 +#define AMDXDNA_ERR_SEV_CRITICAL 3 +#define AMDXDNA_ERR_CLASS_AIE 2 + +#define AMDXDNA_ERR_NUM_MASK GENMASK_U64(15, 0) +#define AMDXDNA_ERR_DRV_MASK GENMASK_U64(23, 16) +#define AMDXDNA_ERR_SEV_MASK GENMASK_U64(31, 24) +#define AMDXDNA_ERR_MOD_MASK GENMASK_U64(39, 32) +#define AMDXDNA_ERR_CLASS_MASK GENMASK_U64(47, 40) + +enum amdxdna_error_num { + AMDXDNA_ERROR_NUM_AIE_SATURATION = 3, + AMDXDNA_ERROR_NUM_AIE_FP, + AMDXDNA_ERROR_NUM_AIE_STREAM, + AMDXDNA_ERROR_NUM_AIE_ACCESS, + AMDXDNA_ERROR_NUM_AIE_BUS, + AMDXDNA_ERROR_NUM_AIE_INSTRUCTION, + AMDXDNA_ERROR_NUM_AIE_ECC, + AMDXDNA_ERROR_NUM_AIE_LOCK, + AMDXDNA_ERROR_NUM_AIE_DMA, + AMDXDNA_ERROR_NUM_AIE_MEM_PARITY, + AMDXDNA_ERROR_NUM_UNKNOWN = 15, +}; + +enum amdxdna_error_module { + AMDXDNA_ERROR_MODULE_AIE_CORE = 3, + AMDXDNA_ERROR_MODULE_AIE_MEMORY, + AMDXDNA_ERROR_MODULE_AIE_SHIM, + AMDXDNA_ERROR_MODULE_AIE_NOC, + AMDXDNA_ERROR_MODULE_AIE_PL, + AMDXDNA_ERROR_MODULE_UNKNOWN = 8, +}; + +#define AMDXDNA_ERROR_ENCODE(err_num, err_mod) \ + (FIELD_PREP(AMDXDNA_ERR_NUM_MASK, err_num) | \ + FIELD_PREP_CONST(AMDXDNA_ERR_DRV_MASK, AMDXDNA_ERR_DRV_AIE) | \ + FIELD_PREP_CONST(AMDXDNA_ERR_SEV_MASK, AMDXDNA_ERR_SEV_CRITICAL) | \ + FIELD_PREP(AMDXDNA_ERR_MOD_MASK, err_mod) | \ + FIELD_PREP_CONST(AMDXDNA_ERR_CLASS_MASK, AMDXDNA_ERR_CLASS_AIE)) + +#define AMDXDNA_EXTRA_ERR_COL_MASK GENMASK_U64(7, 0) +#define AMDXDNA_EXTRA_ERR_ROW_MASK GENMASK_U64(15, 8) + +#define AMDXDNA_EXTRA_ERR_ENCODE(row, col) \ + (FIELD_PREP(AMDXDNA_EXTRA_ERR_COL_MASK, col) | \ + FIELD_PREP(AMDXDNA_EXTRA_ERR_ROW_MASK, row)) + +#endif /* _AMDXDNA_ERROR_H_ */ diff --git a/drivers/accel/amdxdna/amdxdna_gem.c b/drivers/accel/amdxdna/amdxdna_gem.c index d407a36eb412..dfa916eeb2d9 100644 --- a/drivers/accel/amdxdna/amdxdna_gem.c +++ b/drivers/accel/amdxdna/amdxdna_gem.c @@ -8,6 +8,7 @@ #include <drm/drm_device.h> #include <drm/drm_gem.h> #include <drm/drm_gem_shmem_helper.h> +#include <drm/drm_print.h> #include <drm/gpu_scheduler.h> #include <linux/dma-buf.h> #include <linux/dma-direct.h> @@ -392,35 +393,33 @@ static const struct dma_buf_ops amdxdna_dmabuf_ops = { .vunmap = drm_gem_dmabuf_vunmap, }; -static int amdxdna_gem_obj_vmap(struct drm_gem_object *obj, struct iosys_map *map) +static int amdxdna_gem_obj_vmap(struct amdxdna_gem_obj *abo, void **vaddr) { - struct amdxdna_gem_obj *abo = to_xdna_obj(obj); - - iosys_map_clear(map); - - dma_resv_assert_held(obj->resv); + struct iosys_map map = IOSYS_MAP_INIT_VADDR(NULL); + int ret; if (is_import_bo(abo)) - dma_buf_vmap(abo->dma_buf, map); + ret = dma_buf_vmap_unlocked(abo->dma_buf, &map); else - drm_gem_shmem_object_vmap(obj, map); + ret = drm_gem_vmap(to_gobj(abo), &map); - if (!map->vaddr) - return -ENOMEM; - - return 0; + *vaddr = map.vaddr; + return ret; } -static void amdxdna_gem_obj_vunmap(struct drm_gem_object *obj, struct iosys_map *map) +static void amdxdna_gem_obj_vunmap(struct amdxdna_gem_obj *abo) { - struct amdxdna_gem_obj *abo = to_xdna_obj(obj); + struct iosys_map map; + + if (!abo->mem.kva) + return; - dma_resv_assert_held(obj->resv); + iosys_map_set_vaddr(&map, abo->mem.kva); if (is_import_bo(abo)) - dma_buf_vunmap(abo->dma_buf, map); + dma_buf_vunmap_unlocked(abo->dma_buf, &map); else - drm_gem_shmem_object_vunmap(obj, map); + drm_gem_vunmap(to_gobj(abo), &map); } static struct dma_buf *amdxdna_gem_prime_export(struct drm_gem_object *gobj, int flags) @@ -455,7 +454,6 @@ static void amdxdna_gem_obj_free(struct drm_gem_object *gobj) { struct amdxdna_dev *xdna = to_xdna_dev(gobj->dev); struct amdxdna_gem_obj *abo = to_xdna_obj(gobj); - struct iosys_map map = IOSYS_MAP_INIT_VADDR(abo->mem.kva); XDNA_DBG(xdna, "BO type %d xdna_addr 0x%llx", abo->type, abo->mem.dev_addr); @@ -468,7 +466,7 @@ static void amdxdna_gem_obj_free(struct drm_gem_object *gobj) if (abo->type == AMDXDNA_BO_DEV_HEAP) drm_mm_takedown(&abo->mm); - drm_gem_vunmap(gobj, &map); + amdxdna_gem_obj_vunmap(abo); mutex_destroy(&abo->lock); if (is_import_bo(abo)) { @@ -489,8 +487,8 @@ static const struct drm_gem_object_funcs amdxdna_gem_shmem_funcs = { .pin = drm_gem_shmem_object_pin, .unpin = drm_gem_shmem_object_unpin, .get_sg_table = drm_gem_shmem_object_get_sg_table, - .vmap = amdxdna_gem_obj_vmap, - .vunmap = amdxdna_gem_obj_vunmap, + .vmap = drm_gem_shmem_object_vmap, + .vunmap = drm_gem_shmem_object_vunmap, .mmap = amdxdna_gem_obj_mmap, .vm_ops = &drm_gem_shmem_vm_ops, .export = amdxdna_gem_prime_export, @@ -663,7 +661,6 @@ amdxdna_drm_create_dev_heap(struct drm_device *dev, struct drm_file *filp) { struct amdxdna_client *client = filp->driver_priv; - struct iosys_map map = IOSYS_MAP_INIT_VADDR(NULL); struct amdxdna_dev *xdna = to_xdna_dev(dev); struct amdxdna_gem_obj *abo; int ret; @@ -692,12 +689,11 @@ amdxdna_drm_create_dev_heap(struct drm_device *dev, abo->mem.dev_addr = client->xdna->dev_info->dev_mem_base; drm_mm_init(&abo->mm, abo->mem.dev_addr, abo->mem.size); - ret = drm_gem_vmap(to_gobj(abo), &map); + ret = amdxdna_gem_obj_vmap(abo, &abo->mem.kva); if (ret) { XDNA_ERR(xdna, "Vmap heap bo failed, ret %d", ret); goto release_obj; } - abo->mem.kva = map.vaddr; client->dev_heap = abo; drm_gem_object_get(to_gobj(abo)); @@ -748,7 +744,6 @@ amdxdna_drm_create_cmd_bo(struct drm_device *dev, struct amdxdna_drm_create_bo *args, struct drm_file *filp) { - struct iosys_map map = IOSYS_MAP_INIT_VADDR(NULL); struct amdxdna_dev *xdna = to_xdna_dev(dev); struct amdxdna_gem_obj *abo; int ret; @@ -770,12 +765,11 @@ amdxdna_drm_create_cmd_bo(struct drm_device *dev, abo->type = AMDXDNA_BO_CMD; abo->client = filp->driver_priv; - ret = drm_gem_vmap(to_gobj(abo), &map); + ret = amdxdna_gem_obj_vmap(abo, &abo->mem.kva); if (ret) { XDNA_ERR(xdna, "Vmap cmd bo failed, ret %d", ret); goto release_obj; } - abo->mem.kva = map.vaddr; return abo; @@ -969,6 +963,9 @@ int amdxdna_drm_sync_bo_ioctl(struct drm_device *dev, XDNA_DBG(xdna, "Sync bo %d offset 0x%llx, size 0x%llx\n", args->handle, args->offset, args->size); + if (args->direction == SYNC_DIRECT_FROM_DEVICE) + ret = amdxdna_hwctx_sync_debug_bo(abo->client, args->handle); + put_obj: drm_gem_object_put(gobj); return ret; diff --git a/drivers/accel/amdxdna/amdxdna_gem.h b/drivers/accel/amdxdna/amdxdna_gem.h index ae29db94a9d3..f79fc7f3c93b 100644 --- a/drivers/accel/amdxdna/amdxdna_gem.h +++ b/drivers/accel/amdxdna/amdxdna_gem.h @@ -7,6 +7,7 @@ #define _AMDXDNA_GEM_H_ #include <linux/hmm.h> +#include "amdxdna_pci_drv.h" struct amdxdna_umap { struct vm_area_struct *vma; @@ -62,6 +63,11 @@ static inline void amdxdna_gem_put_obj(struct amdxdna_gem_obj *abo) drm_gem_object_put(to_gobj(abo)); } +static inline u64 amdxdna_dev_bo_offset(struct amdxdna_gem_obj *abo) +{ + return abo->mem.dev_addr - abo->client->dev_heap->mem.dev_addr; +} + void amdxdna_umap_put(struct amdxdna_umap *mapp); struct drm_gem_object * diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c index da1ac89bb78f..858df97cd3fb 100644 --- a/drivers/accel/amdxdna/amdxdna_mailbox.c +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c @@ -194,7 +194,8 @@ static void mailbox_release_msg(struct mailbox_channel *mb_chann, { MB_DBG(mb_chann, "msg_id 0x%x msg opcode 0x%x", mb_msg->pkg.header.id, mb_msg->pkg.header.opcode); - mb_msg->notify_cb(mb_msg->handle, NULL, 0); + if (mb_msg->notify_cb) + mb_msg->notify_cb(mb_msg->handle, NULL, 0); kfree(mb_msg); } @@ -248,7 +249,7 @@ mailbox_get_resp(struct mailbox_channel *mb_chann, struct xdna_msg_header *heade { struct mailbox_msg *mb_msg; int msg_id; - int ret; + int ret = 0; msg_id = header->id; if (!mailbox_validate_msgid(msg_id)) { @@ -265,9 +266,11 @@ mailbox_get_resp(struct mailbox_channel *mb_chann, struct xdna_msg_header *heade MB_DBG(mb_chann, "opcode 0x%x size %d id 0x%x", header->opcode, header->total_size, header->id); - ret = mb_msg->notify_cb(mb_msg->handle, data, header->total_size); - if (unlikely(ret)) - MB_ERR(mb_chann, "Message callback ret %d", ret); + if (mb_msg->notify_cb) { + ret = mb_msg->notify_cb(mb_msg->handle, data, header->total_size); + if (unlikely(ret)) + MB_ERR(mb_chann, "Message callback ret %d", ret); + } kfree(mb_msg); return ret; @@ -513,6 +516,7 @@ xdna_mailbox_create_channel(struct mailbox *mb, } mb_chann->bad_state = false; + mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0); MB_DBG(mb_chann, "Mailbox channel created (irq: %d)", mb_chann->msix_irq); return mb_chann; diff --git a/drivers/accel/amdxdna/amdxdna_mailbox_helper.h b/drivers/accel/amdxdna/amdxdna_mailbox_helper.h index 710ff8873d61..556c712cad0a 100644 --- a/drivers/accel/amdxdna/amdxdna_mailbox_helper.h +++ b/drivers/accel/amdxdna/amdxdna_mailbox_helper.h @@ -16,16 +16,18 @@ struct xdna_notify { u32 *data; size_t size; int error; + u32 *status; }; -#define DECLARE_XDNA_MSG_COMMON(name, op, status) \ +#define DECLARE_XDNA_MSG_COMMON(name, op, s) \ struct name##_req req = { 0 }; \ - struct name##_resp resp = { status }; \ + struct name##_resp resp = { .status = s }; \ struct xdna_notify hdl = { \ .error = 0, \ .data = (u32 *)&resp, \ .size = sizeof(resp), \ .comp = COMPLETION_INITIALIZER_ONSTACK(hdl.comp), \ + .status = (u32 *)&resp.status, \ }; \ struct xdna_mailbox_msg msg = { \ .send_data = (u8 *)&req, \ diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c index 569cd703729d..1973ab67721b 100644 --- a/drivers/accel/amdxdna/amdxdna_pci_drv.c +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c @@ -13,13 +13,11 @@ #include <drm/gpu_scheduler.h> #include <linux/iommu.h> #include <linux/pci.h> -#include <linux/pm_runtime.h> #include "amdxdna_ctx.h" #include "amdxdna_gem.h" #include "amdxdna_pci_drv.h" - -#define AMDXDNA_AUTOSUSPEND_DELAY 5000 /* milliseconds */ +#include "amdxdna_pm.h" MODULE_FIRMWARE("amdnpu/1502_00/npu.sbin"); MODULE_FIRMWARE("amdnpu/17f0_10/npu.sbin"); @@ -29,9 +27,14 @@ MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin"); /* * 0.0: Initial version * 0.1: Support getting all hardware contexts by DRM_IOCTL_AMDXDNA_GET_ARRAY + * 0.2: Support getting last error hardware error + * 0.3: Support firmware debug buffer + * 0.4: Support getting resource information + * 0.5: Support getting telemetry data + * 0.6: Support preemption */ #define AMDXDNA_DRIVER_MAJOR 0 -#define AMDXDNA_DRIVER_MINOR 1 +#define AMDXDNA_DRIVER_MINOR 6 /* * Bind the driver base on (vendor_id, device_id) pair and later use the @@ -61,17 +64,9 @@ static int amdxdna_drm_open(struct drm_device *ddev, struct drm_file *filp) struct amdxdna_client *client; int ret; - ret = pm_runtime_resume_and_get(ddev->dev); - if (ret) { - XDNA_ERR(xdna, "Failed to get rpm, ret %d", ret); - return ret; - } - client = kzalloc(sizeof(*client), GFP_KERNEL); - if (!client) { - ret = -ENOMEM; - goto put_rpm; - } + if (!client) + return -ENOMEM; client->pid = pid_nr(rcu_access_pointer(filp->pid)); client->xdna = xdna; @@ -106,9 +101,6 @@ unbind_sva: iommu_sva_unbind_device(client->sva); failed: kfree(client); -put_rpm: - pm_runtime_mark_last_busy(ddev->dev); - pm_runtime_put_autosuspend(ddev->dev); return ret; } @@ -130,8 +122,6 @@ static void amdxdna_drm_close(struct drm_device *ddev, struct drm_file *filp) XDNA_DBG(xdna, "pid %d closed", client->pid); kfree(client); - pm_runtime_mark_last_busy(ddev->dev); - pm_runtime_put_autosuspend(ddev->dev); } static int amdxdna_flush(struct file *f, fl_owner_t id) @@ -310,19 +300,12 @@ static int amdxdna_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto failed_dev_fini; } - pm_runtime_set_autosuspend_delay(dev, AMDXDNA_AUTOSUSPEND_DELAY); - pm_runtime_use_autosuspend(dev); - pm_runtime_allow(dev); - ret = drm_dev_register(&xdna->ddev, 0); if (ret) { XDNA_ERR(xdna, "DRM register failed, ret %d", ret); - pm_runtime_forbid(dev); goto failed_sysfs_fini; } - pm_runtime_mark_last_busy(dev); - pm_runtime_put_autosuspend(dev); return 0; failed_sysfs_fini: @@ -339,14 +322,10 @@ destroy_notifier_wq: static void amdxdna_remove(struct pci_dev *pdev) { struct amdxdna_dev *xdna = pci_get_drvdata(pdev); - struct device *dev = &pdev->dev; struct amdxdna_client *client; destroy_workqueue(xdna->notifier_wq); - pm_runtime_get_noresume(dev); - pm_runtime_forbid(dev); - drm_dev_unplug(&xdna->ddev); amdxdna_sysfs_fini(xdna); @@ -365,29 +344,9 @@ static void amdxdna_remove(struct pci_dev *pdev) mutex_unlock(&xdna->dev_lock); } -static int amdxdna_pmops_suspend(struct device *dev) -{ - struct amdxdna_dev *xdna = pci_get_drvdata(to_pci_dev(dev)); - - if (!xdna->dev_info->ops->suspend) - return -EOPNOTSUPP; - - return xdna->dev_info->ops->suspend(xdna); -} - -static int amdxdna_pmops_resume(struct device *dev) -{ - struct amdxdna_dev *xdna = pci_get_drvdata(to_pci_dev(dev)); - - if (!xdna->dev_info->ops->resume) - return -EOPNOTSUPP; - - return xdna->dev_info->ops->resume(xdna); -} - static const struct dev_pm_ops amdxdna_pm_ops = { - SYSTEM_SLEEP_PM_OPS(amdxdna_pmops_suspend, amdxdna_pmops_resume) - RUNTIME_PM_OPS(amdxdna_pmops_suspend, amdxdna_pmops_resume, NULL) + SYSTEM_SLEEP_PM_OPS(amdxdna_pm_suspend, amdxdna_pm_resume) + RUNTIME_PM_OPS(amdxdna_pm_suspend, amdxdna_pm_resume, NULL) }; static struct pci_driver amdxdna_pci_driver = { diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.h b/drivers/accel/amdxdna/amdxdna_pci_drv.h index 72d6696d49da..c99477f5e454 100644 --- a/drivers/accel/amdxdna/amdxdna_pci_drv.h +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.h @@ -6,6 +6,7 @@ #ifndef _AMDXDNA_PCI_DRV_H_ #define _AMDXDNA_PCI_DRV_H_ +#include <drm/drm_print.h> #include <linux/workqueue.h> #include <linux/xarray.h> @@ -54,6 +55,7 @@ struct amdxdna_dev_ops { int (*hwctx_init)(struct amdxdna_hwctx *hwctx); void (*hwctx_fini)(struct amdxdna_hwctx *hwctx); int (*hwctx_config)(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size); + int (*hwctx_sync_debug_bo)(struct amdxdna_hwctx *hwctx, u32 debug_bo_hdl); void (*hmm_invalidate)(struct amdxdna_gem_obj *abo, unsigned long cur_seq); int (*cmd_submit)(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq); int (*get_aie_info)(struct amdxdna_client *client, struct amdxdna_drm_get_info *args); @@ -99,6 +101,7 @@ struct amdxdna_dev { struct amdxdna_fw_ver fw_ver; struct rw_semaphore notifier_lock; /* for mmu notifier*/ struct workqueue_struct *notifier_wq; + bool rpm_on; }; /* diff --git a/drivers/accel/amdxdna/amdxdna_pm.c b/drivers/accel/amdxdna/amdxdna_pm.c new file mode 100644 index 000000000000..fa38e65d617c --- /dev/null +++ b/drivers/accel/amdxdna/amdxdna_pm.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025, Advanced Micro Devices, Inc. + */ + +#include <drm/amdxdna_accel.h> +#include <drm/drm_drv.h> +#include <linux/pm_runtime.h> + +#include "amdxdna_pm.h" + +#define AMDXDNA_AUTOSUSPEND_DELAY 5000 /* milliseconds */ + +int amdxdna_pm_suspend(struct device *dev) +{ + struct amdxdna_dev *xdna = to_xdna_dev(dev_get_drvdata(dev)); + int ret = -EOPNOTSUPP; + bool rpm; + + if (xdna->dev_info->ops->suspend) { + rpm = xdna->rpm_on; + xdna->rpm_on = false; + ret = xdna->dev_info->ops->suspend(xdna); + xdna->rpm_on = rpm; + } + + XDNA_DBG(xdna, "Suspend done ret %d", ret); + return ret; +} + +int amdxdna_pm_resume(struct device *dev) +{ + struct amdxdna_dev *xdna = to_xdna_dev(dev_get_drvdata(dev)); + int ret = -EOPNOTSUPP; + bool rpm; + + if (xdna->dev_info->ops->resume) { + rpm = xdna->rpm_on; + xdna->rpm_on = false; + ret = xdna->dev_info->ops->resume(xdna); + xdna->rpm_on = rpm; + } + + XDNA_DBG(xdna, "Resume done ret %d", ret); + return ret; +} + +int amdxdna_pm_resume_get(struct amdxdna_dev *xdna) +{ + struct device *dev = xdna->ddev.dev; + int ret; + + if (!xdna->rpm_on) + return 0; + + ret = pm_runtime_resume_and_get(dev); + if (ret) { + XDNA_ERR(xdna, "Resume failed: %d", ret); + pm_runtime_set_suspended(dev); + } + + return ret; +} + +void amdxdna_pm_suspend_put(struct amdxdna_dev *xdna) +{ + struct device *dev = xdna->ddev.dev; + + if (!xdna->rpm_on) + return; + + pm_runtime_put_autosuspend(dev); +} + +void amdxdna_pm_init(struct amdxdna_dev *xdna) +{ + struct device *dev = xdna->ddev.dev; + + pm_runtime_set_active(dev); + pm_runtime_set_autosuspend_delay(dev, AMDXDNA_AUTOSUSPEND_DELAY); + pm_runtime_use_autosuspend(dev); + pm_runtime_allow(dev); + pm_runtime_put_autosuspend(dev); + xdna->rpm_on = true; +} + +void amdxdna_pm_fini(struct amdxdna_dev *xdna) +{ + struct device *dev = xdna->ddev.dev; + + xdna->rpm_on = false; + pm_runtime_get_noresume(dev); + pm_runtime_forbid(dev); +} diff --git a/drivers/accel/amdxdna/amdxdna_pm.h b/drivers/accel/amdxdna/amdxdna_pm.h new file mode 100644 index 000000000000..77b2d6e45570 --- /dev/null +++ b/drivers/accel/amdxdna/amdxdna_pm.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025, Advanced Micro Devices, Inc. + */ + +#ifndef _AMDXDNA_PM_H_ +#define _AMDXDNA_PM_H_ + +#include "amdxdna_pci_drv.h" + +int amdxdna_pm_suspend(struct device *dev); +int amdxdna_pm_resume(struct device *dev); +int amdxdna_pm_resume_get(struct amdxdna_dev *xdna); +void amdxdna_pm_suspend_put(struct amdxdna_dev *xdna); +void amdxdna_pm_init(struct amdxdna_dev *xdna); +void amdxdna_pm_fini(struct amdxdna_dev *xdna); + +#endif /* _AMDXDNA_PM_H_ */ diff --git a/drivers/accel/amdxdna/npu1_regs.c b/drivers/accel/amdxdna/npu1_regs.c index e4f6dac7d00f..ec407f3b48fc 100644 --- a/drivers/accel/amdxdna/npu1_regs.c +++ b/drivers/accel/amdxdna/npu1_regs.c @@ -46,6 +46,7 @@ const struct rt_config npu1_default_rt_cfg[] = { { 2, 1, AIE2_RT_CFG_INIT }, /* PDI APP LOAD MODE */ + { 4, 1, AIE2_RT_CFG_INIT }, /* Debug BO */ { 1, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */ { 0 }, }; @@ -62,16 +63,23 @@ const struct dpm_clk_freq npu1_dpm_clk_table[] = { { 0 } }; +static const struct aie2_fw_feature_tbl npu1_fw_feature_table[] = { + { .feature = AIE2_NPU_COMMAND, .min_minor = 8 }, + { 0 } +}; + static const struct amdxdna_dev_priv npu1_dev_priv = { .fw_path = "amdnpu/1502_00/npu.sbin", .protocol_major = 0x5, .protocol_minor = 0x7, .rt_config = npu1_default_rt_cfg, .dpm_clk_tbl = npu1_dpm_clk_table, + .fw_feature_tbl = npu1_fw_feature_table, .col_align = COL_ALIGN_NONE, .mbox_dev_addr = NPU1_MBOX_BAR_BASE, .mbox_size = 0, /* Use BAR size */ .sram_dev_addr = NPU1_SRAM_BAR_BASE, + .hwctx_limit = 6, .sram_offs = { DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU1_SRAM, MPNPU_SRAM_X2I_MAILBOX_0), DEFINE_BAR_OFFSET(FW_ALIVE_OFF, NPU1_SRAM, MPNPU_SRAM_I2X_MAILBOX_15), diff --git a/drivers/accel/amdxdna/npu2_regs.c b/drivers/accel/amdxdna/npu2_regs.c index a081cac75ee0..86f87d0d1354 100644 --- a/drivers/accel/amdxdna/npu2_regs.c +++ b/drivers/accel/amdxdna/npu2_regs.c @@ -67,10 +67,12 @@ static const struct amdxdna_dev_priv npu2_dev_priv = { .protocol_minor = 0x6, .rt_config = npu4_default_rt_cfg, .dpm_clk_tbl = npu4_dpm_clk_table, + .fw_feature_tbl = npu4_fw_feature_table, .col_align = COL_ALIGN_NATURE, .mbox_dev_addr = NPU2_MBOX_BAR_BASE, .mbox_size = 0, /* Use BAR size */ .sram_dev_addr = NPU2_SRAM_BAR_BASE, + .hwctx_limit = 16, .sram_offs = { DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_0), DEFINE_BAR_OFFSET(FW_ALIVE_OFF, NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_15), diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c index 9f2e33182ec6..986a5f28ba24 100644 --- a/drivers/accel/amdxdna/npu4_regs.c +++ b/drivers/accel/amdxdna/npu4_regs.c @@ -63,10 +63,14 @@ const struct rt_config npu4_default_rt_cfg[] = { { 5, 1, AIE2_RT_CFG_INIT }, /* PDI APP LOAD MODE */ + { 10, 1, AIE2_RT_CFG_INIT }, /* DEBUG BUF */ + { 14, 0, AIE2_RT_CFG_INIT, BIT_U64(AIE2_PREEMPT) }, /* Frame boundary preemption */ { 1, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */ { 2, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */ { 3, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */ { 4, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */ + { 13, 0, AIE2_RT_CFG_FORCE_PREEMPT }, + { 14, 0, AIE2_RT_CFG_FRAME_BOUNDARY_PREEMPT }, { 0 }, }; @@ -82,16 +86,24 @@ const struct dpm_clk_freq npu4_dpm_clk_table[] = { { 0 } }; +const struct aie2_fw_feature_tbl npu4_fw_feature_table[] = { + { .feature = AIE2_NPU_COMMAND, .min_minor = 15 }, + { .feature = AIE2_PREEMPT, .min_minor = 12 }, + { 0 } +}; + static const struct amdxdna_dev_priv npu4_dev_priv = { .fw_path = "amdnpu/17f0_10/npu.sbin", .protocol_major = 0x6, .protocol_minor = 12, .rt_config = npu4_default_rt_cfg, .dpm_clk_tbl = npu4_dpm_clk_table, + .fw_feature_tbl = npu4_fw_feature_table, .col_align = COL_ALIGN_NATURE, .mbox_dev_addr = NPU4_MBOX_BAR_BASE, .mbox_size = 0, /* Use BAR size */ .sram_dev_addr = NPU4_SRAM_BAR_BASE, + .hwctx_limit = 16, .sram_offs = { DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_0), DEFINE_BAR_OFFSET(FW_ALIVE_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_15), diff --git a/drivers/accel/amdxdna/npu5_regs.c b/drivers/accel/amdxdna/npu5_regs.c index 5f1cf83461c4..75ad97f0b937 100644 --- a/drivers/accel/amdxdna/npu5_regs.c +++ b/drivers/accel/amdxdna/npu5_regs.c @@ -67,10 +67,12 @@ static const struct amdxdna_dev_priv npu5_dev_priv = { .protocol_minor = 12, .rt_config = npu4_default_rt_cfg, .dpm_clk_tbl = npu4_dpm_clk_table, + .fw_feature_tbl = npu4_fw_feature_table, .col_align = COL_ALIGN_NATURE, .mbox_dev_addr = NPU5_MBOX_BAR_BASE, .mbox_size = 0, /* Use BAR size */ .sram_dev_addr = NPU5_SRAM_BAR_BASE, + .hwctx_limit = 16, .sram_offs = { DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_0), DEFINE_BAR_OFFSET(FW_ALIVE_OFF, NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_15), diff --git a/drivers/accel/amdxdna/npu6_regs.c b/drivers/accel/amdxdna/npu6_regs.c index 94a7005685a7..758dc013fe13 100644 --- a/drivers/accel/amdxdna/npu6_regs.c +++ b/drivers/accel/amdxdna/npu6_regs.c @@ -67,10 +67,12 @@ static const struct amdxdna_dev_priv npu6_dev_priv = { .protocol_minor = 12, .rt_config = npu4_default_rt_cfg, .dpm_clk_tbl = npu4_dpm_clk_table, + .fw_feature_tbl = npu4_fw_feature_table, .col_align = COL_ALIGN_NATURE, .mbox_dev_addr = NPU6_MBOX_BAR_BASE, .mbox_size = 0, /* Use BAR size */ .sram_dev_addr = NPU6_SRAM_BAR_BASE, + .hwctx_limit = 16, .sram_offs = { DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_0), DEFINE_BAR_OFFSET(FW_ALIVE_OFF, NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_15), diff --git a/drivers/accel/ethosu/Kconfig b/drivers/accel/ethosu/Kconfig new file mode 100644 index 000000000000..d25f9b3eb317 --- /dev/null +++ b/drivers/accel/ethosu/Kconfig @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config DRM_ACCEL_ARM_ETHOSU + tristate "Arm Ethos-U65/U85 NPU" + depends on HAS_IOMEM + depends on DRM_ACCEL + select DRM_GEM_DMA_HELPER + select DRM_SCHED + select GENERIC_ALLOCATOR + help + Enables driver for Arm Ethos-U65/U85 NPUs diff --git a/drivers/accel/ethosu/Makefile b/drivers/accel/ethosu/Makefile new file mode 100644 index 000000000000..17db5a600416 --- /dev/null +++ b/drivers/accel/ethosu/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_DRM_ACCEL_ARM_ETHOSU) := ethosu.o +ethosu-y += ethosu_drv.o ethosu_gem.o ethosu_job.o diff --git a/drivers/accel/ethosu/ethosu_device.h b/drivers/accel/ethosu/ethosu_device.h new file mode 100644 index 000000000000..b189fa783d6a --- /dev/null +++ b/drivers/accel/ethosu/ethosu_device.h @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: GPL-2.0-only or MIT */ +/* Copyright 2025 Arm, Ltd. */ + +#ifndef __ETHOSU_DEVICE_H__ +#define __ETHOSU_DEVICE_H__ + +#include <linux/bitfield.h> +#include <linux/bits.h> +#include <linux/types.h> + +#include <drm/drm_device.h> +#include <drm/gpu_scheduler.h> + +#include <drm/ethosu_accel.h> + +struct clk; +struct gen_pool; + +#define NPU_REG_ID 0x0000 +#define NPU_REG_STATUS 0x0004 +#define NPU_REG_CMD 0x0008 +#define NPU_REG_RESET 0x000c +#define NPU_REG_QBASE 0x0010 +#define NPU_REG_QBASE_HI 0x0014 +#define NPU_REG_QREAD 0x0018 +#define NPU_REG_QCONFIG 0x001c +#define NPU_REG_QSIZE 0x0020 +#define NPU_REG_PROT 0x0024 +#define NPU_REG_CONFIG 0x0028 +#define NPU_REG_REGIONCFG 0x003c +#define NPU_REG_AXILIMIT0 0x0040 // U65 +#define NPU_REG_AXILIMIT1 0x0044 // U65 +#define NPU_REG_AXILIMIT2 0x0048 // U65 +#define NPU_REG_AXILIMIT3 0x004c // U65 +#define NPU_REG_MEM_ATTR0 0x0040 // U85 +#define NPU_REG_MEM_ATTR1 0x0044 // U85 +#define NPU_REG_MEM_ATTR2 0x0048 // U85 +#define NPU_REG_MEM_ATTR3 0x004c // U85 +#define NPU_REG_AXI_SRAM 0x0050 // U85 +#define NPU_REG_AXI_EXT 0x0054 // U85 + +#define NPU_REG_BASEP(x) (0x0080 + (x) * 8) +#define NPU_REG_BASEP_HI(x) (0x0084 + (x) * 8) +#define NPU_BASEP_REGION_MAX 8 + +#define ID_ARCH_MAJOR_MASK GENMASK(31, 28) +#define ID_ARCH_MINOR_MASK GENMASK(27, 20) +#define ID_ARCH_PATCH_MASK GENMASK(19, 16) +#define ID_VER_MAJOR_MASK GENMASK(11, 8) +#define ID_VER_MINOR_MASK GENMASK(7, 4) + +#define CONFIG_MACS_PER_CC_MASK GENMASK(3, 0) +#define CONFIG_CMD_STREAM_VER_MASK GENMASK(7, 4) + +#define STATUS_STATE_RUNNING BIT(0) +#define STATUS_IRQ_RAISED BIT(1) +#define STATUS_BUS_STATUS BIT(2) +#define STATUS_RESET_STATUS BIT(3) +#define STATUS_CMD_PARSE_ERR BIT(4) +#define STATUS_CMD_END_REACHED BIT(5) + +#define CMD_CLEAR_IRQ BIT(1) +#define CMD_TRANSITION_TO_RUN BIT(0) + +#define RESET_PENDING_CSL BIT(1) +#define RESET_PENDING_CPL BIT(0) + +#define PROT_ACTIVE_CSL BIT(1) + +enum ethosu_cmds { + NPU_OP_CONV = 0x2, + NPU_OP_DEPTHWISE = 0x3, + NPU_OP_POOL = 0x5, + NPU_OP_ELEMENTWISE = 0x6, + NPU_OP_RESIZE = 0x7, // U85 only + NPU_OP_DMA_START = 0x10, + NPU_SET_IFM_PAD_TOP = 0x100, + NPU_SET_IFM_PAD_LEFT = 0x101, + NPU_SET_IFM_PAD_RIGHT = 0x102, + NPU_SET_IFM_PAD_BOTTOM = 0x103, + NPU_SET_IFM_DEPTH_M1 = 0x104, + NPU_SET_IFM_PRECISION = 0x105, + NPU_SET_IFM_BROADCAST = 0x108, + NPU_SET_IFM_WIDTH0_M1 = 0x10a, + NPU_SET_IFM_HEIGHT0_M1 = 0x10b, + NPU_SET_IFM_HEIGHT1_M1 = 0x10c, + NPU_SET_IFM_REGION = 0x10f, + NPU_SET_OFM_WIDTH_M1 = 0x111, + NPU_SET_OFM_HEIGHT_M1 = 0x112, + NPU_SET_OFM_DEPTH_M1 = 0x113, + NPU_SET_OFM_PRECISION = 0x114, + NPU_SET_OFM_WIDTH0_M1 = 0x11a, + NPU_SET_OFM_HEIGHT0_M1 = 0x11b, + NPU_SET_OFM_HEIGHT1_M1 = 0x11c, + NPU_SET_OFM_REGION = 0x11f, + NPU_SET_KERNEL_WIDTH_M1 = 0x120, + NPU_SET_KERNEL_HEIGHT_M1 = 0x121, + NPU_SET_KERNEL_STRIDE = 0x122, + NPU_SET_WEIGHT_REGION = 0x128, + NPU_SET_SCALE_REGION = 0x129, + NPU_SET_DMA0_SRC_REGION = 0x130, + NPU_SET_DMA0_DST_REGION = 0x131, + NPU_SET_DMA0_SIZE0 = 0x132, + NPU_SET_DMA0_SIZE1 = 0x133, + NPU_SET_IFM2_BROADCAST = 0x180, + NPU_SET_IFM2_PRECISION = 0x185, + NPU_SET_IFM2_WIDTH0_M1 = 0x18a, + NPU_SET_IFM2_HEIGHT0_M1 = 0x18b, + NPU_SET_IFM2_HEIGHT1_M1 = 0x18c, + NPU_SET_IFM2_REGION = 0x18f, + NPU_SET_IFM_BASE0 = 0x4000, + NPU_SET_IFM_BASE1 = 0x4001, + NPU_SET_IFM_BASE2 = 0x4002, + NPU_SET_IFM_BASE3 = 0x4003, + NPU_SET_IFM_STRIDE_X = 0x4004, + NPU_SET_IFM_STRIDE_Y = 0x4005, + NPU_SET_IFM_STRIDE_C = 0x4006, + NPU_SET_OFM_BASE0 = 0x4010, + NPU_SET_OFM_BASE1 = 0x4011, + NPU_SET_OFM_BASE2 = 0x4012, + NPU_SET_OFM_BASE3 = 0x4013, + NPU_SET_OFM_STRIDE_X = 0x4014, + NPU_SET_OFM_STRIDE_Y = 0x4015, + NPU_SET_OFM_STRIDE_C = 0x4016, + NPU_SET_WEIGHT_BASE = 0x4020, + NPU_SET_WEIGHT_LENGTH = 0x4021, + NPU_SET_SCALE_BASE = 0x4022, + NPU_SET_SCALE_LENGTH = 0x4023, + NPU_SET_DMA0_SRC = 0x4030, + NPU_SET_DMA0_DST = 0x4031, + NPU_SET_DMA0_LEN = 0x4032, + NPU_SET_DMA0_SRC_STRIDE0 = 0x4033, + NPU_SET_DMA0_SRC_STRIDE1 = 0x4034, + NPU_SET_DMA0_DST_STRIDE0 = 0x4035, + NPU_SET_DMA0_DST_STRIDE1 = 0x4036, + NPU_SET_IFM2_BASE0 = 0x4080, + NPU_SET_IFM2_BASE1 = 0x4081, + NPU_SET_IFM2_BASE2 = 0x4082, + NPU_SET_IFM2_BASE3 = 0x4083, + NPU_SET_IFM2_STRIDE_X = 0x4084, + NPU_SET_IFM2_STRIDE_Y = 0x4085, + NPU_SET_IFM2_STRIDE_C = 0x4086, + NPU_SET_WEIGHT1_BASE = 0x4090, + NPU_SET_WEIGHT1_LENGTH = 0x4091, + NPU_SET_SCALE1_BASE = 0x4092, + NPU_SET_WEIGHT2_BASE = 0x4092, + NPU_SET_SCALE1_LENGTH = 0x4093, + NPU_SET_WEIGHT2_LENGTH = 0x4093, + NPU_SET_WEIGHT3_BASE = 0x4094, + NPU_SET_WEIGHT3_LENGTH = 0x4095, +}; + +#define ETHOSU_SRAM_REGION 2 /* Matching Vela compiler */ + +/** + * struct ethosu_device - Ethosu device + */ +struct ethosu_device { + /** @base: Base drm_device. */ + struct drm_device base; + + /** @iomem: CPU mapping of the registers. */ + void __iomem *regs; + + void __iomem *sram; + struct gen_pool *srampool; + dma_addr_t sramphys; + + struct clk_bulk_data *clks; + int num_clks; + int irq; + + struct drm_ethosu_npu_info npu_info; + + struct ethosu_job *in_flight_job; + /* For in_flight_job and ethosu_job_hw_submit() */ + struct mutex job_lock; + + /* For dma_fence */ + spinlock_t fence_lock; + + struct drm_gpu_scheduler sched; + /* For ethosu_job_do_push() */ + struct mutex sched_lock; + u64 fence_context; + u64 emit_seqno; +}; + +#define to_ethosu_device(drm_dev) \ + ((struct ethosu_device *)container_of(drm_dev, struct ethosu_device, base)) + +static inline bool ethosu_is_u65(const struct ethosu_device *ethosudev) +{ + return FIELD_GET(ID_ARCH_MAJOR_MASK, ethosudev->npu_info.id) == 1; +} + +#endif diff --git a/drivers/accel/ethosu/ethosu_drv.c b/drivers/accel/ethosu/ethosu_drv.c new file mode 100644 index 000000000000..e05a69bf5574 --- /dev/null +++ b/drivers/accel/ethosu/ethosu_drv.c @@ -0,0 +1,403 @@ +// SPDX-License-Identifier: GPL-2.0-only or MIT +// Copyright (C) 2025 Arm, Ltd. + +#include <linux/bitfield.h> +#include <linux/clk.h> +#include <linux/genalloc.h> +#include <linux/io.h> +#include <linux/iopoll.h> +#include <linux/module.h> +#include <linux/mod_devicetable.h> +#include <linux/platform_device.h> +#include <linux/pm_runtime.h> + +#include <drm/drm_drv.h> +#include <drm/drm_ioctl.h> +#include <drm/drm_utils.h> +#include <drm/drm_gem.h> +#include <drm/drm_accel.h> +#include <drm/ethosu_accel.h> + +#include "ethosu_drv.h" +#include "ethosu_device.h" +#include "ethosu_gem.h" +#include "ethosu_job.h" + +static int ethosu_ioctl_dev_query(struct drm_device *ddev, void *data, + struct drm_file *file) +{ + struct ethosu_device *ethosudev = to_ethosu_device(ddev); + struct drm_ethosu_dev_query *args = data; + + if (!args->pointer) { + switch (args->type) { + case DRM_ETHOSU_DEV_QUERY_NPU_INFO: + args->size = sizeof(ethosudev->npu_info); + return 0; + default: + return -EINVAL; + } + } + + switch (args->type) { + case DRM_ETHOSU_DEV_QUERY_NPU_INFO: + if (args->size < offsetofend(struct drm_ethosu_npu_info, sram_size)) + return -EINVAL; + return copy_struct_to_user(u64_to_user_ptr(args->pointer), + args->size, + ðosudev->npu_info, + sizeof(ethosudev->npu_info), NULL); + default: + return -EINVAL; + } +} + +#define ETHOSU_BO_FLAGS DRM_ETHOSU_BO_NO_MMAP + +static int ethosu_ioctl_bo_create(struct drm_device *ddev, void *data, + struct drm_file *file) +{ + struct drm_ethosu_bo_create *args = data; + int cookie, ret; + + if (!drm_dev_enter(ddev, &cookie)) + return -ENODEV; + + if (!args->size || (args->flags & ~ETHOSU_BO_FLAGS)) { + ret = -EINVAL; + goto out_dev_exit; + } + + ret = ethosu_gem_create_with_handle(file, ddev, &args->size, + args->flags, &args->handle); + +out_dev_exit: + drm_dev_exit(cookie); + return ret; +} + +static int ethosu_ioctl_bo_wait(struct drm_device *ddev, void *data, + struct drm_file *file) +{ + struct drm_ethosu_bo_wait *args = data; + int cookie, ret; + unsigned long timeout = drm_timeout_abs_to_jiffies(args->timeout_ns); + + if (args->pad) + return -EINVAL; + + if (!drm_dev_enter(ddev, &cookie)) + return -ENODEV; + + ret = drm_gem_dma_resv_wait(file, args->handle, true, timeout); + + drm_dev_exit(cookie); + return ret; +} + +static int ethosu_ioctl_bo_mmap_offset(struct drm_device *ddev, void *data, + struct drm_file *file) +{ + struct drm_ethosu_bo_mmap_offset *args = data; + struct drm_gem_object *obj; + + if (args->pad) + return -EINVAL; + + obj = drm_gem_object_lookup(file, args->handle); + if (!obj) + return -ENOENT; + + args->offset = drm_vma_node_offset_addr(&obj->vma_node); + drm_gem_object_put(obj); + return 0; +} + +static int ethosu_ioctl_cmdstream_bo_create(struct drm_device *ddev, void *data, + struct drm_file *file) +{ + struct drm_ethosu_cmdstream_bo_create *args = data; + int cookie, ret; + + if (!drm_dev_enter(ddev, &cookie)) + return -ENODEV; + + if (!args->size || !args->data || args->pad || args->flags) { + ret = -EINVAL; + goto out_dev_exit; + } + + args->flags |= DRM_ETHOSU_BO_NO_MMAP; + + ret = ethosu_gem_cmdstream_create(file, ddev, args->size, args->data, + args->flags, &args->handle); + +out_dev_exit: + drm_dev_exit(cookie); + return ret; +} + +static int ethosu_open(struct drm_device *ddev, struct drm_file *file) +{ + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + struct ethosu_file_priv __free(kfree) *priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + goto err_put_mod; + } + priv->edev = to_ethosu_device(ddev); + + ret = ethosu_job_open(priv); + if (ret) + goto err_put_mod; + + file->driver_priv = no_free_ptr(priv); + return 0; + +err_put_mod: + module_put(THIS_MODULE); + return ret; +} + +static void ethosu_postclose(struct drm_device *ddev, struct drm_file *file) +{ + ethosu_job_close(file->driver_priv); + kfree(file->driver_priv); + module_put(THIS_MODULE); +} + +static const struct drm_ioctl_desc ethosu_drm_driver_ioctls[] = { +#define ETHOSU_IOCTL(n, func, flags) \ + DRM_IOCTL_DEF_DRV(ETHOSU_##n, ethosu_ioctl_##func, flags) + + ETHOSU_IOCTL(DEV_QUERY, dev_query, 0), + ETHOSU_IOCTL(BO_CREATE, bo_create, 0), + ETHOSU_IOCTL(BO_WAIT, bo_wait, 0), + ETHOSU_IOCTL(BO_MMAP_OFFSET, bo_mmap_offset, 0), + ETHOSU_IOCTL(CMDSTREAM_BO_CREATE, cmdstream_bo_create, 0), + ETHOSU_IOCTL(SUBMIT, submit, 0), +}; + +DEFINE_DRM_ACCEL_FOPS(ethosu_drm_driver_fops); + +/* + * Ethosu driver version: + * - 1.0 - initial interface + */ +static const struct drm_driver ethosu_drm_driver = { + .driver_features = DRIVER_COMPUTE_ACCEL | DRIVER_GEM, + .open = ethosu_open, + .postclose = ethosu_postclose, + .ioctls = ethosu_drm_driver_ioctls, + .num_ioctls = ARRAY_SIZE(ethosu_drm_driver_ioctls), + .fops = ðosu_drm_driver_fops, + .name = "ethosu", + .desc = "Arm Ethos-U Accel driver", + .major = 1, + .minor = 0, + + .gem_create_object = ethosu_gem_create_object, +}; + +#define U65_DRAM_AXI_LIMIT_CFG 0x1f3f0002 +#define U65_SRAM_AXI_LIMIT_CFG 0x1f3f00b0 +#define U85_AXI_EXT_CFG 0x00021f3f +#define U85_AXI_SRAM_CFG 0x00021f3f +#define U85_MEM_ATTR0_CFG 0x00000000 +#define U85_MEM_ATTR2_CFG 0x000000b7 + +static int ethosu_reset(struct ethosu_device *ethosudev) +{ + int ret; + u32 reg; + + writel_relaxed(RESET_PENDING_CSL, ethosudev->regs + NPU_REG_RESET); + ret = readl_poll_timeout(ethosudev->regs + NPU_REG_STATUS, reg, + !FIELD_GET(STATUS_RESET_STATUS, reg), + USEC_PER_MSEC, USEC_PER_SEC); + if (ret) + return ret; + + if (!FIELD_GET(PROT_ACTIVE_CSL, readl_relaxed(ethosudev->regs + NPU_REG_PROT))) { + dev_warn(ethosudev->base.dev, "Could not reset to non-secure mode (PROT = %x)\n", + readl_relaxed(ethosudev->regs + NPU_REG_PROT)); + } + + /* + * Assign region 2 (SRAM) to AXI M0 (AXILIMIT0), + * everything else to AXI M1 (AXILIMIT2) + */ + writel_relaxed(0x0000aa8a, ethosudev->regs + NPU_REG_REGIONCFG); + if (ethosu_is_u65(ethosudev)) { + writel_relaxed(U65_SRAM_AXI_LIMIT_CFG, ethosudev->regs + NPU_REG_AXILIMIT0); + writel_relaxed(U65_DRAM_AXI_LIMIT_CFG, ethosudev->regs + NPU_REG_AXILIMIT2); + } else { + writel_relaxed(U85_AXI_SRAM_CFG, ethosudev->regs + NPU_REG_AXI_SRAM); + writel_relaxed(U85_AXI_EXT_CFG, ethosudev->regs + NPU_REG_AXI_EXT); + writel_relaxed(U85_MEM_ATTR0_CFG, ethosudev->regs + NPU_REG_MEM_ATTR0); // SRAM + writel_relaxed(U85_MEM_ATTR2_CFG, ethosudev->regs + NPU_REG_MEM_ATTR2); // DRAM + } + + if (ethosudev->sram) + memset_io(ethosudev->sram, 0, ethosudev->npu_info.sram_size); + + return 0; +} + +static int ethosu_device_resume(struct device *dev) +{ + struct ethosu_device *ethosudev = dev_get_drvdata(dev); + int ret; + + ret = clk_bulk_prepare_enable(ethosudev->num_clks, ethosudev->clks); + if (ret) + return ret; + + ret = ethosu_reset(ethosudev); + if (!ret) + return 0; + + clk_bulk_disable_unprepare(ethosudev->num_clks, ethosudev->clks); + return ret; +} + +static int ethosu_device_suspend(struct device *dev) +{ + struct ethosu_device *ethosudev = dev_get_drvdata(dev); + + clk_bulk_disable_unprepare(ethosudev->num_clks, ethosudev->clks); + return 0; +} + +static int ethosu_sram_init(struct ethosu_device *ethosudev) +{ + ethosudev->npu_info.sram_size = 0; + + ethosudev->srampool = of_gen_pool_get(ethosudev->base.dev->of_node, "sram", 0); + if (!ethosudev->srampool) + return 0; + + ethosudev->npu_info.sram_size = gen_pool_size(ethosudev->srampool); + + ethosudev->sram = (void __iomem *)gen_pool_dma_alloc(ethosudev->srampool, + ethosudev->npu_info.sram_size, + ðosudev->sramphys); + if (!ethosudev->sram) { + dev_err(ethosudev->base.dev, "failed to allocate from SRAM pool\n"); + return -ENOMEM; + } + + return 0; +} + +static int ethosu_init(struct ethosu_device *ethosudev) +{ + int ret; + u32 id, config; + + ret = ethosu_device_resume(ethosudev->base.dev); + if (ret) + return ret; + + pm_runtime_set_autosuspend_delay(ethosudev->base.dev, 50); + pm_runtime_use_autosuspend(ethosudev->base.dev); + ret = devm_pm_runtime_set_active_enabled(ethosudev->base.dev); + if (ret) + return ret; + pm_runtime_get_noresume(ethosudev->base.dev); + + ethosudev->npu_info.id = id = readl_relaxed(ethosudev->regs + NPU_REG_ID); + ethosudev->npu_info.config = config = readl_relaxed(ethosudev->regs + NPU_REG_CONFIG); + + ethosu_sram_init(ethosudev); + + dev_info(ethosudev->base.dev, + "Ethos-U NPU, arch v%ld.%ld.%ld, rev r%ldp%ld, cmd stream ver%ld, %d MACs, %dKB SRAM\n", + FIELD_GET(ID_ARCH_MAJOR_MASK, id), + FIELD_GET(ID_ARCH_MINOR_MASK, id), + FIELD_GET(ID_ARCH_PATCH_MASK, id), + FIELD_GET(ID_VER_MAJOR_MASK, id), + FIELD_GET(ID_VER_MINOR_MASK, id), + FIELD_GET(CONFIG_CMD_STREAM_VER_MASK, config), + 1 << FIELD_GET(CONFIG_MACS_PER_CC_MASK, config), + ethosudev->npu_info.sram_size / 1024); + + return 0; +} + +static int ethosu_probe(struct platform_device *pdev) +{ + int ret; + struct ethosu_device *ethosudev; + + ethosudev = devm_drm_dev_alloc(&pdev->dev, ðosu_drm_driver, + struct ethosu_device, base); + if (IS_ERR(ethosudev)) + return -ENOMEM; + platform_set_drvdata(pdev, ethosudev); + + dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(40)); + + ethosudev->regs = devm_platform_ioremap_resource(pdev, 0); + + ethosudev->num_clks = devm_clk_bulk_get_all(&pdev->dev, ðosudev->clks); + if (ethosudev->num_clks < 0) + return ethosudev->num_clks; + + ret = ethosu_job_init(ethosudev); + if (ret) + return ret; + + ret = ethosu_init(ethosudev); + if (ret) + return ret; + + ret = drm_dev_register(ðosudev->base, 0); + if (ret) + pm_runtime_dont_use_autosuspend(ethosudev->base.dev); + + pm_runtime_put_autosuspend(ethosudev->base.dev); + return ret; +} + +static void ethosu_remove(struct platform_device *pdev) +{ + struct ethosu_device *ethosudev = dev_get_drvdata(&pdev->dev); + + drm_dev_unregister(ðosudev->base); + ethosu_job_fini(ethosudev); + if (ethosudev->sram) + gen_pool_free(ethosudev->srampool, (unsigned long)ethosudev->sram, + ethosudev->npu_info.sram_size); +} + +static const struct of_device_id dt_match[] = { + { .compatible = "arm,ethos-u65" }, + { .compatible = "arm,ethos-u85" }, + {} +}; +MODULE_DEVICE_TABLE(of, dt_match); + +static DEFINE_RUNTIME_DEV_PM_OPS(ethosu_pm_ops, + ethosu_device_suspend, + ethosu_device_resume, + NULL); + +static struct platform_driver ethosu_driver = { + .probe = ethosu_probe, + .remove = ethosu_remove, + .driver = { + .name = "ethosu", + .pm = pm_ptr(ðosu_pm_ops), + .of_match_table = dt_match, + }, +}; +module_platform_driver(ethosu_driver); + +MODULE_AUTHOR("Rob Herring <robh@kernel.org>"); +MODULE_DESCRIPTION("Arm Ethos-U Accel Driver"); +MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/accel/ethosu/ethosu_drv.h b/drivers/accel/ethosu/ethosu_drv.h new file mode 100644 index 000000000000..9e21dfe94184 --- /dev/null +++ b/drivers/accel/ethosu/ethosu_drv.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR MIT */ +/* Copyright 2025 Arm, Ltd. */ +#ifndef __ETHOSU_DRV_H__ +#define __ETHOSU_DRV_H__ + +#include <drm/gpu_scheduler.h> + +struct ethosu_device; + +struct ethosu_file_priv { + struct ethosu_device *edev; + struct drm_sched_entity sched_entity; +}; + +#endif diff --git a/drivers/accel/ethosu/ethosu_gem.c b/drivers/accel/ethosu/ethosu_gem.c new file mode 100644 index 000000000000..473b5f5d7514 --- /dev/null +++ b/drivers/accel/ethosu/ethosu_gem.c @@ -0,0 +1,704 @@ +// SPDX-License-Identifier: GPL-2.0-only or MIT +/* Copyright 2025 Arm, Ltd. */ + +#include <linux/err.h> +#include <linux/slab.h> + +#include <drm/ethosu_accel.h> + +#include "ethosu_device.h" +#include "ethosu_gem.h" + +static void ethosu_gem_free_object(struct drm_gem_object *obj) +{ + struct ethosu_gem_object *bo = to_ethosu_bo(obj); + + kfree(bo->info); + drm_gem_free_mmap_offset(&bo->base.base); + drm_gem_dma_free(&bo->base); +} + +static int ethosu_gem_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) +{ + struct ethosu_gem_object *bo = to_ethosu_bo(obj); + + /* Don't allow mmap on objects that have the NO_MMAP flag set. */ + if (bo->flags & DRM_ETHOSU_BO_NO_MMAP) + return -EINVAL; + + return drm_gem_dma_object_mmap(obj, vma); +} + +static const struct drm_gem_object_funcs ethosu_gem_funcs = { + .free = ethosu_gem_free_object, + .print_info = drm_gem_dma_object_print_info, + .get_sg_table = drm_gem_dma_object_get_sg_table, + .vmap = drm_gem_dma_object_vmap, + .mmap = ethosu_gem_mmap, + .vm_ops = &drm_gem_dma_vm_ops, +}; + +/** + * ethosu_gem_create_object - Implementation of driver->gem_create_object. + * @ddev: DRM device + * @size: Size in bytes of the memory the object will reference + * + * This lets the GEM helpers allocate object structs for us, and keep + * our BO stats correct. + */ +struct drm_gem_object *ethosu_gem_create_object(struct drm_device *ddev, size_t size) +{ + struct ethosu_gem_object *obj; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return ERR_PTR(-ENOMEM); + + obj->base.base.funcs = ðosu_gem_funcs; + return &obj->base.base; +} + +/** + * ethosu_gem_create_with_handle() - Create a GEM object and attach it to a handle. + * @file: DRM file. + * @ddev: DRM device. + * @size: Size of the GEM object to allocate. + * @flags: Combination of drm_ethosu_bo_flags flags. + * @handle: Pointer holding the handle pointing to the new GEM object. + * + * Return: Zero on success + */ +int ethosu_gem_create_with_handle(struct drm_file *file, + struct drm_device *ddev, + u64 *size, u32 flags, u32 *handle) +{ + struct drm_gem_dma_object *mem; + struct ethosu_gem_object *bo; + int ret; + + mem = drm_gem_dma_create(ddev, *size); + if (IS_ERR(mem)) + return PTR_ERR(mem); + + bo = to_ethosu_bo(&mem->base); + bo->flags = flags; + + /* + * Allocate an id of idr table where the obj is registered + * and handle has the id what user can see. + */ + ret = drm_gem_handle_create(file, &mem->base, handle); + if (!ret) + *size = bo->base.base.size; + + /* drop reference from allocate - handle holds it now. */ + drm_gem_object_put(&mem->base); + + return ret; +} + +struct dma { + s8 region; + u64 len; + u64 offset; + s64 stride[2]; +}; + +struct dma_state { + u16 size0; + u16 size1; + s8 mode; + struct dma src; + struct dma dst; +}; + +struct buffer { + u64 base; + u32 length; + s8 region; +}; + +struct feat_matrix { + u64 base[4]; + s64 stride_x; + s64 stride_y; + s64 stride_c; + s8 region; + u8 broadcast; + u16 stride_kernel; + u16 precision; + u16 depth; + u16 width; + u16 width0; + u16 height[3]; + u8 pad_top; + u8 pad_left; + u8 pad_bottom; + u8 pad_right; +}; + +struct cmd_state { + struct dma_state dma; + struct buffer scale[2]; + struct buffer weight[4]; + struct feat_matrix ofm; + struct feat_matrix ifm; + struct feat_matrix ifm2; +}; + +static void cmd_state_init(struct cmd_state *st) +{ + /* Initialize to all 1s to detect missing setup */ + memset(st, 0xff, sizeof(*st)); +} + +static u64 cmd_to_addr(u32 *cmd) +{ + return ((u64)((cmd[0] & 0xff0000) << 16)) | cmd[1]; +} + +static u64 dma_length(struct ethosu_validated_cmdstream_info *info, + struct dma_state *dma_st, struct dma *dma) +{ + s8 mode = dma_st->mode; + u64 len = dma->len; + + if (mode >= 1) { + len += dma->stride[0]; + len *= dma_st->size0; + } + if (mode == 2) { + len += dma->stride[1]; + len *= dma_st->size1; + } + if (dma->region >= 0) + info->region_size[dma->region] = max(info->region_size[dma->region], + len + dma->offset); + + return len; +} + +static u64 feat_matrix_length(struct ethosu_validated_cmdstream_info *info, + struct feat_matrix *fm, + u32 x, u32 y, u32 c) +{ + u32 element_size, storage = fm->precision >> 14; + int tile = 0; + u64 addr; + + if (fm->region < 0) + return U64_MAX; + + switch (storage) { + case 0: + if (x >= fm->width0 + 1) { + x -= fm->width0 + 1; + tile += 1; + } + if (y >= fm->height[tile] + 1) { + y -= fm->height[tile] + 1; + tile += 2; + } + break; + case 1: + if (y >= fm->height[1] + 1) { + y -= fm->height[1] + 1; + tile = 2; + } else if (y >= fm->height[0] + 1) { + y -= fm->height[0] + 1; + tile = 1; + } + break; + } + if (fm->base[tile] == U64_MAX) + return U64_MAX; + + addr = fm->base[tile] + y * fm->stride_y; + + switch ((fm->precision >> 6) & 0x3) { // format + case 0: //nhwc: + addr += x * fm->stride_x + c; + break; + case 1: //nhcwb16: + element_size = BIT((fm->precision >> 1) & 0x3); + + addr += (c / 16) * fm->stride_c + (16 * x + (c & 0xf)) * element_size; + break; + } + + info->region_size[fm->region] = max(info->region_size[fm->region], addr + 1); + + return addr; +} + +static int calc_sizes(struct drm_device *ddev, + struct ethosu_validated_cmdstream_info *info, + u16 op, struct cmd_state *st, + bool ifm, bool ifm2, bool weight, bool scale) +{ + u64 len; + + if (ifm) { + if (st->ifm.stride_kernel == U16_MAX) + return -EINVAL; + u32 stride_y = ((st->ifm.stride_kernel >> 8) & 0x2) + + ((st->ifm.stride_kernel >> 1) & 0x1) + 1; + u32 stride_x = ((st->ifm.stride_kernel >> 5) & 0x2) + + (st->ifm.stride_kernel & 0x1) + 1; + u32 ifm_height = st->ofm.height[2] * stride_y + + st->ifm.height[2] - (st->ifm.pad_top + st->ifm.pad_bottom); + u32 ifm_width = st->ofm.width * stride_x + + st->ifm.width - (st->ifm.pad_left + st->ifm.pad_right); + + len = feat_matrix_length(info, &st->ifm, ifm_width, + ifm_height, st->ifm.depth); + dev_dbg(ddev->dev, "op %d: IFM:%d:0x%llx-0x%llx\n", + op, st->ifm.region, st->ifm.base[0], len); + if (len == U64_MAX) + return -EINVAL; + } + + if (ifm2) { + len = feat_matrix_length(info, &st->ifm2, st->ifm.depth, + 0, st->ofm.depth); + dev_dbg(ddev->dev, "op %d: IFM2:%d:0x%llx-0x%llx\n", + op, st->ifm2.region, st->ifm2.base[0], len); + if (len == U64_MAX) + return -EINVAL; + } + + if (weight) { + dev_dbg(ddev->dev, "op %d: W:%d:0x%llx-0x%llx\n", + op, st->weight[0].region, st->weight[0].base, + st->weight[0].base + st->weight[0].length - 1); + if (st->weight[0].region < 0 || st->weight[0].base == U64_MAX || + st->weight[0].length == U32_MAX) + return -EINVAL; + info->region_size[st->weight[0].region] = + max(info->region_size[st->weight[0].region], + st->weight[0].base + st->weight[0].length); + } + + if (scale) { + dev_dbg(ddev->dev, "op %d: S:%d:0x%llx-0x%llx\n", + op, st->scale[0].region, st->scale[0].base, + st->scale[0].base + st->scale[0].length - 1); + if (st->scale[0].region < 0 || st->scale[0].base == U64_MAX || + st->scale[0].length == U32_MAX) + return -EINVAL; + info->region_size[st->scale[0].region] = + max(info->region_size[st->scale[0].region], + st->scale[0].base + st->scale[0].length); + } + + len = feat_matrix_length(info, &st->ofm, st->ofm.width, + st->ofm.height[2], st->ofm.depth); + dev_dbg(ddev->dev, "op %d: OFM:%d:0x%llx-0x%llx\n", + op, st->ofm.region, st->ofm.base[0], len); + if (len == U64_MAX) + return -EINVAL; + info->output_region[st->ofm.region] = true; + + return 0; +} + +static int calc_sizes_elemwise(struct drm_device *ddev, + struct ethosu_validated_cmdstream_info *info, + u16 op, struct cmd_state *st, + bool ifm, bool ifm2) +{ + u32 height, width, depth; + u64 len; + + if (ifm) { + height = st->ifm.broadcast & 0x1 ? 0 : st->ofm.height[2]; + width = st->ifm.broadcast & 0x2 ? 0 : st->ofm.width; + depth = st->ifm.broadcast & 0x4 ? 0 : st->ofm.depth; + + len = feat_matrix_length(info, &st->ifm, width, + height, depth); + dev_dbg(ddev->dev, "op %d: IFM:%d:0x%llx-0x%llx\n", + op, st->ifm.region, st->ifm.base[0], len); + if (len == U64_MAX) + return -EINVAL; + } + + if (ifm2) { + height = st->ifm2.broadcast & 0x1 ? 0 : st->ofm.height[2]; + width = st->ifm2.broadcast & 0x2 ? 0 : st->ofm.width; + depth = st->ifm2.broadcast & 0x4 ? 0 : st->ofm.depth; + + len = feat_matrix_length(info, &st->ifm2, width, + height, depth); + dev_dbg(ddev->dev, "op %d: IFM2:%d:0x%llx-0x%llx\n", + op, st->ifm2.region, st->ifm2.base[0], len); + if (len == U64_MAX) + return -EINVAL; + } + + len = feat_matrix_length(info, &st->ofm, st->ofm.width, + st->ofm.height[2], st->ofm.depth); + dev_dbg(ddev->dev, "op %d: OFM:%d:0x%llx-0x%llx\n", + op, st->ofm.region, st->ofm.base[0], len); + if (len == U64_MAX) + return -EINVAL; + info->output_region[st->ofm.region] = true; + + return 0; +} + +static int ethosu_gem_cmdstream_copy_and_validate(struct drm_device *ddev, + u32 __user *ucmds, + struct ethosu_gem_object *bo, + u32 size) +{ + struct ethosu_validated_cmdstream_info __free(kfree) *info = kzalloc(sizeof(*info), GFP_KERNEL); + struct ethosu_device *edev = to_ethosu_device(ddev); + u32 *bocmds = bo->base.vaddr; + struct cmd_state st; + int i, ret; + + if (!info) + return -ENOMEM; + info->cmd_size = size; + + cmd_state_init(&st); + + for (i = 0; i < size / 4; i++) { + bool use_ifm, use_ifm2, use_scale; + u64 dstlen, srclen; + u16 cmd, param; + u32 cmds[2]; + u64 addr; + + if (get_user(cmds[0], ucmds++)) + return -EFAULT; + + bocmds[i] = cmds[0]; + + cmd = cmds[0]; + param = cmds[0] >> 16; + + if (cmd & 0x4000) { + if (get_user(cmds[1], ucmds++)) + return -EFAULT; + + i++; + bocmds[i] = cmds[1]; + addr = cmd_to_addr(cmds); + } + + switch (cmd) { + case NPU_OP_DMA_START: + srclen = dma_length(info, &st.dma, &st.dma.src); + dstlen = dma_length(info, &st.dma, &st.dma.dst); + + if (st.dma.dst.region >= 0) + info->output_region[st.dma.dst.region] = true; + dev_dbg(ddev->dev, "cmd: DMA SRC:%d:0x%llx+0x%llx DST:%d:0x%llx+0x%llx\n", + st.dma.src.region, st.dma.src.offset, srclen, + st.dma.dst.region, st.dma.dst.offset, dstlen); + break; + case NPU_OP_CONV: + case NPU_OP_DEPTHWISE: + use_ifm2 = param & 0x1; // weights_ifm2 + use_scale = !(st.ofm.precision & 0x100); + ret = calc_sizes(ddev, info, cmd, &st, true, use_ifm2, + !use_ifm2, use_scale); + if (ret) + return ret; + break; + case NPU_OP_POOL: + use_ifm = param != 0x4; // pooling mode + use_scale = !(st.ofm.precision & 0x100); + ret = calc_sizes(ddev, info, cmd, &st, use_ifm, false, + false, use_scale); + if (ret) + return ret; + break; + case NPU_OP_ELEMENTWISE: + use_ifm2 = !((st.ifm2.broadcast == 8) || (param == 5) || + (param == 6) || (param == 7) || (param == 0x24)); + use_ifm = st.ifm.broadcast != 8; + ret = calc_sizes_elemwise(ddev, info, cmd, &st, use_ifm, use_ifm2); + if (ret) + return ret; + break; + case NPU_OP_RESIZE: // U85 only + WARN_ON(1); // TODO + break; + case NPU_SET_KERNEL_WIDTH_M1: + st.ifm.width = param; + break; + case NPU_SET_KERNEL_HEIGHT_M1: + st.ifm.height[2] = param; + break; + case NPU_SET_KERNEL_STRIDE: + st.ifm.stride_kernel = param; + break; + case NPU_SET_IFM_PAD_TOP: + st.ifm.pad_top = param & 0x7f; + break; + case NPU_SET_IFM_PAD_LEFT: + st.ifm.pad_left = param & 0x7f; + break; + case NPU_SET_IFM_PAD_RIGHT: + st.ifm.pad_right = param & 0xff; + break; + case NPU_SET_IFM_PAD_BOTTOM: + st.ifm.pad_bottom = param & 0xff; + break; + case NPU_SET_IFM_DEPTH_M1: + st.ifm.depth = param; + break; + case NPU_SET_IFM_PRECISION: + st.ifm.precision = param; + break; + case NPU_SET_IFM_BROADCAST: + st.ifm.broadcast = param; + break; + case NPU_SET_IFM_REGION: + st.ifm.region = param & 0x7f; + break; + case NPU_SET_IFM_WIDTH0_M1: + st.ifm.width0 = param; + break; + case NPU_SET_IFM_HEIGHT0_M1: + st.ifm.height[0] = param; + break; + case NPU_SET_IFM_HEIGHT1_M1: + st.ifm.height[1] = param; + break; + case NPU_SET_IFM_BASE0: + case NPU_SET_IFM_BASE1: + case NPU_SET_IFM_BASE2: + case NPU_SET_IFM_BASE3: + st.ifm.base[cmd & 0x3] = addr; + break; + case NPU_SET_IFM_STRIDE_X: + st.ifm.stride_x = addr; + break; + case NPU_SET_IFM_STRIDE_Y: + st.ifm.stride_y = addr; + break; + case NPU_SET_IFM_STRIDE_C: + st.ifm.stride_c = addr; + break; + + case NPU_SET_OFM_WIDTH_M1: + st.ofm.width = param; + break; + case NPU_SET_OFM_HEIGHT_M1: + st.ofm.height[2] = param; + break; + case NPU_SET_OFM_DEPTH_M1: + st.ofm.depth = param; + break; + case NPU_SET_OFM_PRECISION: + st.ofm.precision = param; + break; + case NPU_SET_OFM_REGION: + st.ofm.region = param & 0x7; + break; + case NPU_SET_OFM_WIDTH0_M1: + st.ofm.width0 = param; + break; + case NPU_SET_OFM_HEIGHT0_M1: + st.ofm.height[0] = param; + break; + case NPU_SET_OFM_HEIGHT1_M1: + st.ofm.height[1] = param; + break; + case NPU_SET_OFM_BASE0: + case NPU_SET_OFM_BASE1: + case NPU_SET_OFM_BASE2: + case NPU_SET_OFM_BASE3: + st.ofm.base[cmd & 0x3] = addr; + break; + case NPU_SET_OFM_STRIDE_X: + st.ofm.stride_x = addr; + break; + case NPU_SET_OFM_STRIDE_Y: + st.ofm.stride_y = addr; + break; + case NPU_SET_OFM_STRIDE_C: + st.ofm.stride_c = addr; + break; + + case NPU_SET_IFM2_BROADCAST: + st.ifm2.broadcast = param; + break; + case NPU_SET_IFM2_PRECISION: + st.ifm2.precision = param; + break; + case NPU_SET_IFM2_REGION: + st.ifm2.region = param & 0x7; + break; + case NPU_SET_IFM2_WIDTH0_M1: + st.ifm2.width0 = param; + break; + case NPU_SET_IFM2_HEIGHT0_M1: + st.ifm2.height[0] = param; + break; + case NPU_SET_IFM2_HEIGHT1_M1: + st.ifm2.height[1] = param; + break; + case NPU_SET_IFM2_BASE0: + case NPU_SET_IFM2_BASE1: + case NPU_SET_IFM2_BASE2: + case NPU_SET_IFM2_BASE3: + st.ifm2.base[cmd & 0x3] = addr; + break; + case NPU_SET_IFM2_STRIDE_X: + st.ifm2.stride_x = addr; + break; + case NPU_SET_IFM2_STRIDE_Y: + st.ifm2.stride_y = addr; + break; + case NPU_SET_IFM2_STRIDE_C: + st.ifm2.stride_c = addr; + break; + + case NPU_SET_WEIGHT_REGION: + st.weight[0].region = param & 0x7; + break; + case NPU_SET_SCALE_REGION: + st.scale[0].region = param & 0x7; + break; + case NPU_SET_WEIGHT_BASE: + st.weight[0].base = addr; + break; + case NPU_SET_WEIGHT_LENGTH: + st.weight[0].length = cmds[1]; + break; + case NPU_SET_SCALE_BASE: + st.scale[0].base = addr; + break; + case NPU_SET_SCALE_LENGTH: + st.scale[0].length = cmds[1]; + break; + case NPU_SET_WEIGHT1_BASE: + st.weight[1].base = addr; + break; + case NPU_SET_WEIGHT1_LENGTH: + st.weight[1].length = cmds[1]; + break; + case NPU_SET_SCALE1_BASE: // NPU_SET_WEIGHT2_BASE (U85) + if (ethosu_is_u65(edev)) + st.scale[1].base = addr; + else + st.weight[2].base = addr; + break; + case NPU_SET_SCALE1_LENGTH: // NPU_SET_WEIGHT2_LENGTH (U85) + if (ethosu_is_u65(edev)) + st.scale[1].length = cmds[1]; + else + st.weight[1].length = cmds[1]; + break; + case NPU_SET_WEIGHT3_BASE: + st.weight[3].base = addr; + break; + case NPU_SET_WEIGHT3_LENGTH: + st.weight[3].length = cmds[1]; + break; + + case NPU_SET_DMA0_SRC_REGION: + if (param & 0x100) + st.dma.src.region = -1; + else + st.dma.src.region = param & 0x7; + st.dma.mode = (param >> 9) & 0x3; + break; + case NPU_SET_DMA0_DST_REGION: + if (param & 0x100) + st.dma.dst.region = -1; + else + st.dma.dst.region = param & 0x7; + break; + case NPU_SET_DMA0_SIZE0: + st.dma.size0 = param; + break; + case NPU_SET_DMA0_SIZE1: + st.dma.size1 = param; + break; + case NPU_SET_DMA0_SRC_STRIDE0: + st.dma.src.stride[0] = ((s64)addr << 24) >> 24; + break; + case NPU_SET_DMA0_SRC_STRIDE1: + st.dma.src.stride[1] = ((s64)addr << 24) >> 24; + break; + case NPU_SET_DMA0_DST_STRIDE0: + st.dma.dst.stride[0] = ((s64)addr << 24) >> 24; + break; + case NPU_SET_DMA0_DST_STRIDE1: + st.dma.dst.stride[1] = ((s64)addr << 24) >> 24; + break; + case NPU_SET_DMA0_SRC: + st.dma.src.offset = addr; + break; + case NPU_SET_DMA0_DST: + st.dma.dst.offset = addr; + break; + case NPU_SET_DMA0_LEN: + st.dma.src.len = st.dma.dst.len = addr; + break; + default: + break; + } + } + + for (i = 0; i < NPU_BASEP_REGION_MAX; i++) { + if (!info->region_size[i]) + continue; + dev_dbg(ddev->dev, "region %d max size: 0x%llx\n", + i, info->region_size[i]); + } + + bo->info = no_free_ptr(info); + return 0; +} + +/** + * ethosu_gem_cmdstream_create() - Create a GEM object and attach it to a handle. + * @file: DRM file. + * @ddev: DRM device. + * @exclusive_vm: Exclusive VM. Not NULL if the GEM object can't be shared. + * @size: Size of the GEM object to allocate. + * @flags: Combination of drm_ethosu_bo_flags flags. + * @handle: Pointer holding the handle pointing to the new GEM object. + * + * Return: Zero on success + */ +int ethosu_gem_cmdstream_create(struct drm_file *file, + struct drm_device *ddev, + u32 size, u64 data, u32 flags, u32 *handle) +{ + int ret; + struct drm_gem_dma_object *mem; + struct ethosu_gem_object *bo; + + mem = drm_gem_dma_create(ddev, size); + if (IS_ERR(mem)) + return PTR_ERR(mem); + + bo = to_ethosu_bo(&mem->base); + bo->flags = flags; + + ret = ethosu_gem_cmdstream_copy_and_validate(ddev, + (void __user *)(uintptr_t)data, + bo, size); + if (ret) + goto fail; + + /* + * Allocate an id of idr table where the obj is registered + * and handle has the id what user can see. + */ + ret = drm_gem_handle_create(file, &mem->base, handle); + +fail: + /* drop reference from allocate - handle holds it now. */ + drm_gem_object_put(&mem->base); + + return ret; +} diff --git a/drivers/accel/ethosu/ethosu_gem.h b/drivers/accel/ethosu/ethosu_gem.h new file mode 100644 index 000000000000..3922895a60fb --- /dev/null +++ b/drivers/accel/ethosu/ethosu_gem.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 or MIT */ +/* Copyright 2025 Arm, Ltd. */ + +#ifndef __ETHOSU_GEM_H__ +#define __ETHOSU_GEM_H__ + +#include "ethosu_device.h" +#include <drm/drm_gem_dma_helper.h> + +struct ethosu_validated_cmdstream_info { + u32 cmd_size; + u64 region_size[NPU_BASEP_REGION_MAX]; + bool output_region[NPU_BASEP_REGION_MAX]; +}; + +/** + * struct ethosu_gem_object - Driver specific GEM object. + */ +struct ethosu_gem_object { + /** @base: Inherit from drm_gem_shmem_object. */ + struct drm_gem_dma_object base; + + struct ethosu_validated_cmdstream_info *info; + + /** @flags: Combination of drm_ethosu_bo_flags flags. */ + u32 flags; +}; + +static inline +struct ethosu_gem_object *to_ethosu_bo(struct drm_gem_object *obj) +{ + return container_of(to_drm_gem_dma_obj(obj), struct ethosu_gem_object, base); +} + +struct drm_gem_object *ethosu_gem_create_object(struct drm_device *ddev, + size_t size); + +int ethosu_gem_create_with_handle(struct drm_file *file, + struct drm_device *ddev, + u64 *size, u32 flags, uint32_t *handle); + +int ethosu_gem_cmdstream_create(struct drm_file *file, + struct drm_device *ddev, + u32 size, u64 data, u32 flags, u32 *handle); + +#endif /* __ETHOSU_GEM_H__ */ diff --git a/drivers/accel/ethosu/ethosu_job.c b/drivers/accel/ethosu/ethosu_job.c new file mode 100644 index 000000000000..26e7a2f64d71 --- /dev/null +++ b/drivers/accel/ethosu/ethosu_job.c @@ -0,0 +1,497 @@ +// SPDX-License-Identifier: GPL-2.0-only OR MIT +/* Copyright 2024-2025 Tomeu Vizoso <tomeu@tomeuvizoso.net> */ +/* Copyright 2025 Arm, Ltd. */ + +#include <linux/bitfield.h> +#include <linux/genalloc.h> +#include <linux/interrupt.h> +#include <linux/iopoll.h> +#include <linux/platform_device.h> +#include <linux/pm_runtime.h> + +#include <drm/drm_file.h> +#include <drm/drm_gem.h> +#include <drm/drm_gem_dma_helper.h> +#include <drm/drm_print.h> +#include <drm/ethosu_accel.h> + +#include "ethosu_device.h" +#include "ethosu_drv.h" +#include "ethosu_gem.h" +#include "ethosu_job.h" + +#define JOB_TIMEOUT_MS 500 + +static struct ethosu_job *to_ethosu_job(struct drm_sched_job *sched_job) +{ + return container_of(sched_job, struct ethosu_job, base); +} + +static const char *ethosu_fence_get_driver_name(struct dma_fence *fence) +{ + return "ethosu"; +} + +static const char *ethosu_fence_get_timeline_name(struct dma_fence *fence) +{ + return "ethosu-npu"; +} + +static const struct dma_fence_ops ethosu_fence_ops = { + .get_driver_name = ethosu_fence_get_driver_name, + .get_timeline_name = ethosu_fence_get_timeline_name, +}; + +static void ethosu_job_hw_submit(struct ethosu_device *dev, struct ethosu_job *job) +{ + struct drm_gem_dma_object *cmd_bo = to_drm_gem_dma_obj(job->cmd_bo); + struct ethosu_validated_cmdstream_info *cmd_info = to_ethosu_bo(job->cmd_bo)->info; + + for (int i = 0; i < job->region_cnt; i++) { + struct drm_gem_dma_object *bo; + int region = job->region_bo_num[i]; + + bo = to_drm_gem_dma_obj(job->region_bo[i]); + writel_relaxed(lower_32_bits(bo->dma_addr), dev->regs + NPU_REG_BASEP(region)); + writel_relaxed(upper_32_bits(bo->dma_addr), dev->regs + NPU_REG_BASEP_HI(region)); + dev_dbg(dev->base.dev, "Region %d base addr = %pad\n", region, &bo->dma_addr); + } + + if (job->sram_size) { + writel_relaxed(lower_32_bits(dev->sramphys), + dev->regs + NPU_REG_BASEP(ETHOSU_SRAM_REGION)); + writel_relaxed(upper_32_bits(dev->sramphys), + dev->regs + NPU_REG_BASEP_HI(ETHOSU_SRAM_REGION)); + dev_dbg(dev->base.dev, "Region %d base addr = %pad (SRAM)\n", + ETHOSU_SRAM_REGION, &dev->sramphys); + } + + writel_relaxed(lower_32_bits(cmd_bo->dma_addr), dev->regs + NPU_REG_QBASE); + writel_relaxed(upper_32_bits(cmd_bo->dma_addr), dev->regs + NPU_REG_QBASE_HI); + writel_relaxed(cmd_info->cmd_size, dev->regs + NPU_REG_QSIZE); + + writel(CMD_TRANSITION_TO_RUN, dev->regs + NPU_REG_CMD); + + dev_dbg(dev->base.dev, + "Submitted cmd at %pad to core\n", &cmd_bo->dma_addr); +} + +static int ethosu_acquire_object_fences(struct ethosu_job *job) +{ + int i, ret; + struct drm_gem_object **bos = job->region_bo; + struct ethosu_validated_cmdstream_info *info = to_ethosu_bo(job->cmd_bo)->info; + + for (i = 0; i < job->region_cnt; i++) { + bool is_write; + + if (!bos[i]) + break; + + ret = dma_resv_reserve_fences(bos[i]->resv, 1); + if (ret) + return ret; + + is_write = info->output_region[job->region_bo_num[i]]; + ret = drm_sched_job_add_implicit_dependencies(&job->base, bos[i], + is_write); + if (ret) + return ret; + } + + return 0; +} + +static void ethosu_attach_object_fences(struct ethosu_job *job) +{ + int i; + struct dma_fence *fence = job->inference_done_fence; + struct drm_gem_object **bos = job->region_bo; + struct ethosu_validated_cmdstream_info *info = to_ethosu_bo(job->cmd_bo)->info; + + for (i = 0; i < job->region_cnt; i++) + if (info->output_region[job->region_bo_num[i]]) + dma_resv_add_fence(bos[i]->resv, fence, DMA_RESV_USAGE_WRITE); +} + +static int ethosu_job_push(struct ethosu_job *job) +{ + struct ww_acquire_ctx acquire_ctx; + int ret; + + ret = drm_gem_lock_reservations(job->region_bo, job->region_cnt, &acquire_ctx); + if (ret) + return ret; + + ret = ethosu_acquire_object_fences(job); + if (ret) + goto out; + + ret = pm_runtime_resume_and_get(job->dev->base.dev); + if (!ret) { + guard(mutex)(&job->dev->sched_lock); + + drm_sched_job_arm(&job->base); + job->inference_done_fence = dma_fence_get(&job->base.s_fence->finished); + kref_get(&job->refcount); /* put by scheduler job completion */ + drm_sched_entity_push_job(&job->base); + ethosu_attach_object_fences(job); + } + +out: + drm_gem_unlock_reservations(job->region_bo, job->region_cnt, &acquire_ctx); + return ret; +} + +static void ethosu_job_cleanup(struct kref *ref) +{ + struct ethosu_job *job = container_of(ref, struct ethosu_job, + refcount); + unsigned int i; + + pm_runtime_put_autosuspend(job->dev->base.dev); + + dma_fence_put(job->done_fence); + dma_fence_put(job->inference_done_fence); + + for (i = 0; i < job->region_cnt; i++) + drm_gem_object_put(job->region_bo[i]); + + drm_gem_object_put(job->cmd_bo); + + kfree(job); +} + +static void ethosu_job_put(struct ethosu_job *job) +{ + kref_put(&job->refcount, ethosu_job_cleanup); +} + +static void ethosu_job_free(struct drm_sched_job *sched_job) +{ + struct ethosu_job *job = to_ethosu_job(sched_job); + + drm_sched_job_cleanup(sched_job); + ethosu_job_put(job); +} + +static struct dma_fence *ethosu_job_run(struct drm_sched_job *sched_job) +{ + struct ethosu_job *job = to_ethosu_job(sched_job); + struct ethosu_device *dev = job->dev; + struct dma_fence *fence = job->done_fence; + + if (unlikely(job->base.s_fence->finished.error)) + return NULL; + + dma_fence_init(fence, ðosu_fence_ops, &dev->fence_lock, + dev->fence_context, ++dev->emit_seqno); + dma_fence_get(fence); + + scoped_guard(mutex, &dev->job_lock) { + dev->in_flight_job = job; + ethosu_job_hw_submit(dev, job); + } + + return fence; +} + +static void ethosu_job_handle_irq(struct ethosu_device *dev) +{ + u32 status = readl_relaxed(dev->regs + NPU_REG_STATUS); + + if (status & (STATUS_BUS_STATUS | STATUS_CMD_PARSE_ERR)) { + dev_err(dev->base.dev, "Error IRQ - %x\n", status); + drm_sched_fault(&dev->sched); + return; + } + + scoped_guard(mutex, &dev->job_lock) { + if (dev->in_flight_job) { + dma_fence_signal(dev->in_flight_job->done_fence); + dev->in_flight_job = NULL; + } + } +} + +static irqreturn_t ethosu_job_irq_handler_thread(int irq, void *data) +{ + struct ethosu_device *dev = data; + + ethosu_job_handle_irq(dev); + + return IRQ_HANDLED; +} + +static irqreturn_t ethosu_job_irq_handler(int irq, void *data) +{ + struct ethosu_device *dev = data; + u32 status = readl_relaxed(dev->regs + NPU_REG_STATUS); + + if (!(status & STATUS_IRQ_RAISED)) + return IRQ_NONE; + + writel_relaxed(CMD_CLEAR_IRQ, dev->regs + NPU_REG_CMD); + return IRQ_WAKE_THREAD; +} + +static enum drm_gpu_sched_stat ethosu_job_timedout(struct drm_sched_job *bad) +{ + struct ethosu_job *job = to_ethosu_job(bad); + struct ethosu_device *dev = job->dev; + bool running; + u32 *bocmds = to_drm_gem_dma_obj(job->cmd_bo)->vaddr; + u32 cmdaddr; + + cmdaddr = readl_relaxed(dev->regs + NPU_REG_QREAD); + running = FIELD_GET(STATUS_STATE_RUNNING, readl_relaxed(dev->regs + NPU_REG_STATUS)); + + if (running) { + int ret; + u32 reg; + + ret = readl_relaxed_poll_timeout(dev->regs + NPU_REG_QREAD, + reg, + reg != cmdaddr, + USEC_PER_MSEC, 100 * USEC_PER_MSEC); + + /* If still running and progress is being made, just return */ + if (!ret) + return DRM_GPU_SCHED_STAT_NO_HANG; + } + + dev_err(dev->base.dev, "NPU sched timed out: NPU %s, cmdstream offset 0x%x: 0x%x\n", + running ? "running" : "stopped", + cmdaddr, bocmds[cmdaddr / 4]); + + drm_sched_stop(&dev->sched, bad); + + scoped_guard(mutex, &dev->job_lock) + dev->in_flight_job = NULL; + + /* Proceed with reset now. */ + pm_runtime_force_suspend(dev->base.dev); + pm_runtime_force_resume(dev->base.dev); + + /* Restart the scheduler */ + drm_sched_start(&dev->sched, 0); + + return DRM_GPU_SCHED_STAT_RESET; +} + +static const struct drm_sched_backend_ops ethosu_sched_ops = { + .run_job = ethosu_job_run, + .timedout_job = ethosu_job_timedout, + .free_job = ethosu_job_free +}; + +int ethosu_job_init(struct ethosu_device *edev) +{ + struct device *dev = edev->base.dev; + struct drm_sched_init_args args = { + .ops = ðosu_sched_ops, + .num_rqs = DRM_SCHED_PRIORITY_COUNT, + .credit_limit = 1, + .timeout = msecs_to_jiffies(JOB_TIMEOUT_MS), + .name = dev_name(dev), + .dev = dev, + }; + int ret; + + spin_lock_init(&edev->fence_lock); + ret = devm_mutex_init(dev, &edev->job_lock); + if (ret) + return ret; + ret = devm_mutex_init(dev, &edev->sched_lock); + if (ret) + return ret; + + edev->irq = platform_get_irq(to_platform_device(dev), 0); + if (edev->irq < 0) + return edev->irq; + + ret = devm_request_threaded_irq(dev, edev->irq, + ethosu_job_irq_handler, + ethosu_job_irq_handler_thread, + IRQF_SHARED, KBUILD_MODNAME, + edev); + if (ret) { + dev_err(dev, "failed to request irq\n"); + return ret; + } + + edev->fence_context = dma_fence_context_alloc(1); + + ret = drm_sched_init(&edev->sched, &args); + if (ret) { + dev_err(dev, "Failed to create scheduler: %d\n", ret); + goto err_sched; + } + + return 0; + +err_sched: + drm_sched_fini(&edev->sched); + return ret; +} + +void ethosu_job_fini(struct ethosu_device *dev) +{ + drm_sched_fini(&dev->sched); +} + +int ethosu_job_open(struct ethosu_file_priv *ethosu_priv) +{ + struct ethosu_device *dev = ethosu_priv->edev; + struct drm_gpu_scheduler *sched = &dev->sched; + int ret; + + ret = drm_sched_entity_init(ðosu_priv->sched_entity, + DRM_SCHED_PRIORITY_NORMAL, + &sched, 1, NULL); + return WARN_ON(ret); +} + +void ethosu_job_close(struct ethosu_file_priv *ethosu_priv) +{ + struct drm_sched_entity *entity = ðosu_priv->sched_entity; + + drm_sched_entity_destroy(entity); +} + +static int ethosu_ioctl_submit_job(struct drm_device *dev, struct drm_file *file, + struct drm_ethosu_job *job) +{ + struct ethosu_device *edev = to_ethosu_device(dev); + struct ethosu_file_priv *file_priv = file->driver_priv; + struct ethosu_job *ejob = NULL; + struct ethosu_validated_cmdstream_info *cmd_info; + int ret = 0; + + /* BO region 2 is reserved if SRAM is used */ + if (job->region_bo_handles[ETHOSU_SRAM_REGION] && job->sram_size) + return -EINVAL; + + if (edev->npu_info.sram_size < job->sram_size) + return -EINVAL; + + ejob = kzalloc(sizeof(*ejob), GFP_KERNEL); + if (!ejob) + return -ENOMEM; + + kref_init(&ejob->refcount); + + ejob->dev = edev; + ejob->sram_size = job->sram_size; + + ejob->done_fence = kzalloc(sizeof(*ejob->done_fence), GFP_KERNEL); + if (!ejob->done_fence) { + ret = -ENOMEM; + goto out_cleanup_job; + } + + ret = drm_sched_job_init(&ejob->base, + &file_priv->sched_entity, + 1, NULL, file->client_id); + if (ret) + goto out_put_job; + + ejob->cmd_bo = drm_gem_object_lookup(file, job->cmd_bo); + if (!ejob->cmd_bo) { + ret = -ENOENT; + goto out_cleanup_job; + } + cmd_info = to_ethosu_bo(ejob->cmd_bo)->info; + if (!cmd_info) { + ret = -EINVAL; + goto out_cleanup_job; + } + + for (int i = 0; i < NPU_BASEP_REGION_MAX; i++) { + struct drm_gem_object *gem; + + /* Can only omit a BO handle if the region is not used or used for SRAM */ + if (!job->region_bo_handles[i] && + (!cmd_info->region_size[i] || (i == ETHOSU_SRAM_REGION && job->sram_size))) + continue; + + if (job->region_bo_handles[i] && !cmd_info->region_size[i]) { + dev_err(dev->dev, + "Cmdstream BO handle %d set for unused region %d\n", + job->region_bo_handles[i], i); + ret = -EINVAL; + goto out_cleanup_job; + } + + gem = drm_gem_object_lookup(file, job->region_bo_handles[i]); + if (!gem) { + dev_err(dev->dev, + "Invalid BO handle %d for region %d\n", + job->region_bo_handles[i], i); + ret = -ENOENT; + goto out_cleanup_job; + } + + ejob->region_bo[ejob->region_cnt] = gem; + ejob->region_bo_num[ejob->region_cnt] = i; + ejob->region_cnt++; + + if (to_ethosu_bo(gem)->info) { + dev_err(dev->dev, + "Cmdstream BO handle %d used for region %d\n", + job->region_bo_handles[i], i); + ret = -EINVAL; + goto out_cleanup_job; + } + + /* Verify the command stream doesn't have accesses outside the BO */ + if (cmd_info->region_size[i] > gem->size) { + dev_err(dev->dev, + "cmd stream region %d size greater than BO size (%llu > %zu)\n", + i, cmd_info->region_size[i], gem->size); + ret = -EOVERFLOW; + goto out_cleanup_job; + } + } + ret = ethosu_job_push(ejob); + +out_cleanup_job: + if (ret) + drm_sched_job_cleanup(&ejob->base); +out_put_job: + ethosu_job_put(ejob); + + return ret; +} + +int ethosu_ioctl_submit(struct drm_device *dev, void *data, struct drm_file *file) +{ + struct drm_ethosu_submit *args = data; + int ret = 0; + unsigned int i = 0; + + if (args->pad) { + drm_dbg(dev, "Reserved field in drm_ethosu_submit struct should be 0.\n"); + return -EINVAL; + } + + struct drm_ethosu_job __free(kvfree) *jobs = + kvmalloc_array(args->job_count, sizeof(*jobs), GFP_KERNEL); + if (!jobs) + return -ENOMEM; + + if (copy_from_user(jobs, + (void __user *)(uintptr_t)args->jobs, + args->job_count * sizeof(*jobs))) { + drm_dbg(dev, "Failed to copy incoming job array\n"); + return -EFAULT; + } + + for (i = 0; i < args->job_count; i++) { + ret = ethosu_ioctl_submit_job(dev, file, &jobs[i]); + if (ret) + return ret; + } + + return 0; +} diff --git a/drivers/accel/ethosu/ethosu_job.h b/drivers/accel/ethosu/ethosu_job.h new file mode 100644 index 000000000000..ff1cf448d094 --- /dev/null +++ b/drivers/accel/ethosu/ethosu_job.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR MIT */ +/* Copyright 2024-2025 Tomeu Vizoso <tomeu@tomeuvizoso.net> */ +/* Copyright 2025 Arm, Ltd. */ + +#ifndef __ETHOSU_JOB_H__ +#define __ETHOSU_JOB_H__ + +#include <linux/kref.h> +#include <drm/gpu_scheduler.h> + +struct ethosu_device; +struct ethosu_file_priv; + +struct ethosu_job { + struct drm_sched_job base; + struct ethosu_device *dev; + + struct drm_gem_object *cmd_bo; + struct drm_gem_object *region_bo[NPU_BASEP_REGION_MAX]; + u8 region_bo_num[NPU_BASEP_REGION_MAX]; + u8 region_cnt; + u32 sram_size; + + /* Fence to be signaled by drm-sched once its done with the job */ + struct dma_fence *inference_done_fence; + + /* Fence to be signaled by IRQ handler when the job is complete. */ + struct dma_fence *done_fence; + + struct kref refcount; +}; + +int ethosu_ioctl_submit(struct drm_device *dev, void *data, struct drm_file *file); + +int ethosu_job_init(struct ethosu_device *dev); +void ethosu_job_fini(struct ethosu_device *dev); +int ethosu_job_open(struct ethosu_file_priv *ethosu_priv); +void ethosu_job_close(struct ethosu_file_priv *ethosu_priv); + +#endif diff --git a/drivers/accel/ivpu/Makefile b/drivers/accel/ivpu/Makefile index 1029e0bab061..dbf76b8a5b4c 100644 --- a/drivers/accel/ivpu/Makefile +++ b/drivers/accel/ivpu/Makefile @@ -6,6 +6,7 @@ intel_vpu-y := \ ivpu_fw.o \ ivpu_fw_log.o \ ivpu_gem.o \ + ivpu_gem_userptr.o \ ivpu_hw.o \ ivpu_hw_btrs.o \ ivpu_hw_ip.o \ diff --git a/drivers/accel/ivpu/ivpu_debugfs.c b/drivers/accel/ivpu/ivpu_debugfs.c index cd24ccd20ba6..3bd85ee6c26b 100644 --- a/drivers/accel/ivpu/ivpu_debugfs.c +++ b/drivers/accel/ivpu/ivpu_debugfs.c @@ -398,35 +398,25 @@ static int dct_active_set(void *data, u64 active_percent) DEFINE_DEBUGFS_ATTRIBUTE(ivpu_dct_fops, dct_active_get, dct_active_set, "%llu\n"); +static void print_priority_band(struct seq_file *s, struct ivpu_hw_info *hw, + int band, const char *name) +{ + seq_printf(s, "%-9s: grace_period %9u process_grace_period %9u process_quantum %9u\n", + name, + hw->hws.grace_period[band], + hw->hws.process_grace_period[band], + hw->hws.process_quantum[band]); +} + static int priority_bands_show(struct seq_file *s, void *v) { struct ivpu_device *vdev = s->private; struct ivpu_hw_info *hw = vdev->hw; - for (int band = VPU_JOB_SCHEDULING_PRIORITY_BAND_IDLE; - band < VPU_JOB_SCHEDULING_PRIORITY_BAND_COUNT; band++) { - switch (band) { - case VPU_JOB_SCHEDULING_PRIORITY_BAND_IDLE: - seq_puts(s, "Idle: "); - break; - - case VPU_JOB_SCHEDULING_PRIORITY_BAND_NORMAL: - seq_puts(s, "Normal: "); - break; - - case VPU_JOB_SCHEDULING_PRIORITY_BAND_FOCUS: - seq_puts(s, "Focus: "); - break; - - case VPU_JOB_SCHEDULING_PRIORITY_BAND_REALTIME: - seq_puts(s, "Realtime: "); - break; - } - - seq_printf(s, "grace_period %9u process_grace_period %9u process_quantum %9u\n", - hw->hws.grace_period[band], hw->hws.process_grace_period[band], - hw->hws.process_quantum[band]); - } + print_priority_band(s, hw, VPU_JOB_SCHEDULING_PRIORITY_BAND_IDLE, "Idle"); + print_priority_band(s, hw, VPU_JOB_SCHEDULING_PRIORITY_BAND_NORMAL, "Normal"); + print_priority_band(s, hw, VPU_JOB_SCHEDULING_PRIORITY_BAND_FOCUS, "Focus"); + print_priority_band(s, hw, VPU_JOB_SCHEDULING_PRIORITY_BAND_REALTIME, "Realtime"); return 0; } diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 3289751b4757..3d6fccdefdd6 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -57,7 +57,7 @@ MODULE_PARM_DESC(pll_max_ratio, "Maximum PLL ratio used to set NPU frequency"); int ivpu_sched_mode = IVPU_SCHED_MODE_AUTO; module_param_named(sched_mode, ivpu_sched_mode, int, 0444); -MODULE_PARM_DESC(sched_mode, "Scheduler mode: -1 - Use default scheduler, 0 - Use OS scheduler, 1 - Use HW scheduler"); +MODULE_PARM_DESC(sched_mode, "Scheduler mode: -1 - Use default scheduler, 0 - Use OS scheduler (supported on 27XX - 50XX), 1 - Use HW scheduler"); bool ivpu_disable_mmu_cont_pages; module_param_named(disable_mmu_cont_pages, ivpu_disable_mmu_cont_pages, bool, 0444); @@ -134,6 +134,8 @@ bool ivpu_is_capable(struct ivpu_device *vdev, u32 capability) return true; case DRM_IVPU_CAP_DMA_MEMORY_RANGE: return true; + case DRM_IVPU_CAP_BO_CREATE_FROM_USERPTR: + return true; case DRM_IVPU_CAP_MANAGE_CMDQ: return vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW; default: @@ -200,6 +202,9 @@ static int ivpu_get_param_ioctl(struct drm_device *dev, void *data, struct drm_f case DRM_IVPU_PARAM_CAPABILITIES: args->value = ivpu_is_capable(vdev, args->index); break; + case DRM_IVPU_PARAM_PREEMPT_BUFFER_SIZE: + args->value = ivpu_fw_preempt_buf_size(vdev); + break; default: ret = -EINVAL; break; @@ -310,6 +315,7 @@ static const struct drm_ioctl_desc ivpu_drm_ioctls[] = { DRM_IOCTL_DEF_DRV(IVPU_CMDQ_CREATE, ivpu_cmdq_create_ioctl, 0), DRM_IOCTL_DEF_DRV(IVPU_CMDQ_DESTROY, ivpu_cmdq_destroy_ioctl, 0), DRM_IOCTL_DEF_DRV(IVPU_CMDQ_SUBMIT, ivpu_cmdq_submit_ioctl, 0), + DRM_IOCTL_DEF_DRV(IVPU_BO_CREATE_FROM_USERPTR, ivpu_bo_create_from_userptr_ioctl, 0), }; static int ivpu_wait_for_ready(struct ivpu_device *vdev) @@ -377,8 +383,7 @@ int ivpu_boot(struct ivpu_device *vdev) drm_WARN_ON(&vdev->drm, atomic_read(&vdev->job_timeout_counter)); drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa)); - /* Update boot params located at first 4KB of FW memory */ - ivpu_fw_boot_params_setup(vdev, ivpu_bo_vaddr(vdev->fw->mem)); + ivpu_fw_boot_params_setup(vdev, ivpu_bo_vaddr(vdev->fw->mem_bp)); ret = ivpu_hw_boot_fw(vdev); if (ret) { @@ -450,6 +455,9 @@ int ivpu_shutdown(struct ivpu_device *vdev) static const struct file_operations ivpu_fops = { .owner = THIS_MODULE, DRM_ACCEL_FOPS, +#ifdef CONFIG_PROC_FS + .show_fdinfo = drm_show_fdinfo, +#endif }; static const struct drm_driver driver = { @@ -464,6 +472,9 @@ static const struct drm_driver driver = { .ioctls = ivpu_drm_ioctls, .num_ioctls = ARRAY_SIZE(ivpu_drm_ioctls), .fops = &ivpu_fops, +#ifdef CONFIG_PROC_FS + .show_fdinfo = drm_show_memory_stats, +#endif .name = DRIVER_NAME, .desc = DRIVER_DESC, @@ -705,6 +716,7 @@ static struct pci_device_id ivpu_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_LNL) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PTL_P) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_WCL) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_NVL) }, { } }; MODULE_DEVICE_TABLE(pci, ivpu_pci_ids); diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h index 62ab1c654e63..5b34b6f50e69 100644 --- a/drivers/accel/ivpu/ivpu_drv.h +++ b/drivers/accel/ivpu/ivpu_drv.h @@ -27,6 +27,7 @@ #define PCI_DEVICE_ID_LNL 0x643e #define PCI_DEVICE_ID_PTL_P 0xb03e #define PCI_DEVICE_ID_WCL 0xfd3e +#define PCI_DEVICE_ID_NVL 0xd71d #define IVPU_HW_IP_37XX 37 #define IVPU_HW_IP_40XX 40 @@ -78,6 +79,7 @@ #define IVPU_DBG_KREF BIT(11) #define IVPU_DBG_RPM BIT(12) #define IVPU_DBG_MMU_MAP BIT(13) +#define IVPU_DBG_IOCTL BIT(14) #define ivpu_err(vdev, fmt, ...) \ drm_err(&(vdev)->drm, "%s(): " fmt, __func__, ##__VA_ARGS__) @@ -245,6 +247,8 @@ static inline int ivpu_hw_ip_gen(struct ivpu_device *vdev) case PCI_DEVICE_ID_PTL_P: case PCI_DEVICE_ID_WCL: return IVPU_HW_IP_50XX; + case PCI_DEVICE_ID_NVL: + return IVPU_HW_IP_60XX; default: dump_stack(); ivpu_err(vdev, "Unknown NPU IP generation\n"); @@ -261,6 +265,7 @@ static inline int ivpu_hw_btrs_gen(struct ivpu_device *vdev) case PCI_DEVICE_ID_LNL: case PCI_DEVICE_ID_PTL_P: case PCI_DEVICE_ID_WCL: + case PCI_DEVICE_ID_NVL: return IVPU_HW_BTRS_LNL; default: dump_stack(); diff --git a/drivers/accel/ivpu/ivpu_fw.c b/drivers/accel/ivpu/ivpu_fw.c index 9db741695401..48386d2cddbb 100644 --- a/drivers/accel/ivpu/ivpu_fw.c +++ b/drivers/accel/ivpu/ivpu_fw.c @@ -17,15 +17,10 @@ #include "ivpu_ipc.h" #include "ivpu_pm.h" -#define FW_GLOBAL_MEM_START (2ull * SZ_1G) -#define FW_GLOBAL_MEM_END (3ull * SZ_1G) -#define FW_SHARED_MEM_SIZE SZ_256M /* Must be aligned to FW_SHARED_MEM_ALIGNMENT */ -#define FW_SHARED_MEM_ALIGNMENT SZ_128K /* VPU MTRR limitation */ -#define FW_RUNTIME_MAX_SIZE SZ_512M #define FW_SHAVE_NN_MAX_SIZE SZ_2M -#define FW_RUNTIME_MIN_ADDR (FW_GLOBAL_MEM_START) -#define FW_RUNTIME_MAX_ADDR (FW_GLOBAL_MEM_END - FW_SHARED_MEM_SIZE) #define FW_FILE_IMAGE_OFFSET (VPU_FW_HEADER_SIZE + FW_VERSION_HEADER_SIZE) +#define FW_PREEMPT_BUF_MIN_SIZE SZ_4K +#define FW_PREEMPT_BUF_MAX_SIZE SZ_32M #define WATCHDOG_MSS_REDIRECT 32 #define WATCHDOG_NCE_REDIRECT 33 @@ -61,12 +56,14 @@ static struct { { IVPU_HW_IP_40XX, "intel/vpu/vpu_40xx_v0.0.bin" }, { IVPU_HW_IP_50XX, "intel/vpu/vpu_50xx_v1.bin" }, { IVPU_HW_IP_50XX, "intel/vpu/vpu_50xx_v0.0.bin" }, + { IVPU_HW_IP_60XX, "intel/vpu/vpu_60xx_v1.bin" }, }; /* Production fw_names from the table above */ MODULE_FIRMWARE("intel/vpu/vpu_37xx_v1.bin"); MODULE_FIRMWARE("intel/vpu/vpu_40xx_v1.bin"); MODULE_FIRMWARE("intel/vpu/vpu_50xx_v1.bin"); +MODULE_FIRMWARE("intel/vpu/vpu_60xx_v1.bin"); static int ivpu_fw_request(struct ivpu_device *vdev) { @@ -131,9 +128,14 @@ ivpu_fw_check_api_ver_lt(struct ivpu_device *vdev, const struct vpu_firmware_hea return false; } -static bool is_within_range(u64 addr, size_t size, u64 range_start, size_t range_size) +bool ivpu_is_within_range(u64 addr, size_t size, struct ivpu_addr_range *range) { - if (addr < range_start || addr + size > range_start + range_size) + u64 addr_end; + + if (!range || check_add_overflow(addr, size, &addr_end)) + return false; + + if (addr < range->start || addr_end > range->end) return false; return true; @@ -142,6 +144,12 @@ static bool is_within_range(u64 addr, size_t size, u64 range_start, size_t range static u32 ivpu_fw_sched_mode_select(struct ivpu_device *vdev, const struct vpu_firmware_header *fw_hdr) { + if (ivpu_hw_ip_gen(vdev) >= IVPU_HW_IP_60XX && + ivpu_sched_mode == VPU_SCHEDULING_MODE_OS) { + ivpu_warn(vdev, "OS sched mode is not supported, using HW mode\n"); + return VPU_SCHEDULING_MODE_HW; + } + if (ivpu_sched_mode != IVPU_SCHED_MODE_AUTO) return ivpu_sched_mode; @@ -151,11 +159,56 @@ ivpu_fw_sched_mode_select(struct ivpu_device *vdev, const struct vpu_firmware_he return VPU_SCHEDULING_MODE_HW; } +static void +ivpu_preemption_config_parse(struct ivpu_device *vdev, const struct vpu_firmware_header *fw_hdr) +{ + struct ivpu_fw_info *fw = vdev->fw; + u32 primary_preempt_buf_size, secondary_preempt_buf_size; + + if (fw_hdr->preemption_buffer_1_max_size) + primary_preempt_buf_size = fw_hdr->preemption_buffer_1_max_size; + else + primary_preempt_buf_size = fw_hdr->preemption_buffer_1_size; + + if (fw_hdr->preemption_buffer_2_max_size) + secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_max_size; + else + secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_size; + + ivpu_dbg(vdev, FW_BOOT, "Preemption buffer size, primary: %u, secondary: %u\n", + primary_preempt_buf_size, secondary_preempt_buf_size); + + if (primary_preempt_buf_size < FW_PREEMPT_BUF_MIN_SIZE || + secondary_preempt_buf_size < FW_PREEMPT_BUF_MIN_SIZE) { + ivpu_warn(vdev, "Preemption buffers size too small\n"); + return; + } + + if (primary_preempt_buf_size > FW_PREEMPT_BUF_MAX_SIZE || + secondary_preempt_buf_size > FW_PREEMPT_BUF_MAX_SIZE) { + ivpu_warn(vdev, "Preemption buffers size too big\n"); + return; + } + + if (fw->sched_mode != VPU_SCHEDULING_MODE_HW) + return; + + if (ivpu_test_mode & IVPU_TEST_MODE_MIP_DISABLE) + return; + + vdev->fw->primary_preempt_buf_size = ALIGN(primary_preempt_buf_size, PAGE_SIZE); + vdev->fw->secondary_preempt_buf_size = ALIGN(secondary_preempt_buf_size, PAGE_SIZE); +} + static int ivpu_fw_parse(struct ivpu_device *vdev) { struct ivpu_fw_info *fw = vdev->fw; const struct vpu_firmware_header *fw_hdr = (const void *)fw->file->data; - u64 runtime_addr, image_load_addr, runtime_size, image_size; + struct ivpu_addr_range fw_image_range; + u64 boot_params_addr, boot_params_size; + u64 fw_version_addr, fw_version_size; + u64 runtime_addr, runtime_size; + u64 image_load_addr, image_size; if (fw->file->size <= FW_FILE_IMAGE_OFFSET) { ivpu_err(vdev, "Firmware file is too small: %zu\n", fw->file->size); @@ -167,18 +220,37 @@ static int ivpu_fw_parse(struct ivpu_device *vdev) return -EINVAL; } - runtime_addr = fw_hdr->boot_params_load_address; - runtime_size = fw_hdr->runtime_size; - image_load_addr = fw_hdr->image_load_address; - image_size = fw_hdr->image_size; + boot_params_addr = fw_hdr->boot_params_load_address; + boot_params_size = SZ_4K; - if (runtime_addr < FW_RUNTIME_MIN_ADDR || runtime_addr > FW_RUNTIME_MAX_ADDR) { - ivpu_err(vdev, "Invalid firmware runtime address: 0x%llx\n", runtime_addr); + if (!ivpu_is_within_range(boot_params_addr, boot_params_size, &vdev->hw->ranges.runtime)) { + ivpu_err(vdev, "Invalid boot params address: 0x%llx\n", boot_params_addr); return -EINVAL; } - if (runtime_size < fw->file->size || runtime_size > FW_RUNTIME_MAX_SIZE) { - ivpu_err(vdev, "Invalid firmware runtime size: %llu\n", runtime_size); + fw_version_addr = fw_hdr->firmware_version_load_address; + fw_version_size = ALIGN(fw_hdr->firmware_version_size, SZ_4K); + + if (fw_version_size != SZ_4K) { + ivpu_err(vdev, "Invalid firmware version size: %u\n", + fw_hdr->firmware_version_size); + return -EINVAL; + } + + if (!ivpu_is_within_range(fw_version_addr, fw_version_size, &vdev->hw->ranges.runtime)) { + ivpu_err(vdev, "Invalid firmware version address: 0x%llx\n", fw_version_addr); + return -EINVAL; + } + + runtime_addr = fw_hdr->image_load_address; + runtime_size = fw_hdr->runtime_size - boot_params_size - fw_version_size; + + image_load_addr = fw_hdr->image_load_address; + image_size = fw_hdr->image_size; + + if (!ivpu_is_within_range(runtime_addr, runtime_size, &vdev->hw->ranges.runtime)) { + ivpu_err(vdev, "Invalid firmware runtime address: 0x%llx and size %llu\n", + runtime_addr, runtime_size); return -EINVAL; } @@ -187,23 +259,25 @@ static int ivpu_fw_parse(struct ivpu_device *vdev) return -EINVAL; } - if (image_load_addr < runtime_addr || - image_load_addr + image_size > runtime_addr + runtime_size) { - ivpu_err(vdev, "Invalid firmware load address size: 0x%llx and size %llu\n", + if (!ivpu_is_within_range(image_load_addr, image_size, &vdev->hw->ranges.runtime)) { + ivpu_err(vdev, "Invalid firmware load address: 0x%llx and size %llu\n", image_load_addr, image_size); return -EINVAL; } - if (fw_hdr->shave_nn_fw_size > FW_SHAVE_NN_MAX_SIZE) { - ivpu_err(vdev, "SHAVE NN firmware is too big: %u\n", fw_hdr->shave_nn_fw_size); + if (ivpu_hw_range_init(vdev, &fw_image_range, image_load_addr, image_size)) return -EINVAL; - } - if (fw_hdr->entry_point < image_load_addr || - fw_hdr->entry_point >= image_load_addr + image_size) { + if (!ivpu_is_within_range(fw_hdr->entry_point, SZ_4K, &fw_image_range)) { ivpu_err(vdev, "Invalid entry point: 0x%llx\n", fw_hdr->entry_point); return -EINVAL; } + + if (fw_hdr->shave_nn_fw_size > FW_SHAVE_NN_MAX_SIZE) { + ivpu_err(vdev, "SHAVE NN firmware is too big: %u\n", fw_hdr->shave_nn_fw_size); + return -EINVAL; + } + ivpu_dbg(vdev, FW_BOOT, "Header version: 0x%x, format 0x%x\n", fw_hdr->header_version, fw_hdr->image_format); @@ -217,6 +291,10 @@ static int ivpu_fw_parse(struct ivpu_device *vdev) if (IVPU_FW_CHECK_API_COMPAT(vdev, fw_hdr, JSM, 3)) return -EINVAL; + fw->boot_params_addr = boot_params_addr; + fw->boot_params_size = boot_params_size; + fw->fw_version_addr = fw_version_addr; + fw->fw_version_size = fw_version_size; fw->runtime_addr = runtime_addr; fw->runtime_size = runtime_size; fw->image_load_offset = image_load_addr - runtime_addr; @@ -235,22 +313,13 @@ static int ivpu_fw_parse(struct ivpu_device *vdev) fw->sched_mode = ivpu_fw_sched_mode_select(vdev, fw_hdr); ivpu_info(vdev, "Scheduler mode: %s\n", fw->sched_mode ? "HW" : "OS"); - if (fw_hdr->preemption_buffer_1_max_size) - fw->primary_preempt_buf_size = fw_hdr->preemption_buffer_1_max_size; - else - fw->primary_preempt_buf_size = fw_hdr->preemption_buffer_1_size; + ivpu_preemption_config_parse(vdev, fw_hdr); + ivpu_dbg(vdev, FW_BOOT, "Mid-inference preemption %s supported\n", + ivpu_fw_preempt_buf_size(vdev) ? "is" : "is not"); - if (fw_hdr->preemption_buffer_2_max_size) - fw->secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_max_size; - else - fw->secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_size; - ivpu_dbg(vdev, FW_BOOT, "Preemption buffer sizes: primary %u, secondary %u\n", - fw->primary_preempt_buf_size, fw->secondary_preempt_buf_size); - - if (fw_hdr->ro_section_start_address && !is_within_range(fw_hdr->ro_section_start_address, - fw_hdr->ro_section_size, - fw_hdr->image_load_address, - fw_hdr->image_size)) { + if (fw_hdr->ro_section_start_address && + !ivpu_is_within_range(fw_hdr->ro_section_start_address, fw_hdr->ro_section_size, + &fw_image_range)) { ivpu_err(vdev, "Invalid read-only section: start address 0x%llx, size %u\n", fw_hdr->ro_section_start_address, fw_hdr->ro_section_size); return -EINVAL; @@ -259,12 +328,18 @@ static int ivpu_fw_parse(struct ivpu_device *vdev) fw->read_only_addr = fw_hdr->ro_section_start_address; fw->read_only_size = fw_hdr->ro_section_size; - ivpu_dbg(vdev, FW_BOOT, "Size: file %lu image %u runtime %u shavenn %u\n", - fw->file->size, fw->image_size, fw->runtime_size, fw->shave_nn_size); - ivpu_dbg(vdev, FW_BOOT, "Address: runtime 0x%llx, load 0x%llx, entry point 0x%llx\n", - fw->runtime_addr, image_load_addr, fw->entry_point); + ivpu_dbg(vdev, FW_BOOT, "Boot params: address 0x%llx, size %llu\n", + fw->boot_params_addr, fw->boot_params_size); + ivpu_dbg(vdev, FW_BOOT, "FW version: address 0x%llx, size %llu\n", + fw->fw_version_addr, fw->fw_version_size); + ivpu_dbg(vdev, FW_BOOT, "Runtime: address 0x%llx, size %u\n", + fw->runtime_addr, fw->runtime_size); + ivpu_dbg(vdev, FW_BOOT, "Image load offset: 0x%llx, size %u\n", + fw->image_load_offset, fw->image_size); ivpu_dbg(vdev, FW_BOOT, "Read-only section: address 0x%llx, size %u\n", fw->read_only_addr, fw->read_only_size); + ivpu_dbg(vdev, FW_BOOT, "FW entry point: 0x%llx\n", fw->entry_point); + ivpu_dbg(vdev, FW_BOOT, "SHAVE NN size: %u\n", fw->shave_nn_size); return 0; } @@ -291,39 +366,33 @@ ivpu_fw_init_wa(struct ivpu_device *vdev) IVPU_PRINT_WA(disable_d0i3_msg); } -static int ivpu_fw_update_global_range(struct ivpu_device *vdev) -{ - struct ivpu_fw_info *fw = vdev->fw; - u64 start = ALIGN(fw->runtime_addr + fw->runtime_size, FW_SHARED_MEM_ALIGNMENT); - u64 size = FW_SHARED_MEM_SIZE; - - if (start + size > FW_GLOBAL_MEM_END) { - ivpu_err(vdev, "No space for shared region, start %lld, size %lld\n", start, size); - return -EINVAL; - } - - ivpu_hw_range_init(&vdev->hw->ranges.global, start, size); - return 0; -} - static int ivpu_fw_mem_init(struct ivpu_device *vdev) { struct ivpu_fw_info *fw = vdev->fw; - struct ivpu_addr_range fw_range; int log_verb_size; int ret; - ret = ivpu_fw_update_global_range(vdev); - if (ret) - return ret; + fw->mem_bp = ivpu_bo_create_runtime(vdev, fw->boot_params_addr, fw->boot_params_size, + DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE); + if (!fw->mem_bp) { + ivpu_err(vdev, "Failed to create firmware boot params memory buffer\n"); + return -ENOMEM; + } - fw_range.start = fw->runtime_addr; - fw_range.end = fw->runtime_addr + fw->runtime_size; - fw->mem = ivpu_bo_create(vdev, &vdev->gctx, &fw_range, fw->runtime_size, - DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE); + fw->mem_fw_ver = ivpu_bo_create_runtime(vdev, fw->fw_version_addr, fw->fw_version_size, + DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE); + if (!fw->mem_fw_ver) { + ivpu_err(vdev, "Failed to create firmware version memory buffer\n"); + ret = -ENOMEM; + goto err_free_bp; + } + + fw->mem = ivpu_bo_create_runtime(vdev, fw->runtime_addr, fw->runtime_size, + DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE); if (!fw->mem) { ivpu_err(vdev, "Failed to create firmware runtime memory buffer\n"); - return -ENOMEM; + ret = -ENOMEM; + goto err_free_fw_ver; } ret = ivpu_mmu_context_set_pages_ro(vdev, &vdev->gctx, fw->read_only_addr, @@ -372,6 +441,10 @@ err_free_log_crit: ivpu_bo_free(fw->mem_log_crit); err_free_fw_mem: ivpu_bo_free(fw->mem); +err_free_fw_ver: + ivpu_bo_free(fw->mem_fw_ver); +err_free_bp: + ivpu_bo_free(fw->mem_bp); return ret; } @@ -387,10 +460,14 @@ static void ivpu_fw_mem_fini(struct ivpu_device *vdev) ivpu_bo_free(fw->mem_log_verb); ivpu_bo_free(fw->mem_log_crit); ivpu_bo_free(fw->mem); + ivpu_bo_free(fw->mem_fw_ver); + ivpu_bo_free(fw->mem_bp); fw->mem_log_verb = NULL; fw->mem_log_crit = NULL; fw->mem = NULL; + fw->mem_fw_ver = NULL; + fw->mem_bp = NULL; } int ivpu_fw_init(struct ivpu_device *vdev) @@ -483,11 +560,6 @@ static void ivpu_fw_boot_params_print(struct ivpu_device *vdev, struct vpu_boot_ ivpu_dbg(vdev, FW_BOOT, "boot_params.cache_defaults[VPU_BOOT_L2_CACHE_CFG_NN].cfg = 0x%x\n", boot_params->cache_defaults[VPU_BOOT_L2_CACHE_CFG_NN].cfg); - ivpu_dbg(vdev, FW_BOOT, "boot_params.global_memory_allocator_base = 0x%llx\n", - boot_params->global_memory_allocator_base); - ivpu_dbg(vdev, FW_BOOT, "boot_params.global_memory_allocator_size = 0x%x\n", - boot_params->global_memory_allocator_size); - ivpu_dbg(vdev, FW_BOOT, "boot_params.shave_nn_fw_base = 0x%llx\n", boot_params->shave_nn_fw_base); @@ -495,10 +567,6 @@ static void ivpu_fw_boot_params_print(struct ivpu_device *vdev, struct vpu_boot_ boot_params->watchdog_irq_mss); ivpu_dbg(vdev, FW_BOOT, "boot_params.watchdog_irq_nce = 0x%x\n", boot_params->watchdog_irq_nce); - ivpu_dbg(vdev, FW_BOOT, "boot_params.host_to_vpu_irq = 0x%x\n", - boot_params->host_to_vpu_irq); - ivpu_dbg(vdev, FW_BOOT, "boot_params.job_done_irq = 0x%x\n", - boot_params->job_done_irq); ivpu_dbg(vdev, FW_BOOT, "boot_params.host_version_id = 0x%x\n", boot_params->host_version_id); @@ -546,6 +614,8 @@ static void ivpu_fw_boot_params_print(struct ivpu_device *vdev, struct vpu_boot_ boot_params->system_time_us); ivpu_dbg(vdev, FW_BOOT, "boot_params.power_profile = 0x%x\n", boot_params->power_profile); + ivpu_dbg(vdev, FW_BOOT, "boot_params.vpu_uses_ecc_mca_signal = 0x%x\n", + boot_params->vpu_uses_ecc_mca_signal); } void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params *boot_params) @@ -572,6 +642,7 @@ void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params return; } + memset(boot_params, 0, sizeof(*boot_params)); vdev->pm->is_warmboot = false; boot_params->magic = VPU_BOOT_PARAMS_MAGIC; @@ -647,6 +718,8 @@ void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params boot_params->d0i3_entry_vpu_ts = 0; if (IVPU_WA(disable_d0i2)) boot_params->power_profile |= BIT(1); + boot_params->vpu_uses_ecc_mca_signal = + ivpu_hw_uses_ecc_mca_signal(vdev) ? VPU_BOOT_MCA_ECC_BOTH : 0; boot_params->system_time_us = ktime_to_us(ktime_get_real()); wmb(); /* Flush WC buffers after writing bootparams */ diff --git a/drivers/accel/ivpu/ivpu_fw.h b/drivers/accel/ivpu/ivpu_fw.h index 7081913fb0dd..00945892b55e 100644 --- a/drivers/accel/ivpu/ivpu_fw.h +++ b/drivers/accel/ivpu/ivpu_fw.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation */ #ifndef __IVPU_FW_H__ @@ -19,10 +19,16 @@ struct ivpu_fw_info { const struct firmware *file; const char *name; char version[FW_VERSION_STR_SIZE]; + struct ivpu_bo *mem_bp; + struct ivpu_bo *mem_fw_ver; struct ivpu_bo *mem; struct ivpu_bo *mem_shave_nn; struct ivpu_bo *mem_log_crit; struct ivpu_bo *mem_log_verb; + u64 boot_params_addr; + u64 boot_params_size; + u64 fw_version_addr; + u64 fw_version_size; u64 runtime_addr; u32 runtime_size; u64 image_load_offset; @@ -42,6 +48,7 @@ struct ivpu_fw_info { u64 last_heartbeat; }; +bool ivpu_is_within_range(u64 addr, size_t size, struct ivpu_addr_range *range); int ivpu_fw_init(struct ivpu_device *vdev); void ivpu_fw_fini(struct ivpu_device *vdev); void ivpu_fw_load(struct ivpu_device *vdev); @@ -52,4 +59,9 @@ static inline bool ivpu_fw_is_cold_boot(struct ivpu_device *vdev) return vdev->fw->entry_point == vdev->fw->cold_boot_entry_point; } +static inline u32 ivpu_fw_preempt_buf_size(struct ivpu_device *vdev) +{ + return vdev->fw->primary_preempt_buf_size + vdev->fw->secondary_preempt_buf_size; +} + #endif /* __IVPU_FW_H__ */ diff --git a/drivers/accel/ivpu/ivpu_gem.c b/drivers/accel/ivpu/ivpu_gem.c index 59cfcf3eaded..ece68f570b7e 100644 --- a/drivers/accel/ivpu/ivpu_gem.c +++ b/drivers/accel/ivpu/ivpu_gem.c @@ -15,6 +15,7 @@ #include <drm/drm_utils.h> #include "ivpu_drv.h" +#include "ivpu_fw.h" #include "ivpu_gem.h" #include "ivpu_hw.h" #include "ivpu_mmu.h" @@ -27,8 +28,8 @@ static const struct drm_gem_object_funcs ivpu_gem_funcs; static inline void ivpu_dbg_bo(struct ivpu_device *vdev, struct ivpu_bo *bo, const char *action) { ivpu_dbg(vdev, BO, - "%6s: bo %8p vpu_addr %9llx size %8zu ctx %d has_pages %d dma_mapped %d mmu_mapped %d wc %d imported %d\n", - action, bo, bo->vpu_addr, ivpu_bo_size(bo), bo->ctx_id, + "%6s: bo %8p size %9zu ctx %d vpu_addr %9llx pages %d sgt %d mmu_mapped %d wc %d imported %d\n", + action, bo, ivpu_bo_size(bo), bo->ctx_id, bo->vpu_addr, (bool)bo->base.pages, (bool)bo->base.sgt, bo->mmu_mapped, bo->base.map_wc, (bool)drm_gem_is_imported(&bo->base.base)); } @@ -43,22 +44,47 @@ static inline void ivpu_bo_unlock(struct ivpu_bo *bo) dma_resv_unlock(bo->base.base.resv); } +static struct sg_table *ivpu_bo_map_attachment(struct ivpu_device *vdev, struct ivpu_bo *bo) +{ + struct sg_table *sgt; + + drm_WARN_ON(&vdev->drm, !bo->base.base.import_attach); + + ivpu_bo_lock(bo); + + sgt = bo->base.sgt; + if (!sgt) { + sgt = dma_buf_map_attachment(bo->base.base.import_attach, DMA_BIDIRECTIONAL); + if (IS_ERR(sgt)) + ivpu_err(vdev, "Failed to map BO in IOMMU: %ld\n", PTR_ERR(sgt)); + else + bo->base.sgt = sgt; + } + + ivpu_bo_unlock(bo); + + return sgt; +} + /* - * ivpu_bo_pin() - pin the backing physical pages and map them to VPU. + * ivpu_bo_bind() - pin the backing physical pages and map them to VPU. * * This function pins physical memory pages, then maps the physical pages * to IOMMU address space and finally updates the VPU MMU page tables * to allow the VPU to translate VPU address to IOMMU address. */ -int __must_check ivpu_bo_pin(struct ivpu_bo *bo) +int __must_check ivpu_bo_bind(struct ivpu_bo *bo) { struct ivpu_device *vdev = ivpu_bo_to_vdev(bo); struct sg_table *sgt; int ret = 0; - ivpu_dbg_bo(vdev, bo, "pin"); + ivpu_dbg_bo(vdev, bo, "bind"); - sgt = drm_gem_shmem_get_pages_sgt(&bo->base); + if (bo->base.base.import_attach) + sgt = ivpu_bo_map_attachment(vdev, bo); + else + sgt = drm_gem_shmem_get_pages_sgt(&bo->base); if (IS_ERR(sgt)) { ret = PTR_ERR(sgt); ivpu_err(vdev, "Failed to map BO in IOMMU: %d\n", ret); @@ -70,7 +96,7 @@ int __must_check ivpu_bo_pin(struct ivpu_bo *bo) if (!bo->mmu_mapped) { drm_WARN_ON(&vdev->drm, !bo->ctx); ret = ivpu_mmu_context_map_sgt(vdev, bo->ctx, bo->vpu_addr, sgt, - ivpu_bo_is_snooped(bo)); + ivpu_bo_is_snooped(bo), ivpu_bo_is_read_only(bo)); if (ret) { ivpu_err(vdev, "Failed to map BO in MMU: %d\n", ret); goto unlock; @@ -99,9 +125,9 @@ ivpu_bo_alloc_vpu_addr(struct ivpu_bo *bo, struct ivpu_mmu_context *ctx, ret = ivpu_mmu_context_insert_node(ctx, range, ivpu_bo_size(bo), &bo->mm_node); if (!ret) { bo->ctx = ctx; + bo->ctx_id = ctx->id; bo->vpu_addr = bo->mm_node.start; - } else { - ivpu_err(vdev, "Failed to add BO to context %u: %d\n", ctx->id, ret); + ivpu_dbg_bo(vdev, bo, "vaddr"); } ivpu_bo_unlock(bo); @@ -115,7 +141,7 @@ static void ivpu_bo_unbind_locked(struct ivpu_bo *bo) { struct ivpu_device *vdev = ivpu_bo_to_vdev(bo); - lockdep_assert(dma_resv_held(bo->base.base.resv) || !kref_read(&bo->base.base.refcount)); + dma_resv_assert_held(bo->base.base.resv); if (bo->mmu_mapped) { drm_WARN_ON(&vdev->drm, !bo->ctx); @@ -130,13 +156,15 @@ static void ivpu_bo_unbind_locked(struct ivpu_bo *bo) bo->ctx = NULL; } - if (drm_gem_is_imported(&bo->base.base)) - return; - if (bo->base.sgt) { - dma_unmap_sgtable(vdev->drm.dev, bo->base.sgt, DMA_BIDIRECTIONAL, 0); - sg_free_table(bo->base.sgt); - kfree(bo->base.sgt); + if (bo->base.base.import_attach) { + dma_buf_unmap_attachment(bo->base.base.import_attach, + bo->base.sgt, DMA_BIDIRECTIONAL); + } else { + dma_unmap_sgtable(vdev->drm.dev, bo->base.sgt, DMA_BIDIRECTIONAL, 0); + sg_free_table(bo->base.sgt); + kfree(bo->base.sgt); + } bo->base.sgt = NULL; } } @@ -182,10 +210,11 @@ struct drm_gem_object *ivpu_gem_create_object(struct drm_device *dev, size_t siz struct drm_gem_object *ivpu_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf) { + struct ivpu_device *vdev = to_ivpu_device(dev); struct device *attach_dev = dev->dev; struct dma_buf_attachment *attach; - struct sg_table *sgt; struct drm_gem_object *obj; + struct ivpu_bo *bo; int ret; attach = dma_buf_attach(dma_buf, attach_dev); @@ -194,25 +223,25 @@ struct drm_gem_object *ivpu_gem_prime_import(struct drm_device *dev, get_dma_buf(dma_buf); - sgt = dma_buf_map_attachment_unlocked(attach, DMA_BIDIRECTIONAL); - if (IS_ERR(sgt)) { - ret = PTR_ERR(sgt); - goto fail_detach; - } - - obj = drm_gem_shmem_prime_import_sg_table(dev, attach, sgt); + obj = drm_gem_shmem_prime_import_sg_table(dev, attach, NULL); if (IS_ERR(obj)) { ret = PTR_ERR(obj); - goto fail_unmap; + goto fail_detach; } obj->import_attach = attach; obj->resv = dma_buf->resv; + bo = to_ivpu_bo(obj); + + mutex_lock(&vdev->bo_list_lock); + list_add_tail(&bo->bo_list_node, &vdev->bo_list); + mutex_unlock(&vdev->bo_list_lock); + + ivpu_dbg(vdev, BO, "import: bo %8p size %9zu\n", bo, ivpu_bo_size(bo)); + return obj; -fail_unmap: - dma_buf_unmap_attachment_unlocked(attach, sgt, DMA_BIDIRECTIONAL); fail_detach: dma_buf_detach(dma_buf, attach); dma_buf_put(dma_buf); @@ -220,7 +249,7 @@ fail_detach: return ERR_PTR(ret); } -static struct ivpu_bo *ivpu_bo_alloc(struct ivpu_device *vdev, u64 size, u32 flags, u32 ctx_id) +static struct ivpu_bo *ivpu_bo_alloc(struct ivpu_device *vdev, u64 size, u32 flags) { struct drm_gem_shmem_object *shmem; struct ivpu_bo *bo; @@ -238,7 +267,6 @@ static struct ivpu_bo *ivpu_bo_alloc(struct ivpu_device *vdev, u64 size, u32 fla return ERR_CAST(shmem); bo = to_ivpu_bo(&shmem->base); - bo->ctx_id = ctx_id; bo->base.map_wc = flags & DRM_IVPU_BO_WC; bo->flags = flags; @@ -246,7 +274,7 @@ static struct ivpu_bo *ivpu_bo_alloc(struct ivpu_device *vdev, u64 size, u32 fla list_add_tail(&bo->bo_list_node, &vdev->bo_list); mutex_unlock(&vdev->bo_list_lock); - ivpu_dbg_bo(vdev, bo, "alloc"); + ivpu_dbg(vdev, BO, " alloc: bo %8p size %9llu\n", bo, size); return bo; } @@ -259,8 +287,8 @@ static int ivpu_gem_bo_open(struct drm_gem_object *obj, struct drm_file *file) struct ivpu_addr_range *range; if (bo->ctx) { - ivpu_warn(vdev, "Can't add BO to ctx %u: already in ctx %u\n", - file_priv->ctx.id, bo->ctx->id); + ivpu_dbg(vdev, IOCTL, "Can't add BO %pe to ctx %u: already in ctx %u\n", + bo, file_priv->ctx.id, bo->ctx->id); return -EALREADY; } @@ -281,23 +309,41 @@ static void ivpu_gem_bo_free(struct drm_gem_object *obj) ivpu_dbg_bo(vdev, bo, "free"); + drm_WARN_ON(&vdev->drm, list_empty(&bo->bo_list_node)); + mutex_lock(&vdev->bo_list_lock); list_del(&bo->bo_list_node); - mutex_unlock(&vdev->bo_list_lock); drm_WARN_ON(&vdev->drm, !drm_gem_is_imported(&bo->base.base) && !dma_resv_test_signaled(obj->resv, DMA_RESV_USAGE_READ)); drm_WARN_ON(&vdev->drm, ivpu_bo_size(bo) == 0); drm_WARN_ON(&vdev->drm, bo->base.vaddr); + ivpu_bo_lock(bo); ivpu_bo_unbind_locked(bo); + ivpu_bo_unlock(bo); + + mutex_unlock(&vdev->bo_list_lock); + drm_WARN_ON(&vdev->drm, bo->mmu_mapped); drm_WARN_ON(&vdev->drm, bo->ctx); drm_WARN_ON(obj->dev, refcount_read(&bo->base.pages_use_count) > 1); + drm_WARN_ON(obj->dev, bo->base.base.vma_node.vm_files.rb_node); drm_gem_shmem_free(&bo->base); } +static enum drm_gem_object_status ivpu_gem_status(struct drm_gem_object *obj) +{ + struct ivpu_bo *bo = to_ivpu_bo(obj); + enum drm_gem_object_status status = 0; + + if (ivpu_bo_is_resident(bo)) + status |= DRM_GEM_OBJECT_RESIDENT; + + return status; +} + static const struct drm_gem_object_funcs ivpu_gem_funcs = { .free = ivpu_gem_bo_free, .open = ivpu_gem_bo_open, @@ -308,6 +354,7 @@ static const struct drm_gem_object_funcs ivpu_gem_funcs = { .vmap = drm_gem_shmem_object_vmap, .vunmap = drm_gem_shmem_object_vunmap, .mmap = drm_gem_shmem_object_mmap, + .status = ivpu_gem_status, .vm_ops = &drm_gem_shmem_vm_ops, }; @@ -320,25 +367,33 @@ int ivpu_bo_create_ioctl(struct drm_device *dev, void *data, struct drm_file *fi struct ivpu_bo *bo; int ret; - if (args->flags & ~DRM_IVPU_BO_FLAGS) + if (args->flags & ~DRM_IVPU_BO_FLAGS) { + ivpu_dbg(vdev, IOCTL, "Invalid BO flags 0x%x\n", args->flags); return -EINVAL; + } - if (size == 0) + if (size == 0) { + ivpu_dbg(vdev, IOCTL, "Invalid BO size %llu\n", args->size); return -EINVAL; + } - bo = ivpu_bo_alloc(vdev, size, args->flags, file_priv->ctx.id); + bo = ivpu_bo_alloc(vdev, size, args->flags); if (IS_ERR(bo)) { - ivpu_err(vdev, "Failed to allocate BO: %pe (ctx %u size %llu flags 0x%x)", + ivpu_dbg(vdev, IOCTL, "Failed to allocate BO: %pe ctx %u size %llu flags 0x%x\n", bo, file_priv->ctx.id, args->size, args->flags); return PTR_ERR(bo); } + drm_WARN_ON(&vdev->drm, bo->base.base.handle_count != 0); + ret = drm_gem_handle_create(file, &bo->base.base, &args->handle); - if (ret) - ivpu_err(vdev, "Failed to create handle for BO: %pe (ctx %u size %llu flags 0x%x)", + if (ret) { + ivpu_dbg(vdev, IOCTL, "Failed to create handle for BO: %pe ctx %u size %llu flags 0x%x\n", bo, file_priv->ctx.id, args->size, args->flags); - else + } else { args->vpu_addr = bo->vpu_addr; + drm_WARN_ON(&vdev->drm, bo->base.base.handle_count != 1); + } drm_gem_object_put(&bo->base.base); @@ -360,18 +415,21 @@ ivpu_bo_create(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, drm_WARN_ON(&vdev->drm, !PAGE_ALIGNED(range->end)); drm_WARN_ON(&vdev->drm, !PAGE_ALIGNED(size)); - bo = ivpu_bo_alloc(vdev, size, flags, IVPU_GLOBAL_CONTEXT_MMU_SSID); + bo = ivpu_bo_alloc(vdev, size, flags); if (IS_ERR(bo)) { - ivpu_err(vdev, "Failed to allocate BO: %pe (vpu_addr 0x%llx size %llu flags 0x%x)", + ivpu_err(vdev, "Failed to allocate BO: %pe vpu_addr 0x%llx size %llu flags 0x%x\n", bo, range->start, size, flags); return NULL; } ret = ivpu_bo_alloc_vpu_addr(bo, ctx, range); - if (ret) + if (ret) { + ivpu_err(vdev, "Failed to allocate NPU address for BO: %pe ctx %u size %llu: %d\n", + bo, ctx->id, size, ret); goto err_put; + } - ret = ivpu_bo_pin(bo); + ret = ivpu_bo_bind(bo); if (ret) goto err_put; @@ -391,6 +449,21 @@ err_put: return NULL; } +struct ivpu_bo *ivpu_bo_create_runtime(struct ivpu_device *vdev, u64 addr, u64 size, u32 flags) +{ + struct ivpu_addr_range range; + + if (!ivpu_is_within_range(addr, size, &vdev->hw->ranges.runtime)) { + ivpu_err(vdev, "Invalid runtime BO address 0x%llx size %llu\n", addr, size); + return NULL; + } + + if (ivpu_hw_range_init(vdev, &range, addr, size)) + return NULL; + + return ivpu_bo_create(vdev, &vdev->gctx, &range, size, flags); +} + struct ivpu_bo *ivpu_bo_create_global(struct ivpu_device *vdev, u64 size, u32 flags) { return ivpu_bo_create(vdev, &vdev->gctx, &vdev->hw->ranges.global, size, flags); diff --git a/drivers/accel/ivpu/ivpu_gem.h b/drivers/accel/ivpu/ivpu_gem.h index aa8ff14f7aae..0c3350f22b55 100644 --- a/drivers/accel/ivpu/ivpu_gem.h +++ b/drivers/accel/ivpu/ivpu_gem.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation */ #ifndef __IVPU_GEM_H__ #define __IVPU_GEM_H__ @@ -24,19 +24,22 @@ struct ivpu_bo { bool mmu_mapped; }; -int ivpu_bo_pin(struct ivpu_bo *bo); +int ivpu_bo_bind(struct ivpu_bo *bo); void ivpu_bo_unbind_all_bos_from_context(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx); struct drm_gem_object *ivpu_gem_create_object(struct drm_device *dev, size_t size); struct drm_gem_object *ivpu_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf); struct ivpu_bo *ivpu_bo_create(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, struct ivpu_addr_range *range, u64 size, u32 flags); +struct ivpu_bo *ivpu_bo_create_runtime(struct ivpu_device *vdev, u64 addr, u64 size, u32 flags); struct ivpu_bo *ivpu_bo_create_global(struct ivpu_device *vdev, u64 size, u32 flags); void ivpu_bo_free(struct ivpu_bo *bo); int ivpu_bo_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file); int ivpu_bo_info_ioctl(struct drm_device *dev, void *data, struct drm_file *file); int ivpu_bo_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file); +int ivpu_bo_create_from_userptr_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); void ivpu_bo_list(struct drm_device *dev, struct drm_printer *p); void ivpu_bo_list_print(struct drm_device *dev); @@ -74,6 +77,16 @@ static inline bool ivpu_bo_is_snooped(struct ivpu_bo *bo) return ivpu_bo_cache_mode(bo) == DRM_IVPU_BO_CACHED; } +static inline bool ivpu_bo_is_read_only(struct ivpu_bo *bo) +{ + return bo->flags & DRM_IVPU_BO_READ_ONLY; +} + +static inline bool ivpu_bo_is_resident(struct ivpu_bo *bo) +{ + return !!bo->base.pages; +} + static inline void *ivpu_to_cpu_addr(struct ivpu_bo *bo, u32 vpu_addr) { if (vpu_addr < bo->vpu_addr) @@ -96,4 +109,9 @@ static inline u32 cpu_to_vpu_addr(struct ivpu_bo *bo, void *cpu_addr) return bo->vpu_addr + (cpu_addr - ivpu_bo_vaddr(bo)); } +static inline bool ivpu_bo_is_mappable(struct ivpu_bo *bo) +{ + return bo->flags & DRM_IVPU_BO_MAPPABLE; +} + #endif /* __IVPU_GEM_H__ */ diff --git a/drivers/accel/ivpu/ivpu_gem_userptr.c b/drivers/accel/ivpu/ivpu_gem_userptr.c new file mode 100644 index 000000000000..25ba606164c0 --- /dev/null +++ b/drivers/accel/ivpu/ivpu_gem_userptr.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020-2025 Intel Corporation + */ + +#include <linux/dma-buf.h> +#include <linux/err.h> +#include <linux/highmem.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/scatterlist.h> +#include <linux/slab.h> +#include <linux/capability.h> + +#include <drm/drm_device.h> +#include <drm/drm_file.h> +#include <drm/drm_gem.h> + +#include "ivpu_drv.h" +#include "ivpu_gem.h" + +static struct sg_table * +ivpu_gem_userptr_dmabuf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction direction) +{ + struct sg_table *sgt = attachment->dmabuf->priv; + int ret; + + ret = dma_map_sgtable(attachment->dev, sgt, direction, DMA_ATTR_SKIP_CPU_SYNC); + if (ret) + return ERR_PTR(ret); + + return sgt; +} + +static void ivpu_gem_userptr_dmabuf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction direction) +{ + dma_unmap_sgtable(attachment->dev, sgt, direction, DMA_ATTR_SKIP_CPU_SYNC); +} + +static void ivpu_gem_userptr_dmabuf_release(struct dma_buf *dma_buf) +{ + struct sg_table *sgt = dma_buf->priv; + struct sg_page_iter page_iter; + struct page *page; + + for_each_sgtable_page(sgt, &page_iter, 0) { + page = sg_page_iter_page(&page_iter); + unpin_user_page(page); + } + + sg_free_table(sgt); + kfree(sgt); +} + +static const struct dma_buf_ops ivpu_gem_userptr_dmabuf_ops = { + .map_dma_buf = ivpu_gem_userptr_dmabuf_map, + .unmap_dma_buf = ivpu_gem_userptr_dmabuf_unmap, + .release = ivpu_gem_userptr_dmabuf_release, +}; + +static struct dma_buf * +ivpu_create_userptr_dmabuf(struct ivpu_device *vdev, void __user *user_ptr, + size_t size, uint32_t flags) +{ + struct dma_buf_export_info exp_info = {}; + struct dma_buf *dma_buf; + struct sg_table *sgt; + struct page **pages; + unsigned long nr_pages = size >> PAGE_SHIFT; + unsigned int gup_flags = FOLL_LONGTERM; + int ret, i, pinned; + + /* Add FOLL_WRITE only if the BO is not read-only */ + if (!(flags & DRM_IVPU_BO_READ_ONLY)) + gup_flags |= FOLL_WRITE; + + pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + pinned = pin_user_pages_fast((unsigned long)user_ptr, nr_pages, gup_flags, pages); + if (pinned < 0) { + ret = pinned; + ivpu_dbg(vdev, IOCTL, "Failed to pin user pages: %d\n", ret); + goto free_pages_array; + } + + if (pinned != nr_pages) { + ivpu_dbg(vdev, IOCTL, "Pinned %d pages, expected %lu\n", pinned, nr_pages); + ret = -EFAULT; + goto unpin_pages; + } + + sgt = kmalloc(sizeof(*sgt), GFP_KERNEL); + if (!sgt) { + ret = -ENOMEM; + goto unpin_pages; + } + + ret = sg_alloc_table_from_pages(sgt, pages, nr_pages, 0, size, GFP_KERNEL); + if (ret) { + ivpu_dbg(vdev, IOCTL, "Failed to create sg table: %d\n", ret); + goto free_sgt; + } + + exp_info.exp_name = "ivpu_userptr_dmabuf"; + exp_info.owner = THIS_MODULE; + exp_info.ops = &ivpu_gem_userptr_dmabuf_ops; + exp_info.size = size; + exp_info.flags = O_RDWR | O_CLOEXEC; + exp_info.priv = sgt; + + dma_buf = dma_buf_export(&exp_info); + if (IS_ERR(dma_buf)) { + ret = PTR_ERR(dma_buf); + ivpu_dbg(vdev, IOCTL, "Failed to export userptr dma-buf: %d\n", ret); + goto free_sg_table; + } + + kvfree(pages); + return dma_buf; + +free_sg_table: + sg_free_table(sgt); +free_sgt: + kfree(sgt); +unpin_pages: + for (i = 0; i < pinned; i++) + unpin_user_page(pages[i]); +free_pages_array: + kvfree(pages); + return ERR_PTR(ret); +} + +static struct ivpu_bo * +ivpu_bo_create_from_userptr(struct ivpu_device *vdev, void __user *user_ptr, + size_t size, uint32_t flags) +{ + struct dma_buf *dma_buf; + struct drm_gem_object *obj; + struct ivpu_bo *bo; + + dma_buf = ivpu_create_userptr_dmabuf(vdev, user_ptr, size, flags); + if (IS_ERR(dma_buf)) + return ERR_CAST(dma_buf); + + obj = ivpu_gem_prime_import(&vdev->drm, dma_buf); + if (IS_ERR(obj)) { + dma_buf_put(dma_buf); + return ERR_CAST(obj); + } + + dma_buf_put(dma_buf); + + bo = to_ivpu_bo(obj); + bo->flags = flags; + + return bo; +} + +int ivpu_bo_create_from_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file) +{ + struct drm_ivpu_bo_create_from_userptr *args = data; + struct ivpu_file_priv *file_priv = file->driver_priv; + struct ivpu_device *vdev = to_ivpu_device(dev); + void __user *user_ptr = u64_to_user_ptr(args->user_ptr); + struct ivpu_bo *bo; + int ret; + + if (args->flags & ~(DRM_IVPU_BO_HIGH_MEM | DRM_IVPU_BO_DMA_MEM | DRM_IVPU_BO_READ_ONLY)) { + ivpu_dbg(vdev, IOCTL, "Invalid BO flags: 0x%x\n", args->flags); + return -EINVAL; + } + + if (!args->user_ptr || !args->size) { + ivpu_dbg(vdev, IOCTL, "Userptr or size are zero: ptr %llx size %llu\n", + args->user_ptr, args->size); + return -EINVAL; + } + + if (!PAGE_ALIGNED(args->user_ptr) || !PAGE_ALIGNED(args->size)) { + ivpu_dbg(vdev, IOCTL, "Userptr or size not page aligned: ptr %llx size %llu\n", + args->user_ptr, args->size); + return -EINVAL; + } + + if (!access_ok(user_ptr, args->size)) { + ivpu_dbg(vdev, IOCTL, "Userptr is not accessible: ptr %llx size %llu\n", + args->user_ptr, args->size); + return -EFAULT; + } + + bo = ivpu_bo_create_from_userptr(vdev, user_ptr, args->size, args->flags); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + ret = drm_gem_handle_create(file, &bo->base.base, &args->handle); + if (ret) { + ivpu_dbg(vdev, IOCTL, "Failed to create handle for BO: %pe ctx %u size %llu flags 0x%x\n", + bo, file_priv->ctx.id, args->size, args->flags); + } else { + ivpu_dbg(vdev, BO, "Created userptr BO: handle=%u vpu_addr=0x%llx size=%llu flags=0x%x\n", + args->handle, bo->vpu_addr, args->size, bo->flags); + args->vpu_addr = bo->vpu_addr; + } + + drm_gem_object_put(&bo->base.base); + + return ret; +} diff --git a/drivers/accel/ivpu/ivpu_hw.c b/drivers/accel/ivpu/ivpu_hw.c index 08dcc31b56f4..d69cd0d93569 100644 --- a/drivers/accel/ivpu/ivpu_hw.c +++ b/drivers/accel/ivpu/ivpu_hw.c @@ -8,6 +8,8 @@ #include "ivpu_hw_btrs.h" #include "ivpu_hw_ip.h" +#include <asm/msr-index.h> +#include <asm/msr.h> #include <linux/dmi.h> #include <linux/fault-inject.h> #include <linux/pm_runtime.h> @@ -20,6 +22,10 @@ module_param_named_unsafe(fail_hw, ivpu_fail_hw, charp, 0444); MODULE_PARM_DESC(fail_hw, "<interval>,<probability>,<space>,<times>"); #endif +#define FW_SHARED_MEM_ALIGNMENT SZ_512K /* VPU MTRR limitation */ + +#define ECC_MCA_SIGNAL_ENABLE_MASK 0xff + static char *platform_to_str(u32 platform) { switch (platform) { @@ -147,19 +153,39 @@ static void priority_bands_init(struct ivpu_device *vdev) vdev->hw->hws.process_quantum[VPU_JOB_SCHEDULING_PRIORITY_BAND_REALTIME] = 200000; } +int ivpu_hw_range_init(struct ivpu_device *vdev, struct ivpu_addr_range *range, u64 start, u64 size) +{ + u64 end; + + if (!range || check_add_overflow(start, size, &end)) { + ivpu_err(vdev, "Invalid range: start 0x%llx size %llu\n", start, size); + return -EINVAL; + } + + range->start = start; + range->end = end; + + return 0; +} + static void memory_ranges_init(struct ivpu_device *vdev) { if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX) { - ivpu_hw_range_init(&vdev->hw->ranges.global, 0x80000000, SZ_512M); - ivpu_hw_range_init(&vdev->hw->ranges.user, 0x88000000, 511 * SZ_1M); - ivpu_hw_range_init(&vdev->hw->ranges.shave, 0x180000000, SZ_2G); - ivpu_hw_range_init(&vdev->hw->ranges.dma, 0x200000000, SZ_128G); + ivpu_hw_range_init(vdev, &vdev->hw->ranges.runtime, 0x84800000, SZ_64M); + ivpu_hw_range_init(vdev, &vdev->hw->ranges.global, 0x90000000, SZ_256M); + ivpu_hw_range_init(vdev, &vdev->hw->ranges.user, 0xa0000000, 511 * SZ_1M); + ivpu_hw_range_init(vdev, &vdev->hw->ranges.shave, 0x180000000, SZ_2G); + ivpu_hw_range_init(vdev, &vdev->hw->ranges.dma, 0x200000000, SZ_128G); } else { - ivpu_hw_range_init(&vdev->hw->ranges.global, 0x80000000, SZ_512M); - ivpu_hw_range_init(&vdev->hw->ranges.shave, 0x80000000, SZ_2G); - ivpu_hw_range_init(&vdev->hw->ranges.user, 0x100000000, SZ_256G); + ivpu_hw_range_init(vdev, &vdev->hw->ranges.runtime, 0x80000000, SZ_64M); + ivpu_hw_range_init(vdev, &vdev->hw->ranges.global, 0x90000000, SZ_256M); + ivpu_hw_range_init(vdev, &vdev->hw->ranges.shave, 0x80000000, SZ_2G); + ivpu_hw_range_init(vdev, &vdev->hw->ranges.user, 0x100000000, SZ_256G); vdev->hw->ranges.dma = vdev->hw->ranges.user; } + + drm_WARN_ON(&vdev->drm, !IS_ALIGNED(vdev->hw->ranges.global.start, + FW_SHARED_MEM_ALIGNMENT)); } static int wp_enable(struct ivpu_device *vdev) @@ -373,3 +399,22 @@ irqreturn_t ivpu_hw_irq_handler(int irq, void *ptr) pm_runtime_mark_last_busy(vdev->drm.dev); return IRQ_HANDLED; } + +bool ivpu_hw_uses_ecc_mca_signal(struct ivpu_device *vdev) +{ + unsigned long long msr_integrity_caps; + int ret; + + if (ivpu_hw_ip_gen(vdev) < IVPU_HW_IP_50XX) + return false; + + ret = rdmsrq_safe(MSR_INTEGRITY_CAPS, &msr_integrity_caps); + if (ret) { + ivpu_warn(vdev, "Error reading MSR_INTEGRITY_CAPS: %d", ret); + return false; + } + + ivpu_dbg(vdev, MISC, "MSR_INTEGRITY_CAPS: 0x%llx\n", msr_integrity_caps); + + return msr_integrity_caps & ECC_MCA_SIGNAL_ENABLE_MASK; +} diff --git a/drivers/accel/ivpu/ivpu_hw.h b/drivers/accel/ivpu/ivpu_hw.h index d79668fe1609..b6d0f0d0dccc 100644 --- a/drivers/accel/ivpu/ivpu_hw.h +++ b/drivers/accel/ivpu/ivpu_hw.h @@ -21,6 +21,7 @@ struct ivpu_hw_info { bool (*ip_irq_handler)(struct ivpu_device *vdev, int irq); } irq; struct { + struct ivpu_addr_range runtime; struct ivpu_addr_range global; struct ivpu_addr_range user; struct ivpu_addr_range shave; @@ -51,6 +52,8 @@ struct ivpu_hw_info { }; int ivpu_hw_init(struct ivpu_device *vdev); +int ivpu_hw_range_init(struct ivpu_device *vdev, struct ivpu_addr_range *range, u64 start, + u64 size); int ivpu_hw_power_up(struct ivpu_device *vdev); int ivpu_hw_power_down(struct ivpu_device *vdev); int ivpu_hw_reset(struct ivpu_device *vdev); @@ -60,6 +63,7 @@ void ivpu_irq_handlers_init(struct ivpu_device *vdev); void ivpu_hw_irq_enable(struct ivpu_device *vdev); void ivpu_hw_irq_disable(struct ivpu_device *vdev); irqreturn_t ivpu_hw_irq_handler(int irq, void *ptr); +bool ivpu_hw_uses_ecc_mca_signal(struct ivpu_device *vdev); static inline u32 ivpu_hw_btrs_irq_handler(struct ivpu_device *vdev, int irq) { @@ -71,12 +75,6 @@ static inline u32 ivpu_hw_ip_irq_handler(struct ivpu_device *vdev, int irq) return vdev->hw->irq.ip_irq_handler(vdev, irq); } -static inline void ivpu_hw_range_init(struct ivpu_addr_range *range, u64 start, u64 size) -{ - range->start = start; - range->end = start + size; -} - static inline u64 ivpu_hw_range_size(const struct ivpu_addr_range *range) { return range->end - range->start; diff --git a/drivers/accel/ivpu/ivpu_hw_btrs.c b/drivers/accel/ivpu/ivpu_hw_btrs.c index afdb3b2aa72a..06e65c592618 100644 --- a/drivers/accel/ivpu/ivpu_hw_btrs.c +++ b/drivers/accel/ivpu/ivpu_hw_btrs.c @@ -321,6 +321,14 @@ static int wait_for_pll_lock(struct ivpu_device *vdev, bool enable) return REGB_POLL_FLD(VPU_HW_BTRS_MTL_PLL_STATUS, LOCK, exp_val, PLL_TIMEOUT_US); } +static int wait_for_cdyn_deassert(struct ivpu_device *vdev) +{ + if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL) + return 0; + + return REGB_POLL_FLD(VPU_HW_BTRS_LNL_CDYN, CDYN, 0, PLL_TIMEOUT_US); +} + int ivpu_hw_btrs_wp_drive(struct ivpu_device *vdev, bool enable) { struct wp_request wp; @@ -354,6 +362,14 @@ int ivpu_hw_btrs_wp_drive(struct ivpu_device *vdev, bool enable) return ret; } + if (!enable) { + ret = wait_for_cdyn_deassert(vdev); + if (ret) { + ivpu_err(vdev, "Timed out waiting for CDYN deassert\n"); + return ret; + } + } + return 0; } @@ -673,7 +689,7 @@ bool ivpu_hw_btrs_irq_handler_lnl(struct ivpu_device *vdev, int irq) if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, SURV_ERR, status)) { ivpu_dbg(vdev, IRQ, "Survivability IRQ\n"); - queue_work(system_wq, &vdev->irq_dct_work); + queue_work(system_percpu_wq, &vdev->irq_dct_work); } if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, FREQ_CHANGE, status)) { @@ -752,7 +768,7 @@ int ivpu_hw_btrs_dct_get_request(struct ivpu_device *vdev, bool *enable) } } -void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u32 active_percent) +void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u8 active_percent) { u32 val = 0; u32 cmd = enable ? DCT_ENABLE : DCT_DISABLE; diff --git a/drivers/accel/ivpu/ivpu_hw_btrs.h b/drivers/accel/ivpu/ivpu_hw_btrs.h index 032c384ac3d4..c4c10e22f30f 100644 --- a/drivers/accel/ivpu/ivpu_hw_btrs.h +++ b/drivers/accel/ivpu/ivpu_hw_btrs.h @@ -36,7 +36,7 @@ u32 ivpu_hw_btrs_dpu_freq_get(struct ivpu_device *vdev); bool ivpu_hw_btrs_irq_handler_mtl(struct ivpu_device *vdev, int irq); bool ivpu_hw_btrs_irq_handler_lnl(struct ivpu_device *vdev, int irq); int ivpu_hw_btrs_dct_get_request(struct ivpu_device *vdev, bool *enable); -void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u32 active_percent); +void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u8 active_percent); u32 ivpu_hw_btrs_telemetry_offset_get(struct ivpu_device *vdev); u32 ivpu_hw_btrs_telemetry_size_get(struct ivpu_device *vdev); u32 ivpu_hw_btrs_telemetry_enable_get(struct ivpu_device *vdev); diff --git a/drivers/accel/ivpu/ivpu_hw_btrs_lnl_reg.h b/drivers/accel/ivpu/ivpu_hw_btrs_lnl_reg.h index fff2ef2cada6..a81a9ba540fa 100644 --- a/drivers/accel/ivpu/ivpu_hw_btrs_lnl_reg.h +++ b/drivers/accel/ivpu/ivpu_hw_btrs_lnl_reg.h @@ -74,6 +74,9 @@ #define VPU_HW_BTRS_LNL_PLL_FREQ 0x00000148u #define VPU_HW_BTRS_LNL_PLL_FREQ_RATIO_MASK GENMASK(15, 0) +#define VPU_HW_BTRS_LNL_CDYN 0x0000014cu +#define VPU_HW_BTRS_LNL_CDYN_CDYN_MASK GENMASK(15, 0) + #define VPU_HW_BTRS_LNL_TILE_FUSE 0x00000150u #define VPU_HW_BTRS_LNL_TILE_FUSE_VALID_MASK BIT_MASK(0) #define VPU_HW_BTRS_LNL_TILE_FUSE_CONFIG_MASK GENMASK(6, 1) diff --git a/drivers/accel/ivpu/ivpu_hw_ip.c b/drivers/accel/ivpu/ivpu_hw_ip.c index 2bf9882ab52e..06aa1e7dc50b 100644 --- a/drivers/accel/ivpu/ivpu_hw_ip.c +++ b/drivers/accel/ivpu/ivpu_hw_ip.c @@ -691,6 +691,13 @@ static void pwr_island_delay_set(struct ivpu_device *vdev) status = high ? 46 : 3; break; + case PCI_DEVICE_ID_NVL: + post = high ? 198 : 17; + post1 = 0; + post2 = high ? 198 : 17; + status = 0; + break; + default: dump_stack(); ivpu_err(vdev, "Unknown device ID\n"); @@ -889,6 +896,9 @@ static int soc_cpu_drive_40xx(struct ivpu_device *vdev, bool enable) static int soc_cpu_enable(struct ivpu_device *vdev) { + if (ivpu_hw_ip_gen(vdev) >= IVPU_HW_IP_60XX) + return 0; + return soc_cpu_drive_40xx(vdev, true); } diff --git a/drivers/accel/ivpu/ivpu_ipc.c b/drivers/accel/ivpu/ivpu_ipc.c index 5f00809d448a..1f13bf95b2b3 100644 --- a/drivers/accel/ivpu/ivpu_ipc.c +++ b/drivers/accel/ivpu/ivpu_ipc.c @@ -459,7 +459,7 @@ void ivpu_ipc_irq_handler(struct ivpu_device *vdev) } } - queue_work(system_wq, &vdev->irq_ipc_work); + queue_work(system_percpu_wq, &vdev->irq_ipc_work); } void ivpu_ipc_irq_work_fn(struct work_struct *work) diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c index 060f1fc031d3..4f8564e2878a 100644 --- a/drivers/accel/ivpu/ivpu_job.c +++ b/drivers/accel/ivpu/ivpu_job.c @@ -34,22 +34,20 @@ static void ivpu_cmdq_ring_db(struct ivpu_device *vdev, struct ivpu_cmdq *cmdq) static int ivpu_preemption_buffers_create(struct ivpu_device *vdev, struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cmdq) { - u64 primary_size = ALIGN(vdev->fw->primary_preempt_buf_size, PAGE_SIZE); - u64 secondary_size = ALIGN(vdev->fw->secondary_preempt_buf_size, PAGE_SIZE); - - if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW || - ivpu_test_mode & IVPU_TEST_MODE_MIP_DISABLE) + if (ivpu_fw_preempt_buf_size(vdev) == 0) return 0; cmdq->primary_preempt_buf = ivpu_bo_create(vdev, &file_priv->ctx, &vdev->hw->ranges.user, - primary_size, DRM_IVPU_BO_WC); + vdev->fw->primary_preempt_buf_size, + DRM_IVPU_BO_WC); if (!cmdq->primary_preempt_buf) { ivpu_err(vdev, "Failed to create primary preemption buffer\n"); return -ENOMEM; } cmdq->secondary_preempt_buf = ivpu_bo_create(vdev, &file_priv->ctx, &vdev->hw->ranges.dma, - secondary_size, DRM_IVPU_BO_WC); + vdev->fw->secondary_preempt_buf_size, + DRM_IVPU_BO_WC); if (!cmdq->secondary_preempt_buf) { ivpu_err(vdev, "Failed to create secondary preemption buffer\n"); goto err_free_primary; @@ -66,20 +64,39 @@ err_free_primary: static void ivpu_preemption_buffers_free(struct ivpu_device *vdev, struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cmdq) { - if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW) - return; - if (cmdq->primary_preempt_buf) ivpu_bo_free(cmdq->primary_preempt_buf); if (cmdq->secondary_preempt_buf) ivpu_bo_free(cmdq->secondary_preempt_buf); } +static int ivpu_preemption_job_init(struct ivpu_device *vdev, struct ivpu_file_priv *file_priv, + struct ivpu_cmdq *cmdq, struct ivpu_job *job) +{ + int ret; + + /* Use preemption buffer provided by the user space */ + if (job->primary_preempt_buf) + return 0; + + if (!cmdq->primary_preempt_buf) { + /* Allocate per command queue preemption buffers */ + ret = ivpu_preemption_buffers_create(vdev, file_priv, cmdq); + if (ret) + return ret; + } + + /* Use preemption buffers allocated by the kernel */ + job->primary_preempt_buf = cmdq->primary_preempt_buf; + job->secondary_preempt_buf = cmdq->secondary_preempt_buf; + + return 0; +} + static struct ivpu_cmdq *ivpu_cmdq_alloc(struct ivpu_file_priv *file_priv) { struct ivpu_device *vdev = file_priv->vdev; struct ivpu_cmdq *cmdq; - int ret; cmdq = kzalloc(sizeof(*cmdq), GFP_KERNEL); if (!cmdq) @@ -89,10 +106,6 @@ static struct ivpu_cmdq *ivpu_cmdq_alloc(struct ivpu_file_priv *file_priv) if (!cmdq->mem) goto err_free_cmdq; - ret = ivpu_preemption_buffers_create(vdev, file_priv, cmdq); - if (ret) - ivpu_warn(vdev, "Failed to allocate preemption buffers, preemption limited\n"); - return cmdq; err_free_cmdq: @@ -219,11 +232,13 @@ static int ivpu_register_db(struct ivpu_file_priv *file_priv, struct ivpu_cmdq * ret = ivpu_jsm_register_db(vdev, file_priv->ctx.id, cmdq->db_id, cmdq->mem->vpu_addr, ivpu_bo_size(cmdq->mem)); - if (!ret) + if (!ret) { ivpu_dbg(vdev, JOB, "DB %d registered to cmdq %d ctx %d priority %d\n", cmdq->db_id, cmdq->id, file_priv->ctx.id, cmdq->priority); - else + } else { xa_erase(&vdev->db_xa, cmdq->db_id); + cmdq->db_id = 0; + } return ret; } @@ -333,7 +348,7 @@ static struct ivpu_cmdq *ivpu_cmdq_acquire(struct ivpu_file_priv *file_priv, u32 cmdq = xa_load(&file_priv->cmdq_xa, cmdq_id); if (!cmdq) { - ivpu_warn_ratelimited(vdev, "Failed to find command queue with ID: %u\n", cmdq_id); + ivpu_dbg(vdev, IOCTL, "Failed to find command queue with ID: %u\n", cmdq_id); return NULL; } @@ -427,17 +442,14 @@ static int ivpu_cmdq_push_job(struct ivpu_cmdq *cmdq, struct ivpu_job *job) if (unlikely(ivpu_test_mode & IVPU_TEST_MODE_NULL_SUBMISSION)) entry->flags = VPU_JOB_FLAGS_NULL_SUBMISSION_MASK; - if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) { - if (cmdq->primary_preempt_buf) { - entry->primary_preempt_buf_addr = cmdq->primary_preempt_buf->vpu_addr; - entry->primary_preempt_buf_size = ivpu_bo_size(cmdq->primary_preempt_buf); - } + if (job->primary_preempt_buf) { + entry->primary_preempt_buf_addr = job->primary_preempt_buf->vpu_addr; + entry->primary_preempt_buf_size = ivpu_bo_size(job->primary_preempt_buf); + } - if (cmdq->secondary_preempt_buf) { - entry->secondary_preempt_buf_addr = cmdq->secondary_preempt_buf->vpu_addr; - entry->secondary_preempt_buf_size = - ivpu_bo_size(cmdq->secondary_preempt_buf); - } + if (job->secondary_preempt_buf) { + entry->secondary_preempt_buf_addr = job->secondary_preempt_buf->vpu_addr; + entry->secondary_preempt_buf_size = ivpu_bo_size(job->secondary_preempt_buf); } wmb(); /* Ensure that tail is updated after filling entry */ @@ -522,7 +534,7 @@ ivpu_job_create(struct ivpu_file_priv *file_priv, u32 engine_idx, u32 bo_count) job->bo_count = bo_count; job->done_fence = ivpu_fence_create(vdev); if (!job->done_fence) { - ivpu_warn_ratelimited(vdev, "Failed to create a fence\n"); + ivpu_err(vdev, "Failed to create a fence\n"); goto err_free_job; } @@ -552,21 +564,26 @@ static struct ivpu_job *ivpu_job_remove_from_submitted_jobs(struct ivpu_device * return job; } -static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 job_status) +bool ivpu_job_handle_engine_error(struct ivpu_device *vdev, u32 job_id, u32 job_status) { - struct ivpu_job *job; - lockdep_assert_held(&vdev->submitted_jobs_lock); - job = xa_load(&vdev->submitted_jobs_xa, job_id); - if (!job) - return -ENOENT; + switch (job_status) { + case VPU_JSM_STATUS_PROCESSING_ERR: + case VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MIN ... VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MAX: + { + struct ivpu_job *job = xa_load(&vdev->submitted_jobs_xa, job_id); - if (job_status == VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW) { + if (!job) + return false; + + /* Trigger an engine reset */ guard(mutex)(&job->file_priv->lock); + job->job_status = job_status; + if (job->file_priv->has_mmu_faults) - return 0; + return false; /* * Mark context as faulty and defer destruction of the job to jobs abort thread @@ -574,23 +591,43 @@ static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 * status and ensure both are handled in the same way */ job->file_priv->has_mmu_faults = true; - queue_work(system_wq, &vdev->context_abort_work); - return 0; + queue_work(system_percpu_wq, &vdev->context_abort_work); + return true; } + default: + /* Complete job with error status, engine reset not required */ + break; + } + + return false; +} - job = ivpu_job_remove_from_submitted_jobs(vdev, job_id); +static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 job_status) +{ + struct ivpu_job *job; + + lockdep_assert_held(&vdev->submitted_jobs_lock); + + job = xa_load(&vdev->submitted_jobs_xa, job_id); if (!job) return -ENOENT; - if (job->file_priv->has_mmu_faults) - job_status = DRM_IVPU_JOB_STATUS_ABORTED; + ivpu_job_remove_from_submitted_jobs(vdev, job_id); - job->bos[CMD_BUF_IDX]->job_status = job_status; + if (job->job_status == VPU_JSM_STATUS_SUCCESS) { + if (job->file_priv->has_mmu_faults) + job->job_status = DRM_IVPU_JOB_STATUS_ABORTED; + else + job->job_status = job_status; + } + + job->bos[CMD_BUF_IDX]->job_status = job->job_status; dma_fence_signal(job->done_fence); trace_job("done", job); ivpu_dbg(vdev, JOB, "Job complete: id %3u ctx %2d cmdq_id %u engine %d status 0x%x\n", - job->job_id, job->file_priv->ctx.id, job->cmdq_id, job->engine_idx, job_status); + job->job_id, job->file_priv->ctx.id, job->cmdq_id, job->engine_idx, + job->job_status); ivpu_job_destroy(job); ivpu_stop_job_timeout_detection(vdev); @@ -650,7 +687,6 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) else cmdq = ivpu_cmdq_acquire(file_priv, cmdq_id); if (!cmdq) { - ivpu_warn_ratelimited(vdev, "Failed to get job queue, ctx %d\n", file_priv->ctx.id); ret = -EINVAL; goto err_unlock; } @@ -661,6 +697,13 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) goto err_unlock; } + ret = ivpu_preemption_job_init(vdev, file_priv, cmdq, job); + if (ret) { + ivpu_err(vdev, "Failed to initialize preemption buffers for job %d: %d\n", + job->job_id, ret); + goto err_unlock; + } + job->cmdq_id = cmdq->id; is_first_job = xa_empty(&vdev->submitted_jobs_xa); @@ -714,7 +757,7 @@ err_unlock: static int ivpu_job_prepare_bos_for_submit(struct drm_file *file, struct ivpu_job *job, u32 *buf_handles, - u32 buf_count, u32 commands_offset) + u32 buf_count, u32 commands_offset, u32 preempt_buffer_index) { struct ivpu_file_priv *file_priv = job->file_priv; struct ivpu_device *vdev = file_priv->vdev; @@ -727,40 +770,58 @@ ivpu_job_prepare_bos_for_submit(struct drm_file *file, struct ivpu_job *job, u32 for (i = 0; i < buf_count; i++) { struct drm_gem_object *obj = drm_gem_object_lookup(file, buf_handles[i]); - if (!obj) + if (!obj) { + ivpu_dbg(vdev, IOCTL, "Failed to lookup GEM object with handle %u\n", + buf_handles[i]); return -ENOENT; + } job->bos[i] = to_ivpu_bo(obj); - ret = ivpu_bo_pin(job->bos[i]); + ret = ivpu_bo_bind(job->bos[i]); if (ret) return ret; } bo = job->bos[CMD_BUF_IDX]; if (!dma_resv_test_signaled(bo->base.base.resv, DMA_RESV_USAGE_READ)) { - ivpu_warn(vdev, "Buffer is already in use\n"); + ivpu_dbg(vdev, IOCTL, "Buffer is already in use by another job\n"); return -EBUSY; } if (commands_offset >= ivpu_bo_size(bo)) { - ivpu_warn(vdev, "Invalid command buffer offset %u\n", commands_offset); + ivpu_dbg(vdev, IOCTL, "Invalid commands offset %u for buffer size %zu\n", + commands_offset, ivpu_bo_size(bo)); return -EINVAL; } job->cmd_buf_vpu_addr = bo->vpu_addr + commands_offset; + if (preempt_buffer_index) { + struct ivpu_bo *preempt_bo = job->bos[preempt_buffer_index]; + + if (ivpu_bo_size(preempt_bo) < ivpu_fw_preempt_buf_size(vdev)) { + ivpu_dbg(vdev, IOCTL, "Preemption buffer is too small\n"); + return -EINVAL; + } + if (ivpu_bo_is_mappable(preempt_bo)) { + ivpu_dbg(vdev, IOCTL, "Preemption buffer cannot be mappable\n"); + return -EINVAL; + } + job->primary_preempt_buf = preempt_bo; + } + ret = drm_gem_lock_reservations((struct drm_gem_object **)job->bos, buf_count, &acquire_ctx); if (ret) { - ivpu_warn(vdev, "Failed to lock reservations: %d\n", ret); + ivpu_warn_ratelimited(vdev, "Failed to lock reservations: %d\n", ret); return ret; } for (i = 0; i < buf_count; i++) { ret = dma_resv_reserve_fences(job->bos[i]->base.base.resv, 1); if (ret) { - ivpu_warn(vdev, "Failed to reserve fences: %d\n", ret); + ivpu_warn_ratelimited(vdev, "Failed to reserve fences: %d\n", ret); goto unlock_reservations; } } @@ -780,7 +841,7 @@ unlock_reservations: static int ivpu_submit(struct drm_file *file, struct ivpu_file_priv *file_priv, u32 cmdq_id, u32 buffer_count, u32 engine, void __user *buffers_ptr, u32 cmds_offset, - u8 priority) + u32 preempt_buffer_index, u8 priority) { struct ivpu_device *vdev = file_priv->vdev; struct ivpu_job *job; @@ -807,16 +868,14 @@ static int ivpu_submit(struct drm_file *file, struct ivpu_file_priv *file_priv, job = ivpu_job_create(file_priv, engine, buffer_count); if (!job) { - ivpu_err(vdev, "Failed to create job\n"); ret = -ENOMEM; goto err_exit_dev; } - ret = ivpu_job_prepare_bos_for_submit(file, job, buf_handles, buffer_count, cmds_offset); - if (ret) { - ivpu_err(vdev, "Failed to prepare job: %d\n", ret); + ret = ivpu_job_prepare_bos_for_submit(file, job, buf_handles, buffer_count, cmds_offset, + preempt_buffer_index); + if (ret) goto err_destroy_job; - } down_read(&vdev->pm->reset_lock); ret = ivpu_job_submit(job, priority, cmdq_id); @@ -842,58 +901,91 @@ err_free_handles: int ivpu_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct ivpu_file_priv *file_priv = file->driver_priv; + struct ivpu_device *vdev = file_priv->vdev; struct drm_ivpu_submit *args = data; u8 priority; - if (args->engine != DRM_IVPU_ENGINE_COMPUTE) + if (args->engine != DRM_IVPU_ENGINE_COMPUTE) { + ivpu_dbg(vdev, IOCTL, "Invalid engine %d\n", args->engine); return -EINVAL; + } - if (args->priority > DRM_IVPU_JOB_PRIORITY_REALTIME) + if (args->priority > DRM_IVPU_JOB_PRIORITY_REALTIME) { + ivpu_dbg(vdev, IOCTL, "Invalid priority %d\n", args->priority); return -EINVAL; + } - if (args->buffer_count == 0 || args->buffer_count > JOB_MAX_BUFFER_COUNT) + if (args->buffer_count == 0 || args->buffer_count > JOB_MAX_BUFFER_COUNT) { + ivpu_dbg(vdev, IOCTL, "Invalid buffer count %u\n", args->buffer_count); return -EINVAL; + } - if (!IS_ALIGNED(args->commands_offset, 8)) + if (!IS_ALIGNED(args->commands_offset, 8)) { + ivpu_dbg(vdev, IOCTL, "Invalid commands offset %u\n", args->commands_offset); return -EINVAL; + } - if (!file_priv->ctx.id) + if (!file_priv->ctx.id) { + ivpu_dbg(vdev, IOCTL, "Context not initialized\n"); return -EINVAL; + } - if (file_priv->has_mmu_faults) + if (file_priv->has_mmu_faults) { + ivpu_dbg(vdev, IOCTL, "Context %u has MMU faults\n", file_priv->ctx.id); return -EBADFD; + } priority = ivpu_job_to_jsm_priority(args->priority); return ivpu_submit(file, file_priv, 0, args->buffer_count, args->engine, - (void __user *)args->buffers_ptr, args->commands_offset, priority); + (void __user *)args->buffers_ptr, args->commands_offset, 0, priority); } int ivpu_cmdq_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct ivpu_file_priv *file_priv = file->driver_priv; + struct ivpu_device *vdev = file_priv->vdev; struct drm_ivpu_cmdq_submit *args = data; - if (!ivpu_is_capable(file_priv->vdev, DRM_IVPU_CAP_MANAGE_CMDQ)) + if (!ivpu_is_capable(file_priv->vdev, DRM_IVPU_CAP_MANAGE_CMDQ)) { + ivpu_dbg(vdev, IOCTL, "Command queue management not supported\n"); return -ENODEV; + } - if (args->cmdq_id < IVPU_CMDQ_MIN_ID || args->cmdq_id > IVPU_CMDQ_MAX_ID) + if (args->cmdq_id < IVPU_CMDQ_MIN_ID || args->cmdq_id > IVPU_CMDQ_MAX_ID) { + ivpu_dbg(vdev, IOCTL, "Invalid command queue ID %u\n", args->cmdq_id); return -EINVAL; + } - if (args->buffer_count == 0 || args->buffer_count > JOB_MAX_BUFFER_COUNT) + if (args->buffer_count == 0 || args->buffer_count > JOB_MAX_BUFFER_COUNT) { + ivpu_dbg(vdev, IOCTL, "Invalid buffer count %u\n", args->buffer_count); return -EINVAL; + } - if (!IS_ALIGNED(args->commands_offset, 8)) + if (args->preempt_buffer_index >= args->buffer_count) { + ivpu_dbg(vdev, IOCTL, "Invalid preemption buffer index %u\n", + args->preempt_buffer_index); return -EINVAL; + } - if (!file_priv->ctx.id) + if (!IS_ALIGNED(args->commands_offset, 8)) { + ivpu_dbg(vdev, IOCTL, "Invalid commands offset %u\n", args->commands_offset); return -EINVAL; + } - if (file_priv->has_mmu_faults) + if (!file_priv->ctx.id) { + ivpu_dbg(vdev, IOCTL, "Context not initialized\n"); + return -EINVAL; + } + + if (file_priv->has_mmu_faults) { + ivpu_dbg(vdev, IOCTL, "Context %u has MMU faults\n", file_priv->ctx.id); return -EBADFD; + } return ivpu_submit(file, file_priv, args->cmdq_id, args->buffer_count, VPU_ENGINE_COMPUTE, - (void __user *)args->buffers_ptr, args->commands_offset, 0); + (void __user *)args->buffers_ptr, args->commands_offset, + args->preempt_buffer_index, 0); } int ivpu_cmdq_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file) @@ -904,11 +996,15 @@ int ivpu_cmdq_create_ioctl(struct drm_device *dev, void *data, struct drm_file * struct ivpu_cmdq *cmdq; int ret; - if (!ivpu_is_capable(vdev, DRM_IVPU_CAP_MANAGE_CMDQ)) + if (!ivpu_is_capable(vdev, DRM_IVPU_CAP_MANAGE_CMDQ)) { + ivpu_dbg(vdev, IOCTL, "Command queue management not supported\n"); return -ENODEV; + } - if (args->priority > DRM_IVPU_JOB_PRIORITY_REALTIME) + if (args->priority > DRM_IVPU_JOB_PRIORITY_REALTIME) { + ivpu_dbg(vdev, IOCTL, "Invalid priority %d\n", args->priority); return -EINVAL; + } ret = ivpu_rpm_get(vdev); if (ret < 0) @@ -936,8 +1032,10 @@ int ivpu_cmdq_destroy_ioctl(struct drm_device *dev, void *data, struct drm_file u32 cmdq_id = 0; int ret; - if (!ivpu_is_capable(vdev, DRM_IVPU_CAP_MANAGE_CMDQ)) + if (!ivpu_is_capable(vdev, DRM_IVPU_CAP_MANAGE_CMDQ)) { + ivpu_dbg(vdev, IOCTL, "Command queue management not supported\n"); return -ENODEV; + } ret = ivpu_rpm_get(vdev); if (ret < 0) @@ -984,7 +1082,9 @@ ivpu_job_done_callback(struct ivpu_device *vdev, struct ivpu_ipc_hdr *ipc_hdr, payload = (struct vpu_ipc_msg_payload_job_done *)&jsm_msg->payload; mutex_lock(&vdev->submitted_jobs_lock); - ivpu_job_signal_and_destroy(vdev, payload->job_id, payload->job_status); + if (!ivpu_job_handle_engine_error(vdev, payload->job_id, payload->job_status)) + /* No engine error, complete the job normally */ + ivpu_job_signal_and_destroy(vdev, payload->job_id, payload->job_status); mutex_unlock(&vdev->submitted_jobs_lock); } @@ -1012,7 +1112,7 @@ void ivpu_context_abort_work_fn(struct work_struct *work) if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) if (ivpu_jsm_reset_engine(vdev, 0)) - return; + goto runtime_put; mutex_lock(&vdev->context_list_lock); xa_for_each(&vdev->context_xa, ctx_id, file_priv) { @@ -1036,7 +1136,7 @@ void ivpu_context_abort_work_fn(struct work_struct *work) goto runtime_put; if (ivpu_jsm_hws_resume_engine(vdev, 0)) - return; + goto runtime_put; /* * In hardware scheduling mode NPU already has stopped processing jobs * and won't send us any further notifications, thus we have to free job related resources @@ -1049,6 +1149,5 @@ void ivpu_context_abort_work_fn(struct work_struct *work) mutex_unlock(&vdev->submitted_jobs_lock); runtime_put: - pm_runtime_mark_last_busy(vdev->drm.dev); pm_runtime_put_autosuspend(vdev->drm.dev); } diff --git a/drivers/accel/ivpu/ivpu_job.h b/drivers/accel/ivpu/ivpu_job.h index 2e301c2eea7b..3ab61e6a5616 100644 --- a/drivers/accel/ivpu/ivpu_job.h +++ b/drivers/accel/ivpu/ivpu_job.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation */ #ifndef __IVPU_JOB_H__ @@ -15,12 +15,17 @@ struct ivpu_device; struct ivpu_file_priv; /** - * struct ivpu_cmdq - Object representing device queue used to send jobs. - * @jobq: Pointer to job queue memory shared with the device - * @mem: Memory allocated for the job queue, shared with device - * @entry_count Number of job entries in the queue - * @db_id: Doorbell assigned to this job queue - * @db_registered: True if doorbell is registered in device + * struct ivpu_cmdq - Represents a command queue for submitting jobs to the VPU. + * Tracks queue memory, preemption buffers, and metadata for job management. + * @jobq: Pointer to job queue memory shared with the device + * @primary_preempt_buf: Primary preemption buffer for this queue (optional) + * @secondary_preempt_buf: Secondary preemption buffer for this queue (optional) + * @mem: Memory allocated for the job queue, shared with device + * @entry_count: Number of job entries in the queue + * @id: Unique command queue ID + * @db_id: Doorbell ID assigned to this job queue + * @priority: Priority level of the command queue + * @is_legacy: True if this is a legacy command queue */ struct ivpu_cmdq { struct vpu_job_queue *jobq; @@ -35,16 +40,22 @@ struct ivpu_cmdq { }; /** - * struct ivpu_job - KMD object that represents batchbuffer / DMA buffer. - * Each batch / DMA buffer is a job to be submitted and executed by the VPU FW. - * This is a unit of execution, and be tracked by the job_id for - * any status reporting from VPU FW through IPC JOB RET/DONE message. - * @file_priv: The client that submitted this job - * @job_id: Job ID for KMD tracking and job status reporting from VPU FW - * @status: Status of the Job from IPC JOB RET/DONE message - * @batch_buffer: CPU vaddr points to the batch buffer memory allocated for the job - * @submit_status_offset: Offset within batch buffer where job completion handler - will update the job status + * struct ivpu_job - Representing a batch or DMA buffer submitted to the VPU. + * Each job is a unit of execution, tracked by job_id for status reporting from VPU FW. + * The structure holds all resources and metadata needed for job submission, execution, + * and completion handling. + * @vdev: Pointer to the VPU device + * @file_priv: The client context that submitted this job + * @done_fence: Fence signaled when job completes + * @cmd_buf_vpu_addr: VPU address of the command buffer for this job + * @cmdq_id: Command queue ID used for submission + * @job_id: Unique job ID for tracking and status reporting + * @engine_idx: Engine index for job execution + * @job_status: Status reported by firmware for this job + * @primary_preempt_buf: Primary preemption buffer for job + * @secondary_preempt_buf: Secondary preemption buffer for job (optional) + * @bo_count: Number of buffer objects associated with this job + * @bos: Array of buffer objects used by the job (batch buffer is at index 0) */ struct ivpu_job { struct ivpu_device *vdev; @@ -54,6 +65,9 @@ struct ivpu_job { u32 cmdq_id; u32 job_id; u32 engine_idx; + u32 job_status; + struct ivpu_bo *primary_preempt_buf; + struct ivpu_bo *secondary_preempt_buf; size_t bo_count; struct ivpu_bo *bos[] __counted_by(bo_count); }; @@ -71,6 +85,7 @@ void ivpu_cmdq_abort_all_jobs(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id) void ivpu_job_done_consumer_init(struct ivpu_device *vdev); void ivpu_job_done_consumer_fini(struct ivpu_device *vdev); +bool ivpu_job_handle_engine_error(struct ivpu_device *vdev, u32 job_id, u32 job_status); void ivpu_context_abort_work_fn(struct work_struct *work); void ivpu_jobs_abort_all(struct ivpu_device *vdev); diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c index 5ea010568faa..e1baf6b64935 100644 --- a/drivers/accel/ivpu/ivpu_mmu.c +++ b/drivers/accel/ivpu/ivpu_mmu.c @@ -970,7 +970,7 @@ void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev) } } - queue_work(system_wq, &vdev->context_abort_work); + queue_work(system_percpu_wq, &vdev->context_abort_work); } void ivpu_mmu_evtq_dump(struct ivpu_device *vdev) diff --git a/drivers/accel/ivpu/ivpu_mmu_context.c b/drivers/accel/ivpu/ivpu_mmu_context.c index f0267efa55aa..87ad593ef47d 100644 --- a/drivers/accel/ivpu/ivpu_mmu_context.c +++ b/drivers/accel/ivpu/ivpu_mmu_context.c @@ -430,7 +430,7 @@ static void ivpu_mmu_context_unmap_pages(struct ivpu_mmu_context *ctx, u64 vpu_a int ivpu_mmu_context_map_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, - u64 vpu_addr, struct sg_table *sgt, bool llc_coherent) + u64 vpu_addr, struct sg_table *sgt, bool llc_coherent, bool read_only) { size_t start_vpu_addr = vpu_addr; struct scatterlist *sg; @@ -450,6 +450,8 @@ ivpu_mmu_context_map_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, prot = IVPU_MMU_ENTRY_MAPPED; if (llc_coherent) prot |= IVPU_MMU_ENTRY_FLAG_LLC_COHERENT; + if (read_only) + prot |= IVPU_MMU_ENTRY_FLAG_RO; mutex_lock(&ctx->lock); @@ -527,7 +529,8 @@ ivpu_mmu_context_unmap_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ct ret = ivpu_mmu_invalidate_tlb(vdev, ctx->id); if (ret) - ivpu_warn(vdev, "Failed to invalidate TLB for ctx %u: %d\n", ctx->id, ret); + ivpu_warn_ratelimited(vdev, "Failed to invalidate TLB for ctx %u: %d\n", + ctx->id, ret); } int @@ -568,7 +571,7 @@ void ivpu_mmu_context_init(struct ivpu_device *vdev, struct ivpu_mmu_context *ct mutex_init(&ctx->lock); if (!context_id) { - start = vdev->hw->ranges.global.start; + start = vdev->hw->ranges.runtime.start; end = vdev->hw->ranges.shave.end; } else { start = min_t(u64, vdev->hw->ranges.user.start, vdev->hw->ranges.shave.start); diff --git a/drivers/accel/ivpu/ivpu_mmu_context.h b/drivers/accel/ivpu/ivpu_mmu_context.h index f255310968cf..663a11a9db11 100644 --- a/drivers/accel/ivpu/ivpu_mmu_context.h +++ b/drivers/accel/ivpu/ivpu_mmu_context.h @@ -42,7 +42,7 @@ int ivpu_mmu_context_insert_node(struct ivpu_mmu_context *ctx, const struct ivpu void ivpu_mmu_context_remove_node(struct ivpu_mmu_context *ctx, struct drm_mm_node *node); int ivpu_mmu_context_map_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, - u64 vpu_addr, struct sg_table *sgt, bool llc_coherent); + u64 vpu_addr, struct sg_table *sgt, bool llc_coherent, bool read_only); void ivpu_mmu_context_unmap_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, u64 vpu_addr, struct sg_table *sgt); int ivpu_mmu_context_set_pages_ro(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, diff --git a/drivers/accel/ivpu/ivpu_ms.c b/drivers/accel/ivpu/ivpu_ms.c index 2a043baf10ca..1d9c1cb17924 100644 --- a/drivers/accel/ivpu/ivpu_ms.c +++ b/drivers/accel/ivpu/ivpu_ms.c @@ -8,6 +8,7 @@ #include "ivpu_drv.h" #include "ivpu_gem.h" +#include "ivpu_hw.h" #include "ivpu_jsm_msg.h" #include "ivpu_ms.h" #include "ivpu_pm.h" @@ -37,8 +38,8 @@ int ivpu_ms_start_ioctl(struct drm_device *dev, void *data, struct drm_file *fil struct drm_ivpu_metric_streamer_start *args = data; struct ivpu_device *vdev = file_priv->vdev; struct ivpu_ms_instance *ms; - u64 single_buff_size; u32 sample_size; + u64 buf_size; int ret; if (!args->metric_group_mask || !args->read_period_samples || @@ -52,7 +53,8 @@ int ivpu_ms_start_ioctl(struct drm_device *dev, void *data, struct drm_file *fil mutex_lock(&file_priv->ms_lock); if (get_instance_by_mask(file_priv, args->metric_group_mask)) { - ivpu_err(vdev, "Instance already exists (mask %#llx)\n", args->metric_group_mask); + ivpu_dbg(vdev, IOCTL, "Instance already exists (mask %#llx)\n", + args->metric_group_mask); ret = -EALREADY; goto unlock; } @@ -69,12 +71,18 @@ int ivpu_ms_start_ioctl(struct drm_device *dev, void *data, struct drm_file *fil if (ret) goto err_free_ms; - single_buff_size = sample_size * - ((u64)args->read_period_samples * MS_READ_PERIOD_MULTIPLIER); - ms->bo = ivpu_bo_create_global(vdev, PAGE_ALIGN(single_buff_size * MS_NUM_BUFFERS), - DRM_IVPU_BO_CACHED | DRM_IVPU_BO_MAPPABLE); + buf_size = PAGE_ALIGN((u64)args->read_period_samples * sample_size * + MS_READ_PERIOD_MULTIPLIER * MS_NUM_BUFFERS); + if (buf_size > ivpu_hw_range_size(&vdev->hw->ranges.global)) { + ivpu_dbg(vdev, IOCTL, "Requested MS buffer size %llu exceeds range size %llu\n", + buf_size, ivpu_hw_range_size(&vdev->hw->ranges.global)); + ret = -EINVAL; + goto err_free_ms; + } + + ms->bo = ivpu_bo_create_global(vdev, buf_size, DRM_IVPU_BO_CACHED | DRM_IVPU_BO_MAPPABLE); if (!ms->bo) { - ivpu_err(vdev, "Failed to allocate MS buffer (size %llu)\n", single_buff_size); + ivpu_dbg(vdev, IOCTL, "Failed to allocate MS buffer (size %llu)\n", buf_size); ret = -ENOMEM; goto err_free_ms; } @@ -175,7 +183,8 @@ int ivpu_ms_get_data_ioctl(struct drm_device *dev, void *data, struct drm_file * ms = get_instance_by_mask(file_priv, args->metric_group_mask); if (!ms) { - ivpu_err(vdev, "Instance doesn't exist for mask: %#llx\n", args->metric_group_mask); + ivpu_dbg(vdev, IOCTL, "Instance doesn't exist for mask: %#llx\n", + args->metric_group_mask); ret = -EINVAL; goto unlock; } diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 475ddc94f1cf..480c075d87f6 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -54,7 +54,7 @@ static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev) static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev) { struct ivpu_fw_info *fw = vdev->fw; - struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem); + struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem_bp); if (!bp->save_restore_ret_address) { ivpu_pm_prepare_cold_boot(vdev); @@ -186,7 +186,7 @@ void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason) if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) { ivpu_hw_diagnose_failure(vdev); ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */ - queue_work(system_unbound_wq, &vdev->pm->recovery_work); + queue_work(system_dfl_wq, &vdev->pm->recovery_work); } } @@ -226,7 +226,8 @@ void ivpu_start_job_timeout_detection(struct ivpu_device *vdev) unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr; /* No-op if already queued */ - queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms)); + queue_delayed_work(system_percpu_wq, &vdev->pm->job_timeout_work, + msecs_to_jiffies(timeout_ms)); } void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev) @@ -359,7 +360,6 @@ int ivpu_rpm_get(struct ivpu_device *vdev) void ivpu_rpm_put(struct ivpu_device *vdev) { - pm_runtime_mark_last_busy(vdev->drm.dev); pm_runtime_put_autosuspend(vdev->drm.dev); } @@ -428,7 +428,6 @@ void ivpu_pm_enable(struct ivpu_device *vdev) struct device *dev = vdev->drm.dev; pm_runtime_allow(dev); - pm_runtime_mark_last_busy(dev); pm_runtime_put_autosuspend(dev); } @@ -502,6 +501,11 @@ void ivpu_pm_irq_dct_work_fn(struct work_struct *work) else ret = ivpu_pm_dct_disable(vdev); - if (!ret) - ivpu_hw_btrs_dct_set_status(vdev, enable, vdev->pm->dct_active_percent); + if (!ret) { + /* Convert percent to U1.7 format */ + u8 val = DIV_ROUND_CLOSEST(vdev->pm->dct_active_percent * 128, 100); + + ivpu_hw_btrs_dct_set_status(vdev, enable, val); + } + } diff --git a/drivers/accel/ivpu/ivpu_sysfs.c b/drivers/accel/ivpu/ivpu_sysfs.c index 268ab7744a8b..d250a10caca9 100644 --- a/drivers/accel/ivpu/ivpu_sysfs.c +++ b/drivers/accel/ivpu/ivpu_sysfs.c @@ -63,7 +63,8 @@ npu_memory_utilization_show(struct device *dev, struct device_attribute *attr, c mutex_lock(&vdev->bo_list_lock); list_for_each_entry(bo, &vdev->bo_list, bo_list_node) - total_npu_memory += bo->base.base.size; + if (ivpu_bo_is_resident(bo)) + total_npu_memory += ivpu_bo_size(bo); mutex_unlock(&vdev->bo_list_lock); return sysfs_emit(buf, "%lld\n", total_npu_memory); diff --git a/drivers/accel/ivpu/vpu_jsm_api.h b/drivers/accel/ivpu/vpu_jsm_api.h index 4b6b2b3d2583..bca6a44dc041 100644 --- a/drivers/accel/ivpu/vpu_jsm_api.h +++ b/drivers/accel/ivpu/vpu_jsm_api.h @@ -1,15 +1,16 @@ /* SPDX-License-Identifier: MIT */ /* - * Copyright (c) 2020-2024, Intel Corporation. + * Copyright (c) 2020-2025, Intel Corporation. + */ + +/** + * @addtogroup Jsm + * @{ */ /** * @file * @brief JSM shared definitions - * - * @ingroup Jsm - * @brief JSM shared definitions - * @{ */ #ifndef VPU_JSM_API_H #define VPU_JSM_API_H @@ -22,7 +23,7 @@ /* * Minor version changes when API backward compatibility is preserved. */ -#define VPU_JSM_API_VER_MINOR 29 +#define VPU_JSM_API_VER_MINOR 33 /* * API header changed (field names, documentation, formatting) but API itself has not been changed @@ -71,9 +72,15 @@ #define VPU_JSM_STATUS_MVNCI_OUT_OF_RESOURCES 0xAU #define VPU_JSM_STATUS_MVNCI_NOT_IMPLEMENTED 0xBU #define VPU_JSM_STATUS_MVNCI_INTERNAL_ERROR 0xCU -/* Job status returned when the job was preempted mid-inference */ +/* @deprecated (use VPU_JSM_STATUS_PREEMPTED_MID_COMMAND instead) */ #define VPU_JSM_STATUS_PREEMPTED_MID_INFERENCE 0xDU +/* Job status returned when the job was preempted mid-command */ +#define VPU_JSM_STATUS_PREEMPTED_MID_COMMAND 0xDU +/* Range of status codes that require engine reset */ +#define VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MIN 0xEU #define VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW 0xEU +#define VPU_JSM_STATUS_MVNCI_PREEMPTION_TIMED_OUT 0xFU +#define VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MAX 0x1FU /* * Host <-> VPU IPC channels. @@ -134,11 +141,21 @@ enum { * 2. Native fence queues are only supported on VPU 40xx onwards. */ VPU_JOB_QUEUE_FLAGS_USE_NATIVE_FENCE_MASK = (1 << 1U), - /* * Enable turbo mode for testing NPU performance; not recommended for regular usage. */ - VPU_JOB_QUEUE_FLAGS_TURBO_MODE = (1 << 2U) + VPU_JOB_QUEUE_FLAGS_TURBO_MODE = (1 << 2U), + /* + * Queue error detection mode flag + * For 'interactive' queues (this bit not set), the FW will identify queues that have not + * completed a job inside the TDR timeout as in error as part of engine reset sequence. + * For 'non-interactive' queues (this bit set), the FW will identify queues that have not + * progressed the heartbeat inside the non-interactive no-progress timeout as in error as + * part of engine reset sequence. Additionally, there is an upper limit applied to these + * queues: even if they progress the heartbeat, if they run longer than non-interactive + * timeout, then the FW will also identify them as in error. + */ + VPU_JOB_QUEUE_FLAGS_NON_INTERACTIVE = (1 << 3U) }; /* @@ -209,7 +226,7 @@ enum { */ #define VPU_INLINE_CMD_TYPE_FENCE_SIGNAL 0x2 -/* +/** * Job scheduling priority bands for both hardware scheduling and OS scheduling. */ enum vpu_job_scheduling_priority_band { @@ -220,16 +237,16 @@ enum vpu_job_scheduling_priority_band { VPU_JOB_SCHEDULING_PRIORITY_BAND_COUNT = 4, }; -/* +/** * Job format. * Jobs defines the actual workloads to be executed by a given engine. */ struct vpu_job_queue_entry { - /**< Address of VPU commands batch buffer */ + /** Address of VPU commands batch buffer */ u64 batch_buf_addr; - /**< Job ID */ + /** Job ID */ u32 job_id; - /**< Flags bit field, see VPU_JOB_FLAGS_* above */ + /** Flags bit field, see VPU_JOB_FLAGS_* above */ u32 flags; /** * Doorbell ring timestamp taken by KMD from SoC's global system clock, in @@ -237,20 +254,20 @@ struct vpu_job_queue_entry { * to match other profiling timestamps. */ u64 doorbell_timestamp; - /**< Extra id for job tracking, used only in the firmware perf traces */ + /** Extra id for job tracking, used only in the firmware perf traces */ u64 host_tracking_id; - /**< Address of the primary preemption buffer to use for this job */ + /** Address of the primary preemption buffer to use for this job */ u64 primary_preempt_buf_addr; - /**< Size of the primary preemption buffer to use for this job */ + /** Size of the primary preemption buffer to use for this job */ u32 primary_preempt_buf_size; - /**< Size of secondary preemption buffer to use for this job */ + /** Size of secondary preemption buffer to use for this job */ u32 secondary_preempt_buf_size; - /**< Address of secondary preemption buffer to use for this job */ + /** Address of secondary preemption buffer to use for this job */ u64 secondary_preempt_buf_addr; u64 reserved_0; }; -/* +/** * Inline command format. * Inline commands are the commands executed at scheduler level (typically, * synchronization directives). Inline command and job objects must be of @@ -258,34 +275,36 @@ struct vpu_job_queue_entry { */ struct vpu_inline_cmd { u64 reserved_0; - /* Inline command type, see VPU_INLINE_CMD_TYPE_* defines. */ + /** Inline command type, see VPU_INLINE_CMD_TYPE_* defines. */ u32 type; - /* Flags bit field, see VPU_JOB_FLAGS_* above. */ + /** Flags bit field, see VPU_JOB_FLAGS_* above. */ u32 flags; - /* Inline command payload. Depends on inline command type. */ - union { - /* Fence (wait and signal) commands' payload. */ - struct { - /* Fence object handle. */ + /** Inline command payload. Depends on inline command type. */ + union payload { + /** Fence (wait and signal) commands' payload. */ + struct fence { + /** Fence object handle. */ u64 fence_handle; - /* User VA of the current fence value. */ + /** User VA of the current fence value. */ u64 current_value_va; - /* User VA of the monitored fence value (read-only). */ + /** User VA of the monitored fence value (read-only). */ u64 monitored_value_va; - /* Value to wait for or write in fence location. */ + /** Value to wait for or write in fence location. */ u64 value; - /* User VA of the log buffer in which to add log entry on completion. */ + /** User VA of the log buffer in which to add log entry on completion. */ u64 log_buffer_va; - /* NPU private data. */ + /** NPU private data. */ u64 npu_private_data; } fence; - /* Other commands do not have a payload. */ - /* Payload definition for future inline commands can be inserted here. */ + /** + * Other commands do not have a payload: + * Payload definition for future inline commands can be inserted here. + */ u64 reserved_1[6]; } payload; }; -/* +/** * Job queue slots can be populated either with job objects or inline command objects. */ union vpu_jobq_slot { @@ -293,7 +312,7 @@ union vpu_jobq_slot { struct vpu_inline_cmd inline_cmd; }; -/* +/** * Job queue control registers. */ struct vpu_job_queue_header { @@ -301,18 +320,18 @@ struct vpu_job_queue_header { u32 head; u32 tail; u32 flags; - /* Set to 1 to indicate priority_band field is valid */ + /** Set to 1 to indicate priority_band field is valid */ u32 priority_band_valid; - /* + /** * Priority for the work of this job queue, valid only if the HWS is NOT used - * and the `priority_band_valid` is set to 1. It is applied only during - * the VPU_JSM_MSG_REGISTER_DB message processing. - * The device firmware might use the `priority_band` to optimize the power + * and the @ref priority_band_valid is set to 1. It is applied only during + * the @ref VPU_JSM_MSG_REGISTER_DB message processing. + * The device firmware might use the priority_band to optimize the power * management logic, but it will not affect the order of jobs. * Available priority bands: @see enum vpu_job_scheduling_priority_band */ u32 priority_band; - /* Inside realtime band assigns a further priority, limited to 0..31 range */ + /** Inside realtime band assigns a further priority, limited to 0..31 range */ u32 realtime_priority_level; u32 reserved_0[9]; }; @@ -337,16 +356,16 @@ enum vpu_trace_entity_type { VPU_TRACE_ENTITY_TYPE_HW_COMPONENT = 2, }; -/* +/** * HWS specific log buffer header details. * Total size is 32 bytes. */ struct vpu_hws_log_buffer_header { - /* Written by VPU after adding a log entry. Initialised by host to 0. */ + /** Written by VPU after adding a log entry. Initialised by host to 0. */ u32 first_free_entry_index; - /* Incremented by VPU every time the VPU writes the 0th entry; initialised by host to 0. */ + /** Incremented by VPU every time the VPU writes the 0th entry; initialised by host to 0. */ u32 wraparound_count; - /* + /** * This is the number of buffers that can be stored in the log buffer provided by the host. * It is written by host before passing buffer to VPU. VPU should consider it read-only. */ @@ -354,14 +373,14 @@ struct vpu_hws_log_buffer_header { u64 reserved[2]; }; -/* +/** * HWS specific log buffer entry details. * Total size is 32 bytes. */ struct vpu_hws_log_buffer_entry { - /* VPU timestamp must be an invariant timer tick (not impacted by DVFS) */ + /** VPU timestamp must be an invariant timer tick (not impacted by DVFS) */ u64 vpu_timestamp; - /* + /** * Operation type: * 0 - context state change * 1 - queue new work @@ -371,7 +390,7 @@ struct vpu_hws_log_buffer_entry { */ u32 operation_type; u32 reserved; - /* Operation data depends on operation type */ + /** Operation data depends on operation type */ u64 operation_data[2]; }; @@ -381,51 +400,54 @@ enum vpu_hws_native_fence_log_type { VPU_HWS_NATIVE_FENCE_LOG_TYPE_SIGNALS = 2 }; -/* HWS native fence log buffer header. */ +/** HWS native fence log buffer header. */ struct vpu_hws_native_fence_log_header { union { struct { - /* Index of the first free entry in buffer. */ + /** Index of the first free entry in buffer. */ u32 first_free_entry_idx; - /* Incremented each time NPU wraps around the buffer to write next entry. */ + /** + * Incremented whenever the NPU wraps around the buffer and writes + * to the first entry again. + */ u32 wraparound_count; }; - /* Field allowing atomic update of both fields above. */ + /** Field allowing atomic update of both fields above. */ u64 atomic_wraparound_and_entry_idx; }; - /* Log buffer type, see enum vpu_hws_native_fence_log_type. */ + /** Log buffer type, see enum vpu_hws_native_fence_log_type. */ u64 type; - /* Allocated number of entries in the log buffer. */ + /** Allocated number of entries in the log buffer. */ u64 entry_nb; u64 reserved[2]; }; -/* Native fence log operation types. */ +/** Native fence log operation types. */ enum vpu_hws_native_fence_log_op { VPU_HWS_NATIVE_FENCE_LOG_OP_SIGNAL_EXECUTED = 0, VPU_HWS_NATIVE_FENCE_LOG_OP_WAIT_UNBLOCKED = 1 }; -/* HWS native fence log entry. */ +/** HWS native fence log entry. */ struct vpu_hws_native_fence_log_entry { - /* Newly signaled/unblocked fence value. */ + /** Newly signaled/unblocked fence value. */ u64 fence_value; - /* Native fence object handle to which this operation belongs. */ + /** Native fence object handle to which this operation belongs. */ u64 fence_handle; - /* Operation type, see enum vpu_hws_native_fence_log_op. */ + /** Operation type, see enum vpu_hws_native_fence_log_op. */ u64 op_type; u64 reserved_0; - /* + /** * VPU_HWS_NATIVE_FENCE_LOG_OP_WAIT_UNBLOCKED only: Timestamp at which fence * wait was started (in NPU SysTime). */ u64 fence_wait_start_ts; u64 reserved_1; - /* Timestamp at which fence operation was completed (in NPU SysTime). */ + /** Timestamp at which fence operation was completed (in NPU SysTime). */ u64 fence_end_ts; }; -/* Native fence log buffer. */ +/** Native fence log buffer. */ struct vpu_hws_native_fence_log_buffer { struct vpu_hws_native_fence_log_header header; struct vpu_hws_native_fence_log_entry entry[]; @@ -435,10 +457,17 @@ struct vpu_hws_native_fence_log_buffer { * Host <-> VPU IPC messages types. */ enum vpu_ipc_msg_type { + /** Unsupported command */ VPU_JSM_MSG_UNKNOWN = 0xFFFFFFFF, - /* IPC Host -> Device, Async commands */ + /** IPC Host -> Device, base id for async commands */ VPU_JSM_MSG_ASYNC_CMD = 0x1100, + /** + * Reset engine. The NPU cancels all the jobs currently executing on the target + * engine making the engine become idle and then does a HW reset, before returning + * to the host. + * @see struct vpu_ipc_msg_payload_engine_reset + */ VPU_JSM_MSG_ENGINE_RESET = VPU_JSM_MSG_ASYNC_CMD, /** * Preempt engine. The NPU stops (preempts) all the jobs currently @@ -448,10 +477,24 @@ enum vpu_ipc_msg_type { * the target engine, but it stops processing them (until the queue doorbell * is rung again); the host is responsible to reset the job queue, either * after preemption or when resubmitting jobs to the queue. + * @see vpu_ipc_msg_payload_engine_preempt */ VPU_JSM_MSG_ENGINE_PREEMPT = 0x1101, + /** + * OS scheduling doorbell register command + * @see vpu_ipc_msg_payload_register_db + */ VPU_JSM_MSG_REGISTER_DB = 0x1102, + /** + * OS scheduling doorbell unregister command + * @see vpu_ipc_msg_payload_unregister_db + */ VPU_JSM_MSG_UNREGISTER_DB = 0x1103, + /** + * Query engine heartbeat. Heartbeat is expected to increase monotonically + * and increase while work is being progressed by NPU. + * @see vpu_ipc_msg_payload_query_engine_hb + */ VPU_JSM_MSG_QUERY_ENGINE_HB = 0x1104, VPU_JSM_MSG_GET_POWER_LEVEL_COUNT = 0x1105, VPU_JSM_MSG_GET_POWER_LEVEL = 0x1106, @@ -477,6 +520,7 @@ enum vpu_ipc_msg_type { * aborted and removed from internal scheduling queues. All doorbells assigned * to the host_ssid are unregistered and any internal FW resources belonging to * the host_ssid are released. + * @see vpu_ipc_msg_payload_ssid_release */ VPU_JSM_MSG_SSID_RELEASE = 0x110e, /** @@ -504,43 +548,78 @@ enum vpu_ipc_msg_type { * @see vpu_jsm_metric_streamer_start */ VPU_JSM_MSG_METRIC_STREAMER_INFO = 0x1112, - /** Control command: Priority band setup */ + /** + * Control command: Priority band setup + * @see vpu_ipc_msg_payload_hws_priority_band_setup + */ VPU_JSM_MSG_SET_PRIORITY_BAND_SETUP = 0x1113, - /** Control command: Create command queue */ + /** + * Control command: Create command queue + * @see vpu_ipc_msg_payload_hws_create_cmdq + */ VPU_JSM_MSG_CREATE_CMD_QUEUE = 0x1114, - /** Control command: Destroy command queue */ + /** + * Control command: Destroy command queue + * @see vpu_ipc_msg_payload_hws_destroy_cmdq + */ VPU_JSM_MSG_DESTROY_CMD_QUEUE = 0x1115, - /** Control command: Set context scheduling properties */ + /** + * Control command: Set context scheduling properties + * @see vpu_ipc_msg_payload_hws_set_context_sched_properties + */ VPU_JSM_MSG_SET_CONTEXT_SCHED_PROPERTIES = 0x1116, - /* + /** * Register a doorbell to notify VPU of new work. The doorbell may later be * deallocated or reassigned to another context. + * @see vpu_jsm_hws_register_db */ VPU_JSM_MSG_HWS_REGISTER_DB = 0x1117, - /** Control command: Log buffer setting */ + /** + * Control command: Log buffer setting + * @see vpu_ipc_msg_payload_hws_set_scheduling_log + */ VPU_JSM_MSG_HWS_SET_SCHEDULING_LOG = 0x1118, - /* Control command: Suspend command queue. */ + /** + * Control command: Suspend command queue. + * @see vpu_ipc_msg_payload_hws_suspend_cmdq + */ VPU_JSM_MSG_HWS_SUSPEND_CMDQ = 0x1119, - /* Control command: Resume command queue */ + /** + * Control command: Resume command queue + * @see vpu_ipc_msg_payload_hws_resume_cmdq + */ VPU_JSM_MSG_HWS_RESUME_CMDQ = 0x111a, - /* Control command: Resume engine after reset */ + /** + * Control command: Resume engine after reset + * @see vpu_ipc_msg_payload_hws_resume_engine + */ VPU_JSM_MSG_HWS_ENGINE_RESUME = 0x111b, - /* Control command: Enable survivability/DCT mode */ + /** + * Control command: Enable survivability/DCT mode + * @see vpu_ipc_msg_payload_pwr_dct_control + */ VPU_JSM_MSG_DCT_ENABLE = 0x111c, - /* Control command: Disable survivability/DCT mode */ + /** + * Control command: Disable survivability/DCT mode + * This command has no payload + */ VPU_JSM_MSG_DCT_DISABLE = 0x111d, /** * Dump VPU state. To be used for debug purposes only. - * NOTE: Please introduce new ASYNC commands before this one. * + * This command has no payload. + * NOTE: Please introduce new ASYNC commands before this one. */ VPU_JSM_MSG_STATE_DUMP = 0x11FF, - /* IPC Host -> Device, General commands */ + /** IPC Host -> Device, base id for general commands */ VPU_JSM_MSG_GENERAL_CMD = 0x1200, + /** Unsupported command */ VPU_JSM_MSG_BLOB_DEINIT_DEPRECATED = VPU_JSM_MSG_GENERAL_CMD, /** * Control dyndbg behavior by executing a dyndbg command; equivalent to - * Linux command: `echo '<dyndbg_cmd>' > <debugfs>/dynamic_debug/control`. + * Linux command: + * @verbatim echo '<dyndbg_cmd>' > <debugfs>/dynamic_debug/control @endverbatim + * @see vpu_ipc_msg_payload_dyndbg_control */ VPU_JSM_MSG_DYNDBG_CONTROL = 0x1201, /** @@ -548,17 +627,35 @@ enum vpu_ipc_msg_type { */ VPU_JSM_MSG_PWR_D0I3_ENTER = 0x1202, - /* IPC Device -> Host, Job completion */ + /** + * IPC Device -> Host, Job completion + * @see struct vpu_ipc_msg_payload_job_done + */ VPU_JSM_MSG_JOB_DONE = 0x2100, - /* IPC Device -> Host, Fence signalled */ + /** + * IPC Device -> Host, Fence signalled + * @see vpu_ipc_msg_payload_native_fence_signalled + */ VPU_JSM_MSG_NATIVE_FENCE_SIGNALLED = 0x2101, /* IPC Device -> Host, Async command completion */ VPU_JSM_MSG_ASYNC_CMD_DONE = 0x2200, + /** + * IPC Device -> Host, engine reset complete + * @see vpu_ipc_msg_payload_engine_reset_done + */ VPU_JSM_MSG_ENGINE_RESET_DONE = VPU_JSM_MSG_ASYNC_CMD_DONE, + /** + * Preempt complete message + * @see vpu_ipc_msg_payload_engine_preempt_done + */ VPU_JSM_MSG_ENGINE_PREEMPT_DONE = 0x2201, VPU_JSM_MSG_REGISTER_DB_DONE = 0x2202, VPU_JSM_MSG_UNREGISTER_DB_DONE = 0x2203, + /** + * Response to query engine heartbeat. + * @see vpu_ipc_msg_payload_query_engine_hb_done + */ VPU_JSM_MSG_QUERY_ENGINE_HB_DONE = 0x2204, VPU_JSM_MSG_GET_POWER_LEVEL_COUNT_DONE = 0x2205, VPU_JSM_MSG_GET_POWER_LEVEL_DONE = 0x2206, @@ -575,7 +672,10 @@ enum vpu_ipc_msg_type { VPU_JSM_MSG_TRACE_GET_CAPABILITY_RSP = 0x220c, /** Response to VPU_JSM_MSG_TRACE_GET_NAME. */ VPU_JSM_MSG_TRACE_GET_NAME_RSP = 0x220d, - /** Response to VPU_JSM_MSG_SSID_RELEASE. */ + /** + * Response to VPU_JSM_MSG_SSID_RELEASE. + * @see vpu_ipc_msg_payload_ssid_release + */ VPU_JSM_MSG_SSID_RELEASE_DONE = 0x220e, /** * Response to VPU_JSM_MSG_METRIC_STREAMER_START. @@ -605,37 +705,71 @@ enum vpu_ipc_msg_type { /** * Asynchronous event sent from the VPU to the host either when the current * metric buffer is full or when the VPU has collected a multiple of - * @notify_sample_count samples as indicated through the start command - * (VPU_JSM_MSG_METRIC_STREAMER_START). Returns information about collected - * metric data. + * @ref vpu_jsm_metric_streamer_start::notify_sample_count samples as indicated + * through the start command (VPU_JSM_MSG_METRIC_STREAMER_START). Returns + * information about collected metric data. * @see vpu_jsm_metric_streamer_done */ VPU_JSM_MSG_METRIC_STREAMER_NOTIFICATION = 0x2213, - /** Response to control command: Priority band setup */ + /** + * Response to control command: Priority band setup + * @see vpu_ipc_msg_payload_hws_priority_band_setup + */ VPU_JSM_MSG_SET_PRIORITY_BAND_SETUP_RSP = 0x2214, - /** Response to control command: Create command queue */ + /** + * Response to control command: Create command queue + * @see vpu_ipc_msg_payload_hws_create_cmdq_rsp + */ VPU_JSM_MSG_CREATE_CMD_QUEUE_RSP = 0x2215, - /** Response to control command: Destroy command queue */ + /** + * Response to control command: Destroy command queue + * @see vpu_ipc_msg_payload_hws_destroy_cmdq + */ VPU_JSM_MSG_DESTROY_CMD_QUEUE_RSP = 0x2216, - /** Response to control command: Set context scheduling properties */ + /** + * Response to control command: Set context scheduling properties + * @see vpu_ipc_msg_payload_hws_set_context_sched_properties + */ VPU_JSM_MSG_SET_CONTEXT_SCHED_PROPERTIES_RSP = 0x2217, - /** Response to control command: Log buffer setting */ + /** + * Response to control command: Log buffer setting + * @see vpu_ipc_msg_payload_hws_set_scheduling_log + */ VPU_JSM_MSG_HWS_SET_SCHEDULING_LOG_RSP = 0x2218, - /* IPC Device -> Host, HWS notify index entry of log buffer written */ + /** + * IPC Device -> Host, HWS notify index entry of log buffer written + * @see vpu_ipc_msg_payload_hws_scheduling_log_notification + */ VPU_JSM_MSG_HWS_SCHEDULING_LOG_NOTIFICATION = 0x2219, - /* IPC Device -> Host, HWS completion of a context suspend request */ + /** + * IPC Device -> Host, HWS completion of a context suspend request + * @see vpu_ipc_msg_payload_hws_suspend_cmdq + */ VPU_JSM_MSG_HWS_SUSPEND_CMDQ_DONE = 0x221a, - /* Response to control command: Resume command queue */ + /** + * Response to control command: Resume command queue + * @see vpu_ipc_msg_payload_hws_resume_cmdq + */ VPU_JSM_MSG_HWS_RESUME_CMDQ_RSP = 0x221b, - /* Response to control command: Resume engine command response */ + /** + * Response to control command: Resume engine command response + * @see vpu_ipc_msg_payload_hws_resume_engine + */ VPU_JSM_MSG_HWS_RESUME_ENGINE_DONE = 0x221c, - /* Response to control command: Enable survivability/DCT mode */ + /** + * Response to control command: Enable survivability/DCT mode + * This command has no payload + */ VPU_JSM_MSG_DCT_ENABLE_DONE = 0x221d, - /* Response to control command: Disable survivability/DCT mode */ + /** + * Response to control command: Disable survivability/DCT mode + * This command has no payload + */ VPU_JSM_MSG_DCT_DISABLE_DONE = 0x221e, /** * Response to state dump control command. - * NOTE: Please introduce new ASYNC responses before this one. * + * This command has no payload. + * NOTE: Please introduce new ASYNC responses before this one. */ VPU_JSM_MSG_STATE_DUMP_RSP = 0x22FF, @@ -653,57 +787,66 @@ enum vpu_ipc_msg_type { enum vpu_ipc_msg_status { VPU_JSM_MSG_FREE, VPU_JSM_MSG_ALLOCATED }; -/* - * Host <-> LRT IPC message payload definitions +/** + * Engine reset request payload + * @see VPU_JSM_MSG_ENGINE_RESET */ struct vpu_ipc_msg_payload_engine_reset { - /* Engine to be reset. */ + /** Engine to be reset. */ u32 engine_idx; - /* Reserved */ + /** Reserved */ u32 reserved_0; }; +/** + * Engine preemption request struct + * @see VPU_JSM_MSG_ENGINE_PREEMPT + */ struct vpu_ipc_msg_payload_engine_preempt { - /* Engine to be preempted. */ + /** Engine to be preempted. */ u32 engine_idx; - /* ID of the preemption request. */ + /** ID of the preemption request. */ u32 preempt_id; }; -/* - * @brief Register doorbell command structure. +/** + * Register doorbell command structure. * This structure supports doorbell registration for only OS scheduling. * @see VPU_JSM_MSG_REGISTER_DB */ struct vpu_ipc_msg_payload_register_db { - /* Index of the doorbell to register. */ + /** Index of the doorbell to register. */ u32 db_idx; - /* Reserved */ + /** Reserved */ u32 reserved_0; - /* Virtual address in Global GTT pointing to the start of job queue. */ + /** Virtual address in Global GTT pointing to the start of job queue. */ u64 jobq_base; - /* Size of the job queue in bytes. */ + /** Size of the job queue in bytes. */ u32 jobq_size; - /* Host sub-stream ID for the context assigned to the doorbell. */ + /** Host sub-stream ID for the context assigned to the doorbell. */ u32 host_ssid; }; /** - * @brief Unregister doorbell command structure. + * Unregister doorbell command structure. * Request structure to unregister a doorbell for both HW and OS scheduling. * @see VPU_JSM_MSG_UNREGISTER_DB */ struct vpu_ipc_msg_payload_unregister_db { - /* Index of the doorbell to unregister. */ + /** Index of the doorbell to unregister. */ u32 db_idx; - /* Reserved */ + /** Reserved */ u32 reserved_0; }; +/** + * Heartbeat request structure + * @see VPU_JSM_MSG_QUERY_ENGINE_HB + */ struct vpu_ipc_msg_payload_query_engine_hb { - /* Engine to return heartbeat value. */ + /** Engine to return heartbeat value. */ u32 engine_idx; - /* Reserved */ + /** Reserved */ u32 reserved_0; }; @@ -723,10 +866,14 @@ struct vpu_ipc_msg_payload_power_level { u32 reserved_0; }; +/** + * Structure for requesting ssid release + * @see VPU_JSM_MSG_SSID_RELEASE + */ struct vpu_ipc_msg_payload_ssid_release { - /* Host sub-stream ID for the context to be released. */ + /** Host sub-stream ID for the context to be released. */ u32 host_ssid; - /* Reserved */ + /** Reserved */ u32 reserved_0; }; @@ -752,7 +899,7 @@ struct vpu_jsm_metric_streamer_start { u64 sampling_rate; /** * If > 0 the VPU will send a VPU_JSM_MSG_METRIC_STREAMER_NOTIFICATION message - * after every @notify_sample_count samples is collected or dropped by the VPU. + * after every @ref notify_sample_count samples is collected or dropped by the VPU. * If set to UINT_MAX the VPU will only generate a notification when the metric * buffer is full. If set to 0 the VPU will never generate a notification. */ @@ -762,9 +909,9 @@ struct vpu_jsm_metric_streamer_start { * Address and size of the buffer where the VPU will write metric data. The * VPU writes all counters from enabled metric groups one after another. If * there is no space left to write data at the next sample period the VPU - * will switch to the next buffer (@see next_buffer_addr) and will optionally - * send a notification to the host driver if @notify_sample_count is non-zero. - * If @next_buffer_addr is NULL the VPU will stop collecting metric data. + * will switch to the next buffer (@ref next_buffer_addr) and will optionally + * send a notification to the host driver if @ref notify_sample_count is non-zero. + * If @ref next_buffer_addr is NULL the VPU will stop collecting metric data. */ u64 buffer_addr; u64 buffer_size; @@ -827,63 +974,80 @@ struct vpu_jsm_metric_streamer_update { u64 next_buffer_size; }; +/** + * Device -> host job completion message. + * @see VPU_JSM_MSG_JOB_DONE + */ struct vpu_ipc_msg_payload_job_done { - /* Engine to which the job was submitted. */ + /** Engine to which the job was submitted. */ u32 engine_idx; - /* Index of the doorbell to which the job was submitted */ + /** Index of the doorbell to which the job was submitted */ u32 db_idx; - /* ID of the completed job */ + /** ID of the completed job */ u32 job_id; - /* Status of the completed job */ + /** Status of the completed job */ u32 job_status; - /* Host SSID */ + /** Host SSID */ u32 host_ssid; - /* Zero Padding */ + /** Zero Padding */ u32 reserved_0; - /* Command queue id */ + /** Command queue id */ u64 cmdq_id; }; -/* +/** * Notification message upon native fence signalling. * @see VPU_JSM_MSG_NATIVE_FENCE_SIGNALLED */ struct vpu_ipc_msg_payload_native_fence_signalled { - /* Engine ID. */ + /** Engine ID. */ u32 engine_idx; - /* Host SSID. */ + /** Host SSID. */ u32 host_ssid; - /* CMDQ ID */ + /** CMDQ ID */ u64 cmdq_id; - /* Fence object handle. */ + /** Fence object handle. */ u64 fence_handle; }; +/** + * vpu_ipc_msg_payload_engine_reset_done will contain an array of this structure + * which contains which queues caused reset if FW was able to detect any error. + * @see vpu_ipc_msg_payload_engine_reset_done + */ struct vpu_jsm_engine_reset_context { - /* Host SSID */ + /** Host SSID */ u32 host_ssid; - /* Zero Padding */ + /** Zero Padding */ u32 reserved_0; - /* Command queue id */ + /** Command queue id */ u64 cmdq_id; - /* See VPU_ENGINE_RESET_CONTEXT_* defines */ + /** See VPU_ENGINE_RESET_CONTEXT_* defines */ u64 flags; }; +/** + * Engine reset response. + * @see VPU_JSM_MSG_ENGINE_RESET_DONE + */ struct vpu_ipc_msg_payload_engine_reset_done { - /* Engine ordinal */ + /** Engine ordinal */ u32 engine_idx; - /* Number of impacted contexts */ + /** Number of impacted contexts */ u32 num_impacted_contexts; - /* Array of impacted command queue ids and their flags */ + /** Array of impacted command queue ids and their flags */ struct vpu_jsm_engine_reset_context impacted_contexts[VPU_MAX_ENGINE_RESET_IMPACTED_CONTEXTS]; }; +/** + * Preemption response struct + * @see VPU_JSM_MSG_ENGINE_PREEMPT_DONE + */ struct vpu_ipc_msg_payload_engine_preempt_done { - /* Engine preempted. */ + /** Engine preempted. */ u32 engine_idx; - /* ID of the preemption request. */ + /** ID of the preemption request. */ u32 preempt_id; }; @@ -912,12 +1076,16 @@ struct vpu_ipc_msg_payload_unregister_db_done { u32 reserved_0; }; +/** + * Structure for heartbeat response + * @see VPU_JSM_MSG_QUERY_ENGINE_HB_DONE + */ struct vpu_ipc_msg_payload_query_engine_hb_done { - /* Engine returning heartbeat value. */ + /** Engine returning heartbeat value. */ u32 engine_idx; - /* Reserved */ + /** Reserved */ u32 reserved_0; - /* Heartbeat value. */ + /** Heartbeat value. */ u64 heartbeat; }; @@ -937,7 +1105,10 @@ struct vpu_ipc_msg_payload_get_power_level_count_done { u8 power_limit[16]; }; -/* HWS priority band setup request / response */ +/** + * HWS priority band setup request / response + * @see VPU_JSM_MSG_SET_PRIORITY_BAND_SETUP + */ struct vpu_ipc_msg_payload_hws_priority_band_setup { /* * Grace period in 100ns units when preempting another priority band for @@ -964,15 +1135,23 @@ struct vpu_ipc_msg_payload_hws_priority_band_setup { * TDR timeout value in milliseconds. Default value of 0 meaning no timeout. */ u32 tdr_timeout; + /* Non-interactive queue timeout for no progress of heartbeat in milliseconds. + * Default value of 0 meaning no timeout. + */ + u32 non_interactive_no_progress_timeout; + /* + * Non-interactive queue upper limit timeout value in milliseconds. Default + * value of 0 meaning no timeout. + */ + u32 non_interactive_timeout; }; -/* +/** * @brief HWS create command queue request. * Host will create a command queue via this command. * Note: Cmdq group is a handle of an object which * may contain one or more command queues. * @see VPU_JSM_MSG_CREATE_CMD_QUEUE - * @see VPU_JSM_MSG_CREATE_CMD_QUEUE_RSP */ struct vpu_ipc_msg_payload_hws_create_cmdq { /* Process id */ @@ -993,66 +1172,73 @@ struct vpu_ipc_msg_payload_hws_create_cmdq { u32 reserved_0; }; -/* - * @brief HWS create command queue response. - * @see VPU_JSM_MSG_CREATE_CMD_QUEUE +/** + * HWS create command queue response. * @see VPU_JSM_MSG_CREATE_CMD_QUEUE_RSP */ struct vpu_ipc_msg_payload_hws_create_cmdq_rsp { - /* Process id */ + /** Process id */ u64 process_id; - /* Host SSID */ + /** Host SSID */ u32 host_ssid; - /* Engine for which queue is being created */ + /** Engine for which queue is being created */ u32 engine_idx; - /* Command queue group */ + /** Command queue group */ u64 cmdq_group; - /* Command queue id */ + /** Command queue id */ u64 cmdq_id; }; -/* HWS destroy command queue request / response */ +/** + * HWS destroy command queue request / response + * @see VPU_JSM_MSG_DESTROY_CMD_QUEUE + * @see VPU_JSM_MSG_DESTROY_CMD_QUEUE_RSP + */ struct vpu_ipc_msg_payload_hws_destroy_cmdq { - /* Host SSID */ + /** Host SSID */ u32 host_ssid; - /* Zero Padding */ + /** Zero Padding */ u32 reserved; - /* Command queue id */ + /** Command queue id */ u64 cmdq_id; }; -/* HWS set context scheduling properties request / response */ +/** + * HWS set context scheduling properties request / response + * @see VPU_JSM_MSG_SET_CONTEXT_SCHED_PROPERTIES + * @see VPU_JSM_MSG_SET_CONTEXT_SCHED_PROPERTIES_RSP + */ struct vpu_ipc_msg_payload_hws_set_context_sched_properties { - /* Host SSID */ + /** Host SSID */ u32 host_ssid; - /* Zero Padding */ + /** Zero Padding */ u32 reserved_0; - /* Command queue id */ + /** Command queue id */ u64 cmdq_id; - /* + /** * Priority band to assign to work of this context. * Available priority bands: @see enum vpu_job_scheduling_priority_band */ u32 priority_band; - /* Inside realtime band assigns a further priority */ + /** Inside realtime band assigns a further priority */ u32 realtime_priority_level; - /* Priority relative to other contexts in the same process */ + /** Priority relative to other contexts in the same process */ s32 in_process_priority; - /* Zero padding / Reserved */ + /** Zero padding / Reserved */ u32 reserved_1; - /* + /** * Context quantum relative to other contexts of same priority in the same process * Minimum value supported by NPU is 1ms (10000 in 100ns units). */ u64 context_quantum; - /* Grace period when preempting context of the same priority within the same process */ + /** Grace period when preempting context of the same priority within the same process */ u64 grace_period_same_priority; - /* Grace period when preempting context of a lower priority within the same process */ + /** Grace period when preempting context of a lower priority within the same process */ u64 grace_period_lower_priority; }; -/* - * @brief Register doorbell command structure. +/** + * Register doorbell command structure. * This structure supports doorbell registration for both HW and OS scheduling. * Note: Queue base and size are added here so that the same structure can be used for * OS scheduling and HW scheduling. For OS scheduling, cmdq_id will be ignored @@ -1061,27 +1247,27 @@ struct vpu_ipc_msg_payload_hws_set_context_sched_properties { * @see VPU_JSM_MSG_HWS_REGISTER_DB */ struct vpu_jsm_hws_register_db { - /* Index of the doorbell to register. */ + /** Index of the doorbell to register. */ u32 db_id; - /* Host sub-stream ID for the context assigned to the doorbell. */ + /** Host sub-stream ID for the context assigned to the doorbell. */ u32 host_ssid; - /* ID of the command queue associated with the doorbell. */ + /** ID of the command queue associated with the doorbell. */ u64 cmdq_id; - /* Virtual address pointing to the start of command queue. */ + /** Virtual address pointing to the start of command queue. */ u64 cmdq_base; - /* Size of the command queue in bytes. */ + /** Size of the command queue in bytes. */ u64 cmdq_size; }; -/* - * @brief Structure to set another buffer to be used for scheduling-related logging. +/** + * Structure to set another buffer to be used for scheduling-related logging. * The size of the logging buffer and the number of entries is defined as part of the * buffer itself as described next. * The log buffer received from the host is made up of; - * - header: 32 bytes in size, as shown in 'struct vpu_hws_log_buffer_header'. + * - header: 32 bytes in size, as shown in @ref vpu_hws_log_buffer_header. * The header contains the number of log entries in the buffer. * - log entry: 0 to n-1, each log entry is 32 bytes in size, as shown in - * 'struct vpu_hws_log_buffer_entry'. + * @ref vpu_hws_log_buffer_entry. * The entry contains the VPU timestamp, operation type and data. * The host should provide the notify index value of log buffer to VPU. This is a * value defined within the log buffer and when written to will generate the @@ -1095,30 +1281,30 @@ struct vpu_jsm_hws_register_db { * @see VPU_JSM_MSG_HWS_SCHEDULING_LOG_NOTIFICATION */ struct vpu_ipc_msg_payload_hws_set_scheduling_log { - /* Engine ordinal */ + /** Engine ordinal */ u32 engine_idx; - /* Host SSID */ + /** Host SSID */ u32 host_ssid; - /* + /** * VPU log buffer virtual address. * Set to 0 to disable logging for this engine. */ u64 vpu_log_buffer_va; - /* + /** * Notify index of log buffer. VPU_JSM_MSG_HWS_SCHEDULING_LOG_NOTIFICATION * is generated when an event log is written to this index. */ u64 notify_index; - /* + /** * Field is now deprecated, will be removed when KMD is updated to support removal */ u32 enable_extra_events; - /* Zero Padding */ + /** Zero Padding */ u32 reserved_0; }; -/* - * @brief The scheduling log notification is generated by VPU when it writes +/** + * The scheduling log notification is generated by VPU when it writes * an event into the log buffer at the notify_index. VPU notifies host with * VPU_JSM_MSG_HWS_SCHEDULING_LOG_NOTIFICATION. This is an asynchronous * message from VPU to host. @@ -1126,14 +1312,14 @@ struct vpu_ipc_msg_payload_hws_set_scheduling_log { * @see VPU_JSM_MSG_HWS_SET_SCHEDULING_LOG */ struct vpu_ipc_msg_payload_hws_scheduling_log_notification { - /* Engine ordinal */ + /** Engine ordinal */ u32 engine_idx; - /* Zero Padding */ + /** Zero Padding */ u32 reserved_0; }; -/* - * @brief HWS suspend command queue request and done structure. +/** + * HWS suspend command queue request and done structure. * Host will request the suspend of contexts and VPU will; * - Suspend all work on this context * - Preempt any running work @@ -1152,21 +1338,21 @@ struct vpu_ipc_msg_payload_hws_scheduling_log_notification { * @see VPU_JSM_MSG_HWS_SUSPEND_CMDQ_DONE */ struct vpu_ipc_msg_payload_hws_suspend_cmdq { - /* Host SSID */ + /** Host SSID */ u32 host_ssid; - /* Zero Padding */ + /** Zero Padding */ u32 reserved_0; - /* Command queue id */ + /** Command queue id */ u64 cmdq_id; - /* + /** * Suspend fence value - reported by the VPU suspend context * completed once suspend is complete. */ u64 suspend_fence_value; }; -/* - * @brief HWS Resume command queue request / response structure. +/** + * HWS Resume command queue request / response structure. * Host will request the resume of a context; * - VPU will resume all work on this context * - Scheduler will allow this context to be scheduled @@ -1174,25 +1360,25 @@ struct vpu_ipc_msg_payload_hws_suspend_cmdq { * @see VPU_JSM_MSG_HWS_RESUME_CMDQ_RSP */ struct vpu_ipc_msg_payload_hws_resume_cmdq { - /* Host SSID */ + /** Host SSID */ u32 host_ssid; - /* Zero Padding */ + /** Zero Padding */ u32 reserved_0; - /* Command queue id */ + /** Command queue id */ u64 cmdq_id; }; -/* - * @brief HWS Resume engine request / response structure. - * After a HWS engine reset, all scheduling is stopped on VPU until a engine resume. +/** + * HWS Resume engine request / response structure. + * After a HWS engine reset, all scheduling is stopped on VPU until an engine resume. * Host shall send this command to resume scheduling of any valid queue. - * @see VPU_JSM_MSG_HWS_RESUME_ENGINE + * @see VPU_JSM_MSG_HWS_ENGINE_RESUME * @see VPU_JSM_MSG_HWS_RESUME_ENGINE_DONE */ struct vpu_ipc_msg_payload_hws_resume_engine { - /* Engine to be resumed */ + /** Engine to be resumed */ u32 engine_idx; - /* Reserved */ + /** Reserved */ u32 reserved_0; }; @@ -1326,7 +1512,7 @@ struct vpu_jsm_metric_streamer_done { /** * Metric group description placed in the metric buffer after successful completion * of the VPU_JSM_MSG_METRIC_STREAMER_INFO command. This is followed by one or more - * @vpu_jsm_metric_counter_descriptor records. + * @ref vpu_jsm_metric_counter_descriptor records. * @see VPU_JSM_MSG_METRIC_STREAMER_INFO */ struct vpu_jsm_metric_group_descriptor { @@ -1413,29 +1599,24 @@ struct vpu_jsm_metric_counter_descriptor { }; /** - * Payload for VPU_JSM_MSG_DYNDBG_CONTROL requests. + * Payload for @ref VPU_JSM_MSG_DYNDBG_CONTROL requests. * - * VPU_JSM_MSG_DYNDBG_CONTROL are used to control the VPU FW Dynamic Debug - * feature, which allows developers to selectively enable / disable MVLOG_DEBUG - * messages. This is equivalent to the Dynamic Debug functionality provided by - * Linux - * (https://www.kernel.org/doc/html/latest/admin-guide/dynamic-debug-howto.html) - * The host can control Dynamic Debug behavior by sending dyndbg commands, which - * have the same syntax as Linux - * dyndbg commands. + * VPU_JSM_MSG_DYNDBG_CONTROL requests are used to control the VPU FW dynamic debug + * feature, which allows developers to selectively enable/disable code to obtain + * additional FW information. This is equivalent to the dynamic debug functionality + * provided by Linux. The host can control dynamic debug behavior by sending dyndbg + * commands, using the same syntax as for Linux dynamic debug commands. * - * NOTE: in order for MVLOG_DEBUG messages to be actually printed, the host - * still has to set the logging level to MVLOG_DEBUG, using the - * VPU_JSM_MSG_TRACE_SET_CONFIG command. + * @see https://www.kernel.org/doc/html/latest/admin-guide/dynamic-debug-howto.html. * - * The host can see the current dynamic debug configuration by executing a - * special 'show' command. The dyndbg configuration will be printed to the - * configured logging destination using MVLOG_INFO logging level. + * NOTE: + * As the dynamic debug feature uses MVLOG messages to provide information, the host + * must first set the logging level to MVLOG_DEBUG, using the @ref VPU_JSM_MSG_TRACE_SET_CONFIG + * command. */ struct vpu_ipc_msg_payload_dyndbg_control { /** - * Dyndbg command (same format as Linux dyndbg); must be a NULL-terminated - * string. + * Dyndbg command to be executed. */ char dyndbg_cmd[VPU_DYNDBG_CMD_MAX_LEN]; }; @@ -1456,7 +1637,7 @@ struct vpu_ipc_msg_payload_pwr_d0i3_enter { }; /** - * Payload for VPU_JSM_MSG_DCT_ENABLE message. + * Payload for @ref VPU_JSM_MSG_DCT_ENABLE message. * * Default values for DCT active/inactive times are 5.3ms and 30ms respectively, * corresponding to a 85% duty cycle. This payload allows the host to tune these @@ -1513,28 +1694,28 @@ union vpu_ipc_msg_payload { struct vpu_ipc_msg_payload_pwr_dct_control pwr_dct_control; }; -/* - * Host <-> LRT IPC message base structure. +/** + * Host <-> NPU IPC message base structure. * * NOTE: All instances of this object must be aligned on a 64B boundary * to allow proper handling of VPU cache operations. */ struct vpu_jsm_msg { - /* Reserved */ + /** Reserved */ u64 reserved_0; - /* Message type, see vpu_ipc_msg_type enum. */ + /** Message type, see @ref vpu_ipc_msg_type. */ u32 type; - /* Buffer status, see vpu_ipc_msg_status enum. */ + /** Buffer status, see @ref vpu_ipc_msg_status. */ u32 status; - /* + /** * Request ID, provided by the host in a request message and passed * back by VPU in the response message. */ u32 request_id; - /* Request return code set by the VPU, see VPU_JSM_STATUS_* defines. */ + /** Request return code set by the VPU, see VPU_JSM_STATUS_* defines. */ u32 result; u64 reserved_1; - /* Message payload depending on message type, see vpu_ipc_msg_payload union. */ + /** Message payload depending on message type, see vpu_ipc_msg_payload union. */ union vpu_ipc_msg_payload payload; }; diff --git a/drivers/accel/qaic/Kconfig b/drivers/accel/qaic/Kconfig index 5e405a19c157..116e42d152ca 100644 --- a/drivers/accel/qaic/Kconfig +++ b/drivers/accel/qaic/Kconfig @@ -9,6 +9,7 @@ config DRM_ACCEL_QAIC depends on PCI && HAS_IOMEM depends on MHI_BUS select CRC32 + select WANT_DEV_COREDUMP help Enables driver for Qualcomm's Cloud AI accelerator PCIe cards that are designed to accelerate Deep Learning inference workloads. diff --git a/drivers/accel/qaic/Makefile b/drivers/accel/qaic/Makefile index 1106b876f737..71f727b74da3 100644 --- a/drivers/accel/qaic/Makefile +++ b/drivers/accel/qaic/Makefile @@ -11,6 +11,8 @@ qaic-y := \ qaic_data.o \ qaic_drv.o \ qaic_ras.o \ + qaic_ssr.o \ + qaic_sysfs.o \ qaic_timesync.o \ sahara.o diff --git a/drivers/accel/qaic/qaic.h b/drivers/accel/qaic/qaic.h index 820d133236dd..fa7a8155658c 100644 --- a/drivers/accel/qaic/qaic.h +++ b/drivers/accel/qaic/qaic.h @@ -21,6 +21,7 @@ #define QAIC_DBC_BASE SZ_128K #define QAIC_DBC_SIZE SZ_4K +#define QAIC_SSR_DBC_SENTINEL U32_MAX /* No ongoing SSR sentinel */ #define QAIC_NO_PARTITION -1 @@ -47,6 +48,22 @@ enum __packed dev_states { QAIC_ONLINE, }; +enum dbc_states { + /* DBC is free and can be activated */ + DBC_STATE_IDLE, + /* DBC is activated and a workload is running on device */ + DBC_STATE_ASSIGNED, + /* Sub-system associated with this workload has crashed and it will shutdown soon */ + DBC_STATE_BEFORE_SHUTDOWN, + /* Sub-system associated with this workload has crashed and it has shutdown */ + DBC_STATE_AFTER_SHUTDOWN, + /* Sub-system associated with this workload is shutdown and it will be powered up soon */ + DBC_STATE_BEFORE_POWER_UP, + /* Sub-system associated with this workload is now powered up */ + DBC_STATE_AFTER_POWER_UP, + DBC_STATE_MAX, +}; + extern bool datapath_polling; struct qaic_user { @@ -114,6 +131,8 @@ struct dma_bridge_chan { unsigned int irq; /* Polling work item to simulate interrupts */ struct work_struct poll_work; + /* Represents various states of this DBC from enum dbc_states */ + unsigned int state; }; struct qaic_device { @@ -161,6 +180,8 @@ struct qaic_device { struct mhi_device *qts_ch; /* Work queue for tasks related to MHI "QAIC_TIMESYNC" channel */ struct workqueue_struct *qts_wq; + /* MHI "QAIC_TIMESYNC_PERIODIC" channel device */ + struct mhi_device *mqts_ch; /* Head of list of page allocated by MHI bootlog device */ struct list_head bootlog; /* MHI bootlog channel device */ @@ -177,6 +198,14 @@ struct qaic_device { unsigned int ue_count; /* Un-correctable non-fatal error count */ unsigned int ue_nf_count; + /* MHI SSR channel device */ + struct mhi_device *ssr_ch; + /* Work queue for tasks related to MHI SSR device */ + struct workqueue_struct *ssr_wq; + /* Buffer to collect SSR crashdump via SSR MHI channel */ + void *ssr_mhi_buf; + /* DBC which is under SSR. Sentinel U32_MAX would mean that no SSR in progress */ + u32 ssr_dbc; }; struct qaic_drm_device { @@ -195,6 +224,8 @@ struct qaic_drm_device { struct list_head users; /* Synchronizes access to users list */ struct mutex users_mutex; + /* Pointer to array of DBC sysfs attributes */ + void *sysfs_attrs; }; struct qaic_bo { @@ -317,6 +348,13 @@ int qaic_partial_execute_bo_ioctl(struct drm_device *dev, void *data, struct drm int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int qaic_perf_stats_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int qaic_detach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); -void irq_polling_work(struct work_struct *work); +void qaic_irq_polling_work(struct work_struct *work); +void qaic_dbc_enter_ssr(struct qaic_device *qdev, u32 dbc_id); +void qaic_dbc_exit_ssr(struct qaic_device *qdev); + +/* qaic_sysfs.c */ +int qaic_sysfs_init(struct qaic_drm_device *qddev); +void qaic_sysfs_remove(struct qaic_drm_device *qddev); +void set_dbc_state(struct qaic_device *qdev, u32 dbc_id, unsigned int state); #endif /* _QAIC_H_ */ diff --git a/drivers/accel/qaic/qaic_control.c b/drivers/accel/qaic/qaic_control.c index b86a8e48e731..428d8f65bff3 100644 --- a/drivers/accel/qaic/qaic_control.c +++ b/drivers/accel/qaic/qaic_control.c @@ -17,6 +17,7 @@ #include <linux/overflow.h> #include <linux/pci.h> #include <linux/scatterlist.h> +#include <linux/sched/signal.h> #include <linux/types.h> #include <linux/uaccess.h> #include <linux/workqueue.h> @@ -30,7 +31,7 @@ #define MANAGE_MAGIC_NUMBER ((__force __le32)0x43494151) /* "QAIC" in little endian */ #define QAIC_DBC_Q_GAP SZ_256 #define QAIC_DBC_Q_BUF_ALIGN SZ_4K -#define QAIC_MANAGE_EXT_MSG_LENGTH SZ_64K /* Max DMA message length */ +#define QAIC_MANAGE_WIRE_MSG_LENGTH SZ_64K /* Max DMA message length */ #define QAIC_WRAPPER_MAX_SIZE SZ_4K #define QAIC_MHI_RETRY_WAIT_MS 100 #define QAIC_MHI_RETRY_MAX 20 @@ -309,6 +310,7 @@ static void save_dbc_buf(struct qaic_device *qdev, struct ioctl_resources *resou enable_dbc(qdev, dbc_id, usr); qdev->dbc[dbc_id].in_use = true; resources->buf = NULL; + set_dbc_state(qdev, dbc_id, DBC_STATE_ASSIGNED); } } @@ -367,7 +369,7 @@ static int encode_passthrough(struct qaic_device *qdev, void *trans, struct wrap if (in_trans->hdr.len % 8 != 0) return -EINVAL; - if (size_add(msg_hdr_len, in_trans->hdr.len) > QAIC_MANAGE_EXT_MSG_LENGTH) + if (size_add(msg_hdr_len, in_trans->hdr.len) > QAIC_MANAGE_WIRE_MSG_LENGTH) return -ENOSPC; trans_wrapper = add_wrapper(wrappers, @@ -495,7 +497,7 @@ static int encode_addr_size_pairs(struct dma_xfer *xfer, struct wrapper_list *wr nents = sgt->nents; nents_dma = nents; - *size = QAIC_MANAGE_EXT_MSG_LENGTH - msg_hdr_len - sizeof(**out_trans); + *size = QAIC_MANAGE_WIRE_MSG_LENGTH - msg_hdr_len - sizeof(**out_trans); for_each_sgtable_dma_sg(sgt, sg, i) { *size -= sizeof(*asp); /* Save 1K for possible follow-up transactions. */ @@ -576,7 +578,7 @@ static int encode_dma(struct qaic_device *qdev, void *trans, struct wrapper_list /* There should be enough space to hold at least one ASP entry. */ if (size_add(msg_hdr_len, sizeof(*out_trans) + sizeof(struct wire_addr_size_pair)) > - QAIC_MANAGE_EXT_MSG_LENGTH) + QAIC_MANAGE_WIRE_MSG_LENGTH) return -ENOMEM; xfer = kmalloc(sizeof(*xfer), GFP_KERNEL); @@ -645,7 +647,7 @@ static int encode_activate(struct qaic_device *qdev, void *trans, struct wrapper msg = &wrapper->msg; msg_hdr_len = le32_to_cpu(msg->hdr.len); - if (size_add(msg_hdr_len, sizeof(*out_trans)) > QAIC_MANAGE_MAX_MSG_LENGTH) + if (size_add(msg_hdr_len, sizeof(*out_trans)) > QAIC_MANAGE_WIRE_MSG_LENGTH) return -ENOSPC; if (!in_trans->queue_size) @@ -655,8 +657,9 @@ static int encode_activate(struct qaic_device *qdev, void *trans, struct wrapper return -EINVAL; nelem = in_trans->queue_size; - size = (get_dbc_req_elem_size() + get_dbc_rsp_elem_size()) * nelem; - if (size / nelem != get_dbc_req_elem_size() + get_dbc_rsp_elem_size()) + if (check_mul_overflow((u32)(get_dbc_req_elem_size() + get_dbc_rsp_elem_size()), + nelem, + &size)) return -EINVAL; if (size + QAIC_DBC_Q_GAP + QAIC_DBC_Q_BUF_ALIGN < size) @@ -729,7 +732,7 @@ static int encode_status(struct qaic_device *qdev, void *trans, struct wrapper_l msg = &wrapper->msg; msg_hdr_len = le32_to_cpu(msg->hdr.len); - if (size_add(msg_hdr_len, in_trans->hdr.len) > QAIC_MANAGE_MAX_MSG_LENGTH) + if (size_add(msg_hdr_len, in_trans->hdr.len) > QAIC_MANAGE_WIRE_MSG_LENGTH) return -ENOSPC; trans_wrapper = add_wrapper(wrappers, sizeof(*trans_wrapper)); @@ -810,7 +813,7 @@ static int encode_message(struct qaic_device *qdev, struct manage_msg *user_msg, } if (ret) - break; + goto out; } if (user_len != user_msg->len) @@ -921,6 +924,7 @@ static int decode_deactivate(struct qaic_device *qdev, void *trans, u32 *msg_len } release_dbc(qdev, dbc_id); + set_dbc_state(qdev, dbc_id, DBC_STATE_IDLE); *msg_len += sizeof(*in_trans); return 0; @@ -1052,7 +1056,7 @@ static void *msg_xfer(struct qaic_device *qdev, struct wrapper_list *wrappers, u init_completion(&elem.xfer_done); if (likely(!qdev->cntl_lost_buf)) { /* - * The max size of request to device is QAIC_MANAGE_EXT_MSG_LENGTH. + * The max size of request to device is QAIC_MANAGE_WIRE_MSG_LENGTH. * The max size of response from device is QAIC_MANAGE_MAX_MSG_LENGTH. */ out_buf = kmalloc(QAIC_MANAGE_MAX_MSG_LENGTH, GFP_KERNEL); @@ -1079,7 +1083,6 @@ static void *msg_xfer(struct qaic_device *qdev, struct wrapper_list *wrappers, u list_for_each_entry(w, &wrappers->list, list) { kref_get(&w->ref_count); - retry_count = 0; ret = mhi_queue_buf(qdev->cntl_ch, DMA_TO_DEVICE, &w->msg, w->len, list_is_last(&w->list, &wrappers->list) ? MHI_EOT : MHI_CHAIN); if (ret) { diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index c4f117edb266..60cb4d65d48e 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -18,6 +18,7 @@ #include <linux/scatterlist.h> #include <linux/spinlock.h> #include <linux/srcu.h> +#include <linux/string.h> #include <linux/types.h> #include <linux/uaccess.h> #include <linux/wait.h> @@ -165,7 +166,7 @@ static void free_slice(struct kref *kref) drm_gem_object_put(&slice->bo->base); sg_free_table(slice->sgt); kfree(slice->sgt); - kfree(slice->reqs); + kvfree(slice->reqs); kfree(slice); } @@ -404,7 +405,7 @@ static int qaic_map_one_slice(struct qaic_device *qdev, struct qaic_bo *bo, goto free_sgt; } - slice->reqs = kcalloc(sgt->nents, sizeof(*slice->reqs), GFP_KERNEL); + slice->reqs = kvcalloc(sgt->nents, sizeof(*slice->reqs), GFP_KERNEL); if (!slice->reqs) { ret = -ENOMEM; goto free_slice; @@ -430,7 +431,7 @@ static int qaic_map_one_slice(struct qaic_device *qdev, struct qaic_bo *bo, return 0; free_req: - kfree(slice->reqs); + kvfree(slice->reqs); free_slice: kfree(slice); free_sgt: @@ -643,8 +644,36 @@ static void qaic_free_object(struct drm_gem_object *obj) kfree(bo); } +static struct sg_table *qaic_get_sg_table(struct drm_gem_object *obj) +{ + struct qaic_bo *bo = to_qaic_bo(obj); + struct scatterlist *sg, *sg_in; + struct sg_table *sgt, *sgt_in; + int i; + + sgt_in = bo->sgt; + + sgt = kmalloc(sizeof(*sgt), GFP_KERNEL); + if (!sgt) + return ERR_PTR(-ENOMEM); + + if (sg_alloc_table(sgt, sgt_in->orig_nents, GFP_KERNEL)) { + kfree(sgt); + return ERR_PTR(-ENOMEM); + } + + sg = sgt->sgl; + for_each_sgtable_sg(sgt_in, sg_in, i) { + memcpy(sg, sg_in, sizeof(*sg)); + sg = sg_next(sg); + } + + return sgt; +} + static const struct drm_gem_object_funcs qaic_gem_funcs = { .free = qaic_free_object, + .get_sg_table = qaic_get_sg_table, .print_info = qaic_gem_print_info, .mmap = qaic_gem_object_mmap, .vm_ops = &drm_vm_ops, @@ -953,8 +982,9 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi if (args->hdr.count == 0) return -EINVAL; - arg_size = args->hdr.count * sizeof(*slice_ent); - if (arg_size / args->hdr.count != sizeof(*slice_ent)) + if (check_mul_overflow((unsigned long)args->hdr.count, + (unsigned long)sizeof(*slice_ent), + &arg_size)) return -EINVAL; if (!(args->hdr.dir == DMA_TO_DEVICE || args->hdr.dir == DMA_FROM_DEVICE)) @@ -984,18 +1014,12 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi user_data = u64_to_user_ptr(args->data); - slice_ent = kzalloc(arg_size, GFP_KERNEL); - if (!slice_ent) { - ret = -EINVAL; + slice_ent = memdup_user(user_data, arg_size); + if (IS_ERR(slice_ent)) { + ret = PTR_ERR(slice_ent); goto unlock_dev_srcu; } - ret = copy_from_user(slice_ent, user_data, arg_size); - if (ret) { - ret = -EFAULT; - goto free_slice_ent; - } - obj = drm_gem_object_lookup(file_priv, args->hdr.handle); if (!obj) { ret = -ENOENT; @@ -1023,6 +1047,11 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi goto unlock_ch_srcu; } + if (dbc->id == qdev->ssr_dbc) { + ret = -EPIPE; + goto unlock_ch_srcu; + } + ret = qaic_prepare_bo(qdev, bo, &args->hdr); if (ret) goto unlock_ch_srcu; @@ -1300,8 +1329,6 @@ static int __qaic_execute_bo_ioctl(struct drm_device *dev, void *data, struct dr int usr_rcu_id, qdev_rcu_id; struct qaic_device *qdev; struct qaic_user *usr; - u8 __user *user_data; - unsigned long n; u64 received_ts; u32 queue_level; u64 submit_ts; @@ -1314,20 +1341,12 @@ static int __qaic_execute_bo_ioctl(struct drm_device *dev, void *data, struct dr received_ts = ktime_get_ns(); size = is_partial ? sizeof(struct qaic_partial_execute_entry) : sizeof(*exec); - n = (unsigned long)size * args->hdr.count; - if (args->hdr.count == 0 || n / args->hdr.count != size) + if (args->hdr.count == 0) return -EINVAL; - user_data = u64_to_user_ptr(args->data); - - exec = kcalloc(args->hdr.count, size, GFP_KERNEL); - if (!exec) - return -ENOMEM; - - if (copy_from_user(exec, user_data, n)) { - ret = -EFAULT; - goto free_exec; - } + exec = memdup_array_user(u64_to_user_ptr(args->data), args->hdr.count, size); + if (IS_ERR(exec)) + return PTR_ERR(exec); usr = file_priv->driver_priv; usr_rcu_id = srcu_read_lock(&usr->qddev_lock); @@ -1356,6 +1375,11 @@ static int __qaic_execute_bo_ioctl(struct drm_device *dev, void *data, struct dr goto release_ch_rcu; } + if (dbc->id == qdev->ssr_dbc) { + ret = -EPIPE; + goto release_ch_rcu; + } + ret = mutex_lock_interruptible(&dbc->req_lock); if (ret) goto release_ch_rcu; @@ -1396,7 +1420,6 @@ unlock_dev_srcu: srcu_read_unlock(&qdev->dev_lock, qdev_rcu_id); unlock_usr_srcu: srcu_read_unlock(&usr->qddev_lock, usr_rcu_id); -free_exec: kfree(exec); return ret; } @@ -1491,7 +1514,7 @@ irqreturn_t dbc_irq_handler(int irq, void *data) return IRQ_WAKE_THREAD; } -void irq_polling_work(struct work_struct *work) +void qaic_irq_polling_work(struct work_struct *work) { struct dma_bridge_chan *dbc = container_of(work, struct dma_bridge_chan, poll_work); unsigned long flags; @@ -1709,6 +1732,11 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file goto unlock_ch_srcu; } + if (dbc->id == qdev->ssr_dbc) { + ret = -EPIPE; + goto unlock_ch_srcu; + } + obj = drm_gem_object_lookup(file_priv, args->handle); if (!obj) { ret = -ENOENT; @@ -1729,6 +1757,9 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file if (!dbc->usr) ret = -EPERM; + if (dbc->id == qdev->ssr_dbc) + ret = -EPIPE; + put_obj: drm_gem_object_put(obj); unlock_ch_srcu: @@ -1749,7 +1780,8 @@ int qaic_perf_stats_bo_ioctl(struct drm_device *dev, void *data, struct drm_file struct qaic_device *qdev; struct qaic_user *usr; struct qaic_bo *bo; - int ret, i; + int ret = 0; + int i; usr = file_priv->driver_priv; usr_rcu_id = srcu_read_lock(&usr->qddev_lock); @@ -1770,18 +1802,12 @@ int qaic_perf_stats_bo_ioctl(struct drm_device *dev, void *data, struct drm_file goto unlock_dev_srcu; } - ent = kcalloc(args->hdr.count, sizeof(*ent), GFP_KERNEL); - if (!ent) { - ret = -EINVAL; + ent = memdup_array_user(u64_to_user_ptr(args->data), args->hdr.count, sizeof(*ent)); + if (IS_ERR(ent)) { + ret = PTR_ERR(ent); goto unlock_dev_srcu; } - ret = copy_from_user(ent, u64_to_user_ptr(args->data), args->hdr.count * sizeof(*ent)); - if (ret) { - ret = -EFAULT; - goto free_ent; - } - for (i = 0; i < args->hdr.count; i++) { obj = drm_gem_object_lookup(file_priv, ent[i].handle); if (!obj) { @@ -1789,6 +1815,16 @@ int qaic_perf_stats_bo_ioctl(struct drm_device *dev, void *data, struct drm_file goto free_ent; } bo = to_qaic_bo(obj); + if (!bo->sliced) { + drm_gem_object_put(obj); + ret = -EINVAL; + goto free_ent; + } + if (bo->dbc->id != args->hdr.dbc_id) { + drm_gem_object_put(obj); + ret = -EINVAL; + goto free_ent; + } /* * perf stats ioctl is called before wait ioctl is complete then * the latency information is invalid. @@ -1927,6 +1963,17 @@ static void empty_xfer_list(struct qaic_device *qdev, struct dma_bridge_chan *db spin_unlock_irqrestore(&dbc->xfer_lock, flags); } +static void sync_empty_xfer_list(struct qaic_device *qdev, struct dma_bridge_chan *dbc) +{ + empty_xfer_list(qdev, dbc); + synchronize_srcu(&dbc->ch_lock); + /* + * Threads holding channel lock, may add more elements in the xfer_list. + * Flush out these elements from xfer_list. + */ + empty_xfer_list(qdev, dbc); +} + int disable_dbc(struct qaic_device *qdev, u32 dbc_id, struct qaic_user *usr) { if (!qdev->dbc[dbc_id].usr || qdev->dbc[dbc_id].usr->handle != usr->handle) @@ -1941,7 +1988,7 @@ int disable_dbc(struct qaic_device *qdev, u32 dbc_id, struct qaic_user *usr) * enable_dbc - Enable the DBC. DBCs are disabled by removing the context of * user. Add user context back to DBC to enable it. This function trusts the * DBC ID passed and expects the DBC to be disabled. - * @qdev: Qranium device handle + * @qdev: qaic device handle * @dbc_id: ID of the DBC * @usr: User context */ @@ -1955,13 +2002,7 @@ void wakeup_dbc(struct qaic_device *qdev, u32 dbc_id) struct dma_bridge_chan *dbc = &qdev->dbc[dbc_id]; dbc->usr = NULL; - empty_xfer_list(qdev, dbc); - synchronize_srcu(&dbc->ch_lock); - /* - * Threads holding channel lock, may add more elements in the xfer_list. - * Flush out these elements from xfer_list. - */ - empty_xfer_list(qdev, dbc); + sync_empty_xfer_list(qdev, dbc); } void release_dbc(struct qaic_device *qdev, u32 dbc_id) @@ -2002,3 +2043,30 @@ void qaic_data_get_fifo_info(struct dma_bridge_chan *dbc, u32 *head, u32 *tail) *head = readl(dbc->dbc_base + REQHP_OFF); *tail = readl(dbc->dbc_base + REQTP_OFF); } + +/* + * qaic_dbc_enter_ssr - Prepare to enter in sub system reset(SSR) for given DBC ID. + * @qdev: qaic device handle + * @dbc_id: ID of the DBC which will enter SSR + * + * The device will automatically deactivate the workload as not + * all errors can be silently recovered. The user will be + * notified and will need to decide the required recovery + * action to take. + */ +void qaic_dbc_enter_ssr(struct qaic_device *qdev, u32 dbc_id) +{ + qdev->ssr_dbc = dbc_id; + release_dbc(qdev, dbc_id); +} + +/* + * qaic_dbc_exit_ssr - Prepare to exit from sub system reset(SSR) for given DBC ID. + * @qdev: qaic device handle + * + * The DBC returns to an operational state and begins accepting work after exiting SSR. + */ +void qaic_dbc_exit_ssr(struct qaic_device *qdev) +{ + qdev->ssr_dbc = QAIC_SSR_DBC_SENTINEL; +} diff --git a/drivers/accel/qaic/qaic_drv.c b/drivers/accel/qaic/qaic_drv.c index e162f4b8a262..4c70bd949d53 100644 --- a/drivers/accel/qaic/qaic_drv.c +++ b/drivers/accel/qaic/qaic_drv.c @@ -30,6 +30,7 @@ #include "qaic.h" #include "qaic_debugfs.h" #include "qaic_ras.h" +#include "qaic_ssr.h" #include "qaic_timesync.h" #include "sahara.h" @@ -270,6 +271,13 @@ static int qaic_create_drm_device(struct qaic_device *qdev, s32 partition_id) return ret; } + ret = qaic_sysfs_init(qddev); + if (ret) { + drm_dev_unregister(drm); + pci_dbg(qdev->pdev, "qaic_sysfs_init failed %d\n", ret); + return ret; + } + qaic_debugfs_init(qddev); return ret; @@ -281,6 +289,7 @@ static void qaic_destroy_drm_device(struct qaic_device *qdev, s32 partition_id) struct drm_device *drm = to_drm(qddev); struct qaic_user *usr; + qaic_sysfs_remove(qddev); drm_dev_unregister(drm); qddev->partition_id = 0; /* @@ -382,6 +391,7 @@ void qaic_dev_reset_clean_local_state(struct qaic_device *qdev) qaic_notify_reset(qdev); /* start tearing things down */ + qaic_clean_up_ssr(qdev); for (i = 0; i < qdev->num_dbc; ++i) release_dbc(qdev, i); } @@ -431,11 +441,18 @@ static struct qaic_device *create_qdev(struct pci_dev *pdev, qdev->qts_wq = qaicm_wq_init(drm, "qaic_ts"); if (IS_ERR(qdev->qts_wq)) return NULL; + qdev->ssr_wq = qaicm_wq_init(drm, "qaic_ssr"); + if (IS_ERR(qdev->ssr_wq)) + return NULL; ret = qaicm_srcu_init(drm, &qdev->dev_lock); if (ret) return NULL; + ret = qaic_ssr_init(qdev, drm); + if (ret) + pci_info(pdev, "QAIC SSR crashdump collection not supported.\n"); + qdev->qddev = qddev; qdev->pdev = pdev; qddev->qdev = qdev; @@ -545,7 +562,7 @@ static int init_msi(struct qaic_device *qdev, struct pci_dev *pdev) qdev->dbc[i].irq = pci_irq_vector(pdev, qdev->single_msi ? 0 : i + 1); if (!qdev->single_msi) disable_irq_nosync(qdev->dbc[i].irq); - INIT_WORK(&qdev->dbc[i].poll_work, irq_polling_work); + INIT_WORK(&qdev->dbc[i].poll_work, qaic_irq_polling_work); } } @@ -660,6 +677,92 @@ static const struct pci_error_handlers qaic_pci_err_handler = { .reset_done = qaic_pci_reset_done, }; +static bool qaic_is_under_reset(struct qaic_device *qdev) +{ + int rcu_id; + bool ret; + + rcu_id = srcu_read_lock(&qdev->dev_lock); + ret = qdev->dev_state != QAIC_ONLINE; + srcu_read_unlock(&qdev->dev_lock, rcu_id); + return ret; +} + +static bool qaic_data_path_busy(struct qaic_device *qdev) +{ + bool ret = false; + int dev_rcu_id; + int i; + + dev_rcu_id = srcu_read_lock(&qdev->dev_lock); + if (qdev->dev_state != QAIC_ONLINE) { + srcu_read_unlock(&qdev->dev_lock, dev_rcu_id); + return false; + } + for (i = 0; i < qdev->num_dbc; i++) { + struct dma_bridge_chan *dbc = &qdev->dbc[i]; + unsigned long flags; + int ch_rcu_id; + + ch_rcu_id = srcu_read_lock(&dbc->ch_lock); + if (!dbc->usr || !dbc->in_use) { + srcu_read_unlock(&dbc->ch_lock, ch_rcu_id); + continue; + } + spin_lock_irqsave(&dbc->xfer_lock, flags); + ret = !list_empty(&dbc->xfer_list); + spin_unlock_irqrestore(&dbc->xfer_lock, flags); + srcu_read_unlock(&dbc->ch_lock, ch_rcu_id); + if (ret) + break; + } + srcu_read_unlock(&qdev->dev_lock, dev_rcu_id); + return ret; +} + +static int qaic_pm_suspend(struct device *dev) +{ + struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev)); + + dev_dbg(dev, "Suspending..\n"); + if (qaic_data_path_busy(qdev)) { + dev_dbg(dev, "Device's datapath is busy. Aborting suspend..\n"); + return -EBUSY; + } + if (qaic_is_under_reset(qdev)) { + dev_dbg(dev, "Device is under reset. Aborting suspend..\n"); + return -EBUSY; + } + qaic_mqts_ch_stop_timer(qdev->mqts_ch); + qaic_pci_reset_prepare(qdev->pdev); + pci_save_state(qdev->pdev); + pci_disable_device(qdev->pdev); + pci_set_power_state(qdev->pdev, PCI_D3hot); + return 0; +} + +static int qaic_pm_resume(struct device *dev) +{ + struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev)); + int ret; + + dev_dbg(dev, "Resuming..\n"); + pci_set_power_state(qdev->pdev, PCI_D0); + pci_restore_state(qdev->pdev); + ret = pci_enable_device(qdev->pdev); + if (ret) { + dev_err(dev, "pci_enable_device failed on resume %d\n", ret); + return ret; + } + pci_set_master(qdev->pdev); + qaic_pci_reset_done(qdev->pdev); + return 0; +} + +static const struct dev_pm_ops qaic_pm_ops = { + SYSTEM_SLEEP_PM_OPS(qaic_pm_suspend, qaic_pm_resume) +}; + static struct pci_driver qaic_pci_driver = { .name = QAIC_NAME, .id_table = qaic_ids, @@ -667,6 +770,9 @@ static struct pci_driver qaic_pci_driver = { .remove = qaic_pci_remove, .shutdown = qaic_pci_shutdown, .err_handler = &qaic_pci_err_handler, + .driver = { + .pm = pm_sleep_ptr(&qaic_pm_ops), + }, }; static int __init qaic_init(void) @@ -702,9 +808,16 @@ static int __init qaic_init(void) ret = qaic_ras_register(); if (ret) pr_debug("qaic: qaic_ras_register failed %d\n", ret); + ret = qaic_ssr_register(); + if (ret) { + pr_debug("qaic: qaic_ssr_register failed %d\n", ret); + goto free_bootlog; + } return 0; +free_bootlog: + qaic_bootlog_unregister(); free_mhi: mhi_driver_unregister(&qaic_mhi_driver); free_pci: @@ -730,6 +843,7 @@ static void __exit qaic_exit(void) * reinitializing the link_up state after the cleanup is done. */ link_up = true; + qaic_ssr_unregister(); qaic_ras_unregister(); qaic_bootlog_unregister(); qaic_timesync_deinit(); diff --git a/drivers/accel/qaic/qaic_ras.c b/drivers/accel/qaic/qaic_ras.c index 914ffc4a9970..f1d52a710136 100644 --- a/drivers/accel/qaic/qaic_ras.c +++ b/drivers/accel/qaic/qaic_ras.c @@ -514,21 +514,21 @@ static ssize_t ce_count_show(struct device *dev, struct device_attribute *attr, { struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev)); - return snprintf(buf, PAGE_SIZE, "%d\n", qdev->ce_count); + return sysfs_emit(buf, "%d\n", qdev->ce_count); } static ssize_t ue_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev)); - return snprintf(buf, PAGE_SIZE, "%d\n", qdev->ue_count); + return sysfs_emit(buf, "%d\n", qdev->ue_count); } static ssize_t ue_nonfatal_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev)); - return snprintf(buf, PAGE_SIZE, "%d\n", qdev->ue_nf_count); + return sysfs_emit(buf, "%d\n", qdev->ue_nf_count); } static DEVICE_ATTR_RO(ce_count); diff --git a/drivers/accel/qaic/qaic_ssr.c b/drivers/accel/qaic/qaic_ssr.c new file mode 100644 index 000000000000..9b662d690371 --- /dev/null +++ b/drivers/accel/qaic/qaic_ssr.c @@ -0,0 +1,815 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. */ +/* Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved. */ + +#include <asm/byteorder.h> +#include <drm/drm_file.h> +#include <drm/drm_managed.h> +#include <linux/devcoredump.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/mhi.h> +#include <linux/workqueue.h> + +#include "qaic.h" +#include "qaic_ssr.h" + +#define SSR_RESP_MSG_SZ 32 +#define SSR_MHI_BUF_SIZE SZ_64K +#define SSR_MEM_READ_DATA_SIZE ((u64)SSR_MHI_BUF_SIZE - sizeof(struct ssr_crashdump)) +#define SSR_MEM_READ_CHUNK_SIZE ((u64)SSR_MEM_READ_DATA_SIZE - sizeof(struct ssr_memory_read_rsp)) + +#define DEBUG_TRANSFER_INFO BIT(0) +#define DEBUG_TRANSFER_INFO_RSP BIT(1) +#define MEMORY_READ BIT(2) +#define MEMORY_READ_RSP BIT(3) +#define DEBUG_TRANSFER_DONE BIT(4) +#define DEBUG_TRANSFER_DONE_RSP BIT(5) +#define SSR_EVENT BIT(8) +#define SSR_EVENT_RSP BIT(9) + +#define SSR_EVENT_NACK BIT(0) +#define BEFORE_SHUTDOWN BIT(1) +#define AFTER_SHUTDOWN BIT(2) +#define BEFORE_POWER_UP BIT(3) +#define AFTER_POWER_UP BIT(4) + +struct debug_info_table { + /* Save preferences. Default is mandatory */ + u64 save_perf; + /* Base address of the debug region */ + u64 mem_base; + /* Size of debug region in bytes */ + u64 len; + /* Description */ + char desc[20]; + /* Filename of debug region */ + char filename[20]; +}; + +struct _ssr_hdr { + __le32 cmd; + __le32 len; + __le32 dbc_id; +}; + +struct ssr_hdr { + u32 cmd; + u32 len; + u32 dbc_id; +}; + +struct ssr_debug_transfer_info { + struct ssr_hdr hdr; + u32 resv; + u64 tbl_addr; + u64 tbl_len; +} __packed; + +struct ssr_debug_transfer_info_rsp { + struct _ssr_hdr hdr; + __le32 ret; +} __packed; + +struct ssr_memory_read { + struct _ssr_hdr hdr; + __le32 resv; + __le64 addr; + __le64 len; +} __packed; + +struct ssr_memory_read_rsp { + struct _ssr_hdr hdr; + __le32 resv; + u8 data[]; +} __packed; + +struct ssr_debug_transfer_done { + struct _ssr_hdr hdr; + __le32 resv; +} __packed; + +struct ssr_debug_transfer_done_rsp { + struct _ssr_hdr hdr; + __le32 ret; +} __packed; + +struct ssr_event { + struct ssr_hdr hdr; + u32 event; +} __packed; + +struct ssr_event_rsp { + struct _ssr_hdr hdr; + __le32 event; +} __packed; + +struct ssr_resp { + /* Work struct to schedule work coming on QAIC_SSR channel */ + struct work_struct work; + /* Root struct of device, used to access device resources */ + struct qaic_device *qdev; + /* Buffer used by MHI for transfer requests */ + u8 data[] __aligned(8); +}; + +/* SSR crashdump book keeping structure */ +struct ssr_dump_info { + /* DBC associated with this SSR crashdump */ + struct dma_bridge_chan *dbc; + /* + * It will be used when we complete the crashdump download and switch + * to waiting on SSR events + */ + struct ssr_resp *resp; + /* MEMORY READ request MHI buffer.*/ + struct ssr_memory_read *read_buf_req; + /* TRUE: ->read_buf_req is queued for MHI transaction. FALSE: Otherwise */ + bool read_buf_req_queued; + /* Address of table in host */ + void *tbl_addr; + /* Total size of table */ + u64 tbl_len; + /* Offset of table(->tbl_addr) where the new chunk will be dumped */ + u64 tbl_off; + /* Address of table in device/target */ + u64 tbl_addr_dev; + /* Ptr to the entire dump */ + void *dump_addr; + /* Entire crashdump size */ + u64 dump_sz; + /* Offset of crashdump(->dump_addr) where the new chunk will be dumped */ + u64 dump_off; + /* Points to the table entry we are currently downloading */ + struct debug_info_table *tbl_ent; + /* Offset in the current table entry(->tbl_ent) for next chuck */ + u64 tbl_ent_off; +}; + +struct ssr_crashdump { + /* + * Points to a book keeping struct maintained by MHI SSR device while + * downloading a SSR crashdump. It is NULL when crashdump downloading + * not in progress. + */ + struct ssr_dump_info *dump_info; + /* Work struct to schedule work coming on QAIC_SSR channel */ + struct work_struct work; + /* Root struct of device, used to access device resources */ + struct qaic_device *qdev; + /* Buffer used by MHI for transfer requests */ + u8 data[]; +}; + +#define QAIC_SSR_DUMP_V1_MAGIC 0x1234567890abcdef +#define QAIC_SSR_DUMP_V1_VER 1 +struct dump_file_meta { + u64 magic; + u64 version; + u64 size; /* Total size of the entire dump */ + u64 tbl_len; /* Length of the table in byte */ +}; + +/* + * Layout of crashdump + * +------------------------------------------+ + * | Crashdump Meta structure | + * | type: struct dump_file_meta | + * +------------------------------------------+ + * | Crashdump Table | + * | type: array of struct debug_info_table | + * | | + * | | + * | | + * +------------------------------------------+ + * | Crashdump | + * | | + * | | + * | | + * | | + * | | + * +------------------------------------------+ + */ + +static void free_ssr_dump_info(struct ssr_crashdump *ssr_crash) +{ + struct ssr_dump_info *dump_info = ssr_crash->dump_info; + + ssr_crash->dump_info = NULL; + if (!dump_info) + return; + if (!dump_info->read_buf_req_queued) + kfree(dump_info->read_buf_req); + vfree(dump_info->tbl_addr); + vfree(dump_info->dump_addr); + kfree(dump_info); +} + +void qaic_clean_up_ssr(struct qaic_device *qdev) +{ + struct ssr_crashdump *ssr_crash = qdev->ssr_mhi_buf; + + if (!ssr_crash) + return; + + qaic_dbc_exit_ssr(qdev); + free_ssr_dump_info(ssr_crash); +} + +static int alloc_dump(struct ssr_dump_info *dump_info) +{ + struct debug_info_table *tbl_ent = dump_info->tbl_addr; + struct dump_file_meta *dump_meta; + u64 tbl_sz_lp = 0; + u64 dump_size = 0; + + while (tbl_sz_lp < dump_info->tbl_len) { + le64_to_cpus(&tbl_ent->save_perf); + le64_to_cpus(&tbl_ent->mem_base); + le64_to_cpus(&tbl_ent->len); + + if (tbl_ent->len == 0) + return -EINVAL; + + dump_size += tbl_ent->len; + tbl_ent++; + tbl_sz_lp += sizeof(*tbl_ent); + } + + dump_info->dump_sz = dump_size + dump_info->tbl_len + sizeof(*dump_meta); + dump_info->dump_addr = vzalloc(dump_info->dump_sz); + if (!dump_info->dump_addr) + return -ENOMEM; + + /* Copy crashdump meta and table */ + dump_meta = dump_info->dump_addr; + dump_meta->magic = QAIC_SSR_DUMP_V1_MAGIC; + dump_meta->version = QAIC_SSR_DUMP_V1_VER; + dump_meta->size = dump_info->dump_sz; + dump_meta->tbl_len = dump_info->tbl_len; + memcpy(dump_info->dump_addr + sizeof(*dump_meta), dump_info->tbl_addr, dump_info->tbl_len); + /* Offset by crashdump meta and table (copied above) */ + dump_info->dump_off = dump_info->tbl_len + sizeof(*dump_meta); + + return 0; +} + +static int send_xfer_done(struct qaic_device *qdev, void *resp, u32 dbc_id) +{ + struct ssr_debug_transfer_done *xfer_done; + int ret; + + xfer_done = kmalloc(sizeof(*xfer_done), GFP_KERNEL); + if (!xfer_done) { + ret = -ENOMEM; + goto out; + } + + ret = mhi_queue_buf(qdev->ssr_ch, DMA_FROM_DEVICE, resp, SSR_RESP_MSG_SZ, MHI_EOT); + if (ret) + goto free_xfer_done; + + xfer_done->hdr.cmd = cpu_to_le32(DEBUG_TRANSFER_DONE); + xfer_done->hdr.len = cpu_to_le32(sizeof(*xfer_done)); + xfer_done->hdr.dbc_id = cpu_to_le32(dbc_id); + + ret = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE, xfer_done, sizeof(*xfer_done), MHI_EOT); + if (ret) + goto free_xfer_done; + + return 0; + +free_xfer_done: + kfree(xfer_done); +out: + return ret; +} + +static int mem_read_req(struct qaic_device *qdev, u64 dest_addr, u64 dest_len) +{ + struct ssr_crashdump *ssr_crash = qdev->ssr_mhi_buf; + struct ssr_memory_read *read_buf_req; + struct ssr_dump_info *dump_info; + int ret; + + dump_info = ssr_crash->dump_info; + ret = mhi_queue_buf(qdev->ssr_ch, DMA_FROM_DEVICE, ssr_crash->data, SSR_MEM_READ_DATA_SIZE, + MHI_EOT); + if (ret) + goto out; + + read_buf_req = dump_info->read_buf_req; + read_buf_req->hdr.cmd = cpu_to_le32(MEMORY_READ); + read_buf_req->hdr.len = cpu_to_le32(sizeof(*read_buf_req)); + read_buf_req->hdr.dbc_id = cpu_to_le32(qdev->ssr_dbc); + read_buf_req->addr = cpu_to_le64(dest_addr); + read_buf_req->len = cpu_to_le64(dest_len); + + ret = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE, read_buf_req, sizeof(*read_buf_req), + MHI_EOT); + if (!ret) + dump_info->read_buf_req_queued = true; + +out: + return ret; +} + +static int ssr_copy_table(struct ssr_dump_info *dump_info, void *data, u64 len) +{ + if (len > dump_info->tbl_len - dump_info->tbl_off) + return -EINVAL; + + memcpy(dump_info->tbl_addr + dump_info->tbl_off, data, len); + dump_info->tbl_off += len; + + /* Entire table has been downloaded, alloc dump memory */ + if (dump_info->tbl_off == dump_info->tbl_len) { + dump_info->tbl_ent = dump_info->tbl_addr; + return alloc_dump(dump_info); + } + + return 0; +} + +static int ssr_copy_dump(struct ssr_dump_info *dump_info, void *data, u64 len) +{ + struct debug_info_table *tbl_ent; + + tbl_ent = dump_info->tbl_ent; + + if (len > tbl_ent->len - dump_info->tbl_ent_off) + return -EINVAL; + + memcpy(dump_info->dump_addr + dump_info->dump_off, data, len); + dump_info->dump_off += len; + dump_info->tbl_ent_off += len; + + /* + * Current segment (a entry in table) of the crashdump is complete, + * move to next one + */ + if (tbl_ent->len == dump_info->tbl_ent_off) { + dump_info->tbl_ent++; + dump_info->tbl_ent_off = 0; + } + + return 0; +} + +static void ssr_dump_worker(struct work_struct *work) +{ + struct ssr_crashdump *ssr_crash = container_of(work, struct ssr_crashdump, work); + struct qaic_device *qdev = ssr_crash->qdev; + struct ssr_memory_read_rsp *mem_rd_resp; + struct debug_info_table *tbl_ent; + struct ssr_dump_info *dump_info; + u64 dest_addr, dest_len; + struct _ssr_hdr *_hdr; + struct ssr_hdr hdr; + u64 data_len; + int ret; + + mem_rd_resp = (struct ssr_memory_read_rsp *)ssr_crash->data; + _hdr = &mem_rd_resp->hdr; + hdr.cmd = le32_to_cpu(_hdr->cmd); + hdr.len = le32_to_cpu(_hdr->len); + hdr.dbc_id = le32_to_cpu(_hdr->dbc_id); + + if (hdr.dbc_id != qdev->ssr_dbc) + goto reset_device; + + dump_info = ssr_crash->dump_info; + if (!dump_info) + goto reset_device; + + if (hdr.cmd != MEMORY_READ_RSP) + goto free_dump_info; + + if (hdr.len > SSR_MEM_READ_DATA_SIZE) + goto free_dump_info; + + data_len = hdr.len - sizeof(*mem_rd_resp); + + if (dump_info->tbl_off < dump_info->tbl_len) /* Chunk belongs to table */ + ret = ssr_copy_table(dump_info, mem_rd_resp->data, data_len); + else /* Chunk belongs to crashdump */ + ret = ssr_copy_dump(dump_info, mem_rd_resp->data, data_len); + + if (ret) + goto free_dump_info; + + if (dump_info->tbl_off < dump_info->tbl_len) { + /* Continue downloading table */ + dest_addr = dump_info->tbl_addr_dev + dump_info->tbl_off; + dest_len = min(SSR_MEM_READ_CHUNK_SIZE, dump_info->tbl_len - dump_info->tbl_off); + ret = mem_read_req(qdev, dest_addr, dest_len); + } else if (dump_info->dump_off < dump_info->dump_sz) { + /* Continue downloading crashdump */ + tbl_ent = dump_info->tbl_ent; + dest_addr = tbl_ent->mem_base + dump_info->tbl_ent_off; + dest_len = min(SSR_MEM_READ_CHUNK_SIZE, tbl_ent->len - dump_info->tbl_ent_off); + ret = mem_read_req(qdev, dest_addr, dest_len); + } else { + /* Crashdump download complete */ + ret = send_xfer_done(qdev, dump_info->resp->data, hdr.dbc_id); + } + + /* Most likely a MHI xfer has failed */ + if (ret) + goto free_dump_info; + + return; + +free_dump_info: + /* Free the allocated memory */ + free_ssr_dump_info(ssr_crash); +reset_device: + /* + * After subsystem crashes in device crashdump collection begins but + * something went wrong while collecting crashdump, now instead of + * handling this error we just reset the device as the best effort has + * been made + */ + mhi_soc_reset(qdev->mhi_cntrl); +} + +static struct ssr_dump_info *alloc_dump_info(struct qaic_device *qdev, + struct ssr_debug_transfer_info *debug_info) +{ + struct ssr_dump_info *dump_info; + int ret; + + le64_to_cpus(&debug_info->tbl_len); + le64_to_cpus(&debug_info->tbl_addr); + + if (debug_info->tbl_len == 0 || + debug_info->tbl_len % sizeof(struct debug_info_table) != 0) { + ret = -EINVAL; + goto out; + } + + /* Allocate SSR crashdump book keeping structure */ + dump_info = kzalloc(sizeof(*dump_info), GFP_KERNEL); + if (!dump_info) { + ret = -ENOMEM; + goto out; + } + + /* Buffer used to send MEMORY READ request to device via MHI */ + dump_info->read_buf_req = kzalloc(sizeof(*dump_info->read_buf_req), GFP_KERNEL); + if (!dump_info->read_buf_req) { + ret = -ENOMEM; + goto free_dump_info; + } + + /* Crashdump meta table buffer */ + dump_info->tbl_addr = vzalloc(debug_info->tbl_len); + if (!dump_info->tbl_addr) { + ret = -ENOMEM; + goto free_read_buf_req; + } + + dump_info->tbl_addr_dev = debug_info->tbl_addr; + dump_info->tbl_len = debug_info->tbl_len; + + return dump_info; + +free_read_buf_req: + kfree(dump_info->read_buf_req); +free_dump_info: + kfree(dump_info); +out: + return ERR_PTR(ret); +} + +static int dbg_xfer_info_rsp(struct qaic_device *qdev, struct dma_bridge_chan *dbc, + struct ssr_debug_transfer_info *debug_info) +{ + struct ssr_debug_transfer_info_rsp *debug_rsp; + struct ssr_crashdump *ssr_crash = NULL; + int ret = 0, ret2; + + debug_rsp = kmalloc(sizeof(*debug_rsp), GFP_KERNEL); + if (!debug_rsp) + return -ENOMEM; + + if (!qdev->ssr_mhi_buf) { + ret = -ENOMEM; + goto send_rsp; + } + + if (dbc->state != DBC_STATE_BEFORE_POWER_UP) { + ret = -EINVAL; + goto send_rsp; + } + + ssr_crash = qdev->ssr_mhi_buf; + ssr_crash->dump_info = alloc_dump_info(qdev, debug_info); + if (IS_ERR(ssr_crash->dump_info)) { + ret = PTR_ERR(ssr_crash->dump_info); + ssr_crash->dump_info = NULL; + } + +send_rsp: + debug_rsp->hdr.cmd = cpu_to_le32(DEBUG_TRANSFER_INFO_RSP); + debug_rsp->hdr.len = cpu_to_le32(sizeof(*debug_rsp)); + debug_rsp->hdr.dbc_id = cpu_to_le32(dbc->id); + /* + * 0 = Return an ACK confirming the host is ready to download crashdump + * 1 = Return an NACK confirming the host is not ready to download crashdump + */ + debug_rsp->ret = cpu_to_le32(ret ? 1 : 0); + + ret2 = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE, debug_rsp, sizeof(*debug_rsp), MHI_EOT); + if (ret2) { + free_ssr_dump_info(ssr_crash); + kfree(debug_rsp); + return ret2; + } + + return ret; +} + +static void dbg_xfer_done_rsp(struct qaic_device *qdev, struct dma_bridge_chan *dbc, + struct ssr_debug_transfer_done_rsp *xfer_rsp) +{ + struct ssr_crashdump *ssr_crash = qdev->ssr_mhi_buf; + u32 status = le32_to_cpu(xfer_rsp->ret); + struct device *dev = &qdev->pdev->dev; + struct ssr_dump_info *dump_info; + + dump_info = ssr_crash->dump_info; + if (!dump_info) + return; + + if (status) { + free_ssr_dump_info(ssr_crash); + return; + } + + dev_coredumpv(dev, dump_info->dump_addr, dump_info->dump_sz, GFP_KERNEL); + /* dev_coredumpv will free dump_info->dump_addr */ + dump_info->dump_addr = NULL; + free_ssr_dump_info(ssr_crash); +} + +static void ssr_worker(struct work_struct *work) +{ + struct ssr_resp *resp = container_of(work, struct ssr_resp, work); + struct ssr_hdr *hdr = (struct ssr_hdr *)resp->data; + struct ssr_dump_info *dump_info = NULL; + struct qaic_device *qdev = resp->qdev; + struct ssr_crashdump *ssr_crash; + struct ssr_event_rsp *event_rsp; + struct dma_bridge_chan *dbc; + struct ssr_event *event; + u32 ssr_event_ack; + int ret; + + le32_to_cpus(&hdr->cmd); + le32_to_cpus(&hdr->len); + le32_to_cpus(&hdr->dbc_id); + + if (hdr->len > SSR_RESP_MSG_SZ) + goto out; + + if (hdr->dbc_id >= qdev->num_dbc) + goto out; + + dbc = &qdev->dbc[hdr->dbc_id]; + + switch (hdr->cmd) { + case DEBUG_TRANSFER_INFO: + ret = dbg_xfer_info_rsp(qdev, dbc, (struct ssr_debug_transfer_info *)resp->data); + if (ret) + break; + + ssr_crash = qdev->ssr_mhi_buf; + dump_info = ssr_crash->dump_info; + dump_info->dbc = dbc; + dump_info->resp = resp; + + /* Start by downloading debug table */ + ret = mem_read_req(qdev, dump_info->tbl_addr_dev, + min(dump_info->tbl_len, SSR_MEM_READ_CHUNK_SIZE)); + if (ret) { + free_ssr_dump_info(ssr_crash); + break; + } + + /* + * Till now everything went fine, which means that we will be + * collecting crashdump chunk by chunk. Do not queue a response + * buffer for SSR cmds till the crashdump is complete. + */ + return; + case SSR_EVENT: + event = (struct ssr_event *)hdr; + le32_to_cpus(&event->event); + ssr_event_ack = event->event; + ssr_crash = qdev->ssr_mhi_buf; + + switch (event->event) { + case BEFORE_SHUTDOWN: + set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_BEFORE_SHUTDOWN); + qaic_dbc_enter_ssr(qdev, hdr->dbc_id); + break; + case AFTER_SHUTDOWN: + set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_AFTER_SHUTDOWN); + break; + case BEFORE_POWER_UP: + set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_BEFORE_POWER_UP); + break; + case AFTER_POWER_UP: + /* + * If dump info is a non NULL value it means that we + * have received this SSR event while downloading a + * crashdump for this DBC is still in progress. NACK + * the SSR event + */ + if (ssr_crash && ssr_crash->dump_info) { + free_ssr_dump_info(ssr_crash); + ssr_event_ack = SSR_EVENT_NACK; + break; + } + + set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_AFTER_POWER_UP); + break; + default: + break; + } + + event_rsp = kmalloc(sizeof(*event_rsp), GFP_KERNEL); + if (!event_rsp) + break; + + event_rsp->hdr.cmd = cpu_to_le32(SSR_EVENT_RSP); + event_rsp->hdr.len = cpu_to_le32(sizeof(*event_rsp)); + event_rsp->hdr.dbc_id = cpu_to_le32(hdr->dbc_id); + event_rsp->event = cpu_to_le32(ssr_event_ack); + + ret = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE, event_rsp, sizeof(*event_rsp), + MHI_EOT); + if (ret) + kfree(event_rsp); + + if (event->event == AFTER_POWER_UP && ssr_event_ack != SSR_EVENT_NACK) { + qaic_dbc_exit_ssr(qdev); + set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_IDLE); + } + + break; + case DEBUG_TRANSFER_DONE_RSP: + dbg_xfer_done_rsp(qdev, dbc, (struct ssr_debug_transfer_done_rsp *)hdr); + break; + default: + break; + } + +out: + ret = mhi_queue_buf(qdev->ssr_ch, DMA_FROM_DEVICE, resp->data, SSR_RESP_MSG_SZ, MHI_EOT); + if (ret) + kfree(resp); +} + +static int qaic_ssr_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id) +{ + struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(mhi_dev->mhi_cntrl->cntrl_dev)); + struct ssr_resp *resp; + int ret; + + ret = mhi_prepare_for_transfer(mhi_dev); + if (ret) + return ret; + + resp = kzalloc(sizeof(*resp) + SSR_RESP_MSG_SZ, GFP_KERNEL); + if (!resp) { + mhi_unprepare_from_transfer(mhi_dev); + return -ENOMEM; + } + + resp->qdev = qdev; + INIT_WORK(&resp->work, ssr_worker); + + ret = mhi_queue_buf(mhi_dev, DMA_FROM_DEVICE, resp->data, SSR_RESP_MSG_SZ, MHI_EOT); + if (ret) { + kfree(resp); + mhi_unprepare_from_transfer(mhi_dev); + return ret; + } + + dev_set_drvdata(&mhi_dev->dev, qdev); + qdev->ssr_ch = mhi_dev; + + return 0; +} + +static void qaic_ssr_mhi_remove(struct mhi_device *mhi_dev) +{ + struct qaic_device *qdev; + + qdev = dev_get_drvdata(&mhi_dev->dev); + mhi_unprepare_from_transfer(qdev->ssr_ch); + qdev->ssr_ch = NULL; +} + +static void qaic_ssr_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result) +{ + struct qaic_device *qdev = dev_get_drvdata(&mhi_dev->dev); + struct ssr_crashdump *ssr_crash = qdev->ssr_mhi_buf; + struct _ssr_hdr *hdr = mhi_result->buf_addr; + struct ssr_dump_info *dump_info; + + if (mhi_result->transaction_status) { + kfree(mhi_result->buf_addr); + return; + } + + /* + * MEMORY READ is used to download crashdump. And crashdump is + * downloaded chunk by chunk in a series of MEMORY READ SSR commands. + * Hence to avoid too many kmalloc() and kfree() of the same MEMORY READ + * request buffer, we allocate only one such buffer and free it only + * once. + */ + if (le32_to_cpu(hdr->cmd) == MEMORY_READ) { + dump_info = ssr_crash->dump_info; + if (dump_info) { + dump_info->read_buf_req_queued = false; + return; + } + } + + kfree(mhi_result->buf_addr); +} + +static void qaic_ssr_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result) +{ + struct ssr_resp *resp = container_of(mhi_result->buf_addr, struct ssr_resp, data); + struct qaic_device *qdev = dev_get_drvdata(&mhi_dev->dev); + struct ssr_crashdump *ssr_crash = qdev->ssr_mhi_buf; + bool memory_read_rsp = false; + + if (ssr_crash && ssr_crash->data == mhi_result->buf_addr) + memory_read_rsp = true; + + if (mhi_result->transaction_status) { + /* Do not free SSR crashdump buffer as it allocated via managed APIs */ + if (!memory_read_rsp) + kfree(resp); + return; + } + + if (memory_read_rsp) + queue_work(qdev->ssr_wq, &ssr_crash->work); + else + queue_work(qdev->ssr_wq, &resp->work); +} + +static const struct mhi_device_id qaic_ssr_mhi_match_table[] = { + { .chan = "QAIC_SSR", }, + {}, +}; + +static struct mhi_driver qaic_ssr_mhi_driver = { + .id_table = qaic_ssr_mhi_match_table, + .remove = qaic_ssr_mhi_remove, + .probe = qaic_ssr_mhi_probe, + .ul_xfer_cb = qaic_ssr_mhi_ul_xfer_cb, + .dl_xfer_cb = qaic_ssr_mhi_dl_xfer_cb, + .driver = { + .name = "qaic_ssr", + }, +}; + +int qaic_ssr_init(struct qaic_device *qdev, struct drm_device *drm) +{ + struct ssr_crashdump *ssr_crash; + + qdev->ssr_dbc = QAIC_SSR_DBC_SENTINEL; + + /* + * Device requests only one SSR at a time. So allocating only one + * buffer to download crashdump is good enough. + */ + ssr_crash = drmm_kzalloc(drm, SSR_MHI_BUF_SIZE, GFP_KERNEL); + if (!ssr_crash) + return -ENOMEM; + + ssr_crash->qdev = qdev; + INIT_WORK(&ssr_crash->work, ssr_dump_worker); + qdev->ssr_mhi_buf = ssr_crash; + + return 0; +} + +int qaic_ssr_register(void) +{ + return mhi_driver_register(&qaic_ssr_mhi_driver); +} + +void qaic_ssr_unregister(void) +{ + mhi_driver_unregister(&qaic_ssr_mhi_driver); +} diff --git a/drivers/accel/qaic/qaic_ssr.h b/drivers/accel/qaic/qaic_ssr.h new file mode 100644 index 000000000000..97ccff305750 --- /dev/null +++ b/drivers/accel/qaic/qaic_ssr.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only + * + * Copyright (c) 2020, The Linux Foundation. All rights reserved. + * Copyright (c) 2021, 2024 Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef __QAIC_SSR_H__ +#define __QAIC_SSR_H__ + +struct drm_device; +struct qaic_device; + +int qaic_ssr_register(void); +void qaic_ssr_unregister(void); +void qaic_clean_up_ssr(struct qaic_device *qdev); +int qaic_ssr_init(struct qaic_device *qdev, struct drm_device *drm); +#endif /* __QAIC_SSR_H__ */ diff --git a/drivers/accel/qaic/qaic_sysfs.c b/drivers/accel/qaic/qaic_sysfs.c new file mode 100644 index 000000000000..e0afb0ffb589 --- /dev/null +++ b/drivers/accel/qaic/qaic_sysfs.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* Copyright (c) 2020-2025, The Linux Foundation. All rights reserved. */ + +#include <drm/drm_file.h> +#include <drm/drm_managed.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/kobject.h> +#include <linux/mutex.h> +#include <linux/sysfs.h> + +#include "qaic.h" + +#define NAME_LEN 14 + +struct dbc_attribute { + struct device_attribute dev_attr; + u32 dbc_id; + char name[NAME_LEN]; +}; + +static ssize_t dbc_state_show(struct device *dev, struct device_attribute *a, char *buf) +{ + struct dbc_attribute *dbc_attr = container_of(a, struct dbc_attribute, dev_attr); + struct drm_minor *minor = dev_get_drvdata(dev); + struct qaic_device *qdev; + + qdev = to_qaic_device(minor->dev); + return sysfs_emit(buf, "%d\n", qdev->dbc[dbc_attr->dbc_id].state); +} + +void set_dbc_state(struct qaic_device *qdev, u32 dbc_id, unsigned int state) +{ + struct device *kdev = to_accel_kdev(qdev->qddev); + char *envp[3] = {}; + char state_str[16]; + char id_str[12]; + + envp[0] = id_str; + envp[1] = state_str; + + if (state >= DBC_STATE_MAX) + return; + if (dbc_id >= qdev->num_dbc) + return; + if (state == qdev->dbc[dbc_id].state) + return; + + scnprintf(id_str, ARRAY_SIZE(id_str), "DBC_ID=%d", dbc_id); + scnprintf(state_str, ARRAY_SIZE(state_str), "DBC_STATE=%d", state); + + qdev->dbc[dbc_id].state = state; + kobject_uevent_env(&kdev->kobj, KOBJ_CHANGE, envp); +} + +int qaic_sysfs_init(struct qaic_drm_device *qddev) +{ + struct device *kdev = to_accel_kdev(qddev); + struct drm_device *drm = to_drm(qddev); + u32 num_dbc = qddev->qdev->num_dbc; + struct dbc_attribute *dbc_attrs; + int i, ret; + + dbc_attrs = drmm_kcalloc(drm, num_dbc, sizeof(*dbc_attrs), GFP_KERNEL); + if (!dbc_attrs) + return -ENOMEM; + + for (i = 0; i < num_dbc; ++i) { + struct dbc_attribute *dbc_attr = &dbc_attrs[i]; + + sysfs_attr_init(&dbc_attr->dev_attr.attr); + dbc_attr->dbc_id = i; + scnprintf(dbc_attr->name, NAME_LEN, "dbc%d_state", i); + dbc_attr->dev_attr.attr.name = dbc_attr->name; + dbc_attr->dev_attr.attr.mode = 0444; + dbc_attr->dev_attr.show = dbc_state_show; + ret = sysfs_create_file(&kdev->kobj, &dbc_attr->dev_attr.attr); + if (ret) { + int j; + + for (j = 0; j < i; ++j) { + dbc_attr = &dbc_attrs[j]; + sysfs_remove_file(&kdev->kobj, &dbc_attr->dev_attr.attr); + } + drmm_kfree(drm, dbc_attrs); + return ret; + } + } + + qddev->sysfs_attrs = dbc_attrs; + return 0; +} + +void qaic_sysfs_remove(struct qaic_drm_device *qddev) +{ + struct dbc_attribute *dbc_attrs = qddev->sysfs_attrs; + struct device *kdev = to_accel_kdev(qddev); + u32 num_dbc = qddev->qdev->num_dbc; + int i; + + if (!dbc_attrs) + return; + + qddev->sysfs_attrs = NULL; + for (i = 0; i < num_dbc; ++i) + sysfs_remove_file(&kdev->kobj, &dbc_attrs[i].dev_attr.attr); + drmm_kfree(to_drm(qddev), dbc_attrs); +} diff --git a/drivers/accel/qaic/qaic_timesync.c b/drivers/accel/qaic/qaic_timesync.c index 3fac540f8e03..8af2475f4f36 100644 --- a/drivers/accel/qaic/qaic_timesync.c +++ b/drivers/accel/qaic/qaic_timesync.c @@ -171,6 +171,13 @@ mod_timer: dev_err(mqtsdev->dev, "%s mod_timer error:%d\n", __func__, ret); } +void qaic_mqts_ch_stop_timer(struct mhi_device *mhi_dev) +{ + struct mqts_dev *mqtsdev = dev_get_drvdata(&mhi_dev->dev); + + timer_delete_sync(&mqtsdev->timer); +} + static int qaic_timesync_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id) { struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(mhi_dev->mhi_cntrl->cntrl_dev)); @@ -206,6 +213,7 @@ static int qaic_timesync_probe(struct mhi_device *mhi_dev, const struct mhi_devi timer->expires = jiffies + msecs_to_jiffies(timesync_delay_ms); add_timer(timer); dev_set_drvdata(&mhi_dev->dev, mqtsdev); + qdev->mqts_ch = mhi_dev; return 0; @@ -221,6 +229,7 @@ static void qaic_timesync_remove(struct mhi_device *mhi_dev) { struct mqts_dev *mqtsdev = dev_get_drvdata(&mhi_dev->dev); + mqtsdev->qdev->mqts_ch = NULL; timer_delete_sync(&mqtsdev->timer); mhi_unprepare_from_transfer(mqtsdev->mhi_dev); kfree(mqtsdev->sync_msg); diff --git a/drivers/accel/qaic/qaic_timesync.h b/drivers/accel/qaic/qaic_timesync.h index 851b7acd43bb..77b9c2b55057 100644 --- a/drivers/accel/qaic/qaic_timesync.h +++ b/drivers/accel/qaic/qaic_timesync.h @@ -6,6 +6,9 @@ #ifndef __QAIC_TIMESYNC_H__ #define __QAIC_TIMESYNC_H__ +#include <linux/mhi.h> + int qaic_timesync_init(void); void qaic_timesync_deinit(void); +void qaic_mqts_ch_stop_timer(struct mhi_device *mhi_dev); #endif /* __QAIC_TIMESYNC_H__ */ diff --git a/drivers/accel/qaic/sahara.c b/drivers/accel/qaic/sahara.c index 3ebcc1f7ff58..fd3c3b2d1fd3 100644 --- a/drivers/accel/qaic/sahara.c +++ b/drivers/accel/qaic/sahara.c @@ -159,6 +159,7 @@ struct sahara_context { struct sahara_packet *rx; struct work_struct fw_work; struct work_struct dump_work; + struct work_struct read_data_work; struct mhi_device *mhi_dev; const char * const *image_table; u32 table_size; @@ -174,7 +175,10 @@ struct sahara_context { u64 dump_image_offset; void *mem_dump_freespace; u64 dump_images_left; + u32 read_data_offset; + u32 read_data_length; bool is_mem_dump_mode; + bool non_streaming; }; static const char * const aic100_image_table[] = { @@ -194,6 +198,7 @@ static const char * const aic200_image_table[] = { [23] = "qcom/aic200/aop.mbn", [32] = "qcom/aic200/tz.mbn", [33] = "qcom/aic200/hypvm.mbn", + [38] = "qcom/aic200/xbl_config.elf", [39] = "qcom/aic200/aic200_abl.elf", [40] = "qcom/aic200/apdp.mbn", [41] = "qcom/aic200/devcfg.mbn", @@ -202,6 +207,7 @@ static const char * const aic200_image_table[] = { [49] = "qcom/aic200/shrm.elf", [50] = "qcom/aic200/cpucp.elf", [51] = "qcom/aic200/aop_devcfg.mbn", + [54] = "qcom/aic200/qupv3fw.elf", [57] = "qcom/aic200/cpucp_dtbs.elf", [62] = "qcom/aic200/uefi_dtbs.elf", [63] = "qcom/aic200/xbl_ac_config.mbn", @@ -213,9 +219,15 @@ static const char * const aic200_image_table[] = { [69] = "qcom/aic200/dcd.mbn", [73] = "qcom/aic200/gearvm.mbn", [74] = "qcom/aic200/sti.bin", - [75] = "qcom/aic200/pvs.bin", + [76] = "qcom/aic200/tz_qti_config.mbn", + [78] = "qcom/aic200/pvs.bin", }; +static bool is_streaming(struct sahara_context *context) +{ + return !context->non_streaming; +} + static int sahara_find_image(struct sahara_context *context, u32 image_id) { int ret; @@ -265,6 +277,8 @@ static void sahara_send_reset(struct sahara_context *context) int ret; context->is_mem_dump_mode = false; + context->read_data_offset = 0; + context->read_data_length = 0; context->tx[0]->cmd = cpu_to_le32(SAHARA_RESET_CMD); context->tx[0]->length = cpu_to_le32(SAHARA_RESET_LENGTH); @@ -319,9 +333,39 @@ static void sahara_hello(struct sahara_context *context) dev_err(&context->mhi_dev->dev, "Unable to send hello response %d\n", ret); } +static int read_data_helper(struct sahara_context *context, int buf_index) +{ + enum mhi_flags mhi_flag; + u32 pkt_data_len; + int ret; + + pkt_data_len = min(context->read_data_length, SAHARA_PACKET_MAX_SIZE); + + memcpy(context->tx[buf_index], + &context->firmware->data[context->read_data_offset], + pkt_data_len); + + context->read_data_offset += pkt_data_len; + context->read_data_length -= pkt_data_len; + + if (is_streaming(context) || !context->read_data_length) + mhi_flag = MHI_EOT; + else + mhi_flag = MHI_CHAIN; + + ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE, + context->tx[buf_index], pkt_data_len, mhi_flag); + if (ret) { + dev_err(&context->mhi_dev->dev, "Unable to send read_data response %d\n", ret); + return ret; + } + + return 0; +} + static void sahara_read_data(struct sahara_context *context) { - u32 image_id, data_offset, data_len, pkt_data_len; + u32 image_id, data_offset, data_len; int ret; int i; @@ -357,7 +401,7 @@ static void sahara_read_data(struct sahara_context *context) * and is not needed here on error. */ - if (data_len > SAHARA_TRANSFER_MAX_SIZE) { + if (context->non_streaming && data_len > SAHARA_TRANSFER_MAX_SIZE) { dev_err(&context->mhi_dev->dev, "Malformed read_data packet - data len %d exceeds max xfer size %d\n", data_len, SAHARA_TRANSFER_MAX_SIZE); sahara_send_reset(context); @@ -378,22 +422,18 @@ static void sahara_read_data(struct sahara_context *context) return; } - for (i = 0; i < SAHARA_NUM_TX_BUF && data_len; ++i) { - pkt_data_len = min(data_len, SAHARA_PACKET_MAX_SIZE); - - memcpy(context->tx[i], &context->firmware->data[data_offset], pkt_data_len); + context->read_data_offset = data_offset; + context->read_data_length = data_len; - data_offset += pkt_data_len; - data_len -= pkt_data_len; + if (is_streaming(context)) { + schedule_work(&context->read_data_work); + return; + } - ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE, - context->tx[i], pkt_data_len, - !data_len ? MHI_EOT : MHI_CHAIN); - if (ret) { - dev_err(&context->mhi_dev->dev, "Unable to send read_data response %d\n", - ret); - return; - } + for (i = 0; i < SAHARA_NUM_TX_BUF && context->read_data_length; ++i) { + ret = read_data_helper(context, i); + if (ret) + break; } } @@ -538,6 +578,7 @@ static void sahara_parse_dump_table(struct sahara_context *context) struct sahara_memory_dump_meta_v1 *dump_meta; u64 table_nents; u64 dump_length; + u64 mul_bytes; int ret; u64 i; @@ -551,8 +592,9 @@ static void sahara_parse_dump_table(struct sahara_context *context) dev_table[i].description[SAHARA_TABLE_ENTRY_STR_LEN - 1] = 0; dev_table[i].filename[SAHARA_TABLE_ENTRY_STR_LEN - 1] = 0; - dump_length = size_add(dump_length, le64_to_cpu(dev_table[i].length)); - if (dump_length == SIZE_MAX) { + if (check_add_overflow(dump_length, + le64_to_cpu(dev_table[i].length), + &dump_length)) { /* Discard the dump */ sahara_send_reset(context); return; @@ -568,14 +610,17 @@ static void sahara_parse_dump_table(struct sahara_context *context) dev_table[i].filename); } - dump_length = size_add(dump_length, sizeof(*dump_meta)); - if (dump_length == SIZE_MAX) { + if (check_add_overflow(dump_length, (u64)sizeof(*dump_meta), &dump_length)) { /* Discard the dump */ sahara_send_reset(context); return; } - dump_length = size_add(dump_length, size_mul(sizeof(*image_out_table), table_nents)); - if (dump_length == SIZE_MAX) { + if (check_mul_overflow((u64)sizeof(*image_out_table), table_nents, &mul_bytes)) { + /* Discard the dump */ + sahara_send_reset(context); + return; + } + if (check_add_overflow(dump_length, mul_bytes, &dump_length)) { /* Discard the dump */ sahara_send_reset(context); return; @@ -615,7 +660,7 @@ static void sahara_parse_dump_table(struct sahara_context *context) /* Request the first chunk of the first image */ context->dump_image = &image_out_table[0]; - dump_length = min(context->dump_image->length, SAHARA_READ_MAX_SIZE); + dump_length = min_t(u64, context->dump_image->length, SAHARA_READ_MAX_SIZE); /* Avoid requesting EOI sized data so that we can identify errors */ if (dump_length == SAHARA_END_OF_IMAGE_LENGTH) dump_length = SAHARA_END_OF_IMAGE_LENGTH / 2; @@ -663,7 +708,7 @@ static void sahara_parse_dump_image(struct sahara_context *context) /* Get next image chunk */ dump_length = context->dump_image->length - context->dump_image_offset; - dump_length = min(dump_length, SAHARA_READ_MAX_SIZE); + dump_length = min_t(u64, dump_length, SAHARA_READ_MAX_SIZE); /* Avoid requesting EOI sized data so that we can identify errors */ if (dump_length == SAHARA_END_OF_IMAGE_LENGTH) dump_length = SAHARA_END_OF_IMAGE_LENGTH / 2; @@ -742,6 +787,13 @@ error: sahara_send_reset(context); } +static void sahara_read_data_processing(struct work_struct *work) +{ + struct sahara_context *context = container_of(work, struct sahara_context, read_data_work); + + read_data_helper(context, 0); +} + static int sahara_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id) { struct sahara_context *context; @@ -756,34 +808,56 @@ static int sahara_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_ if (!context->rx) return -ENOMEM; + if (!strcmp(mhi_dev->mhi_cntrl->name, "AIC200")) { + context->image_table = aic200_image_table; + context->table_size = ARRAY_SIZE(aic200_image_table); + } else { + context->image_table = aic100_image_table; + context->table_size = ARRAY_SIZE(aic100_image_table); + context->non_streaming = true; + } + /* - * AIC100 defines SAHARA_TRANSFER_MAX_SIZE as the largest value it - * will request for READ_DATA. This is larger than - * SAHARA_PACKET_MAX_SIZE, and we need 9x SAHARA_PACKET_MAX_SIZE to - * cover SAHARA_TRANSFER_MAX_SIZE. When the remote side issues a - * READ_DATA, it requires a transfer of the exact size requested. We - * can use MHI_CHAIN to link multiple buffers into a single transfer - * but the remote side will not consume the buffers until it sees an - * EOT, thus we need to allocate enough buffers to put in the tx fifo - * to cover an entire READ_DATA request of the max size. + * There are two firmware implementations for READ_DATA handling. + * The older "SBL" implementation defines a Sahara transfer size, and + * expects that the response is a single transport transfer. If the + * FW wants to transfer a file that is larger than the transfer size, + * the FW will issue multiple READ_DATA commands. For this + * implementation, we need to allocate enough buffers to contain the + * entire Sahara transfer size. + * + * The newer "XBL" implementation does not define a maximum transfer + * size and instead expects the data to be streamed over using the + * transport level MTU. The FW will issue a single READ_DATA command + * of whatever size, and consume multiple transport level transfers + * until the expected amount of data is consumed. For this + * implementation we only need a single buffer of the transport MTU + * but we'll need to be able to use it multiple times for a single + * READ_DATA request. + * + * AIC100 is the SBL implementation and defines SAHARA_TRANSFER_MAX_SIZE + * and we need 9x SAHARA_PACKET_MAX_SIZE to cover that. We can use + * MHI_CHAIN to link multiple buffers into a single transfer but the + * remote side will not consume the buffers until it sees an EOT, thus + * we need to allocate enough buffers to put in the tx fifo to cover an + * entire READ_DATA request of the max size. + * + * AIC200 is the XBL implementation, and so a single buffer will work. */ for (i = 0; i < SAHARA_NUM_TX_BUF; ++i) { - context->tx[i] = devm_kzalloc(&mhi_dev->dev, SAHARA_PACKET_MAX_SIZE, GFP_KERNEL); + context->tx[i] = devm_kzalloc(&mhi_dev->dev, + SAHARA_PACKET_MAX_SIZE, + GFP_KERNEL); if (!context->tx[i]) return -ENOMEM; + if (is_streaming(context)) + break; } context->mhi_dev = mhi_dev; INIT_WORK(&context->fw_work, sahara_processing); INIT_WORK(&context->dump_work, sahara_dump_processing); - - if (!strcmp(mhi_dev->mhi_cntrl->name, "AIC200")) { - context->image_table = aic200_image_table; - context->table_size = ARRAY_SIZE(aic200_image_table); - } else { - context->image_table = aic100_image_table; - context->table_size = ARRAY_SIZE(aic100_image_table); - } + INIT_WORK(&context->read_data_work, sahara_read_data_processing); context->active_image_id = SAHARA_IMAGE_ID_NONE; dev_set_drvdata(&mhi_dev->dev, context); @@ -814,6 +888,10 @@ static void sahara_mhi_remove(struct mhi_device *mhi_dev) static void sahara_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result) { + struct sahara_context *context = dev_get_drvdata(&mhi_dev->dev); + + if (!mhi_result->transaction_status && context->read_data_length && is_streaming(context)) + schedule_work(&context->read_data_work); } static void sahara_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result) diff --git a/drivers/accel/rocket/rocket_gem.c b/drivers/accel/rocket/rocket_gem.c index 0551e11cc184..624c4ecf5a34 100644 --- a/drivers/accel/rocket/rocket_gem.c +++ b/drivers/accel/rocket/rocket_gem.c @@ -2,6 +2,7 @@ /* Copyright 2024-2025 Tomeu Vizoso <tomeu@tomeuvizoso.net> */ #include <drm/drm_device.h> +#include <drm/drm_print.h> #include <drm/drm_utils.h> #include <drm/rocket_accel.h> #include <linux/dma-mapping.h> |
