75 files changed, 6070 insertions, 1034 deletions
diff --git a/drivers/accel/Kconfig b/drivers/accel/Kconfig
index bb01cebc42bf..bdf48ccafcf2 100644
--- a/drivers/accel/Kconfig
+++ b/drivers/accel/Kconfig
@@ -25,6 +25,7 @@ menuconfig DRM_ACCEL
 	  and debugfs).
 
 source "drivers/accel/amdxdna/Kconfig"
+source "drivers/accel/ethosu/Kconfig"
 source "drivers/accel/habanalabs/Kconfig"
 source "drivers/accel/ivpu/Kconfig"
 source "drivers/accel/qaic/Kconfig"
diff --git a/drivers/accel/Makefile b/drivers/accel/Makefile
index ffc3fa588666..1d3a7251b950 100644
--- a/drivers/accel/Makefile
+++ b/drivers/accel/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
 obj-$(CONFIG_DRM_ACCEL_AMDXDNA)		+= amdxdna/
+obj-$(CONFIG_DRM_ACCEL_ARM_ETHOSU)	+= ethosu/
 obj-$(CONFIG_DRM_ACCEL_HABANALABS)	+= habanalabs/
 obj-$(CONFIG_DRM_ACCEL_IVPU)		+= ivpu/
 obj-$(CONFIG_DRM_ACCEL_QAIC)		+= qaic/
diff --git a/drivers/accel/amdxdna/Makefile b/drivers/accel/amdxdna/Makefile
index 6797dac65efa..6344aaf523fa 100644
--- a/drivers/accel/amdxdna/Makefile
+++ b/drivers/accel/amdxdna/Makefile
@@ -14,6 +14,7 @@ amdxdna-y := \
 	amdxdna_mailbox.o \
 	amdxdna_mailbox_helper.o \
 	amdxdna_pci_drv.o \
+	amdxdna_pm.o \
 	amdxdna_sysfs.o \
 	amdxdna_ubuf.o \
 	npu1_regs.o \
diff --git a/drivers/accel/amdxdna/TODO b/drivers/accel/amdxdna/TODO
index ad8ac6e315b6..0e4bbebeaedf 100644
--- a/drivers/accel/amdxdna/TODO
+++ b/drivers/accel/amdxdna/TODO
@@ -1,2 +1 @@
 - Add debugfs support
-- Add debug BO support
diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index e9f9b1fa5dc1..42d876a427c5 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -21,6 +21,7 @@
 #include "amdxdna_gem.h"
 #include "amdxdna_mailbox.h"
 #include "amdxdna_pci_drv.h"
+#include "amdxdna_pm.h"
 
 static bool force_cmdlist;
 module_param(force_cmdlist, bool, 0600);
@@ -88,7 +89,7 @@ static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hw
 		goto out;
 	}
 
-	ret = aie2_config_cu(hwctx);
+	ret = aie2_config_cu(hwctx, NULL);
 	if (ret) {
 		XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
 		goto out;
@@ -167,14 +168,11 @@ static int aie2_hwctx_resume_cb(struct amdxdna_hwctx *hwctx, void *arg)
 
 int aie2_hwctx_resume(struct amdxdna_client *client)
 {
-	struct amdxdna_dev *xdna = client->xdna;
-
 	/*
 	 * The resume path cannot guarantee that mailbox channel can be
 	 * regenerated. If this happen, when submit message to this
 	 * mailbox channel, error will return.
 	 */
-	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
 	return amdxdna_hwctx_walk(client, NULL, aie2_hwctx_resume_cb);
 }
 
@@ -184,12 +182,13 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
 	struct dma_fence *fence = job->fence;
 
 	trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq);
+
+	amdxdna_pm_suspend_put(job->hwctx->client->xdna);
 	job->hwctx->priv->completed++;
 	dma_fence_signal(fence);
 
 	up(&job->hwctx->priv->job_sem);
 	job->job_done = true;
-	dma_fence_put(fence);
 	mmput_async(job->mm);
 	aie2_job_put(job);
 }
@@ -204,10 +203,13 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
 
 	cmd_abo = job->cmd_bo;
 
-	if (unlikely(!data))
+	if (unlikely(job->job_timeout)) {
+		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
+		ret = -EINVAL;
 		goto out;
+	}
 
-	if (unlikely(size != sizeof(u32))) {
+	if (unlikely(!data) || unlikely(size != sizeof(u32))) {
 		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
 		ret = -EINVAL;
 		goto out;
@@ -226,11 +228,10 @@ out:
 }
 
 static int
-aie2_sched_nocmd_resp_handler(void *handle, void __iomem *data, size_t size)
+aie2_sched_drvcmd_resp_handler(void *handle, void __iomem *data, size_t size)
 {
 	struct amdxdna_sched_job *job = handle;
 	int ret = 0;
-	u32 status;
 
 	if (unlikely(!data))
 		goto out;
@@ -240,8 +241,7 @@ aie2_sched_nocmd_resp_handler(void *handle, void __iomem *data, size_t size)
 		goto out;
 	}
 
-	status = readl(data);
-	XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
+	job->drv_cmd->result = readl(data);
 
 out:
 	aie2_sched_notify(job);
@@ -260,6 +260,13 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
 	int ret = 0;
 
 	cmd_abo = job->cmd_bo;
+
+	if (unlikely(job->job_timeout)) {
+		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_TIMEOUT);
+		ret = -EINVAL;
+		goto out;
+	}
+
 	if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
 		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
 		ret = -EINVAL;
@@ -314,8 +321,18 @@ aie2_sched_job_run(struct drm_sched_job *sched_job)
 	kref_get(&job->refcnt);
 	fence = dma_fence_get(job->fence);
 
-	if (unlikely(!cmd_abo)) {
-		ret = aie2_sync_bo(hwctx, job, aie2_sched_nocmd_resp_handler);
+	if (job->drv_cmd) {
+		switch (job->drv_cmd->opcode) {
+		case SYNC_DEBUG_BO:
+			ret = aie2_sync_bo(hwctx, job, aie2_sched_drvcmd_resp_handler);
+			break;
+		case ATTACH_DEBUG_BO:
+			ret = aie2_config_debug_bo(hwctx, job, aie2_sched_drvcmd_resp_handler);
+			break;
+		default:
+			ret = -EINVAL;
+			break;
+		}
 		goto out;
 	}
 
@@ -362,6 +379,7 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
 
 	xdna = hwctx->client->xdna;
 	trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
+	job->job_timeout = true;
 	mutex_lock(&xdna->dev_lock);
 	aie2_hwctx_stop(xdna, hwctx, sched_job);
 
@@ -531,13 +549,12 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 		.num_rqs = DRM_SCHED_PRIORITY_COUNT,
 		.credit_limit = HWCTX_MAX_CMDS,
 		.timeout = msecs_to_jiffies(HWCTX_MAX_TIMEOUT),
-		.name = hwctx->name,
+		.name = "amdxdna_js",
 		.dev = xdna->ddev.dev,
 	};
 	struct drm_gpu_scheduler *sched;
 	struct amdxdna_hwctx_priv *priv;
 	struct amdxdna_gem_obj *heap;
-	struct amdxdna_dev_hdl *ndev;
 	int i, ret;
 
 	priv = kzalloc(sizeof(*hwctx->priv), GFP_KERNEL);
@@ -610,10 +627,14 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 		goto free_entity;
 	}
 
+	ret = amdxdna_pm_resume_get(xdna);
+	if (ret)
+		goto free_col_list;
+
 	ret = aie2_alloc_resource(hwctx);
 	if (ret) {
 		XDNA_ERR(xdna, "Alloc hw resource failed, ret %d", ret);
-		goto free_col_list;
+		goto suspend_put;
 	}
 
 	ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
@@ -628,10 +649,9 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 		XDNA_ERR(xdna, "Create syncobj failed, ret %d", ret);
 		goto release_resource;
 	}
+	amdxdna_pm_suspend_put(xdna);
 
 	hwctx->status = HWCTX_STAT_INIT;
-	ndev = xdna->dev_handle;
-	ndev->hwctx_num++;
 	init_waitqueue_head(&priv->job_free_wq);
 
 	XDNA_DBG(xdna, "hwctx %s init completed", hwctx->name);
@@ -640,6 +660,8 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 
 release_resource:
 	aie2_release_resource(hwctx);
+suspend_put:
+	amdxdna_pm_suspend_put(xdna);
 free_col_list:
 	kfree(hwctx->col_list);
 free_entity:
@@ -662,26 +684,25 @@ free_priv:
 
 void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx)
 {
-	struct amdxdna_dev_hdl *ndev;
 	struct amdxdna_dev *xdna;
 	int idx;
 
 	xdna = hwctx->client->xdna;
-	ndev = xdna->dev_handle;
-	ndev->hwctx_num--;
 
 	XDNA_DBG(xdna, "%s sequence number %lld", hwctx->name, hwctx->priv->seq);
-	drm_sched_entity_destroy(&hwctx->priv->entity);
-
 	aie2_hwctx_wait_for_idle(hwctx);
 
 	/* Request fw to destroy hwctx and cancel the rest pending requests */
 	aie2_release_resource(hwctx);
 
+	mutex_unlock(&xdna->dev_lock);
+	drm_sched_entity_destroy(&hwctx->priv->entity);
+
 	/* Wait for all submitted jobs to be completed or canceled */
 	wait_event(hwctx->priv->job_free_wq,
 		   atomic64_read(&hwctx->job_submit_cnt) ==
 		   atomic64_read(&hwctx->job_free_cnt));
+	mutex_lock(&xdna->dev_lock);
 
 	drm_sched_fini(&hwctx->priv->sched);
 	aie2_ctx_syncobj_destroy(hwctx);
@@ -697,6 +718,14 @@ void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx)
 	kfree(hwctx->cus);
 }
 
+static int aie2_config_cu_resp_handler(void *handle, void __iomem *data, size_t size)
+{
+	struct amdxdna_hwctx *hwctx = handle;
+
+	amdxdna_pm_suspend_put(hwctx->client->xdna);
+	return 0;
+}
+
 static int aie2_hwctx_cu_config(struct amdxdna_hwctx *hwctx, void *buf, u32 size)
 {
 	struct amdxdna_hwctx_param_config_cu *config = buf;
@@ -728,10 +757,14 @@ static int aie2_hwctx_cu_config(struct amdxdna_hwctx *hwctx, void *buf, u32 size
 	if (!hwctx->cus)
 		return -ENOMEM;
 
-	ret = aie2_config_cu(hwctx);
+	ret = amdxdna_pm_resume_get(xdna);
+	if (ret)
+		goto free_cus;
+
+	ret = aie2_config_cu(hwctx, aie2_config_cu_resp_handler);
 	if (ret) {
 		XDNA_ERR(xdna, "Config CU to firmware failed, ret %d", ret);
-		goto free_cus;
+		goto pm_suspend_put;
 	}
 
 	wmb(); /* To avoid locking in command submit when check status */
@@ -739,12 +772,82 @@ static int aie2_hwctx_cu_config(struct amdxdna_hwctx *hwctx, void *buf, u32 size
 
 	return 0;
 
+pm_suspend_put:
+	amdxdna_pm_suspend_put(xdna);
 free_cus:
 	kfree(hwctx->cus);
 	hwctx->cus = NULL;
 	return ret;
 }
 
+static void aie2_cmd_wait(struct amdxdna_hwctx *hwctx, u64 seq)
+{
+	struct dma_fence *out_fence = aie2_cmd_get_out_fence(hwctx, seq);
+
+	if (!out_fence) {
+		XDNA_ERR(hwctx->client->xdna, "Failed to get fence");
+		return;
+	}
+
+	dma_fence_wait_timeout(out_fence, false, MAX_SCHEDULE_TIMEOUT);
+	dma_fence_put(out_fence);
+}
+
+static int aie2_hwctx_cfg_debug_bo(struct amdxdna_hwctx *hwctx, u32 bo_hdl,
+				   bool attach)
+{
+	struct amdxdna_client *client = hwctx->client;
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_drv_cmd cmd = { 0 };
+	struct amdxdna_gem_obj *abo;
+	u64 seq;
+	int ret;
+
+	abo = amdxdna_gem_get_obj(client, bo_hdl, AMDXDNA_BO_DEV);
+	if (!abo) {
+		XDNA_ERR(xdna, "Get bo %d failed", bo_hdl);
+		return -EINVAL;
+	}
+
+	if (attach) {
+		if (abo->assigned_hwctx != AMDXDNA_INVALID_CTX_HANDLE) {
+			ret = -EBUSY;
+			goto put_obj;
+		}
+		cmd.opcode = ATTACH_DEBUG_BO;
+	} else {
+		if (abo->assigned_hwctx != hwctx->id) {
+			ret = -EINVAL;
+			goto put_obj;
+		}
+		cmd.opcode = DETACH_DEBUG_BO;
+	}
+
+	ret = amdxdna_cmd_submit(client, &cmd, AMDXDNA_INVALID_BO_HANDLE,
+				 &bo_hdl, 1, hwctx->id, &seq);
+	if (ret) {
+		XDNA_ERR(xdna, "Submit command failed");
+		goto put_obj;
+	}
+
+	aie2_cmd_wait(hwctx, seq);
+	if (cmd.result) {
+		XDNA_ERR(xdna, "Response failure 0x%x", cmd.result);
+		goto put_obj;
+	}
+
+	if (attach)
+		abo->assigned_hwctx = hwctx->id;
+	else
+		abo->assigned_hwctx = AMDXDNA_INVALID_CTX_HANDLE;
+
+	XDNA_DBG(xdna, "Config debug BO %d to %s", bo_hdl, hwctx->name);
+
+put_obj:
+	amdxdna_gem_put_obj(abo);
+	return ret;
+}
+
 int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size)
 {
 	struct amdxdna_dev *xdna = hwctx->client->xdna;
@@ -754,14 +857,40 @@ int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *bu
 	case DRM_AMDXDNA_HWCTX_CONFIG_CU:
 		return aie2_hwctx_cu_config(hwctx, buf, size);
 	case DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF:
+		return aie2_hwctx_cfg_debug_bo(hwctx, (u32)value, true);
 	case DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF:
-		return -EOPNOTSUPP;
+		return aie2_hwctx_cfg_debug_bo(hwctx, (u32)value, false);
 	default:
 		XDNA_DBG(xdna, "Not supported type %d", type);
 		return -EOPNOTSUPP;
 	}
 }
 
+int aie2_hwctx_sync_debug_bo(struct amdxdna_hwctx *hwctx, u32 debug_bo_hdl)
+{
+	struct amdxdna_client *client = hwctx->client;
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_drv_cmd cmd = { 0 };
+	u64 seq;
+	int ret;
+
+	cmd.opcode = SYNC_DEBUG_BO;
+	ret = amdxdna_cmd_submit(client, &cmd, AMDXDNA_INVALID_BO_HANDLE,
+				 &debug_bo_hdl, 1, hwctx->id, &seq);
+	if (ret) {
+		XDNA_ERR(xdna, "Submit command failed");
+		return ret;
+	}
+
+	aie2_cmd_wait(hwctx, seq);
+	if (cmd.result) {
+		XDNA_ERR(xdna, "Response failure 0x%x", cmd.result);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int aie2_populate_range(struct amdxdna_gem_obj *abo)
 {
 	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
@@ -862,11 +991,15 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 		goto free_chain;
 	}
 
+	ret = amdxdna_pm_resume_get(xdna);
+	if (ret)
+		goto cleanup_job;
+
 retry:
 	ret = drm_gem_lock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
 	if (ret) {
 		XDNA_WARN(xdna, "Failed to lock BOs, ret %d", ret);
-		goto cleanup_job;
+		goto suspend_put;
 	}
 
 	for (i = 0; i < job->bo_cnt; i++) {
@@ -874,7 +1007,7 @@ retry:
 		if (ret) {
 			XDNA_WARN(xdna, "Failed to reserve fences %d", ret);
 			drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
-			goto cleanup_job;
+			goto suspend_put;
 		}
 	}
 
@@ -889,12 +1022,12 @@ retry:
 					msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
 			} else if (time_after(jiffies, timeout)) {
 				ret = -ETIME;
-				goto cleanup_job;
+				goto suspend_put;
 			}
 
 			ret = aie2_populate_range(abo);
 			if (ret)
-				goto cleanup_job;
+				goto suspend_put;
 			goto retry;
 		}
 	}
@@ -920,6 +1053,8 @@ retry:
 
 	return 0;
 
+suspend_put:
+	amdxdna_pm_suspend_put(xdna);
 cleanup_job:
 	drm_sched_job_cleanup(&job->base);
 free_chain:
diff --git a/drivers/accel/amdxdna/aie2_error.c b/drivers/accel/amdxdna/aie2_error.c
index 5ee905632a39..d452008ec4f4 100644
--- a/drivers/accel/amdxdna/aie2_error.c
+++ b/drivers/accel/amdxdna/aie2_error.c
@@ -13,6 +13,7 @@
 
 #include "aie2_msg_priv.h"
 #include "aie2_pci.h"
+#include "amdxdna_error.h"
 #include "amdxdna_mailbox.h"
 #include "amdxdna_pci_drv.h"
 
@@ -46,6 +47,7 @@ enum aie_module_type {
 	AIE_MEM_MOD = 0,
 	AIE_CORE_MOD,
 	AIE_PL_MOD,
+	AIE_UNKNOWN_MOD,
 };
 
 enum aie_error_category {
@@ -143,6 +145,31 @@ static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
 	EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
 };
 
+static const enum amdxdna_error_num aie_cat_err_num_map[] = {
+	[AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION,
+	[AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP,
+	[AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM,
+	[AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS,
+	[AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS,
+	[AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
+	[AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC,
+	[AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK,
+	[AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA,
+	[AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
+	[AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN,
+};
+
+static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1);
+
+static const enum amdxdna_error_module aie_err_mod_map[] = {
+	[AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY,
+	[AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE,
+	[AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL,
+	[AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN,
+};
+
+static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1);
+
 static enum aie_error_category
 aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
 {
@@ -176,12 +203,40 @@ aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
 		if (event_id != lut[i].event_id)
 			continue;
 
+		if (lut[i].category > AIE_ERROR_UNKNOWN)
+			return AIE_ERROR_UNKNOWN;
+
 		return lut[i].category;
 	}
 
 	return AIE_ERROR_UNKNOWN;
 }
 
+static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
+{
+	struct aie_error *errs = err_info;
+	enum amdxdna_error_module err_mod;
+	enum aie_error_category aie_err;
+	enum amdxdna_error_num err_num;
+	struct aie_error *last_err;
+
+	last_err = &errs[num_err - 1];
+	if (last_err->mod_type >= AIE_UNKNOWN_MOD) {
+		err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN];
+		err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD];
+	} else {
+		aie_err = aie_get_error_category(last_err->row,
+						 last_err->event_id,
+						 last_err->mod_type);
+		err_num = aie_cat_err_num_map[aie_err];
+		err_mod = aie_err_mod_map[last_err->mod_type];
+	}
+
+	ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod);
+	ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real());
+	ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col);
+}
+
 static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
 {
 	struct aie_error *errs = err_info;
@@ -264,29 +319,14 @@ static void aie2_error_worker(struct work_struct *err_work)
 	}
 
 	mutex_lock(&xdna->dev_lock);
+	aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt);
+
 	/* Re-sent this event to firmware */
 	if (aie2_error_event_send(e))
 		XDNA_WARN(xdna, "Unable to register async event");
 	mutex_unlock(&xdna->dev_lock);
 }
 
-int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
-{
-	struct amdxdna_dev *xdna = ndev->xdna;
-	struct async_event *e;
-	int i, ret;
-
-	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
-	for (i = 0; i < ndev->async_events->event_cnt; i++) {
-		e = &ndev->async_events->event[i];
-		ret = aie2_error_event_send(e);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
 {
 	struct amdxdna_dev *xdna = ndev->xdna;
@@ -341,6 +381,10 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
 		e->size = ASYNC_BUF_SIZE;
 		e->resp.status = MAX_AIE2_STATUS_CODE;
 		INIT_WORK(&e->work, aie2_error_worker);
+
+		ret = aie2_error_event_send(e);
+		if (ret)
+			goto free_wq;
 	}
 
 	ndev->async_events = events;
@@ -349,6 +393,8 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
 		 events->event_cnt, events->size);
 	return 0;
 
+free_wq:
+	destroy_workqueue(events->wq);
 free_buf:
 	dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
 			     events->addr, DMA_FROM_DEVICE);
@@ -356,3 +402,18 @@ free_events:
 	kfree(events);
 	return ret;
 }
+
+int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+
+	args->num_element = 1;
+	args->element_size = sizeof(ndev->last_async_err);
+	if (copy_to_user(u64_to_user_ptr(args->buffer),
+			 &ndev->last_async_err, args->element_size))
+		return -EFAULT;
+
+	return 0;
+}
diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
index 9caad083543d..d493bb1c3360 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -27,6 +27,8 @@
 #define DECLARE_AIE2_MSG(name, op) \
 	DECLARE_XDNA_MSG_COMMON(name, op, MAX_AIE2_STATUS_CODE)
 
+#define EXEC_MSG_OPS(xdna)	((xdna)->dev_handle->exec_msg_ops)
+
 static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
 				   struct xdna_mailbox_msg *msg)
 {
@@ -37,7 +39,7 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
 	if (!ndev->mgmt_chann)
 		return -ENODEV;
 
-	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+	drm_WARN_ON(&xdna->ddev, xdna->rpm_on && !mutex_is_locked(&xdna->dev_lock));
 	ret = xdna_send_msg_wait(xdna, ndev->mgmt_chann, msg);
 	if (ret == -ETIME) {
 		xdna_mailbox_stop_channel(ndev->mgmt_chann);
@@ -45,7 +47,7 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
 		ndev->mgmt_chann = NULL;
 	}
 
-	if (!ret && *hdl->data != AIE2_STATUS_SUCCESS) {
+	if (!ret && *hdl->status != AIE2_STATUS_SUCCESS) {
 		XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
 			 msg->opcode, *hdl->data);
 		ret = -EINVAL;
@@ -208,6 +210,14 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
 	hwctx->fw_ctx_id = resp.context_id;
 	WARN_ONCE(hwctx->fw_ctx_id == -1, "Unexpected context id");
 
+	if (ndev->force_preempt_enabled) {
+		ret = aie2_runtime_cfg(ndev, AIE2_RT_CFG_FORCE_PREEMPT, &hwctx->fw_ctx_id);
+		if (ret) {
+			XDNA_ERR(xdna, "failed to enable force preempt %d", ret);
+			return ret;
+		}
+	}
+
 	cq_pair = &resp.cq_pair[0];
 	x2i.mb_head_ptr_reg = AIE2_MBOX_OFF(ndev, cq_pair->x2i_q.head_addr);
 	x2i.mb_tail_ptr_reg = AIE2_MBOX_OFF(ndev, cq_pair->x2i_q.tail_addr);
@@ -233,6 +243,7 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
 		ret = -EINVAL;
 		goto out_destroy_context;
 	}
+	ndev->hwctx_num++;
 
 	XDNA_DBG(xdna, "%s mailbox channel irq: %d, msix_id: %d",
 		 hwctx->name, ret, resp.msix_id);
@@ -267,6 +278,7 @@ int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwc
 		 hwctx->fw_ctx_id);
 	hwctx->priv->mbox_chann = NULL;
 	hwctx->fw_ctx_id = -1;
+	ndev->hwctx_num--;
 
 	return ret;
 }
@@ -332,11 +344,6 @@ int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf,
 		goto fail;
 	}
 
-	if (resp.status != AIE2_STATUS_SUCCESS) {
-		XDNA_ERR(xdna, "Query NPU status failed, status 0x%x", resp.status);
-		ret = -EINVAL;
-		goto fail;
-	}
 	XDNA_DBG(xdna, "Query NPU status completed");
 
 	if (size < resp.size) {
@@ -358,6 +365,55 @@ fail:
 	return ret;
 }
 
+int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
+			 char __user *buf, u32 size,
+			 struct amdxdna_drm_query_telemetry_header *header)
+{
+	DECLARE_AIE2_MSG(get_telemetry, MSG_OP_GET_TELEMETRY);
+	struct amdxdna_dev *xdna = ndev->xdna;
+	dma_addr_t dma_addr;
+	u8 *addr;
+	int ret;
+
+	if (header->type >= MAX_TELEMETRY_TYPE)
+		return -EINVAL;
+
+	addr = dma_alloc_noncoherent(xdna->ddev.dev, size, &dma_addr,
+				     DMA_FROM_DEVICE, GFP_KERNEL);
+	if (!addr)
+		return -ENOMEM;
+
+	req.buf_addr = dma_addr;
+	req.buf_size = size;
+	req.type = header->type;
+
+	drm_clflush_virt_range(addr, size); /* device can access */
+	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+	if (ret) {
+		XDNA_ERR(xdna, "Query telemetry failed, status %d", ret);
+		goto free_buf;
+	}
+
+	if (size < resp.size) {
+		ret = -EINVAL;
+		XDNA_ERR(xdna, "Bad buffer size. Available: %u. Needs: %u", size, resp.size);
+		goto free_buf;
+	}
+
+	if (copy_to_user(buf, addr, resp.size)) {
+		ret = -EFAULT;
+		XDNA_ERR(xdna, "Failed to copy telemetry to user space");
+		goto free_buf;
+	}
+
+	header->major = resp.major;
+	header->minor = resp.minor;
+
+free_buf:
+	dma_free_noncoherent(xdna->ddev.dev, size, addr, dma_addr, DMA_FROM_DEVICE);
+	return ret;
+}
+
 int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
 				 void *handle, int (*cb)(void*, void __iomem *, size_t))
 {
@@ -377,15 +433,17 @@ int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr,
 	return xdna_mailbox_send_msg(ndev->mgmt_chann, &msg, TX_TIMEOUT);
 }
 
-int aie2_config_cu(struct amdxdna_hwctx *hwctx)
+int aie2_config_cu(struct amdxdna_hwctx *hwctx,
+		   int (*notify_cb)(void *, void __iomem *, size_t))
 {
 	struct mailbox_channel *chann = hwctx->priv->mbox_chann;
 	struct amdxdna_dev *xdna = hwctx->client->xdna;
 	u32 shift = xdna->dev_info->dev_mem_buf_shift;
-	DECLARE_AIE2_MSG(config_cu, MSG_OP_CONFIG_CU);
+	struct config_cu_req req = { 0 };
+	struct xdna_mailbox_msg msg;
 	struct drm_gem_object *gobj;
 	struct amdxdna_gem_obj *abo;
-	int ret, i;
+	int i;
 
 	if (!chann)
 		return -ENODEV;
@@ -423,191 +481,386 @@ int aie2_config_cu(struct amdxdna_hwctx *hwctx)
 	}
 	req.num_cus = hwctx->cus->num_cus;
 
-	ret = xdna_send_msg_wait(xdna, chann, &msg);
-	if (ret == -ETIME)
-		aie2_destroy_context(xdna->dev_handle, hwctx);
+	msg.send_data = (u8 *)&req;
+	msg.send_size = sizeof(req);
+	msg.handle = hwctx;
+	msg.opcode = MSG_OP_CONFIG_CU;
+	msg.notify_cb = notify_cb;
+	return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+}
 
-	if (resp.status == AIE2_STATUS_SUCCESS) {
-		XDNA_DBG(xdna, "Configure %d CUs, ret %d", req.num_cus, ret);
-		return 0;
-	}
+static int aie2_init_exec_cu_req(struct amdxdna_gem_obj *cmd_bo, void *req,
+				 size_t *size, u32 *msg_op)
+{
+	struct execute_buffer_req *cu_req = req;
+	u32 cmd_len;
+	void *cmd;
 
-	XDNA_ERR(xdna, "Command opcode 0x%x failed, status 0x%x ret %d",
-		 msg.opcode, resp.status, ret);
-	return ret;
+	cmd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len);
+	if (cmd_len > sizeof(cu_req->payload))
+		return -EINVAL;
+
+	cu_req->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo);
+	if (cu_req->cu_idx == INVALID_CU_IDX)
+		return -EINVAL;
+
+	memcpy(cu_req->payload, cmd, cmd_len);
+
+	*size = sizeof(*cu_req);
+	*msg_op = MSG_OP_EXECUTE_BUFFER_CF;
+	return 0;
 }
 
-int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
-		 int (*notify_cb)(void *, void __iomem *, size_t))
+static int aie2_init_exec_dpu_req(struct amdxdna_gem_obj *cmd_bo, void *req,
+				  size_t *size, u32 *msg_op)
 {
-	struct mailbox_channel *chann = hwctx->priv->mbox_chann;
-	struct amdxdna_dev *xdna = hwctx->client->xdna;
-	struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
-	union {
-		struct execute_buffer_req ebuf;
-		struct exec_dpu_req dpu;
-	} req;
-	struct xdna_mailbox_msg msg;
-	u32 payload_len;
-	void *payload;
-	int cu_idx;
-	int ret;
-	u32 op;
+	struct exec_dpu_req *dpu_req = req;
+	struct amdxdna_cmd_start_npu *sn;
+	u32 cmd_len;
 
-	if (!chann)
-		return -ENODEV;
+	sn = amdxdna_cmd_get_payload(cmd_bo, &cmd_len);
+	if (cmd_len - sizeof(*sn) > sizeof(dpu_req->payload))
+		return -EINVAL;
 
-	payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
-	if (!payload) {
-		XDNA_ERR(xdna, "Invalid command, cannot get payload");
+	dpu_req->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo);
+	if (dpu_req->cu_idx == INVALID_CU_IDX)
 		return -EINVAL;
-	}
 
-	cu_idx = amdxdna_cmd_get_cu_idx(cmd_abo);
-	if (cu_idx < 0) {
-		XDNA_DBG(xdna, "Invalid cu idx");
+	dpu_req->inst_buf_addr = sn->buffer;
+	dpu_req->inst_size = sn->buffer_size;
+	dpu_req->inst_prop_cnt = sn->prop_count;
+	memcpy(dpu_req->payload, sn->prop_args, cmd_len - sizeof(*sn));
+
+	*size = sizeof(*dpu_req);
+	*msg_op = MSG_OP_EXEC_DPU;
+	return 0;
+}
+
+static void aie2_init_exec_chain_req(void *req, u64 slot_addr, size_t size, u32 cmd_cnt)
+{
+	struct cmd_chain_req *chain_req = req;
+
+	chain_req->buf_addr = slot_addr;
+	chain_req->buf_size = size;
+	chain_req->count = cmd_cnt;
+}
+
+static void aie2_init_npu_chain_req(void *req, u64 slot_addr, size_t size, u32 cmd_cnt)
+{
+	struct cmd_chain_npu_req *npu_chain_req = req;
+
+	npu_chain_req->flags = 0;
+	npu_chain_req->reserved = 0;
+	npu_chain_req->buf_addr = slot_addr;
+	npu_chain_req->buf_size = size;
+	npu_chain_req->count = cmd_cnt;
+}
+
+static int
+aie2_cmdlist_fill_cf(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size)
+{
+	struct cmd_chain_slot_execbuf_cf *cf_slot = slot;
+	u32 cmd_len;
+	void *cmd;
+
+	cmd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len);
+	if (*size < sizeof(*cf_slot) + cmd_len)
 		return -EINVAL;
-	}
 
-	op = amdxdna_cmd_get_op(cmd_abo);
-	switch (op) {
+	cf_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo);
+	if (cf_slot->cu_idx == INVALID_CU_IDX)
+		return -EINVAL;
+
+	cf_slot->arg_cnt = cmd_len / sizeof(u32);
+	memcpy(cf_slot->args, cmd, cmd_len);
+	/* Accurate slot size to hint firmware to do necessary copy */
+	*size = sizeof(*cf_slot) + cmd_len;
+	return 0;
+}
+
+static int
+aie2_cmdlist_fill_dpu(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size)
+{
+	struct cmd_chain_slot_dpu *dpu_slot = slot;
+	struct amdxdna_cmd_start_npu *sn;
+	u32 cmd_len;
+	u32 arg_sz;
+
+	sn = amdxdna_cmd_get_payload(cmd_bo, &cmd_len);
+	arg_sz = cmd_len - sizeof(*sn);
+	if (cmd_len < sizeof(*sn) || arg_sz > MAX_DPU_ARGS_SIZE)
+		return -EINVAL;
+
+	if (*size < sizeof(*dpu_slot) + arg_sz)
+		return -EINVAL;
+
+	dpu_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo);
+	if (dpu_slot->cu_idx == INVALID_CU_IDX)
+		return -EINVAL;
+
+	dpu_slot->inst_buf_addr = sn->buffer;
+	dpu_slot->inst_size = sn->buffer_size;
+	dpu_slot->inst_prop_cnt = sn->prop_count;
+	dpu_slot->arg_cnt = arg_sz / sizeof(u32);
+	memcpy(dpu_slot->args, sn->prop_args, arg_sz);
+
+	/* Accurate slot size to hint firmware to do necessary copy */
+	*size = sizeof(*dpu_slot) + arg_sz;
+	return 0;
+}
+
+static int aie2_cmdlist_unsupp(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size)
+{
+	return -EOPNOTSUPP;
+}
+
+static u32 aie2_get_chain_msg_op(u32 cmd_op)
+{
+	switch (cmd_op) {
 	case ERT_START_CU:
-		if (unlikely(payload_len > sizeof(req.ebuf.payload)))
-			XDNA_DBG(xdna, "Invalid ebuf payload len: %d", payload_len);
-		req.ebuf.cu_idx = cu_idx;
-		memcpy(req.ebuf.payload, payload, sizeof(req.ebuf.payload));
-		msg.send_size = sizeof(req.ebuf);
-		msg.opcode = MSG_OP_EXECUTE_BUFFER_CF;
-		break;
-	case ERT_START_NPU: {
-		struct amdxdna_cmd_start_npu *sn = payload;
-
-		if (unlikely(payload_len - sizeof(*sn) > sizeof(req.dpu.payload)))
-			XDNA_DBG(xdna, "Invalid dpu payload len: %d", payload_len);
-		req.dpu.inst_buf_addr = sn->buffer;
-		req.dpu.inst_size = sn->buffer_size;
-		req.dpu.inst_prop_cnt = sn->prop_count;
-		req.dpu.cu_idx = cu_idx;
-		memcpy(req.dpu.payload, sn->prop_args, sizeof(req.dpu.payload));
-		msg.send_size = sizeof(req.dpu);
-		msg.opcode = MSG_OP_EXEC_DPU;
+		return MSG_OP_CHAIN_EXEC_BUFFER_CF;
+	case ERT_START_NPU:
+		return MSG_OP_CHAIN_EXEC_DPU;
+	default:
 		break;
 	}
-	default:
-		XDNA_DBG(xdna, "Invalid ERT cmd op code: %d", op);
+
+	return MSG_OP_MAX_OPCODE;
+}
+
+static struct aie2_exec_msg_ops legacy_exec_message_ops = {
+	.init_cu_req = aie2_init_exec_cu_req,
+	.init_dpu_req = aie2_init_exec_dpu_req,
+	.init_chain_req = aie2_init_exec_chain_req,
+	.fill_cf_slot = aie2_cmdlist_fill_cf,
+	.fill_dpu_slot = aie2_cmdlist_fill_dpu,
+	.fill_preempt_slot = aie2_cmdlist_unsupp,
+	.fill_elf_slot = aie2_cmdlist_unsupp,
+	.get_chain_msg_op = aie2_get_chain_msg_op,
+};
+
+static int
+aie2_cmdlist_fill_npu_cf(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size)
+{
+	struct cmd_chain_slot_npu *npu_slot = slot;
+	u32 cmd_len;
+	void *cmd;
+
+	cmd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len);
+	if (*size < sizeof(*npu_slot) + cmd_len)
 		return -EINVAL;
-	}
-	msg.handle = job;
-	msg.notify_cb = notify_cb;
-	msg.send_data = (u8 *)&req;
-	print_hex_dump_debug("cmd: ", DUMP_PREFIX_OFFSET, 16, 4, &req,
-			     0x40, false);
 
-	ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
-	if (ret) {
-		XDNA_ERR(xdna, "Send message failed");
-		return ret;
-	}
+	npu_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo);
+	if (npu_slot->cu_idx == INVALID_CU_IDX)
+		return -EINVAL;
 
+	memset(npu_slot, 0, sizeof(*npu_slot));
+	npu_slot->type = EXEC_NPU_TYPE_NON_ELF;
+	npu_slot->arg_cnt = cmd_len / sizeof(u32);
+	memcpy(npu_slot->args, cmd, cmd_len);
+
+	*size = sizeof(*npu_slot) + cmd_len;
 	return 0;
 }
 
 static int
-aie2_cmdlist_fill_one_slot_cf(void *cmd_buf, u32 offset,
-			      struct amdxdna_gem_obj *abo, u32 *size)
+aie2_cmdlist_fill_npu_dpu(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size)
 {
-	struct cmd_chain_slot_execbuf_cf *buf = cmd_buf + offset;
-	int cu_idx = amdxdna_cmd_get_cu_idx(abo);
-	u32 payload_len;
-	void *payload;
+	struct cmd_chain_slot_npu *npu_slot = slot;
+	struct amdxdna_cmd_start_npu *sn;
+	u32 cmd_len;
+	u32 arg_sz;
 
-	if (cu_idx < 0)
+	sn = amdxdna_cmd_get_payload(cmd_bo, &cmd_len);
+	arg_sz = cmd_len - sizeof(*sn);
+	if (cmd_len < sizeof(*sn) || arg_sz > MAX_NPU_ARGS_SIZE)
 		return -EINVAL;
 
-	payload = amdxdna_cmd_get_payload(abo, &payload_len);
-	if (!payload)
+	if (*size < sizeof(*npu_slot) + arg_sz)
 		return -EINVAL;
 
-	if (!slot_has_space(*buf, offset, payload_len))
-		return -ENOSPC;
+	npu_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo);
+	if (npu_slot->cu_idx == INVALID_CU_IDX)
+		return -EINVAL;
+
+	memset(npu_slot, 0, sizeof(*npu_slot));
+	npu_slot->type = EXEC_NPU_TYPE_PARTIAL_ELF;
+	npu_slot->inst_buf_addr = sn->buffer;
+	npu_slot->inst_size = sn->buffer_size;
+	npu_slot->inst_prop_cnt = sn->prop_count;
+	npu_slot->arg_cnt = arg_sz / sizeof(u32);
+	memcpy(npu_slot->args, sn->prop_args, arg_sz);
 
-	buf->cu_idx = cu_idx;
-	buf->arg_cnt = payload_len / sizeof(u32);
-	memcpy(buf->args, payload, payload_len);
-	/* Accurate buf size to hint firmware to do necessary copy */
-	*size = sizeof(*buf) + payload_len;
+	*size = sizeof(*npu_slot) + arg_sz;
 	return 0;
 }
 
 static int
-aie2_cmdlist_fill_one_slot_dpu(void *cmd_buf, u32 offset,
-			       struct amdxdna_gem_obj *abo, u32 *size)
+aie2_cmdlist_fill_npu_preempt(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size)
 {
-	struct cmd_chain_slot_dpu *buf = cmd_buf + offset;
-	int cu_idx = amdxdna_cmd_get_cu_idx(abo);
-	struct amdxdna_cmd_start_npu *sn;
-	u32 payload_len;
-	void *payload;
+	struct cmd_chain_slot_npu *npu_slot = slot;
+	struct amdxdna_cmd_preempt_data *pd;
+	u32 cmd_len;
 	u32 arg_sz;
 
-	if (cu_idx < 0)
+	pd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len);
+	arg_sz = cmd_len - sizeof(*pd);
+	if (cmd_len < sizeof(*pd) || arg_sz > MAX_NPU_ARGS_SIZE)
 		return -EINVAL;
 
-	payload = amdxdna_cmd_get_payload(abo, &payload_len);
-	if (!payload)
+	if (*size < sizeof(*npu_slot) + arg_sz)
 		return -EINVAL;
-	sn = payload;
-	arg_sz = payload_len - sizeof(*sn);
-	if (payload_len < sizeof(*sn) || arg_sz > MAX_DPU_ARGS_SIZE)
+
+	npu_slot->cu_idx = amdxdna_cmd_get_cu_idx(cmd_bo);
+	if (npu_slot->cu_idx == INVALID_CU_IDX)
 		return -EINVAL;
 
-	if (!slot_has_space(*buf, offset, arg_sz))
-		return -ENOSPC;
+	memset(npu_slot, 0, sizeof(*npu_slot));
+	npu_slot->type = EXEC_NPU_TYPE_PREEMPT;
+	npu_slot->inst_buf_addr = pd->inst_buf;
+	npu_slot->save_buf_addr = pd->save_buf;
+	npu_slot->restore_buf_addr = pd->restore_buf;
+	npu_slot->inst_size = pd->inst_size;
+	npu_slot->save_size = pd->save_size;
+	npu_slot->restore_size = pd->restore_size;
+	npu_slot->inst_prop_cnt = pd->inst_prop_cnt;
+	npu_slot->arg_cnt = arg_sz / sizeof(u32);
+	memcpy(npu_slot->args, pd->prop_args, arg_sz);
+
+	*size = sizeof(*npu_slot) + arg_sz;
+	return 0;
+}
+
+static int
+aie2_cmdlist_fill_npu_elf(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size)
+{
+	struct cmd_chain_slot_npu *npu_slot = slot;
+	struct amdxdna_cmd_preempt_data *pd;
+	u32 cmd_len;
+	u32 arg_sz;
+
+	pd = amdxdna_cmd_get_payload(cmd_bo, &cmd_len);
+	arg_sz = cmd_len - sizeof(*pd);
+	if (cmd_len < sizeof(*pd) || arg_sz > MAX_NPU_ARGS_SIZE)
+		return -EINVAL;
 
-	buf->inst_buf_addr = sn->buffer;
-	buf->inst_size = sn->buffer_size;
-	buf->inst_prop_cnt = sn->prop_count;
-	buf->cu_idx = cu_idx;
-	buf->arg_cnt = arg_sz / sizeof(u32);
-	memcpy(buf->args, sn->prop_args, arg_sz);
+	if (*size < sizeof(*npu_slot) + arg_sz)
+		return -EINVAL;
 
-	/* Accurate buf size to hint firmware to do necessary copy */
-	*size = sizeof(*buf) + arg_sz;
+	memset(npu_slot, 0, sizeof(*npu_slot));
+	npu_slot->type = EXEC_NPU_TYPE_ELF;
+	npu_slot->inst_buf_addr = pd->inst_buf;
+	npu_slot->save_buf_addr = pd->save_buf;
+	npu_slot->restore_buf_addr = pd->restore_buf;
+	npu_slot->inst_size = pd->inst_size;
+	npu_slot->save_size = pd->save_size;
+	npu_slot->restore_size = pd->restore_size;
+	npu_slot->inst_prop_cnt = pd->inst_prop_cnt;
+	npu_slot->arg_cnt = 1;
+	npu_slot->args[0] = AIE2_EXEC_BUFFER_KERNEL_OP_TXN;
+
+	*size = struct_size(npu_slot, args, npu_slot->arg_cnt);
 	return 0;
 }
 
-static int
-aie2_cmdlist_fill_one_slot(u32 op, struct amdxdna_gem_obj *cmdbuf_abo, u32 offset,
-			   struct amdxdna_gem_obj *abo, u32 *size)
+static u32 aie2_get_npu_chain_msg_op(u32 cmd_op)
+{
+	return MSG_OP_CHAIN_EXEC_NPU;
+}
+
+static struct aie2_exec_msg_ops npu_exec_message_ops = {
+	.init_cu_req = aie2_init_exec_cu_req,
+	.init_dpu_req = aie2_init_exec_dpu_req,
+	.init_chain_req = aie2_init_npu_chain_req,
+	.fill_cf_slot = aie2_cmdlist_fill_npu_cf,
+	.fill_dpu_slot = aie2_cmdlist_fill_npu_dpu,
+	.fill_preempt_slot = aie2_cmdlist_fill_npu_preempt,
+	.fill_elf_slot = aie2_cmdlist_fill_npu_elf,
+	.get_chain_msg_op = aie2_get_npu_chain_msg_op,
+};
+
+static int aie2_init_exec_req(void *req, struct amdxdna_gem_obj *cmd_abo,
+			      size_t *size, u32 *msg_op)
 {
-	u32 this_op = amdxdna_cmd_get_op(abo);
-	void *cmd_buf = cmdbuf_abo->mem.kva;
+	struct amdxdna_dev *xdna = cmd_abo->client->xdna;
 	int ret;
+	u32 op;
 
-	if (this_op != op) {
-		ret = -EINVAL;
-		goto done;
-	}
 
+	op = amdxdna_cmd_get_op(cmd_abo);
 	switch (op) {
 	case ERT_START_CU:
-		ret = aie2_cmdlist_fill_one_slot_cf(cmd_buf, offset, abo, size);
+		ret = EXEC_MSG_OPS(xdna)->init_cu_req(cmd_abo, req, size, msg_op);
+		if (ret) {
+			XDNA_DBG(xdna, "Init CU req failed ret %d", ret);
+			return ret;
+		}
 		break;
 	case ERT_START_NPU:
-		ret = aie2_cmdlist_fill_one_slot_dpu(cmd_buf, offset, abo, size);
+		ret = EXEC_MSG_OPS(xdna)->init_dpu_req(cmd_abo, req, size, msg_op);
+		if (ret) {
+			XDNA_DBG(xdna, "Init DPU req failed ret %d", ret);
+			return ret;
+		}
+
 		break;
 	default:
+		XDNA_ERR(xdna, "Unsupported op %d", op);
 		ret = -EOPNOTSUPP;
+		break;
 	}
 
-done:
-	if (ret) {
-		XDNA_ERR(abo->client->xdna, "Can't fill slot for cmd op %d ret %d",
-			 op, ret);
+	return ret;
+}
+
+static int
+aie2_cmdlist_fill_slot(void *slot, struct amdxdna_gem_obj *cmd_abo,
+		       size_t *size, u32 *cmd_op)
+{
+	struct amdxdna_dev *xdna = cmd_abo->client->xdna;
+	int ret;
+	u32 op;
+
+	op = amdxdna_cmd_get_op(cmd_abo);
+	if (*cmd_op == ERT_INVALID_CMD)
+		*cmd_op = op;
+	else if (op != *cmd_op)
+		return -EINVAL;
+
+	switch (op) {
+	case ERT_START_CU:
+		ret = EXEC_MSG_OPS(xdna)->fill_cf_slot(cmd_abo, slot, size);
+		break;
+	case ERT_START_NPU:
+		ret = EXEC_MSG_OPS(xdna)->fill_dpu_slot(cmd_abo, slot, size);
+		break;
+	case ERT_START_NPU_PREEMPT:
+		if (!AIE2_FEATURE_ON(xdna->dev_handle, AIE2_PREEMPT))
+			return -EOPNOTSUPP;
+		ret = EXEC_MSG_OPS(xdna)->fill_preempt_slot(cmd_abo, slot, size);
+		break;
+	case ERT_START_NPU_PREEMPT_ELF:
+		if (!AIE2_FEATURE_ON(xdna->dev_handle, AIE2_PREEMPT))
+			return -EOPNOTSUPP;
+		ret = EXEC_MSG_OPS(xdna)->fill_elf_slot(cmd_abo, slot, size);
+		break;
+	default:
+		XDNA_INFO(xdna, "Unsupported op %d", op);
+		ret = -EOPNOTSUPP;
+		break;
 	}
+
 	return ret;
 }
 
+void aie2_msg_init(struct amdxdna_dev_hdl *ndev)
+{
+	if (AIE2_FEATURE_ON(ndev, AIE2_NPU_COMMAND))
+		ndev->exec_msg_ops = &npu_exec_message_ops;
+	else
+		ndev->exec_msg_ops = &legacy_exec_message_ops;
+}
+
 static inline struct amdxdna_gem_obj *
 aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
 {
@@ -616,29 +869,36 @@ aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
 	return job->hwctx->priv->cmd_buf[idx];
 }
 
-static void
-aie2_cmdlist_prepare_request(struct cmd_chain_req *req,
-			     struct amdxdna_gem_obj *cmdbuf_abo, u32 size, u32 cnt)
+int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+		 int (*notify_cb)(void *, void __iomem *, size_t))
 {
-	req->buf_addr = cmdbuf_abo->mem.dev_addr;
-	req->buf_size = size;
-	req->count = cnt;
-	drm_clflush_virt_range(cmdbuf_abo->mem.kva, size);
-	XDNA_DBG(cmdbuf_abo->client->xdna, "Command buf addr 0x%llx size 0x%x count %d",
-		 req->buf_addr, size, cnt);
-}
+	struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+	struct xdna_mailbox_msg msg;
+	union exec_req req;
+	int ret;
 
-static inline u32
-aie2_cmd_op_to_msg_op(u32 op)
-{
-	switch (op) {
-	case ERT_START_CU:
-		return MSG_OP_CHAIN_EXEC_BUFFER_CF;
-	case ERT_START_NPU:
-		return MSG_OP_CHAIN_EXEC_DPU;
-	default:
-		return MSG_OP_MAX_OPCODE;
+	if (!chann)
+		return -ENODEV;
+
+	ret = aie2_init_exec_req(&req, cmd_abo, &msg.send_size, &msg.opcode);
+	if (ret)
+		return ret;
+
+	msg.handle = job;
+	msg.notify_cb = notify_cb;
+	msg.send_data = (u8 *)&req;
+	print_hex_dump_debug("cmd: ", DUMP_PREFIX_OFFSET, 16, 4, &req,
+			     0x40, false);
+
+	ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+	if (ret) {
+		XDNA_ERR(xdna, "Send message failed");
+		return ret;
 	}
+
+	return 0;
 }
 
 int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
@@ -649,12 +909,13 @@ int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
 	struct mailbox_channel *chann = hwctx->priv->mbox_chann;
 	struct amdxdna_client *client = hwctx->client;
 	struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+	struct amdxdna_dev *xdna = client->xdna;
 	struct amdxdna_cmd_chain *payload;
 	struct xdna_mailbox_msg msg;
-	struct cmd_chain_req req;
+	union exec_chain_req req;
 	u32 payload_len;
 	u32 offset = 0;
-	u32 size;
+	size_t size;
 	int ret;
 	u32 op;
 	u32 i;
@@ -665,41 +926,42 @@ int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
 	    payload_len < struct_size(payload, data, payload->command_count))
 		return -EINVAL;
 
+	op = ERT_INVALID_CMD;
 	for (i = 0; i < payload->command_count; i++) {
 		u32 boh = (u32)(payload->data[i]);
 		struct amdxdna_gem_obj *abo;
 
 		abo = amdxdna_gem_get_obj(client, boh, AMDXDNA_BO_CMD);
 		if (!abo) {
-			XDNA_ERR(client->xdna, "Failed to find cmd BO %d", boh);
+			XDNA_ERR(xdna, "Failed to find cmd BO %d", boh);
 			return -ENOENT;
 		}
 
-		/* All sub-cmd should have same op, use the first one. */
-		if (i == 0)
-			op = amdxdna_cmd_get_op(abo);
-
-		ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, offset, abo, &size);
+		size = cmdbuf_abo->mem.size - offset;
+		ret = aie2_cmdlist_fill_slot(cmdbuf_abo->mem.kva + offset,
+					     abo, &size, &op);
 		amdxdna_gem_put_obj(abo);
 		if (ret)
-			return -EINVAL;
+			return ret;
 
 		offset += size;
 	}
+	msg.opcode = EXEC_MSG_OPS(xdna)->get_chain_msg_op(op);
+	if (msg.opcode == MSG_OP_MAX_OPCODE)
+		return -EOPNOTSUPP;
 
 	/* The offset is the accumulated total size of the cmd buffer */
-	aie2_cmdlist_prepare_request(&req, cmdbuf_abo, offset, payload->command_count);
+	EXEC_MSG_OPS(xdna)->init_chain_req(&req, cmdbuf_abo->mem.dev_addr,
+					   offset, payload->command_count);
+	drm_clflush_virt_range(cmdbuf_abo->mem.kva, offset);
 
-	msg.opcode = aie2_cmd_op_to_msg_op(op);
-	if (msg.opcode == MSG_OP_MAX_OPCODE)
-		return -EOPNOTSUPP;
 	msg.handle = job;
 	msg.notify_cb = notify_cb;
 	msg.send_data = (u8 *)&req;
 	msg.send_size = sizeof(req);
 	ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
 	if (ret) {
-		XDNA_ERR(hwctx->client->xdna, "Send message failed");
+		XDNA_ERR(xdna, "Send message failed");
 		return ret;
 	}
 
@@ -712,23 +974,27 @@ int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
 {
 	struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
 	struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
 	struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
 	struct xdna_mailbox_msg msg;
-	struct cmd_chain_req req;
-	u32 size;
+	union exec_chain_req req;
+	u32 op = ERT_INVALID_CMD;
+	size_t size;
 	int ret;
-	u32 op;
 
-	op = amdxdna_cmd_get_op(cmd_abo);
-	ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, 0, cmd_abo, &size);
+	size = cmdbuf_abo->mem.size;
+	ret = aie2_cmdlist_fill_slot(cmdbuf_abo->mem.kva, cmd_abo, &size, &op);
 	if (ret)
 		return ret;
 
-	aie2_cmdlist_prepare_request(&req, cmdbuf_abo, size, 1);
-
-	msg.opcode = aie2_cmd_op_to_msg_op(op);
+	msg.opcode = EXEC_MSG_OPS(xdna)->get_chain_msg_op(op);
 	if (msg.opcode == MSG_OP_MAX_OPCODE)
 		return -EOPNOTSUPP;
+
+	EXEC_MSG_OPS(xdna)->init_chain_req(&req, cmdbuf_abo->mem.dev_addr,
+					   size, 1);
+	drm_clflush_virt_range(cmdbuf_abo->mem.kva, size);
+
 	msg.handle = job;
 	msg.notify_cb = notify_cb;
 	msg.send_data = (u8 *)&req;
@@ -753,7 +1019,7 @@ int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 	int ret = 0;
 
 	req.src_addr = 0;
-	req.dst_addr = abo->mem.dev_addr - hwctx->client->dev_heap->mem.dev_addr;
+	req.dst_addr = amdxdna_dev_bo_offset(abo);
 	req.size = abo->mem.size;
 
 	/* Device to Host */
@@ -777,3 +1043,32 @@ int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 
 	return 0;
 }
+
+int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+			 int (*notify_cb)(void *, void __iomem *, size_t))
+{
+	struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+	struct amdxdna_gem_obj *abo = to_xdna_obj(job->bos[0]);
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	struct config_debug_bo_req req;
+	struct xdna_mailbox_msg msg;
+
+	if (job->drv_cmd->opcode == ATTACH_DEBUG_BO)
+		req.config = DEBUG_BO_REGISTER;
+	else
+		req.config = DEBUG_BO_UNREGISTER;
+
+	req.offset = amdxdna_dev_bo_offset(abo);
+	req.size = abo->mem.size;
+
+	XDNA_DBG(xdna, "offset 0x%llx size 0x%llx config %d",
+		 req.offset, req.size, req.config);
+
+	msg.handle = job;
+	msg.notify_cb = notify_cb;
+	msg.send_data = (u8 *)&req;
+	msg.send_size = sizeof(req);
+	msg.opcode = MSG_OP_CONFIG_DEBUG_BO;
+
+	return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+}
diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h
index 6df9065b13f6..1c957a6298d3 100644
--- a/drivers/accel/amdxdna/aie2_msg_priv.h
+++ b/drivers/accel/amdxdna/aie2_msg_priv.h
@@ -9,7 +9,8 @@
 enum aie2_msg_opcode {
 	MSG_OP_CREATE_CONTEXT              = 0x2,
 	MSG_OP_DESTROY_CONTEXT             = 0x3,
-	MSG_OP_SYNC_BO			   = 0x7,
+	MSG_OP_GET_TELEMETRY               = 0x4,
+	MSG_OP_SYNC_BO                     = 0x7,
 	MSG_OP_EXECUTE_BUFFER_CF           = 0xC,
 	MSG_OP_QUERY_COL_STATUS            = 0xD,
 	MSG_OP_QUERY_AIE_TILE_INFO         = 0xE,
@@ -18,6 +19,8 @@ enum aie2_msg_opcode {
 	MSG_OP_CONFIG_CU                   = 0x11,
 	MSG_OP_CHAIN_EXEC_BUFFER_CF        = 0x12,
 	MSG_OP_CHAIN_EXEC_DPU              = 0x13,
+	MSG_OP_CONFIG_DEBUG_BO             = 0x14,
+	MSG_OP_CHAIN_EXEC_NPU              = 0x18,
 	MSG_OP_MAX_XRT_OPCODE,
 	MSG_OP_SUSPEND                     = 0x101,
 	MSG_OP_RESUME                      = 0x102,
@@ -135,6 +138,28 @@ struct destroy_ctx_resp {
 	enum aie2_msg_status	status;
 } __packed;
 
+enum telemetry_type {
+	TELEMETRY_TYPE_DISABLED,
+	TELEMETRY_TYPE_HEALTH,
+	TELEMETRY_TYPE_ERROR_INFO,
+	TELEMETRY_TYPE_PROFILING,
+	TELEMETRY_TYPE_DEBUG,
+	MAX_TELEMETRY_TYPE
+};
+
+struct get_telemetry_req {
+	enum telemetry_type	type;
+	__u64	buf_addr;
+	__u32	buf_size;
+} __packed;
+
+struct get_telemetry_resp {
+	__u32	major;
+	__u32	minor;
+	__u32	size;
+	enum aie2_msg_status	status;
+} __packed;
+
 struct execute_buffer_req {
 	__u32	cu_idx;
 	__u32	payload[19];
@@ -148,6 +173,18 @@ struct exec_dpu_req {
 	__u32	payload[35];
 } __packed;
 
+enum exec_npu_type {
+	EXEC_NPU_TYPE_NON_ELF		= 0x1,
+	EXEC_NPU_TYPE_PARTIAL_ELF	= 0x2,
+	EXEC_NPU_TYPE_PREEMPT		= 0x3,
+	EXEC_NPU_TYPE_ELF		= 0x4,
+};
+
+union exec_req {
+	struct execute_buffer_req ebuf;
+	struct exec_dpu_req dpu_req;
+};
+
 struct execute_buffer_resp {
 	enum aie2_msg_status	status;
 } __packed;
@@ -319,9 +356,6 @@ struct async_event_msg_resp {
 } __packed;
 
 #define MAX_CHAIN_CMDBUF_SIZE SZ_4K
-#define slot_has_space(slot, offset, payload_size)		\
-	(MAX_CHAIN_CMDBUF_SIZE >= (offset) + (payload_size) +	\
-	 sizeof(typeof(slot)))
 
 struct cmd_chain_slot_execbuf_cf {
 	__u32 cu_idx;
@@ -339,12 +373,41 @@ struct cmd_chain_slot_dpu {
 	__u32 args[] __counted_by(arg_cnt);
 };
 
+#define MAX_NPU_ARGS_SIZE (26 * sizeof(__u32))
+#define AIE2_EXEC_BUFFER_KERNEL_OP_TXN	3
+struct cmd_chain_slot_npu {
+	enum exec_npu_type type;
+	u64 inst_buf_addr;
+	u64 save_buf_addr;
+	u64 restore_buf_addr;
+	u32 inst_size;
+	u32 save_size;
+	u32 restore_size;
+	u32 inst_prop_cnt;
+	u32 cu_idx;
+	u32 arg_cnt;
+	u32 args[] __counted_by(arg_cnt);
+} __packed;
+
 struct cmd_chain_req {
 	__u64 buf_addr;
 	__u32 buf_size;
 	__u32 count;
 } __packed;
 
+struct cmd_chain_npu_req {
+	u32 flags;
+	u32 reserved;
+	u64 buf_addr;
+	u32 buf_size;
+	u32 count;
+} __packed;
+
+union exec_chain_req {
+	struct cmd_chain_npu_req npu_req;
+	struct cmd_chain_req req;
+};
+
 struct cmd_chain_resp {
 	enum aie2_msg_status	status;
 	__u32			fail_cmd_idx;
@@ -365,4 +428,21 @@ struct sync_bo_req {
 struct sync_bo_resp {
 	enum aie2_msg_status	status;
 } __packed;
+
+#define DEBUG_BO_UNREGISTER 0
+#define DEBUG_BO_REGISTER   1
+struct config_debug_bo_req {
+	__u64	offset;
+	__u64	size;
+	/*
+	 * config operations.
+	 *   DEBUG_BO_REGISTER: Register debug buffer
+	 *   DEBUG_BO_UNREGISTER: Unregister debug buffer
+	 */
+	__u32	config;
+} __packed;
+
+struct config_debug_bo_resp {
+	enum aie2_msg_status	status;
+} __packed;
 #endif /* _AIE2_MSG_PRIV_H_ */
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index 87c425e3d2b9..ceef1c502e9e 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -25,6 +25,7 @@
 #include "amdxdna_gem.h"
 #include "amdxdna_mailbox.h"
 #include "amdxdna_pci_drv.h"
+#include "amdxdna_pm.h"
 
 static int aie2_max_col = XRS_MAX_COL;
 module_param(aie2_max_col, uint, 0600);
@@ -54,6 +55,7 @@ struct mgmt_mbox_chann_info {
 
 static int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 fw_minor)
 {
+	const struct aie2_fw_feature_tbl *feature;
 	struct amdxdna_dev *xdna = ndev->xdna;
 
 	/*
@@ -77,6 +79,17 @@ static int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 f
 		XDNA_ERR(xdna, "Firmware minor version smaller than supported");
 		return -EINVAL;
 	}
+
+	for (feature = ndev->priv->fw_feature_tbl; feature && feature->min_minor;
+	     feature++) {
+		if (fw_minor < feature->min_minor)
+			continue;
+		if (feature->max_minor > 0 && fw_minor > feature->max_minor)
+			continue;
+
+		set_bit(feature->feature, &ndev->feature_mask);
+	}
+
 	return 0;
 }
 
@@ -170,6 +183,10 @@ int aie2_runtime_cfg(struct amdxdna_dev_hdl *ndev,
 		if (cfg->category != category)
 			continue;
 
+		if (cfg->feature_mask &&
+		    bitmap_subset(&cfg->feature_mask, &ndev->feature_mask, AIE2_FEATURE_MAX))
+			continue;
+
 		value = val ? *val : cfg->value;
 		ret = aie2_set_runtime_cfg(ndev, cfg->type, value);
 		if (ret) {
@@ -223,15 +240,6 @@ static int aie2_mgmt_fw_init(struct amdxdna_dev_hdl *ndev)
 		return ret;
 	}
 
-	if (!ndev->async_events)
-		return 0;
-
-	ret = aie2_error_async_events_send(ndev);
-	if (ret) {
-		XDNA_ERR(ndev->xdna, "Send async events failed");
-		return ret;
-	}
-
 	return 0;
 }
 
@@ -257,6 +265,8 @@ static int aie2_mgmt_fw_query(struct amdxdna_dev_hdl *ndev)
 		return ret;
 	}
 
+	ndev->total_col = min(aie2_max_col, ndev->metadata.cols);
+
 	return 0;
 }
 
@@ -338,6 +348,7 @@ static void aie2_hw_stop(struct amdxdna_dev *xdna)
 	ndev->mbox = NULL;
 	aie2_psp_stop(ndev->psp_hdl);
 	aie2_smu_fini(ndev);
+	aie2_error_async_events_free(ndev);
 	pci_disable_device(pdev);
 
 	ndev->dev_status = AIE2_DEV_INIT;
@@ -424,6 +435,18 @@ static int aie2_hw_start(struct amdxdna_dev *xdna)
 		goto destroy_mgmt_chann;
 	}
 
+	ret = aie2_mgmt_fw_query(ndev);
+	if (ret) {
+		XDNA_ERR(xdna, "failed to query fw, ret %d", ret);
+		goto destroy_mgmt_chann;
+	}
+
+	ret = aie2_error_async_events_alloc(ndev);
+	if (ret) {
+		XDNA_ERR(xdna, "Allocate async events failed, ret %d", ret);
+		goto destroy_mgmt_chann;
+	}
+
 	ndev->dev_status = AIE2_DEV_START;
 
 	return 0;
@@ -459,7 +482,6 @@ static int aie2_hw_resume(struct amdxdna_dev *xdna)
 	struct amdxdna_client *client;
 	int ret;
 
-	guard(mutex)(&xdna->dev_lock);
 	ret = aie2_hw_start(xdna);
 	if (ret) {
 		XDNA_ERR(xdna, "Start hardware failed, %d", ret);
@@ -565,13 +587,6 @@ static int aie2_init(struct amdxdna_dev *xdna)
 		goto release_fw;
 	}
 
-	ret = aie2_mgmt_fw_query(ndev);
-	if (ret) {
-		XDNA_ERR(xdna, "Query firmware failed, ret %d", ret);
-		goto stop_hw;
-	}
-	ndev->total_col = min(aie2_max_col, ndev->metadata.cols);
-
 	xrs_cfg.clk_list.num_levels = ndev->max_dpm_level + 1;
 	for (i = 0; i < xrs_cfg.clk_list.num_levels; i++)
 		xrs_cfg.clk_list.cu_clk_list[i] = ndev->priv->dpm_clk_tbl[i].hclk;
@@ -587,30 +602,11 @@ static int aie2_init(struct amdxdna_dev *xdna)
 		goto stop_hw;
 	}
 
-	ret = aie2_error_async_events_alloc(ndev);
-	if (ret) {
-		XDNA_ERR(xdna, "Allocate async events failed, ret %d", ret);
-		goto stop_hw;
-	}
-
-	ret = aie2_error_async_events_send(ndev);
-	if (ret) {
-		XDNA_ERR(xdna, "Send async events failed, ret %d", ret);
-		goto async_event_free;
-	}
-
-	/* Issue a command to make sure firmware handled async events */
-	ret = aie2_query_firmware_version(ndev, &ndev->xdna->fw_ver);
-	if (ret) {
-		XDNA_ERR(xdna, "Re-query firmware version failed");
-		goto async_event_free;
-	}
-
 	release_firmware(fw);
+	aie2_msg_init(ndev);
+	amdxdna_pm_init(xdna);
 	return 0;
 
-async_event_free:
-	aie2_error_async_events_free(ndev);
 stop_hw:
 	aie2_hw_stop(xdna);
 release_fw:
@@ -621,10 +617,8 @@ release_fw:
 
 static void aie2_fini(struct amdxdna_dev *xdna)
 {
-	struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
-
+	amdxdna_pm_fini(xdna);
 	aie2_hw_stop(xdna);
-	aie2_error_async_events_free(ndev);
 }
 
 static int aie2_get_aie_status(struct amdxdna_client *client,
@@ -845,7 +839,120 @@ static int aie2_get_hwctx_status(struct amdxdna_client *client,
 	}
 
 	args->buffer_size -= (u32)(array_args.buffer - args->buffer);
-	return ret;
+	return 0;
+}
+
+static int aie2_query_resource_info(struct amdxdna_client *client,
+				    struct amdxdna_drm_get_info *args)
+{
+	struct amdxdna_drm_get_resource_info res_info;
+	const struct amdxdna_dev_priv *priv;
+	struct amdxdna_dev_hdl *ndev;
+	struct amdxdna_dev *xdna;
+
+	xdna = client->xdna;
+	ndev = xdna->dev_handle;
+	priv = ndev->priv;
+
+	res_info.npu_clk_max = priv->dpm_clk_tbl[ndev->max_dpm_level].hclk;
+	res_info.npu_tops_max = ndev->max_tops;
+	res_info.npu_task_max = priv->hwctx_limit;
+	res_info.npu_tops_curr = ndev->curr_tops;
+	res_info.npu_task_curr = ndev->hwctx_num;
+
+	if (copy_to_user(u64_to_user_ptr(args->buffer), &res_info, sizeof(res_info)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int aie2_fill_hwctx_map(struct amdxdna_hwctx *hwctx, void *arg)
+{
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	u32 *map = arg;
+
+	if (hwctx->fw_ctx_id >= xdna->dev_handle->priv->hwctx_limit) {
+		XDNA_ERR(xdna, "Invalid fw ctx id %d/%d ", hwctx->fw_ctx_id,
+			 xdna->dev_handle->priv->hwctx_limit);
+		return -EINVAL;
+	}
+
+	map[hwctx->fw_ctx_id] = hwctx->id;
+	return 0;
+}
+
+static int aie2_get_telemetry(struct amdxdna_client *client,
+			      struct amdxdna_drm_get_info *args)
+{
+	struct amdxdna_drm_query_telemetry_header *header __free(kfree) = NULL;
+	u32 telemetry_data_sz, header_sz, elem_num;
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_client *tmp_client;
+	int ret;
+
+	elem_num = xdna->dev_handle->priv->hwctx_limit;
+	header_sz = struct_size(header, map, elem_num);
+	if (args->buffer_size <= header_sz) {
+		XDNA_ERR(xdna, "Invalid buffer size");
+		return -EINVAL;
+	}
+
+	telemetry_data_sz = args->buffer_size - header_sz;
+	if (telemetry_data_sz > SZ_4M) {
+		XDNA_ERR(xdna, "Buffer size is too big, %d", telemetry_data_sz);
+		return -EINVAL;
+	}
+
+	header = kzalloc(header_sz, GFP_KERNEL);
+	if (!header)
+		return -ENOMEM;
+
+	if (copy_from_user(header, u64_to_user_ptr(args->buffer), sizeof(*header))) {
+		XDNA_ERR(xdna, "Failed to copy telemetry header from user");
+		return -EFAULT;
+	}
+
+	header->map_num_elements = elem_num;
+	list_for_each_entry(tmp_client, &xdna->client_list, node) {
+		ret = amdxdna_hwctx_walk(tmp_client, &header->map,
+					 aie2_fill_hwctx_map);
+		if (ret)
+			return ret;
+	}
+
+	ret = aie2_query_telemetry(xdna->dev_handle,
+				   u64_to_user_ptr(args->buffer + header_sz),
+				   telemetry_data_sz, header);
+	if (ret) {
+		XDNA_ERR(xdna, "Query telemetry failed ret %d", ret);
+		return ret;
+	}
+
+	if (copy_to_user(u64_to_user_ptr(args->buffer), header, header_sz)) {
+		XDNA_ERR(xdna, "Copy header failed");
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int aie2_get_preempt_state(struct amdxdna_client *client,
+				  struct amdxdna_drm_get_info *args)
+{
+	struct amdxdna_drm_attribute_state state = {};
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_dev_hdl *ndev;
+
+	ndev = xdna->dev_handle;
+	if (args->param == DRM_AMDXDNA_GET_FORCE_PREEMPT_STATE)
+		state.state = ndev->force_preempt_enabled;
+	else if (args->param == DRM_AMDXDNA_GET_FRAME_BOUNDARY_PREEMPT_STATE)
+		state.state = ndev->frame_boundary_preempt;
+
+	if (copy_to_user(u64_to_user_ptr(args->buffer), &state, sizeof(state)))
+		return -EFAULT;
+
+	return 0;
 }
 
 static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_info *args)
@@ -856,6 +963,10 @@ static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_i
 	if (!drm_dev_enter(&xdna->ddev, &idx))
 		return -ENODEV;
 
+	ret = amdxdna_pm_resume_get(xdna);
+	if (ret)
+		goto dev_exit;
+
 	switch (args->param) {
 	case DRM_AMDXDNA_QUERY_AIE_STATUS:
 		ret = aie2_get_aie_status(client, args);
@@ -878,12 +989,25 @@ static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_i
 	case DRM_AMDXDNA_GET_POWER_MODE:
 		ret = aie2_get_power_mode(client, args);
 		break;
+	case DRM_AMDXDNA_QUERY_TELEMETRY:
+		ret = aie2_get_telemetry(client, args);
+		break;
+	case DRM_AMDXDNA_QUERY_RESOURCE_INFO:
+		ret = aie2_query_resource_info(client, args);
+		break;
+	case DRM_AMDXDNA_GET_FORCE_PREEMPT_STATE:
+	case DRM_AMDXDNA_GET_FRAME_BOUNDARY_PREEMPT_STATE:
+		ret = aie2_get_preempt_state(client, args);
+		break;
 	default:
 		XDNA_ERR(xdna, "Not supported request parameter %u", args->param);
 		ret = -EOPNOTSUPP;
 	}
+
+	amdxdna_pm_suspend_put(xdna);
 	XDNA_DBG(xdna, "Got param %d", args->param);
 
+dev_exit:
 	drm_dev_exit(idx);
 	return ret;
 }
@@ -898,6 +1022,12 @@ static int aie2_query_ctx_status_array(struct amdxdna_client *client,
 
 	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
 
+	if (args->element_size > SZ_4K || args->num_element > SZ_1K) {
+		XDNA_DBG(xdna, "Invalid element size %d or number of element %d",
+			 args->element_size, args->num_element);
+		return -EINVAL;
+	}
+
 	array_args.element_size = min(args->element_size,
 				      sizeof(struct amdxdna_drm_hwctx_entry));
 	array_args.buffer = args->buffer;
@@ -914,7 +1044,7 @@ static int aie2_query_ctx_status_array(struct amdxdna_client *client,
 	args->num_element = (u32)((array_args.buffer - args->buffer) /
 				  args->element_size);
 
-	return ret;
+	return 0;
 }
 
 static int aie2_get_array(struct amdxdna_client *client,
@@ -926,16 +1056,26 @@ static int aie2_get_array(struct amdxdna_client *client,
 	if (!drm_dev_enter(&xdna->ddev, &idx))
 		return -ENODEV;
 
+	ret = amdxdna_pm_resume_get(xdna);
+	if (ret)
+		goto dev_exit;
+
 	switch (args->param) {
 	case DRM_AMDXDNA_HW_CONTEXT_ALL:
 		ret = aie2_query_ctx_status_array(client, args);
 		break;
+	case DRM_AMDXDNA_HW_LAST_ASYNC_ERR:
+		ret = aie2_get_array_async_error(xdna->dev_handle, args);
+		break;
 	default:
 		XDNA_ERR(xdna, "Not supported request parameter %u", args->param);
 		ret = -EOPNOTSUPP;
 	}
+
+	amdxdna_pm_suspend_put(xdna);
 	XDNA_DBG(xdna, "Got param %d", args->param);
 
+dev_exit:
 	drm_dev_exit(idx);
 	return ret;
 }
@@ -965,6 +1105,38 @@ static int aie2_set_power_mode(struct amdxdna_client *client,
 	return aie2_pm_set_mode(xdna->dev_handle, power_mode);
 }
 
+static int aie2_set_preempt_state(struct amdxdna_client *client,
+				  struct amdxdna_drm_set_state *args)
+{
+	struct amdxdna_dev_hdl *ndev = client->xdna->dev_handle;
+	struct amdxdna_drm_attribute_state state;
+	u32 val;
+	int ret;
+
+	if (copy_from_user(&state, u64_to_user_ptr(args->buffer), sizeof(state)))
+		return -EFAULT;
+
+	if (state.state > 1)
+		return -EINVAL;
+
+	if (XDNA_MBZ_DBG(client->xdna, state.pad, sizeof(state.pad)))
+		return -EINVAL;
+
+	if (args->param == DRM_AMDXDNA_SET_FORCE_PREEMPT) {
+		ndev->force_preempt_enabled = state.state;
+	} else if (args->param == DRM_AMDXDNA_SET_FRAME_BOUNDARY_PREEMPT) {
+		val = state.state;
+		ret = aie2_runtime_cfg(ndev, AIE2_RT_CFG_FRAME_BOUNDARY_PREEMPT,
+				       &val);
+		if (ret)
+			return ret;
+
+		ndev->frame_boundary_preempt = state.state;
+	}
+
+	return 0;
+}
+
 static int aie2_set_state(struct amdxdna_client *client,
 			  struct amdxdna_drm_set_state *args)
 {
@@ -974,16 +1146,26 @@ static int aie2_set_state(struct amdxdna_client *client,
 	if (!drm_dev_enter(&xdna->ddev, &idx))
 		return -ENODEV;
 
+	ret = amdxdna_pm_resume_get(xdna);
+	if (ret)
+		goto dev_exit;
+
 	switch (args->param) {
 	case DRM_AMDXDNA_SET_POWER_MODE:
 		ret = aie2_set_power_mode(client, args);
 		break;
+	case DRM_AMDXDNA_SET_FORCE_PREEMPT:
+	case DRM_AMDXDNA_SET_FRAME_BOUNDARY_PREEMPT:
+		ret = aie2_set_preempt_state(client, args);
+		break;
 	default:
 		XDNA_ERR(xdna, "Not supported request parameter %u", args->param);
 		ret = -EOPNOTSUPP;
 		break;
 	}
 
+	amdxdna_pm_suspend_put(xdna);
+dev_exit:
 	drm_dev_exit(idx);
 	return ret;
 }
@@ -998,6 +1180,7 @@ const struct amdxdna_dev_ops aie2_ops = {
 	.hwctx_init = aie2_hwctx_init,
 	.hwctx_fini = aie2_hwctx_fini,
 	.hwctx_config = aie2_hwctx_config,
+	.hwctx_sync_debug_bo = aie2_hwctx_sync_debug_bo,
 	.cmd_submit = aie2_cmd_submit,
 	.hmm_invalidate = aie2_hmm_invalidate,
 	.get_array = aie2_get_array,
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 91a8e948f82a..a5f9c42155d1 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -110,12 +110,15 @@ struct aie_metadata {
 enum rt_config_category {
 	AIE2_RT_CFG_INIT,
 	AIE2_RT_CFG_CLK_GATING,
+	AIE2_RT_CFG_FORCE_PREEMPT,
+	AIE2_RT_CFG_FRAME_BOUNDARY_PREEMPT,
 };
 
 struct rt_config {
 	u32	type;
 	u32	value;
 	u32	category;
+	unsigned long feature_mask;
 };
 
 struct dpm_clk_freq {
@@ -156,6 +159,19 @@ enum aie2_dev_status {
 	AIE2_DEV_START,
 };
 
+struct aie2_exec_msg_ops {
+	int (*init_cu_req)(struct amdxdna_gem_obj *cmd_bo, void *req,
+			   size_t *size, u32 *msg_op);
+	int (*init_dpu_req)(struct amdxdna_gem_obj *cmd_bo, void *req,
+			    size_t *size, u32 *msg_op);
+	void (*init_chain_req)(void *req, u64 slot_addr, size_t size, u32 cmd_cnt);
+	int (*fill_cf_slot)(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size);
+	int (*fill_dpu_slot)(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size);
+	int (*fill_preempt_slot)(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size);
+	int (*fill_elf_slot)(struct amdxdna_gem_obj *cmd_bo, void *slot, size_t *size);
+	u32 (*get_chain_msg_op)(u32 cmd_op);
+};
+
 struct amdxdna_dev_hdl {
 	struct amdxdna_dev		*xdna;
 	const struct amdxdna_dev_priv	*priv;
@@ -173,6 +189,8 @@ struct amdxdna_dev_hdl {
 	u32				total_col;
 	struct aie_version		version;
 	struct aie_metadata		metadata;
+	unsigned long			feature_mask;
+	struct aie2_exec_msg_ops	*exec_msg_ops;
 
 	/* power management and clock*/
 	enum amdxdna_power_mode_type	pw_mode;
@@ -182,6 +200,10 @@ struct amdxdna_dev_hdl {
 	u32				clk_gating;
 	u32				npuclk_freq;
 	u32				hclk_freq;
+	u32				max_tops;
+	u32				curr_tops;
+	u32				force_preempt_enabled;
+	u32				frame_boundary_preempt;
 
 	/* Mailbox and the management channel */
 	struct mailbox			*mbox;
@@ -190,6 +212,8 @@ struct amdxdna_dev_hdl {
 
 	enum aie2_dev_status		dev_status;
 	u32				hwctx_num;
+
+	struct amdxdna_async_error	last_async_err;
 };
 
 #define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \
@@ -204,12 +228,27 @@ struct aie2_hw_ops {
 	int (*set_dpm)(struct amdxdna_dev_hdl *ndev, u32 dpm_level);
 };
 
+enum aie2_fw_feature {
+	AIE2_NPU_COMMAND,
+	AIE2_PREEMPT,
+	AIE2_FEATURE_MAX
+};
+
+struct aie2_fw_feature_tbl {
+	enum aie2_fw_feature feature;
+	u32 max_minor;
+	u32 min_minor;
+};
+
+#define AIE2_FEATURE_ON(ndev, feature)	test_bit(feature, &(ndev)->feature_mask)
+
 struct amdxdna_dev_priv {
 	const char			*fw_path;
 	u64				protocol_major;
 	u64				protocol_minor;
 	const struct rt_config		*rt_config;
 	const struct dpm_clk_freq	*dpm_clk_tbl;
+	const struct aie2_fw_feature_tbl *fw_feature_tbl;
 
 #define COL_ALIGN_NONE   0
 #define COL_ALIGN_NATURE 1
@@ -217,6 +256,7 @@ struct amdxdna_dev_priv {
 	u32				mbox_dev_addr;
 	/* If mbox_size is 0, use BAR size. See MBOX_SIZE macro */
 	u32				mbox_size;
+	u32				hwctx_limit;
 	u32				sram_dev_addr;
 	struct aie2_bar_off_pair	sram_offs[SRAM_MAX_INDEX];
 	struct aie2_bar_off_pair	psp_regs_off[PSP_MAX_REGS];
@@ -234,6 +274,7 @@ extern const struct dpm_clk_freq npu1_dpm_clk_table[];
 extern const struct dpm_clk_freq npu4_dpm_clk_table[];
 extern const struct rt_config npu1_default_rt_cfg[];
 extern const struct rt_config npu4_default_rt_cfg[];
+extern const struct aie2_fw_feature_tbl npu4_fw_feature_table[];
 
 /* aie2_smu.c */
 int aie2_smu_init(struct amdxdna_dev_hdl *ndev);
@@ -253,10 +294,12 @@ void aie2_psp_stop(struct psp_device *psp);
 /* aie2_error.c */
 int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev);
 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev);
-int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev);
 int aie2_error_async_msg_thread(void *data);
+int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev,
+			       struct amdxdna_drm_get_array *args);
 
 /* aie2_message.c */
+void aie2_msg_init(struct amdxdna_dev_hdl *ndev);
 int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
 int aie2_resume_fw(struct amdxdna_dev_hdl *ndev);
 int aie2_set_runtime_cfg(struct amdxdna_dev_hdl *ndev, u32 type, u64 value);
@@ -270,9 +313,13 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
 int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
 int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
 int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf, u32 size, u32 *cols_filled);
+int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
+			 char __user *buf, u32 size,
+			 struct amdxdna_drm_query_telemetry_header *header);
 int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
 				 void *handle, int (*cb)(void*, void __iomem *, size_t));
-int aie2_config_cu(struct amdxdna_hwctx *hwctx);
+int aie2_config_cu(struct amdxdna_hwctx *hwctx,
+		   int (*notify_cb)(void *, void __iomem *, size_t));
 int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 		 int (*notify_cb)(void *, void __iomem *, size_t));
 int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
@@ -283,11 +330,14 @@ int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
 			       int (*notify_cb)(void *, void __iomem *, size_t));
 int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 		 int (*notify_cb)(void *, void __iomem *, size_t));
+int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+			 int (*notify_cb)(void *, void __iomem *, size_t));
 
 /* aie2_hwctx.c */
 int aie2_hwctx_init(struct amdxdna_hwctx *hwctx);
 void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx);
 int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
+int aie2_hwctx_sync_debug_bo(struct amdxdna_hwctx *hwctx, u32 debug_bo_hdl);
 void aie2_hwctx_suspend(struct amdxdna_client *client);
 int aie2_hwctx_resume(struct amdxdna_client *client);
 int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
diff --git a/drivers/accel/amdxdna/aie2_smu.c b/drivers/accel/amdxdna/aie2_smu.c
index d303701b0ded..bd94ee96c2bc 100644
--- a/drivers/accel/amdxdna/aie2_smu.c
+++ b/drivers/accel/amdxdna/aie2_smu.c
@@ -11,6 +11,7 @@
 
 #include "aie2_pci.h"
 #include "amdxdna_pci_drv.h"
+#include "amdxdna_pm.h"
 
 #define SMU_RESULT_OK		1
 
@@ -22,6 +23,13 @@
 #define AIE2_SMU_SET_SOFT_DPMLEVEL	0x7
 #define AIE2_SMU_SET_HARD_DPMLEVEL	0x8
 
+#define NPU4_DPM_TOPS(ndev, dpm_level) \
+({ \
+	typeof(ndev) _ndev = ndev; \
+	(4096 * (_ndev)->total_col * \
+	 (_ndev)->priv->dpm_clk_tbl[dpm_level].hclk / 1000000); \
+})
+
 static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd,
 			 u32 reg_arg, u32 *out)
 {
@@ -59,12 +67,16 @@ int npu1_set_dpm(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
 	u32 freq;
 	int ret;
 
+	ret = amdxdna_pm_resume_get(ndev->xdna);
+	if (ret)
+		return ret;
+
 	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ,
 			    ndev->priv->dpm_clk_tbl[dpm_level].npuclk, &freq);
 	if (ret) {
 		XDNA_ERR(ndev->xdna, "Set npu clock to %d failed, ret %d\n",
 			 ndev->priv->dpm_clk_tbl[dpm_level].npuclk, ret);
-		return ret;
+		goto suspend_put;
 	}
 	ndev->npuclk_freq = freq;
 
@@ -73,49 +85,78 @@ int npu1_set_dpm(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
 	if (ret) {
 		XDNA_ERR(ndev->xdna, "Set h clock to %d failed, ret %d\n",
 			 ndev->priv->dpm_clk_tbl[dpm_level].hclk, ret);
-		return ret;
+		goto suspend_put;
 	}
+
+	amdxdna_pm_suspend_put(ndev->xdna);
 	ndev->hclk_freq = freq;
 	ndev->dpm_level = dpm_level;
+	ndev->max_tops = 2 * ndev->total_col;
+	ndev->curr_tops = ndev->max_tops * freq / 1028;
 
 	XDNA_DBG(ndev->xdna, "MP-NPU clock %d, H clock %d\n",
 		 ndev->npuclk_freq, ndev->hclk_freq);
 
 	return 0;
+
+suspend_put:
+	amdxdna_pm_suspend_put(ndev->xdna);
+	return ret;
 }
 
 int npu4_set_dpm(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
 {
 	int ret;
 
+	ret = amdxdna_pm_resume_get(ndev->xdna);
+	if (ret)
+		return ret;
+
 	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HARD_DPMLEVEL, dpm_level, NULL);
 	if (ret) {
 		XDNA_ERR(ndev->xdna, "Set hard dpm level %d failed, ret %d ",
 			 dpm_level, ret);
-		return ret;
+		goto suspend_put;
 	}
 
 	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_SOFT_DPMLEVEL, dpm_level, NULL);
 	if (ret) {
 		XDNA_ERR(ndev->xdna, "Set soft dpm level %d failed, ret %d",
 			 dpm_level, ret);
-		return ret;
+		goto suspend_put;
 	}
 
+	amdxdna_pm_suspend_put(ndev->xdna);
 	ndev->npuclk_freq = ndev->priv->dpm_clk_tbl[dpm_level].npuclk;
 	ndev->hclk_freq = ndev->priv->dpm_clk_tbl[dpm_level].hclk;
 	ndev->dpm_level = dpm_level;
+	ndev->max_tops = NPU4_DPM_TOPS(ndev, ndev->max_dpm_level);
+	ndev->curr_tops = NPU4_DPM_TOPS(ndev, dpm_level);
 
 	XDNA_DBG(ndev->xdna, "MP-NPU clock %d, H clock %d\n",
 		 ndev->npuclk_freq, ndev->hclk_freq);
 
 	return 0;
+
+suspend_put:
+	amdxdna_pm_suspend_put(ndev->xdna);
+	return ret;
 }
 
 int aie2_smu_init(struct amdxdna_dev_hdl *ndev)
 {
 	int ret;
 
+	/*
+	 * Failing to set power off indicates an unrecoverable hardware or
+	 * firmware error.
+	 */
+	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_OFF, 0, NULL);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Access power failed, ret %d", ret);
+		return ret;
+	}
+
 	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_ON, 0, NULL);
 	if (ret) {
 		XDNA_ERR(ndev->xdna, "Power on failed, ret %d", ret);
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
index 4bfe4ef20550..d17aef89a0ad 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.c
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -113,14 +113,14 @@ void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size)
 	return &cmd->data[num_masks];
 }
 
-int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
+u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
 {
 	struct amdxdna_cmd *cmd = abo->mem.kva;
 	u32 num_masks, i;
 	u32 *cu_mask;
 
 	if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
-		return -1;
+		return INVALID_CU_IDX;
 
 	num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
 	cu_mask = cmd->data;
@@ -129,7 +129,7 @@ int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
 			return ffs(cu_mask[i]) - 1;
 	}
 
-	return -1;
+	return INVALID_CU_IDX;
 }
 
 /*
@@ -161,19 +161,14 @@ int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
 	if (args->ext || args->ext_flags)
 		return -EINVAL;
 
-	if (!drm_dev_enter(dev, &idx))
-		return -ENODEV;
-
 	hwctx = kzalloc(sizeof(*hwctx), GFP_KERNEL);
-	if (!hwctx) {
-		ret = -ENOMEM;
-		goto exit;
-	}
+	if (!hwctx)
+		return -ENOMEM;
 
 	if (copy_from_user(&hwctx->qos, u64_to_user_ptr(args->qos_p), sizeof(hwctx->qos))) {
 		XDNA_ERR(xdna, "Access QoS info failed");
-		ret = -EFAULT;
-		goto free_hwctx;
+		kfree(hwctx);
+		return -EFAULT;
 	}
 
 	hwctx->client = client;
@@ -181,30 +176,36 @@ int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
 	hwctx->num_tiles = args->num_tiles;
 	hwctx->mem_size = args->mem_size;
 	hwctx->max_opc = args->max_opc;
-	ret = xa_alloc_cyclic(&client->hwctx_xa, &hwctx->id, hwctx,
-			      XA_LIMIT(AMDXDNA_INVALID_CTX_HANDLE + 1, MAX_HWCTX_ID),
-			      &client->next_hwctxid, GFP_KERNEL);
-	if (ret < 0) {
-		XDNA_ERR(xdna, "Allocate hwctx ID failed, ret %d", ret);
+
+	guard(mutex)(&xdna->dev_lock);
+
+	if (!drm_dev_enter(dev, &idx)) {
+		ret = -ENODEV;
 		goto free_hwctx;
 	}
 
-	hwctx->name = kasprintf(GFP_KERNEL, "hwctx.%d.%d", client->pid, hwctx->id);
+	ret = xdna->dev_info->ops->hwctx_init(hwctx);
+	if (ret) {
+		XDNA_ERR(xdna, "Init hwctx failed, ret %d", ret);
+		goto dev_exit;
+	}
+
+	hwctx->name = kasprintf(GFP_KERNEL, "hwctx.%d.%d", client->pid, hwctx->fw_ctx_id);
 	if (!hwctx->name) {
 		ret = -ENOMEM;
-		goto rm_id;
+		goto fini_hwctx;
 	}
 
-	mutex_lock(&xdna->dev_lock);
-	ret = xdna->dev_info->ops->hwctx_init(hwctx);
-	if (ret) {
-		mutex_unlock(&xdna->dev_lock);
-		XDNA_ERR(xdna, "Init hwctx failed, ret %d", ret);
+	ret = xa_alloc_cyclic(&client->hwctx_xa, &hwctx->id, hwctx,
+			      XA_LIMIT(AMDXDNA_INVALID_CTX_HANDLE + 1, MAX_HWCTX_ID),
+			      &client->next_hwctxid, GFP_KERNEL);
+	if (ret < 0) {
+		XDNA_ERR(xdna, "Allocate hwctx ID failed, ret %d", ret);
 		goto free_name;
 	}
+
 	args->handle = hwctx->id;
 	args->syncobj_handle = hwctx->syncobj_hdl;
-	mutex_unlock(&xdna->dev_lock);
 
 	atomic64_set(&hwctx->job_submit_cnt, 0);
 	atomic64_set(&hwctx->job_free_cnt, 0);
@@ -214,12 +215,12 @@ int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct dr
 
 free_name:
 	kfree(hwctx->name);
-rm_id:
-	xa_erase(&client->hwctx_xa, hwctx->id);
+fini_hwctx:
+	xdna->dev_info->ops->hwctx_fini(hwctx);
+dev_exit:
+	drm_dev_exit(idx);
 free_hwctx:
 	kfree(hwctx);
-exit:
-	drm_dev_exit(idx);
 	return ret;
 }
 
@@ -327,6 +328,38 @@ unlock_srcu:
 	return ret;
 }
 
+int amdxdna_hwctx_sync_debug_bo(struct amdxdna_client *client, u32 debug_bo_hdl)
+{
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_hwctx *hwctx;
+	struct amdxdna_gem_obj *abo;
+	struct drm_gem_object *gobj;
+	int ret, idx;
+
+	if (!xdna->dev_info->ops->hwctx_sync_debug_bo)
+		return -EOPNOTSUPP;
+
+	gobj = drm_gem_object_lookup(client->filp, debug_bo_hdl);
+	if (!gobj)
+		return -EINVAL;
+
+	abo = to_xdna_obj(gobj);
+	guard(mutex)(&xdna->dev_lock);
+	idx = srcu_read_lock(&client->hwctx_srcu);
+	hwctx = xa_load(&client->hwctx_xa, abo->assigned_hwctx);
+	if (!hwctx) {
+		ret = -EINVAL;
+		goto unlock_srcu;
+	}
+
+	ret = xdna->dev_info->ops->hwctx_sync_debug_bo(hwctx, debug_bo_hdl);
+
+unlock_srcu:
+	srcu_read_unlock(&client->hwctx_srcu, idx);
+	drm_gem_object_put(gobj);
+	return ret;
+}
+
 static void
 amdxdna_arg_bos_put(struct amdxdna_sched_job *job)
 {
@@ -389,9 +422,11 @@ void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job)
 	trace_amdxdna_debug_point(job->hwctx->name, job->seq, "job release");
 	amdxdna_arg_bos_put(job);
 	amdxdna_gem_put_obj(job->cmd_bo);
+	dma_fence_put(job->fence);
 }
 
 int amdxdna_cmd_submit(struct amdxdna_client *client,
+		       struct amdxdna_drv_cmd *drv_cmd,
 		       u32 cmd_bo_hdl, u32 *arg_bo_hdls, u32 arg_bo_cnt,
 		       u32 hwctx_hdl, u64 *seq)
 {
@@ -405,6 +440,8 @@ int amdxdna_cmd_submit(struct amdxdna_client *client,
 	if (!job)
 		return -ENOMEM;
 
+	job->drv_cmd = drv_cmd;
+
 	if (cmd_bo_hdl != AMDXDNA_INVALID_BO_HANDLE) {
 		job->cmd_bo = amdxdna_gem_get_obj(client, cmd_bo_hdl, AMDXDNA_BO_CMD);
 		if (!job->cmd_bo) {
@@ -412,8 +449,6 @@ int amdxdna_cmd_submit(struct amdxdna_client *client,
 			ret = -EINVAL;
 			goto free_job;
 		}
-	} else {
-		job->cmd_bo = NULL;
 	}
 
 	ret = amdxdna_arg_bos_lookup(client, job, arg_bo_hdls, arg_bo_cnt);
@@ -431,11 +466,6 @@ int amdxdna_cmd_submit(struct amdxdna_client *client,
 		goto unlock_srcu;
 	}
 
-	if (hwctx->status != HWCTX_STAT_READY) {
-		XDNA_ERR(xdna, "HW Context is not ready");
-		ret = -EINVAL;
-		goto unlock_srcu;
-	}
 
 	job->hwctx = hwctx;
 	job->mm = current->mm;
@@ -512,7 +542,7 @@ static int amdxdna_drm_submit_execbuf(struct amdxdna_client *client,
 		}
 	}
 
-	ret = amdxdna_cmd_submit(client, cmd_bo_hdl, arg_bo_hdls,
+	ret = amdxdna_cmd_submit(client, NULL, cmd_bo_hdl, arg_bo_hdls,
 				 args->arg_count, args->hwctx, &args->seq);
 	if (ret)
 		XDNA_DBG(xdna, "Submit cmds failed, ret %d", ret);
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
index 7cd7a55936f0..b6151244d64f 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -13,9 +13,12 @@
 struct amdxdna_hwctx_priv;
 
 enum ert_cmd_opcode {
-	ERT_START_CU      = 0,
-	ERT_CMD_CHAIN     = 19,
-	ERT_START_NPU     = 20,
+	ERT_START_CU = 0,
+	ERT_CMD_CHAIN = 19,
+	ERT_START_NPU = 20,
+	ERT_START_NPU_PREEMPT = 21,
+	ERT_START_NPU_PREEMPT_ELF = 22,
+	ERT_INVALID_CMD	= ~0U,
 };
 
 enum ert_cmd_state {
@@ -54,6 +57,21 @@ struct amdxdna_cmd_chain {
 	u64 data[] __counted_by(command_count);
 };
 
+/*
+ * Interpretation of the beginning of data payload for ERT_START_NPU_PREEMPT in
+ * amdxdna_cmd. The rest of the payload in amdxdna_cmd is regular kernel args.
+ */
+struct amdxdna_cmd_preempt_data {
+	u64 inst_buf;	    /* instruction buffer address */
+	u64 save_buf;	    /* save buffer address */
+	u64 restore_buf;    /* restore buffer address */
+	u32 inst_size;	    /* size of instruction buffer in bytes */
+	u32 save_size;	    /* size of save buffer in bytes */
+	u32 restore_size;   /* size of restore buffer in bytes */
+	u32 inst_prop_cnt;  /* properties count */
+	u32 prop_args[];    /* properties and regular kernel arguments */
+};
+
 /* Exec buffer command header format */
 #define AMDXDNA_CMD_STATE		GENMASK(3, 0)
 #define AMDXDNA_CMD_EXTRA_CU_MASK	GENMASK(11, 10)
@@ -64,6 +82,8 @@ struct amdxdna_cmd {
 	u32 data[];
 };
 
+#define INVALID_CU_IDX		(~0U)
+
 struct amdxdna_hwctx {
 	struct amdxdna_client		*client;
 	struct amdxdna_hwctx_priv	*priv;
@@ -95,6 +115,17 @@ struct amdxdna_hwctx {
 #define drm_job_to_xdna_job(j) \
 	container_of(j, struct amdxdna_sched_job, base)
 
+enum amdxdna_job_opcode {
+	SYNC_DEBUG_BO,
+	ATTACH_DEBUG_BO,
+	DETACH_DEBUG_BO,
+};
+
+struct amdxdna_drv_cmd {
+	enum amdxdna_job_opcode	opcode;
+	u32			result;
+};
+
 struct amdxdna_sched_job {
 	struct drm_sched_job	base;
 	struct kref		refcnt;
@@ -105,7 +136,9 @@ struct amdxdna_sched_job {
 	/* user can wait on this fence */
 	struct dma_fence	*out_fence;
 	bool			job_done;
+	bool			job_timeout;
 	u64			seq;
+	struct amdxdna_drv_cmd	*drv_cmd;
 	struct amdxdna_gem_obj	*cmd_bo;
 	size_t			bo_cnt;
 	struct drm_gem_object	*bos[] __counted_by(bo_cnt);
@@ -137,15 +170,17 @@ amdxdna_cmd_get_state(struct amdxdna_gem_obj *abo)
 }
 
 void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
-int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
+u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
 
 void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
 void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
 int amdxdna_hwctx_walk(struct amdxdna_client *client, void *arg,
 		       int (*walk)(struct amdxdna_hwctx *hwctx, void *arg));
+int amdxdna_hwctx_sync_debug_bo(struct amdxdna_client *client, u32 debug_bo_hdl);
 
 int amdxdna_cmd_submit(struct amdxdna_client *client,
-		       u32 cmd_bo_hdls, u32 *arg_bo_hdls, u32 arg_bo_cnt,
+		       struct amdxdna_drv_cmd *drv_cmd, u32 cmd_bo_hdls,
+		       u32 *arg_bo_hdls, u32 arg_bo_cnt,
 		       u32 hwctx_hdl, u64 *seq);
 
 int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
diff --git a/drivers/accel/amdxdna/amdxdna_error.h b/drivers/accel/amdxdna/amdxdna_error.h
new file mode 100644
index 000000000000..c51de86ec12b
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_error.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2025, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _AMDXDNA_ERROR_H_
+#define _AMDXDNA_ERROR_H_
+
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+
+#define AMDXDNA_ERR_DRV_AIE		4
+#define AMDXDNA_ERR_SEV_CRITICAL	3
+#define AMDXDNA_ERR_CLASS_AIE		2
+
+#define AMDXDNA_ERR_NUM_MASK		GENMASK_U64(15, 0)
+#define AMDXDNA_ERR_DRV_MASK		GENMASK_U64(23, 16)
+#define AMDXDNA_ERR_SEV_MASK		GENMASK_U64(31, 24)
+#define AMDXDNA_ERR_MOD_MASK		GENMASK_U64(39, 32)
+#define AMDXDNA_ERR_CLASS_MASK		GENMASK_U64(47, 40)
+
+enum amdxdna_error_num {
+	AMDXDNA_ERROR_NUM_AIE_SATURATION = 3,
+	AMDXDNA_ERROR_NUM_AIE_FP,
+	AMDXDNA_ERROR_NUM_AIE_STREAM,
+	AMDXDNA_ERROR_NUM_AIE_ACCESS,
+	AMDXDNA_ERROR_NUM_AIE_BUS,
+	AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
+	AMDXDNA_ERROR_NUM_AIE_ECC,
+	AMDXDNA_ERROR_NUM_AIE_LOCK,
+	AMDXDNA_ERROR_NUM_AIE_DMA,
+	AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
+	AMDXDNA_ERROR_NUM_UNKNOWN = 15,
+};
+
+enum amdxdna_error_module {
+	AMDXDNA_ERROR_MODULE_AIE_CORE = 3,
+	AMDXDNA_ERROR_MODULE_AIE_MEMORY,
+	AMDXDNA_ERROR_MODULE_AIE_SHIM,
+	AMDXDNA_ERROR_MODULE_AIE_NOC,
+	AMDXDNA_ERROR_MODULE_AIE_PL,
+	AMDXDNA_ERROR_MODULE_UNKNOWN = 8,
+};
+
+#define AMDXDNA_ERROR_ENCODE(err_num, err_mod)				\
+	(FIELD_PREP(AMDXDNA_ERR_NUM_MASK, err_num) |			\
+	 FIELD_PREP_CONST(AMDXDNA_ERR_DRV_MASK, AMDXDNA_ERR_DRV_AIE) |	\
+	 FIELD_PREP_CONST(AMDXDNA_ERR_SEV_MASK, AMDXDNA_ERR_SEV_CRITICAL) | \
+	 FIELD_PREP(AMDXDNA_ERR_MOD_MASK, err_mod) |			\
+	 FIELD_PREP_CONST(AMDXDNA_ERR_CLASS_MASK, AMDXDNA_ERR_CLASS_AIE))
+
+#define AMDXDNA_EXTRA_ERR_COL_MASK	GENMASK_U64(7, 0)
+#define AMDXDNA_EXTRA_ERR_ROW_MASK	GENMASK_U64(15, 8)
+
+#define AMDXDNA_EXTRA_ERR_ENCODE(row, col)				\
+	(FIELD_PREP(AMDXDNA_EXTRA_ERR_COL_MASK, col) |			\
+	 FIELD_PREP(AMDXDNA_EXTRA_ERR_ROW_MASK, row))
+
+#endif /* _AMDXDNA_ERROR_H_ */
diff --git a/drivers/accel/amdxdna/amdxdna_gem.c b/drivers/accel/amdxdna/amdxdna_gem.c
index d407a36eb412..dfa916eeb2d9 100644
--- a/drivers/accel/amdxdna/amdxdna_gem.c
+++ b/drivers/accel/amdxdna/amdxdna_gem.c
@@ -8,6 +8,7 @@
 #include <drm/drm_device.h>
 #include <drm/drm_gem.h>
 #include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_print.h>
 #include <drm/gpu_scheduler.h>
 #include <linux/dma-buf.h>
 #include <linux/dma-direct.h>
@@ -392,35 +393,33 @@ static const struct dma_buf_ops amdxdna_dmabuf_ops = {
 	.vunmap = drm_gem_dmabuf_vunmap,
 };
 
-static int amdxdna_gem_obj_vmap(struct drm_gem_object *obj, struct iosys_map *map)
+static int amdxdna_gem_obj_vmap(struct amdxdna_gem_obj *abo, void **vaddr)
 {
-	struct amdxdna_gem_obj *abo = to_xdna_obj(obj);
-
-	iosys_map_clear(map);
-
-	dma_resv_assert_held(obj->resv);
+	struct iosys_map map = IOSYS_MAP_INIT_VADDR(NULL);
+	int ret;
 
 	if (is_import_bo(abo))
-		dma_buf_vmap(abo->dma_buf, map);
+		ret = dma_buf_vmap_unlocked(abo->dma_buf, &map);
 	else
-		drm_gem_shmem_object_vmap(obj, map);
+		ret = drm_gem_vmap(to_gobj(abo), &map);
 
-	if (!map->vaddr)
-		return -ENOMEM;
-
-	return 0;
+	*vaddr = map.vaddr;
+	return ret;
 }
 
-static void amdxdna_gem_obj_vunmap(struct drm_gem_object *obj, struct iosys_map *map)
+static void amdxdna_gem_obj_vunmap(struct amdxdna_gem_obj *abo)
 {
-	struct amdxdna_gem_obj *abo = to_xdna_obj(obj);
+	struct iosys_map map;
+
+	if (!abo->mem.kva)
+		return;
 
-	dma_resv_assert_held(obj->resv);
+	iosys_map_set_vaddr(&map, abo->mem.kva);
 
 	if (is_import_bo(abo))
-		dma_buf_vunmap(abo->dma_buf, map);
+		dma_buf_vunmap_unlocked(abo->dma_buf, &map);
 	else
-		drm_gem_shmem_object_vunmap(obj, map);
+		drm_gem_vunmap(to_gobj(abo), &map);
 }
 
 static struct dma_buf *amdxdna_gem_prime_export(struct drm_gem_object *gobj, int flags)
@@ -455,7 +454,6 @@ static void amdxdna_gem_obj_free(struct drm_gem_object *gobj)
 {
 	struct amdxdna_dev *xdna = to_xdna_dev(gobj->dev);
 	struct amdxdna_gem_obj *abo = to_xdna_obj(gobj);
-	struct iosys_map map = IOSYS_MAP_INIT_VADDR(abo->mem.kva);
 
 	XDNA_DBG(xdna, "BO type %d xdna_addr 0x%llx", abo->type, abo->mem.dev_addr);
 
@@ -468,7 +466,7 @@ static void amdxdna_gem_obj_free(struct drm_gem_object *gobj)
 	if (abo->type == AMDXDNA_BO_DEV_HEAP)
 		drm_mm_takedown(&abo->mm);
 
-	drm_gem_vunmap(gobj, &map);
+	amdxdna_gem_obj_vunmap(abo);
 	mutex_destroy(&abo->lock);
 
 	if (is_import_bo(abo)) {
@@ -489,8 +487,8 @@ static const struct drm_gem_object_funcs amdxdna_gem_shmem_funcs = {
 	.pin = drm_gem_shmem_object_pin,
 	.unpin = drm_gem_shmem_object_unpin,
 	.get_sg_table = drm_gem_shmem_object_get_sg_table,
-	.vmap = amdxdna_gem_obj_vmap,
-	.vunmap = amdxdna_gem_obj_vunmap,
+	.vmap = drm_gem_shmem_object_vmap,
+	.vunmap = drm_gem_shmem_object_vunmap,
 	.mmap = amdxdna_gem_obj_mmap,
 	.vm_ops = &drm_gem_shmem_vm_ops,
 	.export = amdxdna_gem_prime_export,
@@ -663,7 +661,6 @@ amdxdna_drm_create_dev_heap(struct drm_device *dev,
 			    struct drm_file *filp)
 {
 	struct amdxdna_client *client = filp->driver_priv;
-	struct iosys_map map = IOSYS_MAP_INIT_VADDR(NULL);
 	struct amdxdna_dev *xdna = to_xdna_dev(dev);
 	struct amdxdna_gem_obj *abo;
 	int ret;
@@ -692,12 +689,11 @@ amdxdna_drm_create_dev_heap(struct drm_device *dev,
 	abo->mem.dev_addr = client->xdna->dev_info->dev_mem_base;
 	drm_mm_init(&abo->mm, abo->mem.dev_addr, abo->mem.size);
 
-	ret = drm_gem_vmap(to_gobj(abo), &map);
+	ret = amdxdna_gem_obj_vmap(abo, &abo->mem.kva);
 	if (ret) {
 		XDNA_ERR(xdna, "Vmap heap bo failed, ret %d", ret);
 		goto release_obj;
 	}
-	abo->mem.kva = map.vaddr;
 
 	client->dev_heap = abo;
 	drm_gem_object_get(to_gobj(abo));
@@ -748,7 +744,6 @@ amdxdna_drm_create_cmd_bo(struct drm_device *dev,
 			  struct amdxdna_drm_create_bo *args,
 			  struct drm_file *filp)
 {
-	struct iosys_map map = IOSYS_MAP_INIT_VADDR(NULL);
 	struct amdxdna_dev *xdna = to_xdna_dev(dev);
 	struct amdxdna_gem_obj *abo;
 	int ret;
@@ -770,12 +765,11 @@ amdxdna_drm_create_cmd_bo(struct drm_device *dev,
 	abo->type = AMDXDNA_BO_CMD;
 	abo->client = filp->driver_priv;
 
-	ret = drm_gem_vmap(to_gobj(abo), &map);
+	ret = amdxdna_gem_obj_vmap(abo, &abo->mem.kva);
 	if (ret) {
 		XDNA_ERR(xdna, "Vmap cmd bo failed, ret %d", ret);
 		goto release_obj;
 	}
-	abo->mem.kva = map.vaddr;
 
 	return abo;
 
@@ -969,6 +963,9 @@ int amdxdna_drm_sync_bo_ioctl(struct drm_device *dev,
 	XDNA_DBG(xdna, "Sync bo %d offset 0x%llx, size 0x%llx\n",
 		 args->handle, args->offset, args->size);
 
+	if (args->direction == SYNC_DIRECT_FROM_DEVICE)
+		ret = amdxdna_hwctx_sync_debug_bo(abo->client, args->handle);
+
 put_obj:
 	drm_gem_object_put(gobj);
 	return ret;
diff --git a/drivers/accel/amdxdna/amdxdna_gem.h b/drivers/accel/amdxdna/amdxdna_gem.h
index ae29db94a9d3..f79fc7f3c93b 100644
--- a/drivers/accel/amdxdna/amdxdna_gem.h
+++ b/drivers/accel/amdxdna/amdxdna_gem.h
@@ -7,6 +7,7 @@
 #define _AMDXDNA_GEM_H_
 
 #include <linux/hmm.h>
+#include "amdxdna_pci_drv.h"
 
 struct amdxdna_umap {
 	struct vm_area_struct		*vma;
@@ -62,6 +63,11 @@ static inline void amdxdna_gem_put_obj(struct amdxdna_gem_obj *abo)
 	drm_gem_object_put(to_gobj(abo));
 }
 
+static inline u64 amdxdna_dev_bo_offset(struct amdxdna_gem_obj *abo)
+{
+	return abo->mem.dev_addr - abo->client->dev_heap->mem.dev_addr;
+}
+
 void amdxdna_umap_put(struct amdxdna_umap *mapp);
 
 struct drm_gem_object *
diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c
index da1ac89bb78f..858df97cd3fb 100644
--- a/drivers/accel/amdxdna/amdxdna_mailbox.c
+++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
@@ -194,7 +194,8 @@ static void mailbox_release_msg(struct mailbox_channel *mb_chann,
 {
 	MB_DBG(mb_chann, "msg_id 0x%x msg opcode 0x%x",
 	       mb_msg->pkg.header.id, mb_msg->pkg.header.opcode);
-	mb_msg->notify_cb(mb_msg->handle, NULL, 0);
+	if (mb_msg->notify_cb)
+		mb_msg->notify_cb(mb_msg->handle, NULL, 0);
 	kfree(mb_msg);
 }
 
@@ -248,7 +249,7 @@ mailbox_get_resp(struct mailbox_channel *mb_chann, struct xdna_msg_header *heade
 {
 	struct mailbox_msg *mb_msg;
 	int msg_id;
-	int ret;
+	int ret = 0;
 
 	msg_id = header->id;
 	if (!mailbox_validate_msgid(msg_id)) {
@@ -265,9 +266,11 @@ mailbox_get_resp(struct mailbox_channel *mb_chann, struct xdna_msg_header *heade
 
 	MB_DBG(mb_chann, "opcode 0x%x size %d id 0x%x",
 	       header->opcode, header->total_size, header->id);
-	ret = mb_msg->notify_cb(mb_msg->handle, data, header->total_size);
-	if (unlikely(ret))
-		MB_ERR(mb_chann, "Message callback ret %d", ret);
+	if (mb_msg->notify_cb) {
+		ret = mb_msg->notify_cb(mb_msg->handle, data, header->total_size);
+		if (unlikely(ret))
+			MB_ERR(mb_chann, "Message callback ret %d", ret);
+	}
 
 	kfree(mb_msg);
 	return ret;
@@ -513,6 +516,7 @@ xdna_mailbox_create_channel(struct mailbox *mb,
 	}
 
 	mb_chann->bad_state = false;
+	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
 
 	MB_DBG(mb_chann, "Mailbox channel created (irq: %d)", mb_chann->msix_irq);
 	return mb_chann;
diff --git a/drivers/accel/amdxdna/amdxdna_mailbox_helper.h b/drivers/accel/amdxdna/amdxdna_mailbox_helper.h
index 710ff8873d61..556c712cad0a 100644
--- a/drivers/accel/amdxdna/amdxdna_mailbox_helper.h
+++ b/drivers/accel/amdxdna/amdxdna_mailbox_helper.h
@@ -16,16 +16,18 @@ struct xdna_notify {
 	u32			*data;
 	size_t			size;
 	int			error;
+	u32			*status;
 };
 
-#define DECLARE_XDNA_MSG_COMMON(name, op, status)			\
+#define DECLARE_XDNA_MSG_COMMON(name, op, s)				\
 	struct name##_req	req = { 0 };				\
-	struct name##_resp	resp = { status	};			\
+	struct name##_resp	resp = { .status = s };			\
 	struct xdna_notify	hdl = {					\
 		.error = 0,						\
 		.data = (u32 *)&resp,					\
 		.size = sizeof(resp),					\
 		.comp = COMPLETION_INITIALIZER_ONSTACK(hdl.comp),	\
+		.status = (u32 *)&resp.status,				\
 	};								\
 	struct xdna_mailbox_msg msg = {					\
 		.send_data = (u8 *)&req,				\
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
index 569cd703729d..1973ab67721b 100644
--- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
@@ -13,13 +13,11 @@
 #include <drm/gpu_scheduler.h>
 #include <linux/iommu.h>
 #include <linux/pci.h>
-#include <linux/pm_runtime.h>
 
 #include "amdxdna_ctx.h"
 #include "amdxdna_gem.h"
 #include "amdxdna_pci_drv.h"
-
-#define AMDXDNA_AUTOSUSPEND_DELAY	5000 /* milliseconds */
+#include "amdxdna_pm.h"
 
 MODULE_FIRMWARE("amdnpu/1502_00/npu.sbin");
 MODULE_FIRMWARE("amdnpu/17f0_10/npu.sbin");
@@ -29,9 +27,14 @@ MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin");
 /*
  * 0.0: Initial version
  * 0.1: Support getting all hardware contexts by DRM_IOCTL_AMDXDNA_GET_ARRAY
+ * 0.2: Support getting last error hardware error
+ * 0.3: Support firmware debug buffer
+ * 0.4: Support getting resource information
+ * 0.5: Support getting telemetry data
+ * 0.6: Support preemption
  */
 #define AMDXDNA_DRIVER_MAJOR		0
-#define AMDXDNA_DRIVER_MINOR		1
+#define AMDXDNA_DRIVER_MINOR		6
 
 /*
  * Bind the driver base on (vendor_id, device_id) pair and later use the
@@ -61,17 +64,9 @@ static int amdxdna_drm_open(struct drm_device *ddev, struct drm_file *filp)
 	struct amdxdna_client *client;
 	int ret;
 
-	ret = pm_runtime_resume_and_get(ddev->dev);
-	if (ret) {
-		XDNA_ERR(xdna, "Failed to get rpm, ret %d", ret);
-		return ret;
-	}
-
 	client = kzalloc(sizeof(*client), GFP_KERNEL);
-	if (!client) {
-		ret = -ENOMEM;
-		goto put_rpm;
-	}
+	if (!client)
+		return -ENOMEM;
 
 	client->pid = pid_nr(rcu_access_pointer(filp->pid));
 	client->xdna = xdna;
@@ -106,9 +101,6 @@ unbind_sva:
 	iommu_sva_unbind_device(client->sva);
 failed:
 	kfree(client);
-put_rpm:
-	pm_runtime_mark_last_busy(ddev->dev);
-	pm_runtime_put_autosuspend(ddev->dev);
 
 	return ret;
 }
@@ -130,8 +122,6 @@ static void amdxdna_drm_close(struct drm_device *ddev, struct drm_file *filp)
 
 	XDNA_DBG(xdna, "pid %d closed", client->pid);
 	kfree(client);
-	pm_runtime_mark_last_busy(ddev->dev);
-	pm_runtime_put_autosuspend(ddev->dev);
 }
 
 static int amdxdna_flush(struct file *f, fl_owner_t id)
@@ -310,19 +300,12 @@ static int amdxdna_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto failed_dev_fini;
 	}
 
-	pm_runtime_set_autosuspend_delay(dev, AMDXDNA_AUTOSUSPEND_DELAY);
-	pm_runtime_use_autosuspend(dev);
-	pm_runtime_allow(dev);
-
 	ret = drm_dev_register(&xdna->ddev, 0);
 	if (ret) {
 		XDNA_ERR(xdna, "DRM register failed, ret %d", ret);
-		pm_runtime_forbid(dev);
 		goto failed_sysfs_fini;
 	}
 
-	pm_runtime_mark_last_busy(dev);
-	pm_runtime_put_autosuspend(dev);
 	return 0;
 
 failed_sysfs_fini:
@@ -339,14 +322,10 @@ destroy_notifier_wq:
 static void amdxdna_remove(struct pci_dev *pdev)
 {
 	struct amdxdna_dev *xdna = pci_get_drvdata(pdev);
-	struct device *dev = &pdev->dev;
 	struct amdxdna_client *client;
 
 	destroy_workqueue(xdna->notifier_wq);
 
-	pm_runtime_get_noresume(dev);
-	pm_runtime_forbid(dev);
-
 	drm_dev_unplug(&xdna->ddev);
 	amdxdna_sysfs_fini(xdna);
 
@@ -365,29 +344,9 @@ static void amdxdna_remove(struct pci_dev *pdev)
 	mutex_unlock(&xdna->dev_lock);
 }
 
-static int amdxdna_pmops_suspend(struct device *dev)
-{
-	struct amdxdna_dev *xdna = pci_get_drvdata(to_pci_dev(dev));
-
-	if (!xdna->dev_info->ops->suspend)
-		return -EOPNOTSUPP;
-
-	return xdna->dev_info->ops->suspend(xdna);
-}
-
-static int amdxdna_pmops_resume(struct device *dev)
-{
-	struct amdxdna_dev *xdna = pci_get_drvdata(to_pci_dev(dev));
-
-	if (!xdna->dev_info->ops->resume)
-		return -EOPNOTSUPP;
-
-	return xdna->dev_info->ops->resume(xdna);
-}
-
 static const struct dev_pm_ops amdxdna_pm_ops = {
-	SYSTEM_SLEEP_PM_OPS(amdxdna_pmops_suspend, amdxdna_pmops_resume)
-	RUNTIME_PM_OPS(amdxdna_pmops_suspend, amdxdna_pmops_resume, NULL)
+	SYSTEM_SLEEP_PM_OPS(amdxdna_pm_suspend, amdxdna_pm_resume)
+	RUNTIME_PM_OPS(amdxdna_pm_suspend, amdxdna_pm_resume, NULL)
 };
 
 static struct pci_driver amdxdna_pci_driver = {
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.h b/drivers/accel/amdxdna/amdxdna_pci_drv.h
index 72d6696d49da..c99477f5e454 100644
--- a/drivers/accel/amdxdna/amdxdna_pci_drv.h
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.h
@@ -6,6 +6,7 @@
 #ifndef _AMDXDNA_PCI_DRV_H_
 #define _AMDXDNA_PCI_DRV_H_
 
+#include <drm/drm_print.h>
 #include <linux/workqueue.h>
 #include <linux/xarray.h>
 
@@ -54,6 +55,7 @@ struct amdxdna_dev_ops {
 	int (*hwctx_init)(struct amdxdna_hwctx *hwctx);
 	void (*hwctx_fini)(struct amdxdna_hwctx *hwctx);
 	int (*hwctx_config)(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
+	int (*hwctx_sync_debug_bo)(struct amdxdna_hwctx *hwctx, u32 debug_bo_hdl);
 	void (*hmm_invalidate)(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
 	int (*cmd_submit)(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
 	int (*get_aie_info)(struct amdxdna_client *client, struct amdxdna_drm_get_info *args);
@@ -99,6 +101,7 @@ struct amdxdna_dev {
 	struct amdxdna_fw_ver		fw_ver;
 	struct rw_semaphore		notifier_lock; /* for mmu notifier*/
 	struct workqueue_struct		*notifier_wq;
+	bool				rpm_on;
 };
 
 /*
diff --git a/drivers/accel/amdxdna/amdxdna_pm.c b/drivers/accel/amdxdna/amdxdna_pm.c
new file mode 100644
index 000000000000..fa38e65d617c
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_pm.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_drv.h>
+#include <linux/pm_runtime.h>
+
+#include "amdxdna_pm.h"
+
+#define AMDXDNA_AUTOSUSPEND_DELAY	5000 /* milliseconds */
+
+int amdxdna_pm_suspend(struct device *dev)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(dev_get_drvdata(dev));
+	int ret = -EOPNOTSUPP;
+	bool rpm;
+
+	if (xdna->dev_info->ops->suspend) {
+		rpm = xdna->rpm_on;
+		xdna->rpm_on = false;
+		ret = xdna->dev_info->ops->suspend(xdna);
+		xdna->rpm_on = rpm;
+	}
+
+	XDNA_DBG(xdna, "Suspend done ret %d", ret);
+	return ret;
+}
+
+int amdxdna_pm_resume(struct device *dev)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(dev_get_drvdata(dev));
+	int ret = -EOPNOTSUPP;
+	bool rpm;
+
+	if (xdna->dev_info->ops->resume) {
+		rpm = xdna->rpm_on;
+		xdna->rpm_on = false;
+		ret = xdna->dev_info->ops->resume(xdna);
+		xdna->rpm_on = rpm;
+	}
+
+	XDNA_DBG(xdna, "Resume done ret %d", ret);
+	return ret;
+}
+
+int amdxdna_pm_resume_get(struct amdxdna_dev *xdna)
+{
+	struct device *dev = xdna->ddev.dev;
+	int ret;
+
+	if (!xdna->rpm_on)
+		return 0;
+
+	ret = pm_runtime_resume_and_get(dev);
+	if (ret) {
+		XDNA_ERR(xdna, "Resume failed: %d", ret);
+		pm_runtime_set_suspended(dev);
+	}
+
+	return ret;
+}
+
+void amdxdna_pm_suspend_put(struct amdxdna_dev *xdna)
+{
+	struct device *dev = xdna->ddev.dev;
+
+	if (!xdna->rpm_on)
+		return;
+
+	pm_runtime_put_autosuspend(dev);
+}
+
+void amdxdna_pm_init(struct amdxdna_dev *xdna)
+{
+	struct device *dev = xdna->ddev.dev;
+
+	pm_runtime_set_active(dev);
+	pm_runtime_set_autosuspend_delay(dev, AMDXDNA_AUTOSUSPEND_DELAY);
+	pm_runtime_use_autosuspend(dev);
+	pm_runtime_allow(dev);
+	pm_runtime_put_autosuspend(dev);
+	xdna->rpm_on = true;
+}
+
+void amdxdna_pm_fini(struct amdxdna_dev *xdna)
+{
+	struct device *dev = xdna->ddev.dev;
+
+	xdna->rpm_on = false;
+	pm_runtime_get_noresume(dev);
+	pm_runtime_forbid(dev);
+}
diff --git a/drivers/accel/amdxdna/amdxdna_pm.h b/drivers/accel/amdxdna/amdxdna_pm.h
new file mode 100644
index 000000000000..77b2d6e45570
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_pm.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2025, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _AMDXDNA_PM_H_
+#define _AMDXDNA_PM_H_
+
+#include "amdxdna_pci_drv.h"
+
+int amdxdna_pm_suspend(struct device *dev);
+int amdxdna_pm_resume(struct device  *dev);
+int amdxdna_pm_resume_get(struct amdxdna_dev *xdna);
+void amdxdna_pm_suspend_put(struct amdxdna_dev *xdna);
+void amdxdna_pm_init(struct amdxdna_dev *xdna);
+void amdxdna_pm_fini(struct amdxdna_dev *xdna);
+
+#endif /* _AMDXDNA_PM_H_ */
diff --git a/drivers/accel/amdxdna/npu1_regs.c b/drivers/accel/amdxdna/npu1_regs.c
index e4f6dac7d00f..ec407f3b48fc 100644
--- a/drivers/accel/amdxdna/npu1_regs.c
+++ b/drivers/accel/amdxdna/npu1_regs.c
@@ -46,6 +46,7 @@
 
 const struct rt_config npu1_default_rt_cfg[] = {
 	{ 2, 1, AIE2_RT_CFG_INIT }, /* PDI APP LOAD MODE */
+	{ 4, 1, AIE2_RT_CFG_INIT }, /* Debug BO */
 	{ 1, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */
 	{ 0 },
 };
@@ -62,16 +63,23 @@ const struct dpm_clk_freq npu1_dpm_clk_table[] = {
 	{ 0 }
 };
 
+static const struct aie2_fw_feature_tbl npu1_fw_feature_table[] = {
+	{ .feature = AIE2_NPU_COMMAND, .min_minor = 8 },
+	{ 0 }
+};
+
 static const struct amdxdna_dev_priv npu1_dev_priv = {
 	.fw_path        = "amdnpu/1502_00/npu.sbin",
 	.protocol_major = 0x5,
 	.protocol_minor = 0x7,
 	.rt_config	= npu1_default_rt_cfg,
 	.dpm_clk_tbl	= npu1_dpm_clk_table,
+	.fw_feature_tbl = npu1_fw_feature_table,
 	.col_align	= COL_ALIGN_NONE,
 	.mbox_dev_addr  = NPU1_MBOX_BAR_BASE,
 	.mbox_size      = 0, /* Use BAR size */
 	.sram_dev_addr  = NPU1_SRAM_BAR_BASE,
+	.hwctx_limit    = 6,
 	.sram_offs      = {
 		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU1_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
 		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU1_SRAM, MPNPU_SRAM_I2X_MAILBOX_15),
diff --git a/drivers/accel/amdxdna/npu2_regs.c b/drivers/accel/amdxdna/npu2_regs.c
index a081cac75ee0..86f87d0d1354 100644
--- a/drivers/accel/amdxdna/npu2_regs.c
+++ b/drivers/accel/amdxdna/npu2_regs.c
@@ -67,10 +67,12 @@ static const struct amdxdna_dev_priv npu2_dev_priv = {
 	.protocol_minor = 0x6,
 	.rt_config	= npu4_default_rt_cfg,
 	.dpm_clk_tbl	= npu4_dpm_clk_table,
+	.fw_feature_tbl = npu4_fw_feature_table,
 	.col_align	= COL_ALIGN_NATURE,
 	.mbox_dev_addr  = NPU2_MBOX_BAR_BASE,
 	.mbox_size      = 0, /* Use BAR size */
 	.sram_dev_addr  = NPU2_SRAM_BAR_BASE,
+	.hwctx_limit    = 16,
 	.sram_offs      = {
 		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
 		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
index 9f2e33182ec6..986a5f28ba24 100644
--- a/drivers/accel/amdxdna/npu4_regs.c
+++ b/drivers/accel/amdxdna/npu4_regs.c
@@ -63,10 +63,14 @@
 
 const struct rt_config npu4_default_rt_cfg[] = {
 	{ 5, 1, AIE2_RT_CFG_INIT }, /* PDI APP LOAD MODE */
+	{ 10, 1, AIE2_RT_CFG_INIT }, /* DEBUG BUF */
+	{ 14, 0, AIE2_RT_CFG_INIT, BIT_U64(AIE2_PREEMPT) }, /* Frame boundary preemption */
 	{ 1, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */
 	{ 2, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */
 	{ 3, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */
 	{ 4, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */
+	{ 13, 0, AIE2_RT_CFG_FORCE_PREEMPT },
+	{ 14, 0, AIE2_RT_CFG_FRAME_BOUNDARY_PREEMPT },
 	{ 0 },
 };
 
@@ -82,16 +86,24 @@ const struct dpm_clk_freq npu4_dpm_clk_table[] = {
 	{ 0 }
 };
 
+const struct aie2_fw_feature_tbl npu4_fw_feature_table[] = {
+	{ .feature = AIE2_NPU_COMMAND, .min_minor = 15 },
+	{ .feature = AIE2_PREEMPT, .min_minor = 12 },
+	{ 0 }
+};
+
 static const struct amdxdna_dev_priv npu4_dev_priv = {
 	.fw_path        = "amdnpu/17f0_10/npu.sbin",
 	.protocol_major = 0x6,
 	.protocol_minor = 12,
 	.rt_config	= npu4_default_rt_cfg,
 	.dpm_clk_tbl	= npu4_dpm_clk_table,
+	.fw_feature_tbl = npu4_fw_feature_table,
 	.col_align	= COL_ALIGN_NATURE,
 	.mbox_dev_addr  = NPU4_MBOX_BAR_BASE,
 	.mbox_size      = 0, /* Use BAR size */
 	.sram_dev_addr  = NPU4_SRAM_BAR_BASE,
+	.hwctx_limit    = 16,
 	.sram_offs      = {
 		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
 		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
diff --git a/drivers/accel/amdxdna/npu5_regs.c b/drivers/accel/amdxdna/npu5_regs.c
index 5f1cf83461c4..75ad97f0b937 100644
--- a/drivers/accel/amdxdna/npu5_regs.c
+++ b/drivers/accel/amdxdna/npu5_regs.c
@@ -67,10 +67,12 @@ static const struct amdxdna_dev_priv npu5_dev_priv = {
 	.protocol_minor = 12,
 	.rt_config	= npu4_default_rt_cfg,
 	.dpm_clk_tbl	= npu4_dpm_clk_table,
+	.fw_feature_tbl = npu4_fw_feature_table,
 	.col_align	= COL_ALIGN_NATURE,
 	.mbox_dev_addr  = NPU5_MBOX_BAR_BASE,
 	.mbox_size      = 0, /* Use BAR size */
 	.sram_dev_addr  = NPU5_SRAM_BAR_BASE,
+	.hwctx_limit    = 16,
 	.sram_offs      = {
 		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
 		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
diff --git a/drivers/accel/amdxdna/npu6_regs.c b/drivers/accel/amdxdna/npu6_regs.c
index 94a7005685a7..758dc013fe13 100644
--- a/drivers/accel/amdxdna/npu6_regs.c
+++ b/drivers/accel/amdxdna/npu6_regs.c
@@ -67,10 +67,12 @@ static const struct amdxdna_dev_priv npu6_dev_priv = {
 	.protocol_minor = 12,
 	.rt_config	= npu4_default_rt_cfg,
 	.dpm_clk_tbl	= npu4_dpm_clk_table,
+	.fw_feature_tbl = npu4_fw_feature_table,
 	.col_align	= COL_ALIGN_NATURE,
 	.mbox_dev_addr  = NPU6_MBOX_BAR_BASE,
 	.mbox_size      = 0, /* Use BAR size */
 	.sram_dev_addr  = NPU6_SRAM_BAR_BASE,
+	.hwctx_limit    = 16,
 	.sram_offs      = {
 		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
 		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
diff --git a/drivers/accel/ethosu/Kconfig b/drivers/accel/ethosu/Kconfig
new file mode 100644
index 000000000000..d25f9b3eb317
--- /dev/null
+++ b/drivers/accel/ethosu/Kconfig
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config DRM_ACCEL_ARM_ETHOSU
+	tristate "Arm Ethos-U65/U85 NPU"
+	depends on HAS_IOMEM
+	depends on DRM_ACCEL
+	select DRM_GEM_DMA_HELPER
+	select DRM_SCHED
+	select GENERIC_ALLOCATOR
+	help
+	  Enables driver for Arm Ethos-U65/U85 NPUs
diff --git a/drivers/accel/ethosu/Makefile b/drivers/accel/ethosu/Makefile
new file mode 100644
index 000000000000..17db5a600416
--- /dev/null
+++ b/drivers/accel/ethosu/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_DRM_ACCEL_ARM_ETHOSU) := ethosu.o
+ethosu-y += ethosu_drv.o ethosu_gem.o ethosu_job.o
diff --git a/drivers/accel/ethosu/ethosu_device.h b/drivers/accel/ethosu/ethosu_device.h
new file mode 100644
index 000000000000..b189fa783d6a
--- /dev/null
+++ b/drivers/accel/ethosu/ethosu_device.h
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0-only or MIT */
+/* Copyright 2025 Arm, Ltd. */
+
+#ifndef __ETHOSU_DEVICE_H__
+#define __ETHOSU_DEVICE_H__
+
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/types.h>
+
+#include <drm/drm_device.h>
+#include <drm/gpu_scheduler.h>
+
+#include <drm/ethosu_accel.h>
+
+struct clk;
+struct gen_pool;
+
+#define NPU_REG_ID		0x0000
+#define NPU_REG_STATUS		0x0004
+#define NPU_REG_CMD		0x0008
+#define NPU_REG_RESET		0x000c
+#define NPU_REG_QBASE		0x0010
+#define NPU_REG_QBASE_HI	0x0014
+#define NPU_REG_QREAD		0x0018
+#define NPU_REG_QCONFIG		0x001c
+#define NPU_REG_QSIZE		0x0020
+#define NPU_REG_PROT		0x0024
+#define NPU_REG_CONFIG		0x0028
+#define NPU_REG_REGIONCFG	0x003c
+#define NPU_REG_AXILIMIT0	0x0040		// U65
+#define NPU_REG_AXILIMIT1	0x0044		// U65
+#define NPU_REG_AXILIMIT2	0x0048		// U65
+#define NPU_REG_AXILIMIT3	0x004c		// U65
+#define NPU_REG_MEM_ATTR0	0x0040		// U85
+#define NPU_REG_MEM_ATTR1	0x0044		// U85
+#define NPU_REG_MEM_ATTR2	0x0048		// U85
+#define NPU_REG_MEM_ATTR3	0x004c		// U85
+#define NPU_REG_AXI_SRAM	0x0050		// U85
+#define NPU_REG_AXI_EXT		0x0054		// U85
+
+#define NPU_REG_BASEP(x)	(0x0080 + (x) * 8)
+#define NPU_REG_BASEP_HI(x)	(0x0084 + (x) * 8)
+#define NPU_BASEP_REGION_MAX	8
+
+#define ID_ARCH_MAJOR_MASK	GENMASK(31, 28)
+#define ID_ARCH_MINOR_MASK	GENMASK(27, 20)
+#define ID_ARCH_PATCH_MASK	GENMASK(19, 16)
+#define ID_VER_MAJOR_MASK	GENMASK(11, 8)
+#define ID_VER_MINOR_MASK	GENMASK(7, 4)
+
+#define CONFIG_MACS_PER_CC_MASK	GENMASK(3, 0)
+#define CONFIG_CMD_STREAM_VER_MASK	GENMASK(7, 4)
+
+#define STATUS_STATE_RUNNING	BIT(0)
+#define STATUS_IRQ_RAISED	BIT(1)
+#define STATUS_BUS_STATUS	BIT(2)
+#define STATUS_RESET_STATUS	BIT(3)
+#define STATUS_CMD_PARSE_ERR	BIT(4)
+#define STATUS_CMD_END_REACHED	BIT(5)
+
+#define CMD_CLEAR_IRQ		BIT(1)
+#define CMD_TRANSITION_TO_RUN	BIT(0)
+
+#define RESET_PENDING_CSL	BIT(1)
+#define RESET_PENDING_CPL	BIT(0)
+
+#define PROT_ACTIVE_CSL		BIT(1)
+
+enum ethosu_cmds {
+	NPU_OP_CONV = 0x2,
+	NPU_OP_DEPTHWISE = 0x3,
+	NPU_OP_POOL = 0x5,
+	NPU_OP_ELEMENTWISE = 0x6,
+	NPU_OP_RESIZE = 0x7,	// U85 only
+	NPU_OP_DMA_START = 0x10,
+	NPU_SET_IFM_PAD_TOP = 0x100,
+	NPU_SET_IFM_PAD_LEFT = 0x101,
+	NPU_SET_IFM_PAD_RIGHT = 0x102,
+	NPU_SET_IFM_PAD_BOTTOM = 0x103,
+	NPU_SET_IFM_DEPTH_M1 = 0x104,
+	NPU_SET_IFM_PRECISION = 0x105,
+	NPU_SET_IFM_BROADCAST = 0x108,
+	NPU_SET_IFM_WIDTH0_M1 = 0x10a,
+	NPU_SET_IFM_HEIGHT0_M1 = 0x10b,
+	NPU_SET_IFM_HEIGHT1_M1 = 0x10c,
+	NPU_SET_IFM_REGION = 0x10f,
+	NPU_SET_OFM_WIDTH_M1 = 0x111,
+	NPU_SET_OFM_HEIGHT_M1 = 0x112,
+	NPU_SET_OFM_DEPTH_M1 = 0x113,
+	NPU_SET_OFM_PRECISION = 0x114,
+	NPU_SET_OFM_WIDTH0_M1 = 0x11a,
+	NPU_SET_OFM_HEIGHT0_M1 = 0x11b,
+	NPU_SET_OFM_HEIGHT1_M1 = 0x11c,
+	NPU_SET_OFM_REGION = 0x11f,
+	NPU_SET_KERNEL_WIDTH_M1 = 0x120,
+	NPU_SET_KERNEL_HEIGHT_M1 = 0x121,
+	NPU_SET_KERNEL_STRIDE = 0x122,
+	NPU_SET_WEIGHT_REGION = 0x128,
+	NPU_SET_SCALE_REGION = 0x129,
+	NPU_SET_DMA0_SRC_REGION = 0x130,
+	NPU_SET_DMA0_DST_REGION = 0x131,
+	NPU_SET_DMA0_SIZE0 = 0x132,
+	NPU_SET_DMA0_SIZE1 = 0x133,
+	NPU_SET_IFM2_BROADCAST = 0x180,
+	NPU_SET_IFM2_PRECISION = 0x185,
+	NPU_SET_IFM2_WIDTH0_M1 = 0x18a,
+	NPU_SET_IFM2_HEIGHT0_M1 = 0x18b,
+	NPU_SET_IFM2_HEIGHT1_M1 = 0x18c,
+	NPU_SET_IFM2_REGION = 0x18f,
+	NPU_SET_IFM_BASE0 = 0x4000,
+	NPU_SET_IFM_BASE1 = 0x4001,
+	NPU_SET_IFM_BASE2 = 0x4002,
+	NPU_SET_IFM_BASE3 = 0x4003,
+	NPU_SET_IFM_STRIDE_X = 0x4004,
+	NPU_SET_IFM_STRIDE_Y = 0x4005,
+	NPU_SET_IFM_STRIDE_C = 0x4006,
+	NPU_SET_OFM_BASE0 = 0x4010,
+	NPU_SET_OFM_BASE1 = 0x4011,
+	NPU_SET_OFM_BASE2 = 0x4012,
+	NPU_SET_OFM_BASE3 = 0x4013,
+	NPU_SET_OFM_STRIDE_X = 0x4014,
+	NPU_SET_OFM_STRIDE_Y = 0x4015,
+	NPU_SET_OFM_STRIDE_C = 0x4016,
+	NPU_SET_WEIGHT_BASE = 0x4020,
+	NPU_SET_WEIGHT_LENGTH = 0x4021,
+	NPU_SET_SCALE_BASE = 0x4022,
+	NPU_SET_SCALE_LENGTH = 0x4023,
+	NPU_SET_DMA0_SRC = 0x4030,
+	NPU_SET_DMA0_DST = 0x4031,
+	NPU_SET_DMA0_LEN = 0x4032,
+	NPU_SET_DMA0_SRC_STRIDE0 = 0x4033,
+	NPU_SET_DMA0_SRC_STRIDE1 = 0x4034,
+	NPU_SET_DMA0_DST_STRIDE0 = 0x4035,
+	NPU_SET_DMA0_DST_STRIDE1 = 0x4036,
+	NPU_SET_IFM2_BASE0 = 0x4080,
+	NPU_SET_IFM2_BASE1 = 0x4081,
+	NPU_SET_IFM2_BASE2 = 0x4082,
+	NPU_SET_IFM2_BASE3 = 0x4083,
+	NPU_SET_IFM2_STRIDE_X = 0x4084,
+	NPU_SET_IFM2_STRIDE_Y = 0x4085,
+	NPU_SET_IFM2_STRIDE_C = 0x4086,
+	NPU_SET_WEIGHT1_BASE = 0x4090,
+	NPU_SET_WEIGHT1_LENGTH = 0x4091,
+	NPU_SET_SCALE1_BASE = 0x4092,
+	NPU_SET_WEIGHT2_BASE = 0x4092,
+	NPU_SET_SCALE1_LENGTH = 0x4093,
+	NPU_SET_WEIGHT2_LENGTH = 0x4093,
+	NPU_SET_WEIGHT3_BASE = 0x4094,
+	NPU_SET_WEIGHT3_LENGTH = 0x4095,
+};
+
+#define ETHOSU_SRAM_REGION	2	/* Matching Vela compiler */
+
+/**
+ * struct ethosu_device - Ethosu device
+ */
+struct ethosu_device {
+	/** @base: Base drm_device. */
+	struct drm_device base;
+
+	/** @iomem: CPU mapping of the registers. */
+	void __iomem *regs;
+
+	void __iomem *sram;
+	struct gen_pool *srampool;
+	dma_addr_t sramphys;
+
+	struct clk_bulk_data *clks;
+	int num_clks;
+	int irq;
+
+	struct drm_ethosu_npu_info npu_info;
+
+	struct ethosu_job *in_flight_job;
+	/* For in_flight_job and ethosu_job_hw_submit() */
+	struct mutex job_lock;
+
+	/* For dma_fence */
+	spinlock_t fence_lock;
+
+	struct drm_gpu_scheduler sched;
+	/* For ethosu_job_do_push() */
+	struct mutex sched_lock;
+	u64 fence_context;
+	u64 emit_seqno;
+};
+
+#define to_ethosu_device(drm_dev) \
+	((struct ethosu_device *)container_of(drm_dev, struct ethosu_device, base))
+
+static inline bool ethosu_is_u65(const struct ethosu_device *ethosudev)
+{
+	return FIELD_GET(ID_ARCH_MAJOR_MASK, ethosudev->npu_info.id) == 1;
+}
+
+#endif
diff --git a/drivers/accel/ethosu/ethosu_drv.c b/drivers/accel/ethosu/ethosu_drv.c
new file mode 100644
index 000000000000..e05a69bf5574
--- /dev/null
+++ b/drivers/accel/ethosu/ethosu_drv.c
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: GPL-2.0-only or MIT
+// Copyright (C) 2025 Arm, Ltd.
+
+#include <linux/bitfield.h>
+#include <linux/clk.h>
+#include <linux/genalloc.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+
+#include <drm/drm_drv.h>
+#include <drm/drm_ioctl.h>
+#include <drm/drm_utils.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_accel.h>
+#include <drm/ethosu_accel.h>
+
+#include "ethosu_drv.h"
+#include "ethosu_device.h"
+#include "ethosu_gem.h"
+#include "ethosu_job.h"
+
+static int ethosu_ioctl_dev_query(struct drm_device *ddev, void *data,
+				  struct drm_file *file)
+{
+	struct ethosu_device *ethosudev = to_ethosu_device(ddev);
+	struct drm_ethosu_dev_query *args = data;
+
+	if (!args->pointer) {
+		switch (args->type) {
+		case DRM_ETHOSU_DEV_QUERY_NPU_INFO:
+			args->size = sizeof(ethosudev->npu_info);
+			return 0;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	switch (args->type) {
+	case DRM_ETHOSU_DEV_QUERY_NPU_INFO:
+		if (args->size < offsetofend(struct drm_ethosu_npu_info, sram_size))
+			return -EINVAL;
+		return copy_struct_to_user(u64_to_user_ptr(args->pointer),
+					   args->size,
+					   &ethosudev->npu_info,
+					   sizeof(ethosudev->npu_info), NULL);
+	default:
+		return -EINVAL;
+	}
+}
+
+#define ETHOSU_BO_FLAGS		DRM_ETHOSU_BO_NO_MMAP
+
+static int ethosu_ioctl_bo_create(struct drm_device *ddev, void *data,
+				  struct drm_file *file)
+{
+	struct drm_ethosu_bo_create *args = data;
+	int cookie, ret;
+
+	if (!drm_dev_enter(ddev, &cookie))
+		return -ENODEV;
+
+	if (!args->size || (args->flags & ~ETHOSU_BO_FLAGS)) {
+		ret = -EINVAL;
+		goto out_dev_exit;
+	}
+
+	ret = ethosu_gem_create_with_handle(file, ddev, &args->size,
+					    args->flags, &args->handle);
+
+out_dev_exit:
+	drm_dev_exit(cookie);
+	return ret;
+}
+
+static int ethosu_ioctl_bo_wait(struct drm_device *ddev, void *data,
+				struct drm_file *file)
+{
+	struct drm_ethosu_bo_wait *args = data;
+	int cookie, ret;
+	unsigned long timeout = drm_timeout_abs_to_jiffies(args->timeout_ns);
+
+	if (args->pad)
+		return -EINVAL;
+
+	if (!drm_dev_enter(ddev, &cookie))
+		return -ENODEV;
+
+	ret = drm_gem_dma_resv_wait(file, args->handle, true, timeout);
+
+	drm_dev_exit(cookie);
+	return ret;
+}
+
+static int ethosu_ioctl_bo_mmap_offset(struct drm_device *ddev, void *data,
+				       struct drm_file *file)
+{
+	struct drm_ethosu_bo_mmap_offset *args = data;
+	struct drm_gem_object *obj;
+
+	if (args->pad)
+		return -EINVAL;
+
+	obj = drm_gem_object_lookup(file, args->handle);
+	if (!obj)
+		return -ENOENT;
+
+	args->offset = drm_vma_node_offset_addr(&obj->vma_node);
+	drm_gem_object_put(obj);
+	return 0;
+}
+
+static int ethosu_ioctl_cmdstream_bo_create(struct drm_device *ddev, void *data,
+					    struct drm_file *file)
+{
+	struct drm_ethosu_cmdstream_bo_create *args = data;
+	int cookie, ret;
+
+	if (!drm_dev_enter(ddev, &cookie))
+		return -ENODEV;
+
+	if (!args->size || !args->data || args->pad || args->flags) {
+		ret = -EINVAL;
+		goto out_dev_exit;
+	}
+
+	args->flags |= DRM_ETHOSU_BO_NO_MMAP;
+
+	ret = ethosu_gem_cmdstream_create(file, ddev, args->size, args->data,
+					  args->flags, &args->handle);
+
+out_dev_exit:
+	drm_dev_exit(cookie);
+	return ret;
+}
+
+static int ethosu_open(struct drm_device *ddev, struct drm_file *file)
+{
+	int ret = 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+
+	struct ethosu_file_priv __free(kfree) *priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		goto err_put_mod;
+	}
+	priv->edev = to_ethosu_device(ddev);
+
+	ret = ethosu_job_open(priv);
+	if (ret)
+		goto err_put_mod;
+
+	file->driver_priv = no_free_ptr(priv);
+	return 0;
+
+err_put_mod:
+	module_put(THIS_MODULE);
+	return ret;
+}
+
+static void ethosu_postclose(struct drm_device *ddev, struct drm_file *file)
+{
+	ethosu_job_close(file->driver_priv);
+	kfree(file->driver_priv);
+	module_put(THIS_MODULE);
+}
+
+static const struct drm_ioctl_desc ethosu_drm_driver_ioctls[] = {
+#define ETHOSU_IOCTL(n, func, flags) \
+	DRM_IOCTL_DEF_DRV(ETHOSU_##n, ethosu_ioctl_##func, flags)
+
+	ETHOSU_IOCTL(DEV_QUERY, dev_query, 0),
+	ETHOSU_IOCTL(BO_CREATE, bo_create, 0),
+	ETHOSU_IOCTL(BO_WAIT, bo_wait, 0),
+	ETHOSU_IOCTL(BO_MMAP_OFFSET, bo_mmap_offset, 0),
+	ETHOSU_IOCTL(CMDSTREAM_BO_CREATE, cmdstream_bo_create, 0),
+	ETHOSU_IOCTL(SUBMIT, submit, 0),
+};
+
+DEFINE_DRM_ACCEL_FOPS(ethosu_drm_driver_fops);
+
+/*
+ * Ethosu driver version:
+ * - 1.0 - initial interface
+ */
+static const struct drm_driver ethosu_drm_driver = {
+	.driver_features = DRIVER_COMPUTE_ACCEL | DRIVER_GEM,
+	.open = ethosu_open,
+	.postclose = ethosu_postclose,
+	.ioctls = ethosu_drm_driver_ioctls,
+	.num_ioctls = ARRAY_SIZE(ethosu_drm_driver_ioctls),
+	.fops = &ethosu_drm_driver_fops,
+	.name = "ethosu",
+	.desc = "Arm Ethos-U Accel driver",
+	.major = 1,
+	.minor = 0,
+
+	.gem_create_object = ethosu_gem_create_object,
+};
+
+#define U65_DRAM_AXI_LIMIT_CFG	0x1f3f0002
+#define U65_SRAM_AXI_LIMIT_CFG	0x1f3f00b0
+#define U85_AXI_EXT_CFG		0x00021f3f
+#define U85_AXI_SRAM_CFG	0x00021f3f
+#define U85_MEM_ATTR0_CFG	0x00000000
+#define U85_MEM_ATTR2_CFG	0x000000b7
+
+static int ethosu_reset(struct ethosu_device *ethosudev)
+{
+	int ret;
+	u32 reg;
+
+	writel_relaxed(RESET_PENDING_CSL, ethosudev->regs + NPU_REG_RESET);
+	ret = readl_poll_timeout(ethosudev->regs + NPU_REG_STATUS, reg,
+				 !FIELD_GET(STATUS_RESET_STATUS, reg),
+				 USEC_PER_MSEC, USEC_PER_SEC);
+	if (ret)
+		return ret;
+
+	if (!FIELD_GET(PROT_ACTIVE_CSL, readl_relaxed(ethosudev->regs + NPU_REG_PROT))) {
+		dev_warn(ethosudev->base.dev, "Could not reset to non-secure mode (PROT = %x)\n",
+			 readl_relaxed(ethosudev->regs + NPU_REG_PROT));
+	}
+
+	/*
+	 * Assign region 2 (SRAM) to AXI M0 (AXILIMIT0),
+	 * everything else to AXI M1 (AXILIMIT2)
+	 */
+	writel_relaxed(0x0000aa8a, ethosudev->regs + NPU_REG_REGIONCFG);
+	if (ethosu_is_u65(ethosudev)) {
+		writel_relaxed(U65_SRAM_AXI_LIMIT_CFG, ethosudev->regs + NPU_REG_AXILIMIT0);
+		writel_relaxed(U65_DRAM_AXI_LIMIT_CFG, ethosudev->regs + NPU_REG_AXILIMIT2);
+	} else {
+		writel_relaxed(U85_AXI_SRAM_CFG, ethosudev->regs + NPU_REG_AXI_SRAM);
+		writel_relaxed(U85_AXI_EXT_CFG, ethosudev->regs + NPU_REG_AXI_EXT);
+		writel_relaxed(U85_MEM_ATTR0_CFG, ethosudev->regs + NPU_REG_MEM_ATTR0);	// SRAM
+		writel_relaxed(U85_MEM_ATTR2_CFG, ethosudev->regs + NPU_REG_MEM_ATTR2);	// DRAM
+	}
+
+	if (ethosudev->sram)
+		memset_io(ethosudev->sram, 0, ethosudev->npu_info.sram_size);
+
+	return 0;
+}
+
+static int ethosu_device_resume(struct device *dev)
+{
+	struct ethosu_device *ethosudev = dev_get_drvdata(dev);
+	int ret;
+
+	ret = clk_bulk_prepare_enable(ethosudev->num_clks, ethosudev->clks);
+	if (ret)
+		return ret;
+
+	ret = ethosu_reset(ethosudev);
+	if (!ret)
+		return 0;
+
+	clk_bulk_disable_unprepare(ethosudev->num_clks, ethosudev->clks);
+	return ret;
+}
+
+static int ethosu_device_suspend(struct device *dev)
+{
+	struct ethosu_device *ethosudev = dev_get_drvdata(dev);
+
+	clk_bulk_disable_unprepare(ethosudev->num_clks, ethosudev->clks);
+	return 0;
+}
+
+static int ethosu_sram_init(struct ethosu_device *ethosudev)
+{
+	ethosudev->npu_info.sram_size = 0;
+
+	ethosudev->srampool = of_gen_pool_get(ethosudev->base.dev->of_node, "sram", 0);
+	if (!ethosudev->srampool)
+		return 0;
+
+	ethosudev->npu_info.sram_size = gen_pool_size(ethosudev->srampool);
+
+	ethosudev->sram = (void __iomem *)gen_pool_dma_alloc(ethosudev->srampool,
+							     ethosudev->npu_info.sram_size,
+							     &ethosudev->sramphys);
+	if (!ethosudev->sram) {
+		dev_err(ethosudev->base.dev, "failed to allocate from SRAM pool\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int ethosu_init(struct ethosu_device *ethosudev)
+{
+	int ret;
+	u32 id, config;
+
+	ret = ethosu_device_resume(ethosudev->base.dev);
+	if (ret)
+		return ret;
+
+	pm_runtime_set_autosuspend_delay(ethosudev->base.dev, 50);
+	pm_runtime_use_autosuspend(ethosudev->base.dev);
+	ret = devm_pm_runtime_set_active_enabled(ethosudev->base.dev);
+	if (ret)
+		return ret;
+	pm_runtime_get_noresume(ethosudev->base.dev);
+
+	ethosudev->npu_info.id = id = readl_relaxed(ethosudev->regs + NPU_REG_ID);
+	ethosudev->npu_info.config = config = readl_relaxed(ethosudev->regs + NPU_REG_CONFIG);
+
+	ethosu_sram_init(ethosudev);
+
+	dev_info(ethosudev->base.dev,
+		 "Ethos-U NPU, arch v%ld.%ld.%ld, rev r%ldp%ld, cmd stream ver%ld, %d MACs, %dKB SRAM\n",
+		 FIELD_GET(ID_ARCH_MAJOR_MASK, id),
+		 FIELD_GET(ID_ARCH_MINOR_MASK, id),
+		 FIELD_GET(ID_ARCH_PATCH_MASK, id),
+		 FIELD_GET(ID_VER_MAJOR_MASK, id),
+		 FIELD_GET(ID_VER_MINOR_MASK, id),
+		 FIELD_GET(CONFIG_CMD_STREAM_VER_MASK, config),
+		 1 << FIELD_GET(CONFIG_MACS_PER_CC_MASK, config),
+		 ethosudev->npu_info.sram_size / 1024);
+
+	return 0;
+}
+
+static int ethosu_probe(struct platform_device *pdev)
+{
+	int ret;
+	struct ethosu_device *ethosudev;
+
+	ethosudev = devm_drm_dev_alloc(&pdev->dev, &ethosu_drm_driver,
+				       struct ethosu_device, base);
+	if (IS_ERR(ethosudev))
+		return -ENOMEM;
+	platform_set_drvdata(pdev, ethosudev);
+
+	dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(40));
+
+	ethosudev->regs = devm_platform_ioremap_resource(pdev, 0);
+
+	ethosudev->num_clks = devm_clk_bulk_get_all(&pdev->dev, &ethosudev->clks);
+	if (ethosudev->num_clks < 0)
+		return ethosudev->num_clks;
+
+	ret = ethosu_job_init(ethosudev);
+	if (ret)
+		return ret;
+
+	ret = ethosu_init(ethosudev);
+	if (ret)
+		return ret;
+
+	ret = drm_dev_register(&ethosudev->base, 0);
+	if (ret)
+		pm_runtime_dont_use_autosuspend(ethosudev->base.dev);
+
+	pm_runtime_put_autosuspend(ethosudev->base.dev);
+	return ret;
+}
+
+static void ethosu_remove(struct platform_device *pdev)
+{
+	struct ethosu_device *ethosudev = dev_get_drvdata(&pdev->dev);
+
+	drm_dev_unregister(&ethosudev->base);
+	ethosu_job_fini(ethosudev);
+	if (ethosudev->sram)
+		gen_pool_free(ethosudev->srampool, (unsigned long)ethosudev->sram,
+			      ethosudev->npu_info.sram_size);
+}
+
+static const struct of_device_id dt_match[] = {
+	{ .compatible = "arm,ethos-u65" },
+	{ .compatible = "arm,ethos-u85" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, dt_match);
+
+static DEFINE_RUNTIME_DEV_PM_OPS(ethosu_pm_ops,
+				 ethosu_device_suspend,
+				 ethosu_device_resume,
+				 NULL);
+
+static struct platform_driver ethosu_driver = {
+	.probe = ethosu_probe,
+	.remove = ethosu_remove,
+	.driver = {
+		.name = "ethosu",
+		.pm = pm_ptr(&ethosu_pm_ops),
+		.of_match_table = dt_match,
+	},
+};
+module_platform_driver(ethosu_driver);
+
+MODULE_AUTHOR("Rob Herring <robh@kernel.org>");
+MODULE_DESCRIPTION("Arm Ethos-U Accel Driver");
+MODULE_LICENSE("Dual MIT/GPL");
diff --git a/drivers/accel/ethosu/ethosu_drv.h b/drivers/accel/ethosu/ethosu_drv.h
new file mode 100644
index 000000000000..9e21dfe94184
--- /dev/null
+++ b/drivers/accel/ethosu/ethosu_drv.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR MIT */
+/* Copyright 2025 Arm, Ltd. */
+#ifndef __ETHOSU_DRV_H__
+#define __ETHOSU_DRV_H__
+
+#include <drm/gpu_scheduler.h>
+
+struct ethosu_device;
+
+struct ethosu_file_priv {
+	struct ethosu_device *edev;
+	struct drm_sched_entity sched_entity;
+};
+
+#endif
diff --git a/drivers/accel/ethosu/ethosu_gem.c b/drivers/accel/ethosu/ethosu_gem.c
new file mode 100644
index 000000000000..473b5f5d7514
--- /dev/null
+++ b/drivers/accel/ethosu/ethosu_gem.c
@@ -0,0 +1,704 @@
+// SPDX-License-Identifier: GPL-2.0-only or MIT
+/* Copyright 2025 Arm, Ltd. */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <drm/ethosu_accel.h>
+
+#include "ethosu_device.h"
+#include "ethosu_gem.h"
+
+static void ethosu_gem_free_object(struct drm_gem_object *obj)
+{
+	struct ethosu_gem_object *bo = to_ethosu_bo(obj);
+
+	kfree(bo->info);
+	drm_gem_free_mmap_offset(&bo->base.base);
+	drm_gem_dma_free(&bo->base);
+}
+
+static int ethosu_gem_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma)
+{
+	struct ethosu_gem_object *bo = to_ethosu_bo(obj);
+
+	/* Don't allow mmap on objects that have the NO_MMAP flag set. */
+	if (bo->flags & DRM_ETHOSU_BO_NO_MMAP)
+		return -EINVAL;
+
+	return drm_gem_dma_object_mmap(obj, vma);
+}
+
+static const struct drm_gem_object_funcs ethosu_gem_funcs = {
+	.free = ethosu_gem_free_object,
+	.print_info = drm_gem_dma_object_print_info,
+	.get_sg_table = drm_gem_dma_object_get_sg_table,
+	.vmap = drm_gem_dma_object_vmap,
+	.mmap = ethosu_gem_mmap,
+	.vm_ops = &drm_gem_dma_vm_ops,
+};
+
+/**
+ * ethosu_gem_create_object - Implementation of driver->gem_create_object.
+ * @ddev: DRM device
+ * @size: Size in bytes of the memory the object will reference
+ *
+ * This lets the GEM helpers allocate object structs for us, and keep
+ * our BO stats correct.
+ */
+struct drm_gem_object *ethosu_gem_create_object(struct drm_device *ddev, size_t size)
+{
+	struct ethosu_gem_object *obj;
+
+	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj)
+		return ERR_PTR(-ENOMEM);
+
+	obj->base.base.funcs = &ethosu_gem_funcs;
+	return &obj->base.base;
+}
+
+/**
+ * ethosu_gem_create_with_handle() - Create a GEM object and attach it to a handle.
+ * @file: DRM file.
+ * @ddev: DRM device.
+ * @size: Size of the GEM object to allocate.
+ * @flags: Combination of drm_ethosu_bo_flags flags.
+ * @handle: Pointer holding the handle pointing to the new GEM object.
+ *
+ * Return: Zero on success
+ */
+int ethosu_gem_create_with_handle(struct drm_file *file,
+				  struct drm_device *ddev,
+				  u64 *size, u32 flags, u32 *handle)
+{
+	struct drm_gem_dma_object *mem;
+	struct ethosu_gem_object *bo;
+	int ret;
+
+	mem = drm_gem_dma_create(ddev, *size);
+	if (IS_ERR(mem))
+		return PTR_ERR(mem);
+
+	bo = to_ethosu_bo(&mem->base);
+	bo->flags = flags;
+
+	/*
+	 * Allocate an id of idr table where the obj is registered
+	 * and handle has the id what user can see.
+	 */
+	ret = drm_gem_handle_create(file, &mem->base, handle);
+	if (!ret)
+		*size = bo->base.base.size;
+
+	/* drop reference from allocate - handle holds it now. */
+	drm_gem_object_put(&mem->base);
+
+	return ret;
+}
+
+struct dma {
+	s8 region;
+	u64 len;
+	u64 offset;
+	s64 stride[2];
+};
+
+struct dma_state {
+	u16 size0;
+	u16 size1;
+	s8 mode;
+	struct dma src;
+	struct dma dst;
+};
+
+struct buffer {
+	u64 base;
+	u32 length;
+	s8 region;
+};
+
+struct feat_matrix {
+	u64 base[4];
+	s64 stride_x;
+	s64 stride_y;
+	s64 stride_c;
+	s8 region;
+	u8 broadcast;
+	u16 stride_kernel;
+	u16 precision;
+	u16 depth;
+	u16 width;
+	u16 width0;
+	u16 height[3];
+	u8 pad_top;
+	u8 pad_left;
+	u8 pad_bottom;
+	u8 pad_right;
+};
+
+struct cmd_state {
+	struct dma_state dma;
+	struct buffer scale[2];
+	struct buffer weight[4];
+	struct feat_matrix ofm;
+	struct feat_matrix ifm;
+	struct feat_matrix ifm2;
+};
+
+static void cmd_state_init(struct cmd_state *st)
+{
+	/* Initialize to all 1s to detect missing setup */
+	memset(st, 0xff, sizeof(*st));
+}
+
+static u64 cmd_to_addr(u32 *cmd)
+{
+	return ((u64)((cmd[0] & 0xff0000) << 16)) | cmd[1];
+}
+
+static u64 dma_length(struct ethosu_validated_cmdstream_info *info,
+		      struct dma_state *dma_st, struct dma *dma)
+{
+	s8 mode = dma_st->mode;
+	u64 len = dma->len;
+
+	if (mode >= 1) {
+		len += dma->stride[0];
+		len *= dma_st->size0;
+	}
+	if (mode == 2) {
+		len += dma->stride[1];
+		len *= dma_st->size1;
+	}
+	if (dma->region >= 0)
+		info->region_size[dma->region] = max(info->region_size[dma->region],
+						     len + dma->offset);
+
+	return len;
+}
+
+static u64 feat_matrix_length(struct ethosu_validated_cmdstream_info *info,
+			      struct feat_matrix *fm,
+			      u32 x, u32 y, u32 c)
+{
+	u32 element_size, storage = fm->precision >> 14;
+	int tile = 0;
+	u64 addr;
+
+	if (fm->region < 0)
+		return U64_MAX;
+
+	switch (storage) {
+	case 0:
+		if (x >= fm->width0 + 1) {
+			x -= fm->width0 + 1;
+			tile += 1;
+		}
+		if (y >= fm->height[tile] + 1) {
+			y -= fm->height[tile] + 1;
+			tile += 2;
+		}
+		break;
+	case 1:
+		if (y >= fm->height[1] + 1) {
+			y -= fm->height[1] + 1;
+			tile = 2;
+		} else if (y >= fm->height[0] + 1) {
+			y -= fm->height[0] + 1;
+			tile = 1;
+		}
+		break;
+	}
+	if (fm->base[tile] == U64_MAX)
+		return U64_MAX;
+
+	addr = fm->base[tile] + y * fm->stride_y;
+
+	switch ((fm->precision >> 6) & 0x3) { // format
+	case 0: //nhwc:
+		addr += x * fm->stride_x + c;
+		break;
+	case 1: //nhcwb16:
+		element_size = BIT((fm->precision >> 1) & 0x3);
+
+		addr += (c / 16) * fm->stride_c + (16 * x + (c & 0xf)) * element_size;
+		break;
+	}
+
+	info->region_size[fm->region] = max(info->region_size[fm->region], addr + 1);
+
+	return addr;
+}
+
+static int calc_sizes(struct drm_device *ddev,
+		      struct ethosu_validated_cmdstream_info *info,
+		      u16 op, struct cmd_state *st,
+		      bool ifm, bool ifm2, bool weight, bool scale)
+{
+	u64 len;
+
+	if (ifm) {
+		if (st->ifm.stride_kernel == U16_MAX)
+			return -EINVAL;
+		u32 stride_y = ((st->ifm.stride_kernel >> 8) & 0x2) +
+			((st->ifm.stride_kernel >> 1) & 0x1) + 1;
+		u32 stride_x = ((st->ifm.stride_kernel >> 5) & 0x2) +
+			(st->ifm.stride_kernel & 0x1) + 1;
+		u32 ifm_height = st->ofm.height[2] * stride_y +
+			st->ifm.height[2] - (st->ifm.pad_top + st->ifm.pad_bottom);
+		u32 ifm_width  = st->ofm.width * stride_x +
+			st->ifm.width - (st->ifm.pad_left + st->ifm.pad_right);
+
+		len = feat_matrix_length(info, &st->ifm, ifm_width,
+					 ifm_height, st->ifm.depth);
+		dev_dbg(ddev->dev, "op %d: IFM:%d:0x%llx-0x%llx\n",
+			op, st->ifm.region, st->ifm.base[0], len);
+		if (len == U64_MAX)
+			return -EINVAL;
+	}
+
+	if (ifm2) {
+		len = feat_matrix_length(info, &st->ifm2, st->ifm.depth,
+					 0, st->ofm.depth);
+		dev_dbg(ddev->dev, "op %d: IFM2:%d:0x%llx-0x%llx\n",
+			op, st->ifm2.region, st->ifm2.base[0], len);
+		if (len == U64_MAX)
+			return -EINVAL;
+	}
+
+	if (weight) {
+		dev_dbg(ddev->dev, "op %d: W:%d:0x%llx-0x%llx\n",
+			op, st->weight[0].region, st->weight[0].base,
+			st->weight[0].base + st->weight[0].length - 1);
+		if (st->weight[0].region < 0 || st->weight[0].base == U64_MAX ||
+		    st->weight[0].length == U32_MAX)
+			return -EINVAL;
+		info->region_size[st->weight[0].region] =
+			max(info->region_size[st->weight[0].region],
+			    st->weight[0].base + st->weight[0].length);
+	}
+
+	if (scale) {
+		dev_dbg(ddev->dev, "op %d: S:%d:0x%llx-0x%llx\n",
+			op, st->scale[0].region, st->scale[0].base,
+			st->scale[0].base + st->scale[0].length - 1);
+		if (st->scale[0].region < 0 || st->scale[0].base == U64_MAX ||
+		    st->scale[0].length == U32_MAX)
+			return -EINVAL;
+		info->region_size[st->scale[0].region] =
+			max(info->region_size[st->scale[0].region],
+			    st->scale[0].base + st->scale[0].length);
+	}
+
+	len = feat_matrix_length(info, &st->ofm, st->ofm.width,
+				 st->ofm.height[2], st->ofm.depth);
+	dev_dbg(ddev->dev, "op %d: OFM:%d:0x%llx-0x%llx\n",
+		op, st->ofm.region, st->ofm.base[0], len);
+	if (len == U64_MAX)
+		return -EINVAL;
+	info->output_region[st->ofm.region] = true;
+
+	return 0;
+}
+
+static int calc_sizes_elemwise(struct drm_device *ddev,
+			       struct ethosu_validated_cmdstream_info *info,
+			       u16 op, struct cmd_state *st,
+			       bool ifm, bool ifm2)
+{
+	u32 height, width, depth;
+	u64 len;
+
+	if (ifm) {
+		height = st->ifm.broadcast & 0x1 ? 0 : st->ofm.height[2];
+		width = st->ifm.broadcast & 0x2 ? 0 : st->ofm.width;
+		depth = st->ifm.broadcast & 0x4 ? 0 : st->ofm.depth;
+
+		len = feat_matrix_length(info, &st->ifm, width,
+					 height, depth);
+		dev_dbg(ddev->dev, "op %d: IFM:%d:0x%llx-0x%llx\n",
+			op, st->ifm.region, st->ifm.base[0], len);
+		if (len == U64_MAX)
+			return -EINVAL;
+	}
+
+	if (ifm2) {
+		height = st->ifm2.broadcast & 0x1 ? 0 : st->ofm.height[2];
+		width = st->ifm2.broadcast & 0x2 ? 0 : st->ofm.width;
+		depth = st->ifm2.broadcast & 0x4 ? 0 : st->ofm.depth;
+
+		len = feat_matrix_length(info, &st->ifm2, width,
+					 height, depth);
+		dev_dbg(ddev->dev, "op %d: IFM2:%d:0x%llx-0x%llx\n",
+			op, st->ifm2.region, st->ifm2.base[0], len);
+		if (len == U64_MAX)
+			return -EINVAL;
+	}
+
+	len = feat_matrix_length(info, &st->ofm, st->ofm.width,
+				 st->ofm.height[2], st->ofm.depth);
+	dev_dbg(ddev->dev, "op %d: OFM:%d:0x%llx-0x%llx\n",
+		op, st->ofm.region, st->ofm.base[0], len);
+	if (len == U64_MAX)
+		return -EINVAL;
+	info->output_region[st->ofm.region] = true;
+
+	return 0;
+}
+
+static int ethosu_gem_cmdstream_copy_and_validate(struct drm_device *ddev,
+						  u32 __user *ucmds,
+						  struct ethosu_gem_object *bo,
+						  u32 size)
+{
+	struct ethosu_validated_cmdstream_info __free(kfree) *info = kzalloc(sizeof(*info), GFP_KERNEL);
+	struct ethosu_device *edev = to_ethosu_device(ddev);
+	u32 *bocmds = bo->base.vaddr;
+	struct cmd_state st;
+	int i, ret;
+
+	if (!info)
+		return -ENOMEM;
+	info->cmd_size = size;
+
+	cmd_state_init(&st);
+
+	for (i = 0; i < size / 4; i++) {
+		bool use_ifm, use_ifm2, use_scale;
+		u64 dstlen, srclen;
+		u16 cmd, param;
+		u32 cmds[2];
+		u64 addr;
+
+		if (get_user(cmds[0], ucmds++))
+			return -EFAULT;
+
+		bocmds[i] = cmds[0];
+
+		cmd = cmds[0];
+		param = cmds[0] >> 16;
+
+		if (cmd & 0x4000) {
+			if (get_user(cmds[1], ucmds++))
+				return -EFAULT;
+
+			i++;
+			bocmds[i] = cmds[1];
+			addr = cmd_to_addr(cmds);
+		}
+
+		switch (cmd) {
+		case NPU_OP_DMA_START:
+			srclen = dma_length(info, &st.dma, &st.dma.src);
+			dstlen = dma_length(info, &st.dma, &st.dma.dst);
+
+			if (st.dma.dst.region >= 0)
+				info->output_region[st.dma.dst.region] = true;
+			dev_dbg(ddev->dev, "cmd: DMA SRC:%d:0x%llx+0x%llx DST:%d:0x%llx+0x%llx\n",
+				st.dma.src.region, st.dma.src.offset, srclen,
+				st.dma.dst.region, st.dma.dst.offset, dstlen);
+			break;
+		case NPU_OP_CONV:
+		case NPU_OP_DEPTHWISE:
+			use_ifm2 = param & 0x1;  // weights_ifm2
+			use_scale = !(st.ofm.precision & 0x100);
+			ret = calc_sizes(ddev, info, cmd, &st, true, use_ifm2,
+					 !use_ifm2, use_scale);
+			if (ret)
+				return ret;
+			break;
+		case NPU_OP_POOL:
+			use_ifm = param != 0x4;  // pooling mode
+			use_scale = !(st.ofm.precision & 0x100);
+			ret = calc_sizes(ddev, info, cmd, &st, use_ifm, false,
+					 false, use_scale);
+			if (ret)
+				return ret;
+			break;
+		case NPU_OP_ELEMENTWISE:
+			use_ifm2 = !((st.ifm2.broadcast == 8) || (param == 5) ||
+				(param == 6) || (param == 7) || (param == 0x24));
+			use_ifm = st.ifm.broadcast != 8;
+			ret = calc_sizes_elemwise(ddev, info, cmd, &st, use_ifm, use_ifm2);
+			if (ret)
+				return ret;
+			break;
+		case NPU_OP_RESIZE: // U85 only
+			WARN_ON(1); // TODO
+			break;
+		case NPU_SET_KERNEL_WIDTH_M1:
+			st.ifm.width = param;
+			break;
+		case NPU_SET_KERNEL_HEIGHT_M1:
+			st.ifm.height[2] = param;
+			break;
+		case NPU_SET_KERNEL_STRIDE:
+			st.ifm.stride_kernel = param;
+			break;
+		case NPU_SET_IFM_PAD_TOP:
+			st.ifm.pad_top = param & 0x7f;
+			break;
+		case NPU_SET_IFM_PAD_LEFT:
+			st.ifm.pad_left = param & 0x7f;
+			break;
+		case NPU_SET_IFM_PAD_RIGHT:
+			st.ifm.pad_right = param & 0xff;
+			break;
+		case NPU_SET_IFM_PAD_BOTTOM:
+			st.ifm.pad_bottom = param & 0xff;
+			break;
+		case NPU_SET_IFM_DEPTH_M1:
+			st.ifm.depth = param;
+			break;
+		case NPU_SET_IFM_PRECISION:
+			st.ifm.precision = param;
+			break;
+		case NPU_SET_IFM_BROADCAST:
+			st.ifm.broadcast = param;
+			break;
+		case NPU_SET_IFM_REGION:
+			st.ifm.region = param & 0x7f;
+			break;
+		case NPU_SET_IFM_WIDTH0_M1:
+			st.ifm.width0 = param;
+			break;
+		case NPU_SET_IFM_HEIGHT0_M1:
+			st.ifm.height[0] = param;
+			break;
+		case NPU_SET_IFM_HEIGHT1_M1:
+			st.ifm.height[1] = param;
+			break;
+		case NPU_SET_IFM_BASE0:
+		case NPU_SET_IFM_BASE1:
+		case NPU_SET_IFM_BASE2:
+		case NPU_SET_IFM_BASE3:
+			st.ifm.base[cmd & 0x3] = addr;
+			break;
+		case NPU_SET_IFM_STRIDE_X:
+			st.ifm.stride_x = addr;
+			break;
+		case NPU_SET_IFM_STRIDE_Y:
+			st.ifm.stride_y = addr;
+			break;
+		case NPU_SET_IFM_STRIDE_C:
+			st.ifm.stride_c = addr;
+			break;
+
+		case NPU_SET_OFM_WIDTH_M1:
+			st.ofm.width = param;
+			break;
+		case NPU_SET_OFM_HEIGHT_M1:
+			st.ofm.height[2] = param;
+			break;
+		case NPU_SET_OFM_DEPTH_M1:
+			st.ofm.depth = param;
+			break;
+		case NPU_SET_OFM_PRECISION:
+			st.ofm.precision = param;
+			break;
+		case NPU_SET_OFM_REGION:
+			st.ofm.region = param & 0x7;
+			break;
+		case NPU_SET_OFM_WIDTH0_M1:
+			st.ofm.width0 = param;
+			break;
+		case NPU_SET_OFM_HEIGHT0_M1:
+			st.ofm.height[0] = param;
+			break;
+		case NPU_SET_OFM_HEIGHT1_M1:
+			st.ofm.height[1] = param;
+			break;
+		case NPU_SET_OFM_BASE0:
+		case NPU_SET_OFM_BASE1:
+		case NPU_SET_OFM_BASE2:
+		case NPU_SET_OFM_BASE3:
+			st.ofm.base[cmd & 0x3] = addr;
+			break;
+		case NPU_SET_OFM_STRIDE_X:
+			st.ofm.stride_x = addr;
+			break;
+		case NPU_SET_OFM_STRIDE_Y:
+			st.ofm.stride_y = addr;
+			break;
+		case NPU_SET_OFM_STRIDE_C:
+			st.ofm.stride_c = addr;
+			break;
+
+		case NPU_SET_IFM2_BROADCAST:
+			st.ifm2.broadcast = param;
+			break;
+		case NPU_SET_IFM2_PRECISION:
+			st.ifm2.precision = param;
+			break;
+		case NPU_SET_IFM2_REGION:
+			st.ifm2.region = param & 0x7;
+			break;
+		case NPU_SET_IFM2_WIDTH0_M1:
+			st.ifm2.width0 = param;
+			break;
+		case NPU_SET_IFM2_HEIGHT0_M1:
+			st.ifm2.height[0] = param;
+			break;
+		case NPU_SET_IFM2_HEIGHT1_M1:
+			st.ifm2.height[1] = param;
+			break;
+		case NPU_SET_IFM2_BASE0:
+		case NPU_SET_IFM2_BASE1:
+		case NPU_SET_IFM2_BASE2:
+		case NPU_SET_IFM2_BASE3:
+			st.ifm2.base[cmd & 0x3] = addr;
+			break;
+		case NPU_SET_IFM2_STRIDE_X:
+			st.ifm2.stride_x = addr;
+			break;
+		case NPU_SET_IFM2_STRIDE_Y:
+			st.ifm2.stride_y = addr;
+			break;
+		case NPU_SET_IFM2_STRIDE_C:
+			st.ifm2.stride_c = addr;
+			break;
+
+		case NPU_SET_WEIGHT_REGION:
+			st.weight[0].region = param & 0x7;
+			break;
+		case NPU_SET_SCALE_REGION:
+			st.scale[0].region = param & 0x7;
+			break;
+		case NPU_SET_WEIGHT_BASE:
+			st.weight[0].base = addr;
+			break;
+		case NPU_SET_WEIGHT_LENGTH:
+			st.weight[0].length = cmds[1];
+			break;
+		case NPU_SET_SCALE_BASE:
+			st.scale[0].base = addr;
+			break;
+		case NPU_SET_SCALE_LENGTH:
+			st.scale[0].length = cmds[1];
+			break;
+		case NPU_SET_WEIGHT1_BASE:
+			st.weight[1].base = addr;
+			break;
+		case NPU_SET_WEIGHT1_LENGTH:
+			st.weight[1].length = cmds[1];
+			break;
+		case NPU_SET_SCALE1_BASE: // NPU_SET_WEIGHT2_BASE (U85)
+			if (ethosu_is_u65(edev))
+				st.scale[1].base = addr;
+			else
+				st.weight[2].base = addr;
+			break;
+		case NPU_SET_SCALE1_LENGTH: // NPU_SET_WEIGHT2_LENGTH (U85)
+			if (ethosu_is_u65(edev))
+				st.scale[1].length = cmds[1];
+			else
+				st.weight[1].length = cmds[1];
+			break;
+		case NPU_SET_WEIGHT3_BASE:
+			st.weight[3].base = addr;
+			break;
+		case NPU_SET_WEIGHT3_LENGTH:
+			st.weight[3].length = cmds[1];
+			break;
+
+		case NPU_SET_DMA0_SRC_REGION:
+			if (param & 0x100)
+				st.dma.src.region = -1;
+			else
+				st.dma.src.region = param & 0x7;
+			st.dma.mode = (param >> 9) & 0x3;
+			break;
+		case NPU_SET_DMA0_DST_REGION:
+			if (param & 0x100)
+				st.dma.dst.region = -1;
+			else
+				st.dma.dst.region = param & 0x7;
+			break;
+		case NPU_SET_DMA0_SIZE0:
+			st.dma.size0 = param;
+			break;
+		case NPU_SET_DMA0_SIZE1:
+			st.dma.size1 = param;
+			break;
+		case NPU_SET_DMA0_SRC_STRIDE0:
+			st.dma.src.stride[0] = ((s64)addr << 24) >> 24;
+			break;
+		case NPU_SET_DMA0_SRC_STRIDE1:
+			st.dma.src.stride[1] = ((s64)addr << 24) >> 24;
+			break;
+		case NPU_SET_DMA0_DST_STRIDE0:
+			st.dma.dst.stride[0] = ((s64)addr << 24) >> 24;
+			break;
+		case NPU_SET_DMA0_DST_STRIDE1:
+			st.dma.dst.stride[1] = ((s64)addr << 24) >> 24;
+			break;
+		case NPU_SET_DMA0_SRC:
+			st.dma.src.offset = addr;
+			break;
+		case NPU_SET_DMA0_DST:
+			st.dma.dst.offset = addr;
+			break;
+		case NPU_SET_DMA0_LEN:
+			st.dma.src.len = st.dma.dst.len = addr;
+			break;
+		default:
+			break;
+		}
+	}
+
+	for (i = 0; i < NPU_BASEP_REGION_MAX; i++) {
+		if (!info->region_size[i])
+			continue;
+		dev_dbg(ddev->dev, "region %d max size: 0x%llx\n",
+			i, info->region_size[i]);
+	}
+
+	bo->info = no_free_ptr(info);
+	return 0;
+}
+
+/**
+ * ethosu_gem_cmdstream_create() - Create a GEM object and attach it to a handle.
+ * @file: DRM file.
+ * @ddev: DRM device.
+ * @exclusive_vm: Exclusive VM. Not NULL if the GEM object can't be shared.
+ * @size: Size of the GEM object to allocate.
+ * @flags: Combination of drm_ethosu_bo_flags flags.
+ * @handle: Pointer holding the handle pointing to the new GEM object.
+ *
+ * Return: Zero on success
+ */
+int ethosu_gem_cmdstream_create(struct drm_file *file,
+				struct drm_device *ddev,
+				u32 size, u64 data, u32 flags, u32 *handle)
+{
+	int ret;
+	struct drm_gem_dma_object *mem;
+	struct ethosu_gem_object *bo;
+
+	mem = drm_gem_dma_create(ddev, size);
+	if (IS_ERR(mem))
+		return PTR_ERR(mem);
+
+	bo = to_ethosu_bo(&mem->base);
+	bo->flags = flags;
+
+	ret = ethosu_gem_cmdstream_copy_and_validate(ddev,
+						     (void __user *)(uintptr_t)data,
+						     bo, size);
+	if (ret)
+		goto fail;
+
+	/*
+	 * Allocate an id of idr table where the obj is registered
+	 * and handle has the id what user can see.
+	 */
+	ret = drm_gem_handle_create(file, &mem->base, handle);
+
+fail:
+	/* drop reference from allocate - handle holds it now. */
+	drm_gem_object_put(&mem->base);
+
+	return ret;
+}
diff --git a/drivers/accel/ethosu/ethosu_gem.h b/drivers/accel/ethosu/ethosu_gem.h
new file mode 100644
index 000000000000..3922895a60fb
--- /dev/null
+++ b/drivers/accel/ethosu/ethosu_gem.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2025 Arm, Ltd. */
+
+#ifndef __ETHOSU_GEM_H__
+#define __ETHOSU_GEM_H__
+
+#include "ethosu_device.h"
+#include <drm/drm_gem_dma_helper.h>
+
+struct ethosu_validated_cmdstream_info {
+	u32 cmd_size;
+	u64 region_size[NPU_BASEP_REGION_MAX];
+	bool output_region[NPU_BASEP_REGION_MAX];
+};
+
+/**
+ * struct ethosu_gem_object - Driver specific GEM object.
+ */
+struct ethosu_gem_object {
+	/** @base: Inherit from drm_gem_shmem_object. */
+	struct drm_gem_dma_object base;
+
+	struct ethosu_validated_cmdstream_info *info;
+
+	/** @flags: Combination of drm_ethosu_bo_flags flags. */
+	u32 flags;
+};
+
+static inline
+struct ethosu_gem_object *to_ethosu_bo(struct drm_gem_object *obj)
+{
+	return container_of(to_drm_gem_dma_obj(obj), struct ethosu_gem_object, base);
+}
+
+struct drm_gem_object *ethosu_gem_create_object(struct drm_device *ddev,
+						size_t size);
+
+int ethosu_gem_create_with_handle(struct drm_file *file,
+				  struct drm_device *ddev,
+				  u64 *size, u32 flags, uint32_t *handle);
+
+int ethosu_gem_cmdstream_create(struct drm_file *file,
+				struct drm_device *ddev,
+				u32 size, u64 data, u32 flags, u32 *handle);
+
+#endif /* __ETHOSU_GEM_H__ */
diff --git a/drivers/accel/ethosu/ethosu_job.c b/drivers/accel/ethosu/ethosu_job.c
new file mode 100644
index 000000000000..26e7a2f64d71
--- /dev/null
+++ b/drivers/accel/ethosu/ethosu_job.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+/* Copyright 2024-2025 Tomeu Vizoso <tomeu@tomeuvizoso.net> */
+/* Copyright 2025 Arm, Ltd. */
+
+#include <linux/bitfield.h>
+#include <linux/genalloc.h>
+#include <linux/interrupt.h>
+#include <linux/iopoll.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+
+#include <drm/drm_file.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_dma_helper.h>
+#include <drm/drm_print.h>
+#include <drm/ethosu_accel.h>
+
+#include "ethosu_device.h"
+#include "ethosu_drv.h"
+#include "ethosu_gem.h"
+#include "ethosu_job.h"
+
+#define JOB_TIMEOUT_MS 500
+
+static struct ethosu_job *to_ethosu_job(struct drm_sched_job *sched_job)
+{
+	return container_of(sched_job, struct ethosu_job, base);
+}
+
+static const char *ethosu_fence_get_driver_name(struct dma_fence *fence)
+{
+	return "ethosu";
+}
+
+static const char *ethosu_fence_get_timeline_name(struct dma_fence *fence)
+{
+	return "ethosu-npu";
+}
+
+static const struct dma_fence_ops ethosu_fence_ops = {
+	.get_driver_name = ethosu_fence_get_driver_name,
+	.get_timeline_name = ethosu_fence_get_timeline_name,
+};
+
+static void ethosu_job_hw_submit(struct ethosu_device *dev, struct ethosu_job *job)
+{
+	struct drm_gem_dma_object *cmd_bo = to_drm_gem_dma_obj(job->cmd_bo);
+	struct ethosu_validated_cmdstream_info *cmd_info = to_ethosu_bo(job->cmd_bo)->info;
+
+	for (int i = 0; i < job->region_cnt; i++) {
+		struct drm_gem_dma_object *bo;
+		int region = job->region_bo_num[i];
+
+		bo = to_drm_gem_dma_obj(job->region_bo[i]);
+		writel_relaxed(lower_32_bits(bo->dma_addr), dev->regs + NPU_REG_BASEP(region));
+		writel_relaxed(upper_32_bits(bo->dma_addr), dev->regs + NPU_REG_BASEP_HI(region));
+		dev_dbg(dev->base.dev, "Region %d base addr = %pad\n", region, &bo->dma_addr);
+	}
+
+	if (job->sram_size) {
+		writel_relaxed(lower_32_bits(dev->sramphys),
+			       dev->regs + NPU_REG_BASEP(ETHOSU_SRAM_REGION));
+		writel_relaxed(upper_32_bits(dev->sramphys),
+			       dev->regs + NPU_REG_BASEP_HI(ETHOSU_SRAM_REGION));
+		dev_dbg(dev->base.dev, "Region %d base addr = %pad (SRAM)\n",
+			ETHOSU_SRAM_REGION, &dev->sramphys);
+	}
+
+	writel_relaxed(lower_32_bits(cmd_bo->dma_addr), dev->regs + NPU_REG_QBASE);
+	writel_relaxed(upper_32_bits(cmd_bo->dma_addr), dev->regs + NPU_REG_QBASE_HI);
+	writel_relaxed(cmd_info->cmd_size, dev->regs + NPU_REG_QSIZE);
+
+	writel(CMD_TRANSITION_TO_RUN, dev->regs + NPU_REG_CMD);
+
+	dev_dbg(dev->base.dev,
+		"Submitted cmd at %pad to core\n", &cmd_bo->dma_addr);
+}
+
+static int ethosu_acquire_object_fences(struct ethosu_job *job)
+{
+	int i, ret;
+	struct drm_gem_object **bos = job->region_bo;
+	struct ethosu_validated_cmdstream_info *info = to_ethosu_bo(job->cmd_bo)->info;
+
+	for (i = 0; i < job->region_cnt; i++) {
+		bool is_write;
+
+		if (!bos[i])
+			break;
+
+		ret = dma_resv_reserve_fences(bos[i]->resv, 1);
+		if (ret)
+			return ret;
+
+		is_write = info->output_region[job->region_bo_num[i]];
+		ret = drm_sched_job_add_implicit_dependencies(&job->base, bos[i],
+							      is_write);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void ethosu_attach_object_fences(struct ethosu_job *job)
+{
+	int i;
+	struct dma_fence *fence = job->inference_done_fence;
+	struct drm_gem_object **bos = job->region_bo;
+	struct ethosu_validated_cmdstream_info *info = to_ethosu_bo(job->cmd_bo)->info;
+
+	for (i = 0; i < job->region_cnt; i++)
+		if (info->output_region[job->region_bo_num[i]])
+			dma_resv_add_fence(bos[i]->resv, fence, DMA_RESV_USAGE_WRITE);
+}
+
+static int ethosu_job_push(struct ethosu_job *job)
+{
+	struct ww_acquire_ctx acquire_ctx;
+	int ret;
+
+	ret = drm_gem_lock_reservations(job->region_bo, job->region_cnt, &acquire_ctx);
+	if (ret)
+		return ret;
+
+	ret = ethosu_acquire_object_fences(job);
+	if (ret)
+		goto out;
+
+	ret = pm_runtime_resume_and_get(job->dev->base.dev);
+	if (!ret) {
+		guard(mutex)(&job->dev->sched_lock);
+
+		drm_sched_job_arm(&job->base);
+		job->inference_done_fence = dma_fence_get(&job->base.s_fence->finished);
+		kref_get(&job->refcount); /* put by scheduler job completion */
+		drm_sched_entity_push_job(&job->base);
+		ethosu_attach_object_fences(job);
+	}
+
+out:
+	drm_gem_unlock_reservations(job->region_bo, job->region_cnt, &acquire_ctx);
+	return ret;
+}
+
+static void ethosu_job_cleanup(struct kref *ref)
+{
+	struct ethosu_job *job = container_of(ref, struct ethosu_job,
+						refcount);
+	unsigned int i;
+
+	pm_runtime_put_autosuspend(job->dev->base.dev);
+
+	dma_fence_put(job->done_fence);
+	dma_fence_put(job->inference_done_fence);
+
+	for (i = 0; i < job->region_cnt; i++)
+		drm_gem_object_put(job->region_bo[i]);
+
+	drm_gem_object_put(job->cmd_bo);
+
+	kfree(job);
+}
+
+static void ethosu_job_put(struct ethosu_job *job)
+{
+	kref_put(&job->refcount, ethosu_job_cleanup);
+}
+
+static void ethosu_job_free(struct drm_sched_job *sched_job)
+{
+	struct ethosu_job *job = to_ethosu_job(sched_job);
+
+	drm_sched_job_cleanup(sched_job);
+	ethosu_job_put(job);
+}
+
+static struct dma_fence *ethosu_job_run(struct drm_sched_job *sched_job)
+{
+	struct ethosu_job *job = to_ethosu_job(sched_job);
+	struct ethosu_device *dev = job->dev;
+	struct dma_fence *fence = job->done_fence;
+
+	if (unlikely(job->base.s_fence->finished.error))
+		return NULL;
+
+	dma_fence_init(fence, &ethosu_fence_ops, &dev->fence_lock,
+		       dev->fence_context, ++dev->emit_seqno);
+	dma_fence_get(fence);
+
+	scoped_guard(mutex, &dev->job_lock) {
+		dev->in_flight_job = job;
+		ethosu_job_hw_submit(dev, job);
+	}
+
+	return fence;
+}
+
+static void ethosu_job_handle_irq(struct ethosu_device *dev)
+{
+	u32 status = readl_relaxed(dev->regs + NPU_REG_STATUS);
+
+	if (status & (STATUS_BUS_STATUS | STATUS_CMD_PARSE_ERR)) {
+		dev_err(dev->base.dev, "Error IRQ - %x\n", status);
+		drm_sched_fault(&dev->sched);
+		return;
+	}
+
+	scoped_guard(mutex, &dev->job_lock) {
+		if (dev->in_flight_job) {
+			dma_fence_signal(dev->in_flight_job->done_fence);
+			dev->in_flight_job = NULL;
+		}
+	}
+}
+
+static irqreturn_t ethosu_job_irq_handler_thread(int irq, void *data)
+{
+	struct ethosu_device *dev = data;
+
+	ethosu_job_handle_irq(dev);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t ethosu_job_irq_handler(int irq, void *data)
+{
+	struct ethosu_device *dev = data;
+	u32 status = readl_relaxed(dev->regs + NPU_REG_STATUS);
+
+	if (!(status & STATUS_IRQ_RAISED))
+		return IRQ_NONE;
+
+	writel_relaxed(CMD_CLEAR_IRQ, dev->regs + NPU_REG_CMD);
+	return IRQ_WAKE_THREAD;
+}
+
+static enum drm_gpu_sched_stat ethosu_job_timedout(struct drm_sched_job *bad)
+{
+	struct ethosu_job *job = to_ethosu_job(bad);
+	struct ethosu_device *dev = job->dev;
+	bool running;
+	u32 *bocmds = to_drm_gem_dma_obj(job->cmd_bo)->vaddr;
+	u32 cmdaddr;
+
+	cmdaddr = readl_relaxed(dev->regs + NPU_REG_QREAD);
+	running = FIELD_GET(STATUS_STATE_RUNNING, readl_relaxed(dev->regs + NPU_REG_STATUS));
+
+	if (running) {
+		int ret;
+		u32 reg;
+
+		ret = readl_relaxed_poll_timeout(dev->regs + NPU_REG_QREAD,
+						 reg,
+						 reg != cmdaddr,
+						 USEC_PER_MSEC, 100 * USEC_PER_MSEC);
+
+		/* If still running and progress is being made, just return */
+		if (!ret)
+			return DRM_GPU_SCHED_STAT_NO_HANG;
+	}
+
+	dev_err(dev->base.dev, "NPU sched timed out: NPU %s, cmdstream offset 0x%x: 0x%x\n",
+		running ? "running" : "stopped",
+		cmdaddr, bocmds[cmdaddr / 4]);
+
+	drm_sched_stop(&dev->sched, bad);
+
+	scoped_guard(mutex, &dev->job_lock)
+		dev->in_flight_job = NULL;
+
+	/* Proceed with reset now. */
+	pm_runtime_force_suspend(dev->base.dev);
+	pm_runtime_force_resume(dev->base.dev);
+
+	/* Restart the scheduler */
+	drm_sched_start(&dev->sched, 0);
+
+	return DRM_GPU_SCHED_STAT_RESET;
+}
+
+static const struct drm_sched_backend_ops ethosu_sched_ops = {
+	.run_job = ethosu_job_run,
+	.timedout_job = ethosu_job_timedout,
+	.free_job = ethosu_job_free
+};
+
+int ethosu_job_init(struct ethosu_device *edev)
+{
+	struct device *dev = edev->base.dev;
+	struct drm_sched_init_args args = {
+		.ops = &ethosu_sched_ops,
+		.num_rqs = DRM_SCHED_PRIORITY_COUNT,
+		.credit_limit = 1,
+		.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS),
+		.name = dev_name(dev),
+		.dev = dev,
+	};
+	int ret;
+
+	spin_lock_init(&edev->fence_lock);
+	ret = devm_mutex_init(dev, &edev->job_lock);
+	if (ret)
+		return ret;
+	ret = devm_mutex_init(dev, &edev->sched_lock);
+	if (ret)
+		return ret;
+
+	edev->irq = platform_get_irq(to_platform_device(dev), 0);
+	if (edev->irq < 0)
+		return edev->irq;
+
+	ret = devm_request_threaded_irq(dev, edev->irq,
+					ethosu_job_irq_handler,
+					ethosu_job_irq_handler_thread,
+					IRQF_SHARED, KBUILD_MODNAME,
+					edev);
+	if (ret) {
+		dev_err(dev, "failed to request irq\n");
+		return ret;
+	}
+
+	edev->fence_context = dma_fence_context_alloc(1);
+
+	ret = drm_sched_init(&edev->sched, &args);
+	if (ret) {
+		dev_err(dev, "Failed to create scheduler: %d\n", ret);
+		goto err_sched;
+	}
+
+	return 0;
+
+err_sched:
+	drm_sched_fini(&edev->sched);
+	return ret;
+}
+
+void ethosu_job_fini(struct ethosu_device *dev)
+{
+	drm_sched_fini(&dev->sched);
+}
+
+int ethosu_job_open(struct ethosu_file_priv *ethosu_priv)
+{
+	struct ethosu_device *dev = ethosu_priv->edev;
+	struct drm_gpu_scheduler *sched = &dev->sched;
+	int ret;
+
+	ret = drm_sched_entity_init(&ethosu_priv->sched_entity,
+				    DRM_SCHED_PRIORITY_NORMAL,
+				    &sched, 1, NULL);
+	return WARN_ON(ret);
+}
+
+void ethosu_job_close(struct ethosu_file_priv *ethosu_priv)
+{
+	struct drm_sched_entity *entity = &ethosu_priv->sched_entity;
+
+	drm_sched_entity_destroy(entity);
+}
+
+static int ethosu_ioctl_submit_job(struct drm_device *dev, struct drm_file *file,
+				   struct drm_ethosu_job *job)
+{
+	struct ethosu_device *edev = to_ethosu_device(dev);
+	struct ethosu_file_priv *file_priv = file->driver_priv;
+	struct ethosu_job *ejob = NULL;
+	struct ethosu_validated_cmdstream_info *cmd_info;
+	int ret = 0;
+
+	/* BO region 2 is reserved if SRAM is used */
+	if (job->region_bo_handles[ETHOSU_SRAM_REGION] && job->sram_size)
+		return -EINVAL;
+
+	if (edev->npu_info.sram_size < job->sram_size)
+		return -EINVAL;
+
+	ejob = kzalloc(sizeof(*ejob), GFP_KERNEL);
+	if (!ejob)
+		return -ENOMEM;
+
+	kref_init(&ejob->refcount);
+
+	ejob->dev = edev;
+	ejob->sram_size = job->sram_size;
+
+	ejob->done_fence = kzalloc(sizeof(*ejob->done_fence), GFP_KERNEL);
+	if (!ejob->done_fence) {
+		ret = -ENOMEM;
+		goto out_cleanup_job;
+	}
+
+	ret = drm_sched_job_init(&ejob->base,
+				 &file_priv->sched_entity,
+				 1, NULL, file->client_id);
+	if (ret)
+		goto out_put_job;
+
+	ejob->cmd_bo = drm_gem_object_lookup(file, job->cmd_bo);
+	if (!ejob->cmd_bo) {
+		ret = -ENOENT;
+		goto out_cleanup_job;
+	}
+	cmd_info = to_ethosu_bo(ejob->cmd_bo)->info;
+	if (!cmd_info) {
+		ret = -EINVAL;
+		goto out_cleanup_job;
+	}
+
+	for (int i = 0; i < NPU_BASEP_REGION_MAX; i++) {
+		struct drm_gem_object *gem;
+
+		/* Can only omit a BO handle if the region is not used or used for SRAM */
+		if (!job->region_bo_handles[i] &&
+		    (!cmd_info->region_size[i] || (i == ETHOSU_SRAM_REGION && job->sram_size)))
+			continue;
+
+		if (job->region_bo_handles[i] && !cmd_info->region_size[i]) {
+			dev_err(dev->dev,
+				"Cmdstream BO handle %d set for unused region %d\n",
+				job->region_bo_handles[i], i);
+			ret = -EINVAL;
+			goto out_cleanup_job;
+		}
+
+		gem = drm_gem_object_lookup(file, job->region_bo_handles[i]);
+		if (!gem) {
+			dev_err(dev->dev,
+				"Invalid BO handle %d for region %d\n",
+				job->region_bo_handles[i], i);
+			ret = -ENOENT;
+			goto out_cleanup_job;
+		}
+
+		ejob->region_bo[ejob->region_cnt] = gem;
+		ejob->region_bo_num[ejob->region_cnt] = i;
+		ejob->region_cnt++;
+
+		if (to_ethosu_bo(gem)->info) {
+			dev_err(dev->dev,
+				"Cmdstream BO handle %d used for region %d\n",
+				job->region_bo_handles[i], i);
+			ret = -EINVAL;
+			goto out_cleanup_job;
+		}
+
+		/* Verify the command stream doesn't have accesses outside the BO */
+		if (cmd_info->region_size[i] > gem->size) {
+			dev_err(dev->dev,
+				"cmd stream region %d size greater than BO size (%llu > %zu)\n",
+				i, cmd_info->region_size[i], gem->size);
+			ret = -EOVERFLOW;
+			goto out_cleanup_job;
+		}
+	}
+	ret = ethosu_job_push(ejob);
+
+out_cleanup_job:
+	if (ret)
+		drm_sched_job_cleanup(&ejob->base);
+out_put_job:
+	ethosu_job_put(ejob);
+
+	return ret;
+}
+
+int ethosu_ioctl_submit(struct drm_device *dev, void *data, struct drm_file *file)
+{
+	struct drm_ethosu_submit *args = data;
+	int ret = 0;
+	unsigned int i = 0;
+
+	if (args->pad) {
+		drm_dbg(dev, "Reserved field in drm_ethosu_submit struct should be 0.\n");
+		return -EINVAL;
+	}
+
+	struct drm_ethosu_job __free(kvfree) *jobs =
+		kvmalloc_array(args->job_count, sizeof(*jobs), GFP_KERNEL);
+	if (!jobs)
+		return -ENOMEM;
+
+	if (copy_from_user(jobs,
+			   (void __user *)(uintptr_t)args->jobs,
+			   args->job_count * sizeof(*jobs))) {
+		drm_dbg(dev, "Failed to copy incoming job array\n");
+		return -EFAULT;
+	}
+
+	for (i = 0; i < args->job_count; i++) {
+		ret = ethosu_ioctl_submit_job(dev, file, &jobs[i]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
diff --git a/drivers/accel/ethosu/ethosu_job.h b/drivers/accel/ethosu/ethosu_job.h
new file mode 100644
index 000000000000..ff1cf448d094
--- /dev/null
+++ b/drivers/accel/ethosu/ethosu_job.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR MIT */
+/* Copyright 2024-2025 Tomeu Vizoso <tomeu@tomeuvizoso.net> */
+/* Copyright 2025 Arm, Ltd. */
+
+#ifndef __ETHOSU_JOB_H__
+#define __ETHOSU_JOB_H__
+
+#include <linux/kref.h>
+#include <drm/gpu_scheduler.h>
+
+struct ethosu_device;
+struct ethosu_file_priv;
+
+struct ethosu_job {
+	struct drm_sched_job base;
+	struct ethosu_device *dev;
+
+	struct drm_gem_object *cmd_bo;
+	struct drm_gem_object *region_bo[NPU_BASEP_REGION_MAX];
+	u8 region_bo_num[NPU_BASEP_REGION_MAX];
+	u8 region_cnt;
+	u32 sram_size;
+
+	/* Fence to be signaled by drm-sched once its done with the job */
+	struct dma_fence *inference_done_fence;
+
+	/* Fence to be signaled by IRQ handler when the job is complete. */
+	struct dma_fence *done_fence;
+
+	struct kref refcount;
+};
+
+int ethosu_ioctl_submit(struct drm_device *dev, void *data, struct drm_file *file);
+
+int ethosu_job_init(struct ethosu_device *dev);
+void ethosu_job_fini(struct ethosu_device *dev);
+int ethosu_job_open(struct ethosu_file_priv *ethosu_priv);
+void ethosu_job_close(struct ethosu_file_priv *ethosu_priv);
+
+#endif
diff --git a/drivers/accel/ivpu/Makefile b/drivers/accel/ivpu/Makefile
index 1029e0bab061..dbf76b8a5b4c 100644
--- a/drivers/accel/ivpu/Makefile
+++ b/drivers/accel/ivpu/Makefile
@@ -6,6 +6,7 @@ intel_vpu-y := \
 	ivpu_fw.o \
 	ivpu_fw_log.o \
 	ivpu_gem.o \
+	ivpu_gem_userptr.o \
 	ivpu_hw.o \
 	ivpu_hw_btrs.o \
 	ivpu_hw_ip.o \
diff --git a/drivers/accel/ivpu/ivpu_debugfs.c b/drivers/accel/ivpu/ivpu_debugfs.c
index cd24ccd20ba6..3bd85ee6c26b 100644
--- a/drivers/accel/ivpu/ivpu_debugfs.c
+++ b/drivers/accel/ivpu/ivpu_debugfs.c
@@ -398,35 +398,25 @@ static int dct_active_set(void *data, u64 active_percent)
 
 DEFINE_DEBUGFS_ATTRIBUTE(ivpu_dct_fops, dct_active_get, dct_active_set, "%llu\n");
 
+static void print_priority_band(struct seq_file *s, struct ivpu_hw_info *hw,
+				int band, const char *name)
+{
+	seq_printf(s, "%-9s: grace_period %9u process_grace_period %9u process_quantum %9u\n",
+		   name,
+		   hw->hws.grace_period[band],
+		   hw->hws.process_grace_period[band],
+		   hw->hws.process_quantum[band]);
+}
+
 static int priority_bands_show(struct seq_file *s, void *v)
 {
 	struct ivpu_device *vdev = s->private;
 	struct ivpu_hw_info *hw = vdev->hw;
 
-	for (int band = VPU_JOB_SCHEDULING_PRIORITY_BAND_IDLE;
-	     band < VPU_JOB_SCHEDULING_PRIORITY_BAND_COUNT; band++) {
-		switch (band) {
-		case VPU_JOB_SCHEDULING_PRIORITY_BAND_IDLE:
-			seq_puts(s, "Idle:     ");
-			break;
-
-		case VPU_JOB_SCHEDULING_PRIORITY_BAND_NORMAL:
-			seq_puts(s, "Normal:   ");
-			break;
-
-		case VPU_JOB_SCHEDULING_PRIORITY_BAND_FOCUS:
-			seq_puts(s, "Focus:    ");
-			break;
-
-		case VPU_JOB_SCHEDULING_PRIORITY_BAND_REALTIME:
-			seq_puts(s, "Realtime: ");
-			break;
-		}
-
-		seq_printf(s, "grace_period %9u process_grace_period %9u process_quantum %9u\n",
-			   hw->hws.grace_period[band], hw->hws.process_grace_period[band],
-			   hw->hws.process_quantum[band]);
-	}
+	print_priority_band(s, hw, VPU_JOB_SCHEDULING_PRIORITY_BAND_IDLE, "Idle");
+	print_priority_band(s, hw, VPU_JOB_SCHEDULING_PRIORITY_BAND_NORMAL, "Normal");
+	print_priority_band(s, hw, VPU_JOB_SCHEDULING_PRIORITY_BAND_FOCUS, "Focus");
+	print_priority_band(s, hw, VPU_JOB_SCHEDULING_PRIORITY_BAND_REALTIME, "Realtime");
 
 	return 0;
 }
diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c
index 3289751b4757..3d6fccdefdd6 100644
--- a/drivers/accel/ivpu/ivpu_drv.c
+++ b/drivers/accel/ivpu/ivpu_drv.c
@@ -57,7 +57,7 @@ MODULE_PARM_DESC(pll_max_ratio, "Maximum PLL ratio used to set NPU frequency");
 
 int ivpu_sched_mode = IVPU_SCHED_MODE_AUTO;
 module_param_named(sched_mode, ivpu_sched_mode, int, 0444);
-MODULE_PARM_DESC(sched_mode, "Scheduler mode: -1 - Use default scheduler, 0 - Use OS scheduler, 1 - Use HW scheduler");
+MODULE_PARM_DESC(sched_mode, "Scheduler mode: -1 - Use default scheduler, 0 - Use OS scheduler (supported on 27XX - 50XX), 1 - Use HW scheduler");
 
 bool ivpu_disable_mmu_cont_pages;
 module_param_named(disable_mmu_cont_pages, ivpu_disable_mmu_cont_pages, bool, 0444);
@@ -134,6 +134,8 @@ bool ivpu_is_capable(struct ivpu_device *vdev, u32 capability)
 		return true;
 	case DRM_IVPU_CAP_DMA_MEMORY_RANGE:
 		return true;
+	case DRM_IVPU_CAP_BO_CREATE_FROM_USERPTR:
+		return true;
 	case DRM_IVPU_CAP_MANAGE_CMDQ:
 		return vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW;
 	default:
@@ -200,6 +202,9 @@ static int ivpu_get_param_ioctl(struct drm_device *dev, void *data, struct drm_f
 	case DRM_IVPU_PARAM_CAPABILITIES:
 		args->value = ivpu_is_capable(vdev, args->index);
 		break;
+	case DRM_IVPU_PARAM_PREEMPT_BUFFER_SIZE:
+		args->value = ivpu_fw_preempt_buf_size(vdev);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -310,6 +315,7 @@ static const struct drm_ioctl_desc ivpu_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(IVPU_CMDQ_CREATE, ivpu_cmdq_create_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(IVPU_CMDQ_DESTROY, ivpu_cmdq_destroy_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(IVPU_CMDQ_SUBMIT, ivpu_cmdq_submit_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(IVPU_BO_CREATE_FROM_USERPTR, ivpu_bo_create_from_userptr_ioctl, 0),
 };
 
 static int ivpu_wait_for_ready(struct ivpu_device *vdev)
@@ -377,8 +383,7 @@ int ivpu_boot(struct ivpu_device *vdev)
 	drm_WARN_ON(&vdev->drm, atomic_read(&vdev->job_timeout_counter));
 	drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
 
-	/* Update boot params located at first 4KB of FW memory */
-	ivpu_fw_boot_params_setup(vdev, ivpu_bo_vaddr(vdev->fw->mem));
+	ivpu_fw_boot_params_setup(vdev, ivpu_bo_vaddr(vdev->fw->mem_bp));
 
 	ret = ivpu_hw_boot_fw(vdev);
 	if (ret) {
@@ -450,6 +455,9 @@ int ivpu_shutdown(struct ivpu_device *vdev)
 static const struct file_operations ivpu_fops = {
 	.owner		= THIS_MODULE,
 	DRM_ACCEL_FOPS,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo = drm_show_fdinfo,
+#endif
 };
 
 static const struct drm_driver driver = {
@@ -464,6 +472,9 @@ static const struct drm_driver driver = {
 	.ioctls = ivpu_drm_ioctls,
 	.num_ioctls = ARRAY_SIZE(ivpu_drm_ioctls),
 	.fops = &ivpu_fops,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo = drm_show_memory_stats,
+#endif
 
 	.name = DRIVER_NAME,
 	.desc = DRIVER_DESC,
@@ -705,6 +716,7 @@ static struct pci_device_id ivpu_pci_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_LNL) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PTL_P) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_WCL) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_NVL) },
 	{ }
 };
 MODULE_DEVICE_TABLE(pci, ivpu_pci_ids);
diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h
index 62ab1c654e63..5b34b6f50e69 100644
--- a/drivers/accel/ivpu/ivpu_drv.h
+++ b/drivers/accel/ivpu/ivpu_drv.h
@@ -27,6 +27,7 @@
 #define PCI_DEVICE_ID_LNL	0x643e
 #define PCI_DEVICE_ID_PTL_P	0xb03e
 #define PCI_DEVICE_ID_WCL	0xfd3e
+#define PCI_DEVICE_ID_NVL	0xd71d
 
 #define IVPU_HW_IP_37XX 37
 #define IVPU_HW_IP_40XX 40
@@ -78,6 +79,7 @@
 #define IVPU_DBG_KREF	 BIT(11)
 #define IVPU_DBG_RPM	 BIT(12)
 #define IVPU_DBG_MMU_MAP BIT(13)
+#define IVPU_DBG_IOCTL   BIT(14)
 
 #define ivpu_err(vdev, fmt, ...) \
 	drm_err(&(vdev)->drm, "%s(): " fmt, __func__, ##__VA_ARGS__)
@@ -245,6 +247,8 @@ static inline int ivpu_hw_ip_gen(struct ivpu_device *vdev)
 	case PCI_DEVICE_ID_PTL_P:
 	case PCI_DEVICE_ID_WCL:
 		return IVPU_HW_IP_50XX;
+	case PCI_DEVICE_ID_NVL:
+		return IVPU_HW_IP_60XX;
 	default:
 		dump_stack();
 		ivpu_err(vdev, "Unknown NPU IP generation\n");
@@ -261,6 +265,7 @@ static inline int ivpu_hw_btrs_gen(struct ivpu_device *vdev)
 	case PCI_DEVICE_ID_LNL:
 	case PCI_DEVICE_ID_PTL_P:
 	case PCI_DEVICE_ID_WCL:
+	case PCI_DEVICE_ID_NVL:
 		return IVPU_HW_BTRS_LNL;
 	default:
 		dump_stack();
diff --git a/drivers/accel/ivpu/ivpu_fw.c b/drivers/accel/ivpu/ivpu_fw.c
index 9db741695401..48386d2cddbb 100644
--- a/drivers/accel/ivpu/ivpu_fw.c
+++ b/drivers/accel/ivpu/ivpu_fw.c
@@ -17,15 +17,10 @@
 #include "ivpu_ipc.h"
 #include "ivpu_pm.h"
 
-#define FW_GLOBAL_MEM_START	(2ull * SZ_1G)
-#define FW_GLOBAL_MEM_END	(3ull * SZ_1G)
-#define FW_SHARED_MEM_SIZE	SZ_256M /* Must be aligned to FW_SHARED_MEM_ALIGNMENT */
-#define FW_SHARED_MEM_ALIGNMENT	SZ_128K /* VPU MTRR limitation */
-#define FW_RUNTIME_MAX_SIZE	SZ_512M
 #define FW_SHAVE_NN_MAX_SIZE	SZ_2M
-#define FW_RUNTIME_MIN_ADDR	(FW_GLOBAL_MEM_START)
-#define FW_RUNTIME_MAX_ADDR	(FW_GLOBAL_MEM_END - FW_SHARED_MEM_SIZE)
 #define FW_FILE_IMAGE_OFFSET	(VPU_FW_HEADER_SIZE + FW_VERSION_HEADER_SIZE)
+#define FW_PREEMPT_BUF_MIN_SIZE SZ_4K
+#define FW_PREEMPT_BUF_MAX_SIZE SZ_32M
 
 #define WATCHDOG_MSS_REDIRECT	32
 #define WATCHDOG_NCE_REDIRECT	33
@@ -61,12 +56,14 @@ static struct {
 	{ IVPU_HW_IP_40XX, "intel/vpu/vpu_40xx_v0.0.bin" },
 	{ IVPU_HW_IP_50XX, "intel/vpu/vpu_50xx_v1.bin" },
 	{ IVPU_HW_IP_50XX, "intel/vpu/vpu_50xx_v0.0.bin" },
+	{ IVPU_HW_IP_60XX, "intel/vpu/vpu_60xx_v1.bin" },
 };
 
 /* Production fw_names from the table above */
 MODULE_FIRMWARE("intel/vpu/vpu_37xx_v1.bin");
 MODULE_FIRMWARE("intel/vpu/vpu_40xx_v1.bin");
 MODULE_FIRMWARE("intel/vpu/vpu_50xx_v1.bin");
+MODULE_FIRMWARE("intel/vpu/vpu_60xx_v1.bin");
 
 static int ivpu_fw_request(struct ivpu_device *vdev)
 {
@@ -131,9 +128,14 @@ ivpu_fw_check_api_ver_lt(struct ivpu_device *vdev, const struct vpu_firmware_hea
 	return false;
 }
 
-static bool is_within_range(u64 addr, size_t size, u64 range_start, size_t range_size)
+bool ivpu_is_within_range(u64 addr, size_t size, struct ivpu_addr_range *range)
 {
-	if (addr < range_start || addr + size > range_start + range_size)
+	u64 addr_end;
+
+	if (!range || check_add_overflow(addr, size, &addr_end))
+		return false;
+
+	if (addr < range->start || addr_end > range->end)
 		return false;
 
 	return true;
@@ -142,6 +144,12 @@ static bool is_within_range(u64 addr, size_t size, u64 range_start, size_t range
 static u32
 ivpu_fw_sched_mode_select(struct ivpu_device *vdev, const struct vpu_firmware_header *fw_hdr)
 {
+	if (ivpu_hw_ip_gen(vdev) >= IVPU_HW_IP_60XX &&
+	    ivpu_sched_mode == VPU_SCHEDULING_MODE_OS) {
+		ivpu_warn(vdev, "OS sched mode is not supported, using HW mode\n");
+		return VPU_SCHEDULING_MODE_HW;
+	}
+
 	if (ivpu_sched_mode != IVPU_SCHED_MODE_AUTO)
 		return ivpu_sched_mode;
 
@@ -151,11 +159,56 @@ ivpu_fw_sched_mode_select(struct ivpu_device *vdev, const struct vpu_firmware_he
 	return VPU_SCHEDULING_MODE_HW;
 }
 
+static void
+ivpu_preemption_config_parse(struct ivpu_device *vdev, const struct vpu_firmware_header *fw_hdr)
+{
+	struct ivpu_fw_info *fw = vdev->fw;
+	u32 primary_preempt_buf_size, secondary_preempt_buf_size;
+
+	if (fw_hdr->preemption_buffer_1_max_size)
+		primary_preempt_buf_size = fw_hdr->preemption_buffer_1_max_size;
+	else
+		primary_preempt_buf_size = fw_hdr->preemption_buffer_1_size;
+
+	if (fw_hdr->preemption_buffer_2_max_size)
+		secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_max_size;
+	else
+		secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_size;
+
+	ivpu_dbg(vdev, FW_BOOT, "Preemption buffer size, primary: %u, secondary: %u\n",
+		 primary_preempt_buf_size, secondary_preempt_buf_size);
+
+	if (primary_preempt_buf_size < FW_PREEMPT_BUF_MIN_SIZE ||
+	    secondary_preempt_buf_size < FW_PREEMPT_BUF_MIN_SIZE) {
+		ivpu_warn(vdev, "Preemption buffers size too small\n");
+		return;
+	}
+
+	if (primary_preempt_buf_size > FW_PREEMPT_BUF_MAX_SIZE ||
+	    secondary_preempt_buf_size > FW_PREEMPT_BUF_MAX_SIZE) {
+		ivpu_warn(vdev, "Preemption buffers size too big\n");
+		return;
+	}
+
+	if (fw->sched_mode != VPU_SCHEDULING_MODE_HW)
+		return;
+
+	if (ivpu_test_mode & IVPU_TEST_MODE_MIP_DISABLE)
+		return;
+
+	vdev->fw->primary_preempt_buf_size = ALIGN(primary_preempt_buf_size, PAGE_SIZE);
+	vdev->fw->secondary_preempt_buf_size = ALIGN(secondary_preempt_buf_size, PAGE_SIZE);
+}
+
 static int ivpu_fw_parse(struct ivpu_device *vdev)
 {
 	struct ivpu_fw_info *fw = vdev->fw;
 	const struct vpu_firmware_header *fw_hdr = (const void *)fw->file->data;
-	u64 runtime_addr, image_load_addr, runtime_size, image_size;
+	struct ivpu_addr_range fw_image_range;
+	u64 boot_params_addr, boot_params_size;
+	u64 fw_version_addr, fw_version_size;
+	u64 runtime_addr, runtime_size;
+	u64 image_load_addr, image_size;
 
 	if (fw->file->size <= FW_FILE_IMAGE_OFFSET) {
 		ivpu_err(vdev, "Firmware file is too small: %zu\n", fw->file->size);
@@ -167,18 +220,37 @@ static int ivpu_fw_parse(struct ivpu_device *vdev)
 		return -EINVAL;
 	}
 
-	runtime_addr = fw_hdr->boot_params_load_address;
-	runtime_size = fw_hdr->runtime_size;
-	image_load_addr = fw_hdr->image_load_address;
-	image_size = fw_hdr->image_size;
+	boot_params_addr = fw_hdr->boot_params_load_address;
+	boot_params_size = SZ_4K;
 
-	if (runtime_addr < FW_RUNTIME_MIN_ADDR || runtime_addr > FW_RUNTIME_MAX_ADDR) {
-		ivpu_err(vdev, "Invalid firmware runtime address: 0x%llx\n", runtime_addr);
+	if (!ivpu_is_within_range(boot_params_addr, boot_params_size, &vdev->hw->ranges.runtime)) {
+		ivpu_err(vdev, "Invalid boot params address: 0x%llx\n", boot_params_addr);
 		return -EINVAL;
 	}
 
-	if (runtime_size < fw->file->size || runtime_size > FW_RUNTIME_MAX_SIZE) {
-		ivpu_err(vdev, "Invalid firmware runtime size: %llu\n", runtime_size);
+	fw_version_addr = fw_hdr->firmware_version_load_address;
+	fw_version_size = ALIGN(fw_hdr->firmware_version_size, SZ_4K);
+
+	if (fw_version_size != SZ_4K) {
+		ivpu_err(vdev, "Invalid firmware version size: %u\n",
+			 fw_hdr->firmware_version_size);
+		return -EINVAL;
+	}
+
+	if (!ivpu_is_within_range(fw_version_addr, fw_version_size, &vdev->hw->ranges.runtime)) {
+		ivpu_err(vdev, "Invalid firmware version address: 0x%llx\n", fw_version_addr);
+		return -EINVAL;
+	}
+
+	runtime_addr = fw_hdr->image_load_address;
+	runtime_size = fw_hdr->runtime_size - boot_params_size - fw_version_size;
+
+	image_load_addr = fw_hdr->image_load_address;
+	image_size = fw_hdr->image_size;
+
+	if (!ivpu_is_within_range(runtime_addr, runtime_size, &vdev->hw->ranges.runtime)) {
+		ivpu_err(vdev, "Invalid firmware runtime address: 0x%llx and size %llu\n",
+			 runtime_addr, runtime_size);
 		return -EINVAL;
 	}
 
@@ -187,23 +259,25 @@ static int ivpu_fw_parse(struct ivpu_device *vdev)
 		return -EINVAL;
 	}
 
-	if (image_load_addr < runtime_addr ||
-	    image_load_addr + image_size > runtime_addr + runtime_size) {
-		ivpu_err(vdev, "Invalid firmware load address size: 0x%llx and size %llu\n",
+	if (!ivpu_is_within_range(image_load_addr, image_size, &vdev->hw->ranges.runtime)) {
+		ivpu_err(vdev, "Invalid firmware load address: 0x%llx and size %llu\n",
 			 image_load_addr, image_size);
 		return -EINVAL;
 	}
 
-	if (fw_hdr->shave_nn_fw_size > FW_SHAVE_NN_MAX_SIZE) {
-		ivpu_err(vdev, "SHAVE NN firmware is too big: %u\n", fw_hdr->shave_nn_fw_size);
+	if (ivpu_hw_range_init(vdev, &fw_image_range, image_load_addr, image_size))
 		return -EINVAL;
-	}
 
-	if (fw_hdr->entry_point < image_load_addr ||
-	    fw_hdr->entry_point >= image_load_addr + image_size) {
+	if (!ivpu_is_within_range(fw_hdr->entry_point, SZ_4K, &fw_image_range)) {
 		ivpu_err(vdev, "Invalid entry point: 0x%llx\n", fw_hdr->entry_point);
 		return -EINVAL;
 	}
+
+	if (fw_hdr->shave_nn_fw_size > FW_SHAVE_NN_MAX_SIZE) {
+		ivpu_err(vdev, "SHAVE NN firmware is too big: %u\n", fw_hdr->shave_nn_fw_size);
+		return -EINVAL;
+	}
+
 	ivpu_dbg(vdev, FW_BOOT, "Header version: 0x%x, format 0x%x\n",
 		 fw_hdr->header_version, fw_hdr->image_format);
 
@@ -217,6 +291,10 @@ static int ivpu_fw_parse(struct ivpu_device *vdev)
 	if (IVPU_FW_CHECK_API_COMPAT(vdev, fw_hdr, JSM, 3))
 		return -EINVAL;
 
+	fw->boot_params_addr = boot_params_addr;
+	fw->boot_params_size = boot_params_size;
+	fw->fw_version_addr = fw_version_addr;
+	fw->fw_version_size = fw_version_size;
 	fw->runtime_addr = runtime_addr;
 	fw->runtime_size = runtime_size;
 	fw->image_load_offset = image_load_addr - runtime_addr;
@@ -235,22 +313,13 @@ static int ivpu_fw_parse(struct ivpu_device *vdev)
 	fw->sched_mode = ivpu_fw_sched_mode_select(vdev, fw_hdr);
 	ivpu_info(vdev, "Scheduler mode: %s\n", fw->sched_mode ? "HW" : "OS");
 
-	if (fw_hdr->preemption_buffer_1_max_size)
-		fw->primary_preempt_buf_size = fw_hdr->preemption_buffer_1_max_size;
-	else
-		fw->primary_preempt_buf_size = fw_hdr->preemption_buffer_1_size;
+	ivpu_preemption_config_parse(vdev, fw_hdr);
+	ivpu_dbg(vdev, FW_BOOT, "Mid-inference preemption %s supported\n",
+		 ivpu_fw_preempt_buf_size(vdev) ? "is" : "is not");
 
-	if (fw_hdr->preemption_buffer_2_max_size)
-		fw->secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_max_size;
-	else
-		fw->secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_size;
-	ivpu_dbg(vdev, FW_BOOT, "Preemption buffer sizes: primary %u, secondary %u\n",
-		 fw->primary_preempt_buf_size, fw->secondary_preempt_buf_size);
-
-	if (fw_hdr->ro_section_start_address && !is_within_range(fw_hdr->ro_section_start_address,
-								 fw_hdr->ro_section_size,
-								 fw_hdr->image_load_address,
-								 fw_hdr->image_size)) {
+	if (fw_hdr->ro_section_start_address &&
+	    !ivpu_is_within_range(fw_hdr->ro_section_start_address, fw_hdr->ro_section_size,
+				  &fw_image_range)) {
 		ivpu_err(vdev, "Invalid read-only section: start address 0x%llx, size %u\n",
 			 fw_hdr->ro_section_start_address, fw_hdr->ro_section_size);
 		return -EINVAL;
@@ -259,12 +328,18 @@ static int ivpu_fw_parse(struct ivpu_device *vdev)
 	fw->read_only_addr = fw_hdr->ro_section_start_address;
 	fw->read_only_size = fw_hdr->ro_section_size;
 
-	ivpu_dbg(vdev, FW_BOOT, "Size: file %lu image %u runtime %u shavenn %u\n",
-		 fw->file->size, fw->image_size, fw->runtime_size, fw->shave_nn_size);
-	ivpu_dbg(vdev, FW_BOOT, "Address: runtime 0x%llx, load 0x%llx, entry point 0x%llx\n",
-		 fw->runtime_addr, image_load_addr, fw->entry_point);
+	ivpu_dbg(vdev, FW_BOOT, "Boot params: address 0x%llx, size %llu\n",
+		 fw->boot_params_addr, fw->boot_params_size);
+	ivpu_dbg(vdev, FW_BOOT, "FW version:  address 0x%llx, size %llu\n",
+		 fw->fw_version_addr, fw->fw_version_size);
+	ivpu_dbg(vdev, FW_BOOT, "Runtime:     address 0x%llx, size %u\n",
+		 fw->runtime_addr, fw->runtime_size);
+	ivpu_dbg(vdev, FW_BOOT, "Image load offset: 0x%llx, size %u\n",
+		 fw->image_load_offset, fw->image_size);
 	ivpu_dbg(vdev, FW_BOOT, "Read-only section: address 0x%llx, size %u\n",
 		 fw->read_only_addr, fw->read_only_size);
+	ivpu_dbg(vdev, FW_BOOT, "FW entry point: 0x%llx\n", fw->entry_point);
+	ivpu_dbg(vdev, FW_BOOT, "SHAVE NN size: %u\n", fw->shave_nn_size);
 
 	return 0;
 }
@@ -291,39 +366,33 @@ ivpu_fw_init_wa(struct ivpu_device *vdev)
 	IVPU_PRINT_WA(disable_d0i3_msg);
 }
 
-static int ivpu_fw_update_global_range(struct ivpu_device *vdev)
-{
-	struct ivpu_fw_info *fw = vdev->fw;
-	u64 start = ALIGN(fw->runtime_addr + fw->runtime_size, FW_SHARED_MEM_ALIGNMENT);
-	u64 size = FW_SHARED_MEM_SIZE;
-
-	if (start + size > FW_GLOBAL_MEM_END) {
-		ivpu_err(vdev, "No space for shared region, start %lld, size %lld\n", start, size);
-		return -EINVAL;
-	}
-
-	ivpu_hw_range_init(&vdev->hw->ranges.global, start, size);
-	return 0;
-}
-
 static int ivpu_fw_mem_init(struct ivpu_device *vdev)
 {
 	struct ivpu_fw_info *fw = vdev->fw;
-	struct ivpu_addr_range fw_range;
 	int log_verb_size;
 	int ret;
 
-	ret = ivpu_fw_update_global_range(vdev);
-	if (ret)
-		return ret;
+	fw->mem_bp = ivpu_bo_create_runtime(vdev, fw->boot_params_addr, fw->boot_params_size,
+					    DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE);
+	if (!fw->mem_bp) {
+		ivpu_err(vdev, "Failed to create firmware boot params memory buffer\n");
+		return -ENOMEM;
+	}
 
-	fw_range.start = fw->runtime_addr;
-	fw_range.end = fw->runtime_addr + fw->runtime_size;
-	fw->mem = ivpu_bo_create(vdev, &vdev->gctx, &fw_range, fw->runtime_size,
-				 DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE);
+	fw->mem_fw_ver = ivpu_bo_create_runtime(vdev, fw->fw_version_addr, fw->fw_version_size,
+						DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE);
+	if (!fw->mem_fw_ver) {
+		ivpu_err(vdev, "Failed to create firmware version memory buffer\n");
+		ret = -ENOMEM;
+		goto err_free_bp;
+	}
+
+	fw->mem = ivpu_bo_create_runtime(vdev, fw->runtime_addr, fw->runtime_size,
+					 DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE);
 	if (!fw->mem) {
 		ivpu_err(vdev, "Failed to create firmware runtime memory buffer\n");
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err_free_fw_ver;
 	}
 
 	ret = ivpu_mmu_context_set_pages_ro(vdev, &vdev->gctx, fw->read_only_addr,
@@ -372,6 +441,10 @@ err_free_log_crit:
 	ivpu_bo_free(fw->mem_log_crit);
 err_free_fw_mem:
 	ivpu_bo_free(fw->mem);
+err_free_fw_ver:
+	ivpu_bo_free(fw->mem_fw_ver);
+err_free_bp:
+	ivpu_bo_free(fw->mem_bp);
 	return ret;
 }
 
@@ -387,10 +460,14 @@ static void ivpu_fw_mem_fini(struct ivpu_device *vdev)
 	ivpu_bo_free(fw->mem_log_verb);
 	ivpu_bo_free(fw->mem_log_crit);
 	ivpu_bo_free(fw->mem);
+	ivpu_bo_free(fw->mem_fw_ver);
+	ivpu_bo_free(fw->mem_bp);
 
 	fw->mem_log_verb = NULL;
 	fw->mem_log_crit = NULL;
 	fw->mem = NULL;
+	fw->mem_fw_ver = NULL;
+	fw->mem_bp = NULL;
 }
 
 int ivpu_fw_init(struct ivpu_device *vdev)
@@ -483,11 +560,6 @@ static void ivpu_fw_boot_params_print(struct ivpu_device *vdev, struct vpu_boot_
 	ivpu_dbg(vdev, FW_BOOT, "boot_params.cache_defaults[VPU_BOOT_L2_CACHE_CFG_NN].cfg = 0x%x\n",
 		 boot_params->cache_defaults[VPU_BOOT_L2_CACHE_CFG_NN].cfg);
 
-	ivpu_dbg(vdev, FW_BOOT, "boot_params.global_memory_allocator_base = 0x%llx\n",
-		 boot_params->global_memory_allocator_base);
-	ivpu_dbg(vdev, FW_BOOT, "boot_params.global_memory_allocator_size = 0x%x\n",
-		 boot_params->global_memory_allocator_size);
-
 	ivpu_dbg(vdev, FW_BOOT, "boot_params.shave_nn_fw_base = 0x%llx\n",
 		 boot_params->shave_nn_fw_base);
 
@@ -495,10 +567,6 @@ static void ivpu_fw_boot_params_print(struct ivpu_device *vdev, struct vpu_boot_
 		 boot_params->watchdog_irq_mss);
 	ivpu_dbg(vdev, FW_BOOT, "boot_params.watchdog_irq_nce = 0x%x\n",
 		 boot_params->watchdog_irq_nce);
-	ivpu_dbg(vdev, FW_BOOT, "boot_params.host_to_vpu_irq = 0x%x\n",
-		 boot_params->host_to_vpu_irq);
-	ivpu_dbg(vdev, FW_BOOT, "boot_params.job_done_irq = 0x%x\n",
-		 boot_params->job_done_irq);
 
 	ivpu_dbg(vdev, FW_BOOT, "boot_params.host_version_id = 0x%x\n",
 		 boot_params->host_version_id);
@@ -546,6 +614,8 @@ static void ivpu_fw_boot_params_print(struct ivpu_device *vdev, struct vpu_boot_
 		 boot_params->system_time_us);
 	ivpu_dbg(vdev, FW_BOOT, "boot_params.power_profile = 0x%x\n",
 		 boot_params->power_profile);
+	ivpu_dbg(vdev, FW_BOOT, "boot_params.vpu_uses_ecc_mca_signal = 0x%x\n",
+		 boot_params->vpu_uses_ecc_mca_signal);
 }
 
 void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params *boot_params)
@@ -572,6 +642,7 @@ void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params
 		return;
 	}
 
+	memset(boot_params, 0, sizeof(*boot_params));
 	vdev->pm->is_warmboot = false;
 
 	boot_params->magic = VPU_BOOT_PARAMS_MAGIC;
@@ -647,6 +718,8 @@ void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params
 	boot_params->d0i3_entry_vpu_ts = 0;
 	if (IVPU_WA(disable_d0i2))
 		boot_params->power_profile |= BIT(1);
+	boot_params->vpu_uses_ecc_mca_signal =
+		ivpu_hw_uses_ecc_mca_signal(vdev) ? VPU_BOOT_MCA_ECC_BOTH : 0;
 
 	boot_params->system_time_us = ktime_to_us(ktime_get_real());
 	wmb(); /* Flush WC buffers after writing bootparams */
diff --git a/drivers/accel/ivpu/ivpu_fw.h b/drivers/accel/ivpu/ivpu_fw.h
index 7081913fb0dd..00945892b55e 100644
--- a/drivers/accel/ivpu/ivpu_fw.h
+++ b/drivers/accel/ivpu/ivpu_fw.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2020-2024 Intel Corporation
+ * Copyright (C) 2020-2025 Intel Corporation
  */
 
 #ifndef __IVPU_FW_H__
@@ -19,10 +19,16 @@ struct ivpu_fw_info {
 	const struct firmware *file;
 	const char *name;
 	char version[FW_VERSION_STR_SIZE];
+	struct ivpu_bo *mem_bp;
+	struct ivpu_bo *mem_fw_ver;
 	struct ivpu_bo *mem;
 	struct ivpu_bo *mem_shave_nn;
 	struct ivpu_bo *mem_log_crit;
 	struct ivpu_bo *mem_log_verb;
+	u64 boot_params_addr;
+	u64 boot_params_size;
+	u64 fw_version_addr;
+	u64 fw_version_size;
 	u64 runtime_addr;
 	u32 runtime_size;
 	u64 image_load_offset;
@@ -42,6 +48,7 @@ struct ivpu_fw_info {
 	u64 last_heartbeat;
 };
 
+bool ivpu_is_within_range(u64 addr, size_t size, struct ivpu_addr_range *range);
 int ivpu_fw_init(struct ivpu_device *vdev);
 void ivpu_fw_fini(struct ivpu_device *vdev);
 void ivpu_fw_load(struct ivpu_device *vdev);
@@ -52,4 +59,9 @@ static inline bool ivpu_fw_is_cold_boot(struct ivpu_device *vdev)
 	return vdev->fw->entry_point == vdev->fw->cold_boot_entry_point;
 }
 
+static inline u32 ivpu_fw_preempt_buf_size(struct ivpu_device *vdev)
+{
+	return vdev->fw->primary_preempt_buf_size + vdev->fw->secondary_preempt_buf_size;
+}
+
 #endif /* __IVPU_FW_H__ */
diff --git a/drivers/accel/ivpu/ivpu_gem.c b/drivers/accel/ivpu/ivpu_gem.c
index 59cfcf3eaded..ece68f570b7e 100644
--- a/drivers/accel/ivpu/ivpu_gem.c
+++ b/drivers/accel/ivpu/ivpu_gem.c
@@ -15,6 +15,7 @@
 #include <drm/drm_utils.h>
 
 #include "ivpu_drv.h"
+#include "ivpu_fw.h"
 #include "ivpu_gem.h"
 #include "ivpu_hw.h"
 #include "ivpu_mmu.h"
@@ -27,8 +28,8 @@ static const struct drm_gem_object_funcs ivpu_gem_funcs;
 static inline void ivpu_dbg_bo(struct ivpu_device *vdev, struct ivpu_bo *bo, const char *action)
 {
 	ivpu_dbg(vdev, BO,
-		 "%6s: bo %8p vpu_addr %9llx size %8zu ctx %d has_pages %d dma_mapped %d mmu_mapped %d wc %d imported %d\n",
-		 action, bo, bo->vpu_addr, ivpu_bo_size(bo), bo->ctx_id,
+		 "%6s: bo %8p size %9zu ctx %d vpu_addr %9llx pages %d sgt %d mmu_mapped %d wc %d imported %d\n",
+		 action, bo, ivpu_bo_size(bo), bo->ctx_id, bo->vpu_addr,
 		 (bool)bo->base.pages, (bool)bo->base.sgt, bo->mmu_mapped, bo->base.map_wc,
 		 (bool)drm_gem_is_imported(&bo->base.base));
 }
@@ -43,22 +44,47 @@ static inline void ivpu_bo_unlock(struct ivpu_bo *bo)
 	dma_resv_unlock(bo->base.base.resv);
 }
 
+static struct sg_table *ivpu_bo_map_attachment(struct ivpu_device *vdev, struct ivpu_bo *bo)
+{
+	struct sg_table *sgt;
+
+	drm_WARN_ON(&vdev->drm, !bo->base.base.import_attach);
+
+	ivpu_bo_lock(bo);
+
+	sgt = bo->base.sgt;
+	if (!sgt) {
+		sgt = dma_buf_map_attachment(bo->base.base.import_attach, DMA_BIDIRECTIONAL);
+		if (IS_ERR(sgt))
+			ivpu_err(vdev, "Failed to map BO in IOMMU: %ld\n", PTR_ERR(sgt));
+		else
+			bo->base.sgt = sgt;
+	}
+
+	ivpu_bo_unlock(bo);
+
+	return sgt;
+}
+
 /*
- * ivpu_bo_pin() - pin the backing physical pages and map them to VPU.
+ * ivpu_bo_bind() - pin the backing physical pages and map them to VPU.
  *
  * This function pins physical memory pages, then maps the physical pages
  * to IOMMU address space and finally updates the VPU MMU page tables
  * to allow the VPU to translate VPU address to IOMMU address.
  */
-int __must_check ivpu_bo_pin(struct ivpu_bo *bo)
+int __must_check ivpu_bo_bind(struct ivpu_bo *bo)
 {
 	struct ivpu_device *vdev = ivpu_bo_to_vdev(bo);
 	struct sg_table *sgt;
 	int ret = 0;
 
-	ivpu_dbg_bo(vdev, bo, "pin");
+	ivpu_dbg_bo(vdev, bo, "bind");
 
-	sgt = drm_gem_shmem_get_pages_sgt(&bo->base);
+	if (bo->base.base.import_attach)
+		sgt = ivpu_bo_map_attachment(vdev, bo);
+	else
+		sgt = drm_gem_shmem_get_pages_sgt(&bo->base);
 	if (IS_ERR(sgt)) {
 		ret = PTR_ERR(sgt);
 		ivpu_err(vdev, "Failed to map BO in IOMMU: %d\n", ret);
@@ -70,7 +96,7 @@ int __must_check ivpu_bo_pin(struct ivpu_bo *bo)
 	if (!bo->mmu_mapped) {
 		drm_WARN_ON(&vdev->drm, !bo->ctx);
 		ret = ivpu_mmu_context_map_sgt(vdev, bo->ctx, bo->vpu_addr, sgt,
-					       ivpu_bo_is_snooped(bo));
+					       ivpu_bo_is_snooped(bo), ivpu_bo_is_read_only(bo));
 		if (ret) {
 			ivpu_err(vdev, "Failed to map BO in MMU: %d\n", ret);
 			goto unlock;
@@ -99,9 +125,9 @@ ivpu_bo_alloc_vpu_addr(struct ivpu_bo *bo, struct ivpu_mmu_context *ctx,
 	ret = ivpu_mmu_context_insert_node(ctx, range, ivpu_bo_size(bo), &bo->mm_node);
 	if (!ret) {
 		bo->ctx = ctx;
+		bo->ctx_id = ctx->id;
 		bo->vpu_addr = bo->mm_node.start;
-	} else {
-		ivpu_err(vdev, "Failed to add BO to context %u: %d\n", ctx->id, ret);
+		ivpu_dbg_bo(vdev, bo, "vaddr");
 	}
 
 	ivpu_bo_unlock(bo);
@@ -115,7 +141,7 @@ static void ivpu_bo_unbind_locked(struct ivpu_bo *bo)
 {
 	struct ivpu_device *vdev = ivpu_bo_to_vdev(bo);
 
-	lockdep_assert(dma_resv_held(bo->base.base.resv) || !kref_read(&bo->base.base.refcount));
+	dma_resv_assert_held(bo->base.base.resv);
 
 	if (bo->mmu_mapped) {
 		drm_WARN_ON(&vdev->drm, !bo->ctx);
@@ -130,13 +156,15 @@ static void ivpu_bo_unbind_locked(struct ivpu_bo *bo)
 		bo->ctx = NULL;
 	}
 
-	if (drm_gem_is_imported(&bo->base.base))
-		return;
-
 	if (bo->base.sgt) {
-		dma_unmap_sgtable(vdev->drm.dev, bo->base.sgt, DMA_BIDIRECTIONAL, 0);
-		sg_free_table(bo->base.sgt);
-		kfree(bo->base.sgt);
+		if (bo->base.base.import_attach) {
+			dma_buf_unmap_attachment(bo->base.base.import_attach,
+						 bo->base.sgt, DMA_BIDIRECTIONAL);
+		} else {
+			dma_unmap_sgtable(vdev->drm.dev, bo->base.sgt, DMA_BIDIRECTIONAL, 0);
+			sg_free_table(bo->base.sgt);
+			kfree(bo->base.sgt);
+		}
 		bo->base.sgt = NULL;
 	}
 }
@@ -182,10 +210,11 @@ struct drm_gem_object *ivpu_gem_create_object(struct drm_device *dev, size_t siz
 struct drm_gem_object *ivpu_gem_prime_import(struct drm_device *dev,
 					     struct dma_buf *dma_buf)
 {
+	struct ivpu_device *vdev = to_ivpu_device(dev);
 	struct device *attach_dev = dev->dev;
 	struct dma_buf_attachment *attach;
-	struct sg_table *sgt;
 	struct drm_gem_object *obj;
+	struct ivpu_bo *bo;
 	int ret;
 
 	attach = dma_buf_attach(dma_buf, attach_dev);
@@ -194,25 +223,25 @@ struct drm_gem_object *ivpu_gem_prime_import(struct drm_device *dev,
 
 	get_dma_buf(dma_buf);
 
-	sgt = dma_buf_map_attachment_unlocked(attach, DMA_BIDIRECTIONAL);
-	if (IS_ERR(sgt)) {
-		ret = PTR_ERR(sgt);
-		goto fail_detach;
-	}
-
-	obj = drm_gem_shmem_prime_import_sg_table(dev, attach, sgt);
+	obj = drm_gem_shmem_prime_import_sg_table(dev, attach, NULL);
 	if (IS_ERR(obj)) {
 		ret = PTR_ERR(obj);
-		goto fail_unmap;
+		goto fail_detach;
 	}
 
 	obj->import_attach = attach;
 	obj->resv = dma_buf->resv;
 
+	bo = to_ivpu_bo(obj);
+
+	mutex_lock(&vdev->bo_list_lock);
+	list_add_tail(&bo->bo_list_node, &vdev->bo_list);
+	mutex_unlock(&vdev->bo_list_lock);
+
+	ivpu_dbg(vdev, BO, "import: bo %8p size %9zu\n", bo, ivpu_bo_size(bo));
+
 	return obj;
 
-fail_unmap:
-	dma_buf_unmap_attachment_unlocked(attach, sgt, DMA_BIDIRECTIONAL);
 fail_detach:
 	dma_buf_detach(dma_buf, attach);
 	dma_buf_put(dma_buf);
@@ -220,7 +249,7 @@ fail_detach:
 	return ERR_PTR(ret);
 }
 
-static struct ivpu_bo *ivpu_bo_alloc(struct ivpu_device *vdev, u64 size, u32 flags, u32 ctx_id)
+static struct ivpu_bo *ivpu_bo_alloc(struct ivpu_device *vdev, u64 size, u32 flags)
 {
 	struct drm_gem_shmem_object *shmem;
 	struct ivpu_bo *bo;
@@ -238,7 +267,6 @@ static struct ivpu_bo *ivpu_bo_alloc(struct ivpu_device *vdev, u64 size, u32 fla
 		return ERR_CAST(shmem);
 
 	bo = to_ivpu_bo(&shmem->base);
-	bo->ctx_id = ctx_id;
 	bo->base.map_wc = flags & DRM_IVPU_BO_WC;
 	bo->flags = flags;
 
@@ -246,7 +274,7 @@ static struct ivpu_bo *ivpu_bo_alloc(struct ivpu_device *vdev, u64 size, u32 fla
 	list_add_tail(&bo->bo_list_node, &vdev->bo_list);
 	mutex_unlock(&vdev->bo_list_lock);
 
-	ivpu_dbg_bo(vdev, bo, "alloc");
+	ivpu_dbg(vdev, BO, " alloc: bo %8p size %9llu\n", bo, size);
 
 	return bo;
 }
@@ -259,8 +287,8 @@ static int ivpu_gem_bo_open(struct drm_gem_object *obj, struct drm_file *file)
 	struct ivpu_addr_range *range;
 
 	if (bo->ctx) {
-		ivpu_warn(vdev, "Can't add BO to ctx %u: already in ctx %u\n",
-			  file_priv->ctx.id, bo->ctx->id);
+		ivpu_dbg(vdev, IOCTL, "Can't add BO %pe to ctx %u: already in ctx %u\n",
+			 bo, file_priv->ctx.id, bo->ctx->id);
 		return -EALREADY;
 	}
 
@@ -281,23 +309,41 @@ static void ivpu_gem_bo_free(struct drm_gem_object *obj)
 
 	ivpu_dbg_bo(vdev, bo, "free");
 
+	drm_WARN_ON(&vdev->drm, list_empty(&bo->bo_list_node));
+
 	mutex_lock(&vdev->bo_list_lock);
 	list_del(&bo->bo_list_node);
-	mutex_unlock(&vdev->bo_list_lock);
 
 	drm_WARN_ON(&vdev->drm, !drm_gem_is_imported(&bo->base.base) &&
 		    !dma_resv_test_signaled(obj->resv, DMA_RESV_USAGE_READ));
 	drm_WARN_ON(&vdev->drm, ivpu_bo_size(bo) == 0);
 	drm_WARN_ON(&vdev->drm, bo->base.vaddr);
 
+	ivpu_bo_lock(bo);
 	ivpu_bo_unbind_locked(bo);
+	ivpu_bo_unlock(bo);
+
+	mutex_unlock(&vdev->bo_list_lock);
+
 	drm_WARN_ON(&vdev->drm, bo->mmu_mapped);
 	drm_WARN_ON(&vdev->drm, bo->ctx);
 
 	drm_WARN_ON(obj->dev, refcount_read(&bo->base.pages_use_count) > 1);
+	drm_WARN_ON(obj->dev, bo->base.base.vma_node.vm_files.rb_node);
 	drm_gem_shmem_free(&bo->base);
 }
 
+static enum drm_gem_object_status ivpu_gem_status(struct drm_gem_object *obj)
+{
+	struct ivpu_bo *bo = to_ivpu_bo(obj);
+	enum drm_gem_object_status status = 0;
+
+	if (ivpu_bo_is_resident(bo))
+		status |= DRM_GEM_OBJECT_RESIDENT;
+
+	return status;
+}
+
 static const struct drm_gem_object_funcs ivpu_gem_funcs = {
 	.free = ivpu_gem_bo_free,
 	.open = ivpu_gem_bo_open,
@@ -308,6 +354,7 @@ static const struct drm_gem_object_funcs ivpu_gem_funcs = {
 	.vmap = drm_gem_shmem_object_vmap,
 	.vunmap = drm_gem_shmem_object_vunmap,
 	.mmap = drm_gem_shmem_object_mmap,
+	.status = ivpu_gem_status,
 	.vm_ops = &drm_gem_shmem_vm_ops,
 };
 
@@ -320,25 +367,33 @@ int ivpu_bo_create_ioctl(struct drm_device *dev, void *data, struct drm_file *fi
 	struct ivpu_bo *bo;
 	int ret;
 
-	if (args->flags & ~DRM_IVPU_BO_FLAGS)
+	if (args->flags & ~DRM_IVPU_BO_FLAGS) {
+		ivpu_dbg(vdev, IOCTL, "Invalid BO flags 0x%x\n", args->flags);
 		return -EINVAL;
+	}
 
-	if (size == 0)
+	if (size == 0) {
+		ivpu_dbg(vdev, IOCTL, "Invalid BO size %llu\n", args->size);
 		return -EINVAL;
+	}
 
-	bo = ivpu_bo_alloc(vdev, size, args->flags, file_priv->ctx.id);
+	bo = ivpu_bo_alloc(vdev, size, args->flags);
 	if (IS_ERR(bo)) {
-		ivpu_err(vdev, "Failed to allocate BO: %pe (ctx %u size %llu flags 0x%x)",
+		ivpu_dbg(vdev, IOCTL, "Failed to allocate BO: %pe ctx %u size %llu flags 0x%x\n",
 			 bo, file_priv->ctx.id, args->size, args->flags);
 		return PTR_ERR(bo);
 	}
 
+	drm_WARN_ON(&vdev->drm, bo->base.base.handle_count != 0);
+
 	ret = drm_gem_handle_create(file, &bo->base.base, &args->handle);
-	if (ret)
-		ivpu_err(vdev, "Failed to create handle for BO: %pe (ctx %u size %llu flags 0x%x)",
+	if (ret) {
+		ivpu_dbg(vdev, IOCTL, "Failed to create handle for BO: %pe ctx %u size %llu flags 0x%x\n",
 			 bo, file_priv->ctx.id, args->size, args->flags);
-	else
+	} else {
 		args->vpu_addr = bo->vpu_addr;
+		drm_WARN_ON(&vdev->drm, bo->base.base.handle_count != 1);
+	}
 
 	drm_gem_object_put(&bo->base.base);
 
@@ -360,18 +415,21 @@ ivpu_bo_create(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
 	drm_WARN_ON(&vdev->drm, !PAGE_ALIGNED(range->end));
 	drm_WARN_ON(&vdev->drm, !PAGE_ALIGNED(size));
 
-	bo = ivpu_bo_alloc(vdev, size, flags, IVPU_GLOBAL_CONTEXT_MMU_SSID);
+	bo = ivpu_bo_alloc(vdev, size, flags);
 	if (IS_ERR(bo)) {
-		ivpu_err(vdev, "Failed to allocate BO: %pe (vpu_addr 0x%llx size %llu flags 0x%x)",
+		ivpu_err(vdev, "Failed to allocate BO: %pe vpu_addr 0x%llx size %llu flags 0x%x\n",
 			 bo, range->start, size, flags);
 		return NULL;
 	}
 
 	ret = ivpu_bo_alloc_vpu_addr(bo, ctx, range);
-	if (ret)
+	if (ret) {
+		ivpu_err(vdev, "Failed to allocate NPU address for BO: %pe ctx %u size %llu: %d\n",
+			 bo, ctx->id, size, ret);
 		goto err_put;
+	}
 
-	ret = ivpu_bo_pin(bo);
+	ret = ivpu_bo_bind(bo);
 	if (ret)
 		goto err_put;
 
@@ -391,6 +449,21 @@ err_put:
 	return NULL;
 }
 
+struct ivpu_bo *ivpu_bo_create_runtime(struct ivpu_device *vdev, u64 addr, u64 size, u32 flags)
+{
+	struct ivpu_addr_range range;
+
+	if (!ivpu_is_within_range(addr, size, &vdev->hw->ranges.runtime)) {
+		ivpu_err(vdev, "Invalid runtime BO address 0x%llx size %llu\n", addr, size);
+		return NULL;
+	}
+
+	if (ivpu_hw_range_init(vdev, &range, addr, size))
+		return NULL;
+
+	return ivpu_bo_create(vdev, &vdev->gctx, &range, size, flags);
+}
+
 struct ivpu_bo *ivpu_bo_create_global(struct ivpu_device *vdev, u64 size, u32 flags)
 {
 	return ivpu_bo_create(vdev, &vdev->gctx, &vdev->hw->ranges.global, size, flags);
diff --git a/drivers/accel/ivpu/ivpu_gem.h b/drivers/accel/ivpu/ivpu_gem.h
index aa8ff14f7aae..0c3350f22b55 100644
--- a/drivers/accel/ivpu/ivpu_gem.h
+++ b/drivers/accel/ivpu/ivpu_gem.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2025 Intel Corporation
  */
 #ifndef __IVPU_GEM_H__
 #define __IVPU_GEM_H__
@@ -24,19 +24,22 @@ struct ivpu_bo {
 	bool mmu_mapped;
 };
 
-int ivpu_bo_pin(struct ivpu_bo *bo);
+int ivpu_bo_bind(struct ivpu_bo *bo);
 void ivpu_bo_unbind_all_bos_from_context(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx);
 
 struct drm_gem_object *ivpu_gem_create_object(struct drm_device *dev, size_t size);
 struct drm_gem_object *ivpu_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf);
 struct ivpu_bo *ivpu_bo_create(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
 			       struct ivpu_addr_range *range, u64 size, u32 flags);
+struct ivpu_bo *ivpu_bo_create_runtime(struct ivpu_device *vdev, u64 addr, u64 size, u32 flags);
 struct ivpu_bo *ivpu_bo_create_global(struct ivpu_device *vdev, u64 size, u32 flags);
 void ivpu_bo_free(struct ivpu_bo *bo);
 
 int ivpu_bo_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file);
 int ivpu_bo_info_ioctl(struct drm_device *dev, void *data, struct drm_file *file);
 int ivpu_bo_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file);
+int ivpu_bo_create_from_userptr_ioctl(struct drm_device *dev, void *data,
+				      struct drm_file *file);
 
 void ivpu_bo_list(struct drm_device *dev, struct drm_printer *p);
 void ivpu_bo_list_print(struct drm_device *dev);
@@ -74,6 +77,16 @@ static inline bool ivpu_bo_is_snooped(struct ivpu_bo *bo)
 	return ivpu_bo_cache_mode(bo) == DRM_IVPU_BO_CACHED;
 }
 
+static inline bool ivpu_bo_is_read_only(struct ivpu_bo *bo)
+{
+	return bo->flags & DRM_IVPU_BO_READ_ONLY;
+}
+
+static inline bool ivpu_bo_is_resident(struct ivpu_bo *bo)
+{
+	return !!bo->base.pages;
+}
+
 static inline void *ivpu_to_cpu_addr(struct ivpu_bo *bo, u32 vpu_addr)
 {
 	if (vpu_addr < bo->vpu_addr)
@@ -96,4 +109,9 @@ static inline u32 cpu_to_vpu_addr(struct ivpu_bo *bo, void *cpu_addr)
 	return bo->vpu_addr + (cpu_addr - ivpu_bo_vaddr(bo));
 }
 
+static inline bool ivpu_bo_is_mappable(struct ivpu_bo *bo)
+{
+	return bo->flags & DRM_IVPU_BO_MAPPABLE;
+}
+
 #endif /* __IVPU_GEM_H__ */
diff --git a/drivers/accel/ivpu/ivpu_gem_userptr.c b/drivers/accel/ivpu/ivpu_gem_userptr.c
new file mode 100644
index 000000000000..25ba606164c0
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_gem_userptr.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020-2025 Intel Corporation
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/capability.h>
+
+#include <drm/drm_device.h>
+#include <drm/drm_file.h>
+#include <drm/drm_gem.h>
+
+#include "ivpu_drv.h"
+#include "ivpu_gem.h"
+
+static struct sg_table *
+ivpu_gem_userptr_dmabuf_map(struct dma_buf_attachment *attachment,
+			    enum dma_data_direction direction)
+{
+	struct sg_table *sgt = attachment->dmabuf->priv;
+	int ret;
+
+	ret = dma_map_sgtable(attachment->dev, sgt, direction, DMA_ATTR_SKIP_CPU_SYNC);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return sgt;
+}
+
+static void ivpu_gem_userptr_dmabuf_unmap(struct dma_buf_attachment *attachment,
+					  struct sg_table *sgt,
+					  enum dma_data_direction direction)
+{
+	dma_unmap_sgtable(attachment->dev, sgt, direction, DMA_ATTR_SKIP_CPU_SYNC);
+}
+
+static void ivpu_gem_userptr_dmabuf_release(struct dma_buf *dma_buf)
+{
+	struct sg_table *sgt = dma_buf->priv;
+	struct sg_page_iter page_iter;
+	struct page *page;
+
+	for_each_sgtable_page(sgt, &page_iter, 0) {
+		page = sg_page_iter_page(&page_iter);
+		unpin_user_page(page);
+	}
+
+	sg_free_table(sgt);
+	kfree(sgt);
+}
+
+static const struct dma_buf_ops ivpu_gem_userptr_dmabuf_ops = {
+	.map_dma_buf = ivpu_gem_userptr_dmabuf_map,
+	.unmap_dma_buf = ivpu_gem_userptr_dmabuf_unmap,
+	.release = ivpu_gem_userptr_dmabuf_release,
+};
+
+static struct dma_buf *
+ivpu_create_userptr_dmabuf(struct ivpu_device *vdev, void __user *user_ptr,
+			   size_t size, uint32_t flags)
+{
+	struct dma_buf_export_info exp_info = {};
+	struct dma_buf *dma_buf;
+	struct sg_table *sgt;
+	struct page **pages;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	unsigned int gup_flags = FOLL_LONGTERM;
+	int ret, i, pinned;
+
+	/* Add FOLL_WRITE only if the BO is not read-only */
+	if (!(flags & DRM_IVPU_BO_READ_ONLY))
+		gup_flags |= FOLL_WRITE;
+
+	pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	pinned = pin_user_pages_fast((unsigned long)user_ptr, nr_pages, gup_flags, pages);
+	if (pinned < 0) {
+		ret = pinned;
+		ivpu_dbg(vdev, IOCTL, "Failed to pin user pages: %d\n", ret);
+		goto free_pages_array;
+	}
+
+	if (pinned != nr_pages) {
+		ivpu_dbg(vdev, IOCTL, "Pinned %d pages, expected %lu\n", pinned, nr_pages);
+		ret = -EFAULT;
+		goto unpin_pages;
+	}
+
+	sgt = kmalloc(sizeof(*sgt), GFP_KERNEL);
+	if (!sgt) {
+		ret = -ENOMEM;
+		goto unpin_pages;
+	}
+
+	ret = sg_alloc_table_from_pages(sgt, pages, nr_pages, 0, size, GFP_KERNEL);
+	if (ret) {
+		ivpu_dbg(vdev, IOCTL, "Failed to create sg table: %d\n", ret);
+		goto free_sgt;
+	}
+
+	exp_info.exp_name = "ivpu_userptr_dmabuf";
+	exp_info.owner = THIS_MODULE;
+	exp_info.ops = &ivpu_gem_userptr_dmabuf_ops;
+	exp_info.size = size;
+	exp_info.flags = O_RDWR | O_CLOEXEC;
+	exp_info.priv = sgt;
+
+	dma_buf = dma_buf_export(&exp_info);
+	if (IS_ERR(dma_buf)) {
+		ret = PTR_ERR(dma_buf);
+		ivpu_dbg(vdev, IOCTL, "Failed to export userptr dma-buf: %d\n", ret);
+		goto free_sg_table;
+	}
+
+	kvfree(pages);
+	return dma_buf;
+
+free_sg_table:
+	sg_free_table(sgt);
+free_sgt:
+	kfree(sgt);
+unpin_pages:
+	for (i = 0; i < pinned; i++)
+		unpin_user_page(pages[i]);
+free_pages_array:
+	kvfree(pages);
+	return ERR_PTR(ret);
+}
+
+static struct ivpu_bo *
+ivpu_bo_create_from_userptr(struct ivpu_device *vdev, void __user *user_ptr,
+			    size_t size, uint32_t flags)
+{
+	struct dma_buf *dma_buf;
+	struct drm_gem_object *obj;
+	struct ivpu_bo *bo;
+
+	dma_buf = ivpu_create_userptr_dmabuf(vdev, user_ptr, size, flags);
+	if (IS_ERR(dma_buf))
+		return ERR_CAST(dma_buf);
+
+	obj = ivpu_gem_prime_import(&vdev->drm, dma_buf);
+	if (IS_ERR(obj)) {
+		dma_buf_put(dma_buf);
+		return ERR_CAST(obj);
+	}
+
+	dma_buf_put(dma_buf);
+
+	bo = to_ivpu_bo(obj);
+	bo->flags = flags;
+
+	return bo;
+}
+
+int ivpu_bo_create_from_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
+{
+	struct drm_ivpu_bo_create_from_userptr *args = data;
+	struct ivpu_file_priv *file_priv = file->driver_priv;
+	struct ivpu_device *vdev = to_ivpu_device(dev);
+	void __user *user_ptr = u64_to_user_ptr(args->user_ptr);
+	struct ivpu_bo *bo;
+	int ret;
+
+	if (args->flags & ~(DRM_IVPU_BO_HIGH_MEM | DRM_IVPU_BO_DMA_MEM | DRM_IVPU_BO_READ_ONLY)) {
+		ivpu_dbg(vdev, IOCTL, "Invalid BO flags: 0x%x\n", args->flags);
+		return -EINVAL;
+	}
+
+	if (!args->user_ptr || !args->size) {
+		ivpu_dbg(vdev, IOCTL, "Userptr or size are zero: ptr %llx size %llu\n",
+			 args->user_ptr, args->size);
+		return -EINVAL;
+	}
+
+	if (!PAGE_ALIGNED(args->user_ptr) || !PAGE_ALIGNED(args->size)) {
+		ivpu_dbg(vdev, IOCTL, "Userptr or size not page aligned: ptr %llx size %llu\n",
+			 args->user_ptr, args->size);
+		return -EINVAL;
+	}
+
+	if (!access_ok(user_ptr, args->size)) {
+		ivpu_dbg(vdev, IOCTL, "Userptr is not accessible: ptr %llx size %llu\n",
+			 args->user_ptr, args->size);
+		return -EFAULT;
+	}
+
+	bo = ivpu_bo_create_from_userptr(vdev, user_ptr, args->size, args->flags);
+	if (IS_ERR(bo))
+		return PTR_ERR(bo);
+
+	ret = drm_gem_handle_create(file, &bo->base.base, &args->handle);
+	if (ret) {
+		ivpu_dbg(vdev, IOCTL, "Failed to create handle for BO: %pe ctx %u size %llu flags 0x%x\n",
+			 bo, file_priv->ctx.id, args->size, args->flags);
+	} else {
+		ivpu_dbg(vdev, BO, "Created userptr BO: handle=%u vpu_addr=0x%llx size=%llu flags=0x%x\n",
+			 args->handle, bo->vpu_addr, args->size, bo->flags);
+		args->vpu_addr = bo->vpu_addr;
+	}
+
+	drm_gem_object_put(&bo->base.base);
+
+	return ret;
+}
diff --git a/drivers/accel/ivpu/ivpu_hw.c b/drivers/accel/ivpu/ivpu_hw.c
index 08dcc31b56f4..d69cd0d93569 100644
--- a/drivers/accel/ivpu/ivpu_hw.c
+++ b/drivers/accel/ivpu/ivpu_hw.c
@@ -8,6 +8,8 @@
 #include "ivpu_hw_btrs.h"
 #include "ivpu_hw_ip.h"
 
+#include <asm/msr-index.h>
+#include <asm/msr.h>
 #include <linux/dmi.h>
 #include <linux/fault-inject.h>
 #include <linux/pm_runtime.h>
@@ -20,6 +22,10 @@ module_param_named_unsafe(fail_hw, ivpu_fail_hw, charp, 0444);
 MODULE_PARM_DESC(fail_hw, "<interval>,<probability>,<space>,<times>");
 #endif
 
+#define FW_SHARED_MEM_ALIGNMENT	SZ_512K /* VPU MTRR limitation */
+
+#define ECC_MCA_SIGNAL_ENABLE_MASK	0xff
+
 static char *platform_to_str(u32 platform)
 {
 	switch (platform) {
@@ -147,19 +153,39 @@ static void priority_bands_init(struct ivpu_device *vdev)
 	vdev->hw->hws.process_quantum[VPU_JOB_SCHEDULING_PRIORITY_BAND_REALTIME] = 200000;
 }
 
+int ivpu_hw_range_init(struct ivpu_device *vdev, struct ivpu_addr_range *range, u64 start, u64 size)
+{
+	u64 end;
+
+	if (!range || check_add_overflow(start, size, &end)) {
+		ivpu_err(vdev, "Invalid range: start 0x%llx size %llu\n", start, size);
+		return -EINVAL;
+	}
+
+	range->start = start;
+	range->end = end;
+
+	return 0;
+}
+
 static void memory_ranges_init(struct ivpu_device *vdev)
 {
 	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX) {
-		ivpu_hw_range_init(&vdev->hw->ranges.global, 0x80000000, SZ_512M);
-		ivpu_hw_range_init(&vdev->hw->ranges.user,   0x88000000, 511 * SZ_1M);
-		ivpu_hw_range_init(&vdev->hw->ranges.shave, 0x180000000, SZ_2G);
-		ivpu_hw_range_init(&vdev->hw->ranges.dma,   0x200000000, SZ_128G);
+		ivpu_hw_range_init(vdev, &vdev->hw->ranges.runtime, 0x84800000, SZ_64M);
+		ivpu_hw_range_init(vdev, &vdev->hw->ranges.global,  0x90000000, SZ_256M);
+		ivpu_hw_range_init(vdev, &vdev->hw->ranges.user,    0xa0000000, 511 * SZ_1M);
+		ivpu_hw_range_init(vdev, &vdev->hw->ranges.shave,  0x180000000, SZ_2G);
+		ivpu_hw_range_init(vdev, &vdev->hw->ranges.dma,    0x200000000, SZ_128G);
 	} else {
-		ivpu_hw_range_init(&vdev->hw->ranges.global, 0x80000000, SZ_512M);
-		ivpu_hw_range_init(&vdev->hw->ranges.shave,  0x80000000, SZ_2G);
-		ivpu_hw_range_init(&vdev->hw->ranges.user,  0x100000000, SZ_256G);
+		ivpu_hw_range_init(vdev, &vdev->hw->ranges.runtime, 0x80000000, SZ_64M);
+		ivpu_hw_range_init(vdev, &vdev->hw->ranges.global,  0x90000000, SZ_256M);
+		ivpu_hw_range_init(vdev, &vdev->hw->ranges.shave,   0x80000000, SZ_2G);
+		ivpu_hw_range_init(vdev, &vdev->hw->ranges.user,   0x100000000, SZ_256G);
 		vdev->hw->ranges.dma = vdev->hw->ranges.user;
 	}
+
+	drm_WARN_ON(&vdev->drm, !IS_ALIGNED(vdev->hw->ranges.global.start,
+					    FW_SHARED_MEM_ALIGNMENT));
 }
 
 static int wp_enable(struct ivpu_device *vdev)
@@ -373,3 +399,22 @@ irqreturn_t ivpu_hw_irq_handler(int irq, void *ptr)
 	pm_runtime_mark_last_busy(vdev->drm.dev);
 	return IRQ_HANDLED;
 }
+
+bool ivpu_hw_uses_ecc_mca_signal(struct ivpu_device *vdev)
+{
+	unsigned long long msr_integrity_caps;
+	int ret;
+
+	if (ivpu_hw_ip_gen(vdev) < IVPU_HW_IP_50XX)
+		return false;
+
+	ret = rdmsrq_safe(MSR_INTEGRITY_CAPS, &msr_integrity_caps);
+	if (ret) {
+		ivpu_warn(vdev, "Error reading MSR_INTEGRITY_CAPS: %d", ret);
+		return false;
+	}
+
+	ivpu_dbg(vdev, MISC, "MSR_INTEGRITY_CAPS: 0x%llx\n", msr_integrity_caps);
+
+	return msr_integrity_caps & ECC_MCA_SIGNAL_ENABLE_MASK;
+}
diff --git a/drivers/accel/ivpu/ivpu_hw.h b/drivers/accel/ivpu/ivpu_hw.h
index d79668fe1609..b6d0f0d0dccc 100644
--- a/drivers/accel/ivpu/ivpu_hw.h
+++ b/drivers/accel/ivpu/ivpu_hw.h
@@ -21,6 +21,7 @@ struct ivpu_hw_info {
 		bool (*ip_irq_handler)(struct ivpu_device *vdev, int irq);
 	} irq;
 	struct {
+		struct ivpu_addr_range runtime;
 		struct ivpu_addr_range global;
 		struct ivpu_addr_range user;
 		struct ivpu_addr_range shave;
@@ -51,6 +52,8 @@ struct ivpu_hw_info {
 };
 
 int ivpu_hw_init(struct ivpu_device *vdev);
+int ivpu_hw_range_init(struct ivpu_device *vdev, struct ivpu_addr_range *range, u64 start,
+		       u64 size);
 int ivpu_hw_power_up(struct ivpu_device *vdev);
 int ivpu_hw_power_down(struct ivpu_device *vdev);
 int ivpu_hw_reset(struct ivpu_device *vdev);
@@ -60,6 +63,7 @@ void ivpu_irq_handlers_init(struct ivpu_device *vdev);
 void ivpu_hw_irq_enable(struct ivpu_device *vdev);
 void ivpu_hw_irq_disable(struct ivpu_device *vdev);
 irqreturn_t ivpu_hw_irq_handler(int irq, void *ptr);
+bool ivpu_hw_uses_ecc_mca_signal(struct ivpu_device *vdev);
 
 static inline u32 ivpu_hw_btrs_irq_handler(struct ivpu_device *vdev, int irq)
 {
@@ -71,12 +75,6 @@ static inline u32 ivpu_hw_ip_irq_handler(struct ivpu_device *vdev, int irq)
 	return vdev->hw->irq.ip_irq_handler(vdev, irq);
 }
 
-static inline void ivpu_hw_range_init(struct ivpu_addr_range *range, u64 start, u64 size)
-{
-	range->start = start;
-	range->end = start + size;
-}
-
 static inline u64 ivpu_hw_range_size(const struct ivpu_addr_range *range)
 {
 	return range->end - range->start;
diff --git a/drivers/accel/ivpu/ivpu_hw_btrs.c b/drivers/accel/ivpu/ivpu_hw_btrs.c
index afdb3b2aa72a..06e65c592618 100644
--- a/drivers/accel/ivpu/ivpu_hw_btrs.c
+++ b/drivers/accel/ivpu/ivpu_hw_btrs.c
@@ -321,6 +321,14 @@ static int wait_for_pll_lock(struct ivpu_device *vdev, bool enable)
 	return REGB_POLL_FLD(VPU_HW_BTRS_MTL_PLL_STATUS, LOCK, exp_val, PLL_TIMEOUT_US);
 }
 
+static int wait_for_cdyn_deassert(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return 0;
+
+	return REGB_POLL_FLD(VPU_HW_BTRS_LNL_CDYN, CDYN, 0, PLL_TIMEOUT_US);
+}
+
 int ivpu_hw_btrs_wp_drive(struct ivpu_device *vdev, bool enable)
 {
 	struct wp_request wp;
@@ -354,6 +362,14 @@ int ivpu_hw_btrs_wp_drive(struct ivpu_device *vdev, bool enable)
 		return ret;
 	}
 
+	if (!enable) {
+		ret = wait_for_cdyn_deassert(vdev);
+		if (ret) {
+			ivpu_err(vdev, "Timed out waiting for CDYN deassert\n");
+			return ret;
+		}
+	}
+
 	return 0;
 }
 
@@ -673,7 +689,7 @@ bool ivpu_hw_btrs_irq_handler_lnl(struct ivpu_device *vdev, int irq)
 
 	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, SURV_ERR, status)) {
 		ivpu_dbg(vdev, IRQ, "Survivability IRQ\n");
-		queue_work(system_wq, &vdev->irq_dct_work);
+		queue_work(system_percpu_wq, &vdev->irq_dct_work);
 	}
 
 	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, FREQ_CHANGE, status)) {
@@ -752,7 +768,7 @@ int ivpu_hw_btrs_dct_get_request(struct ivpu_device *vdev, bool *enable)
 	}
 }
 
-void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u32 active_percent)
+void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u8 active_percent)
 {
 	u32 val = 0;
 	u32 cmd = enable ? DCT_ENABLE : DCT_DISABLE;
diff --git a/drivers/accel/ivpu/ivpu_hw_btrs.h b/drivers/accel/ivpu/ivpu_hw_btrs.h
index 032c384ac3d4..c4c10e22f30f 100644
--- a/drivers/accel/ivpu/ivpu_hw_btrs.h
+++ b/drivers/accel/ivpu/ivpu_hw_btrs.h
@@ -36,7 +36,7 @@ u32 ivpu_hw_btrs_dpu_freq_get(struct ivpu_device *vdev);
 bool ivpu_hw_btrs_irq_handler_mtl(struct ivpu_device *vdev, int irq);
 bool ivpu_hw_btrs_irq_handler_lnl(struct ivpu_device *vdev, int irq);
 int ivpu_hw_btrs_dct_get_request(struct ivpu_device *vdev, bool *enable);
-void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u32 active_percent);
+void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u8 active_percent);
 u32 ivpu_hw_btrs_telemetry_offset_get(struct ivpu_device *vdev);
 u32 ivpu_hw_btrs_telemetry_size_get(struct ivpu_device *vdev);
 u32 ivpu_hw_btrs_telemetry_enable_get(struct ivpu_device *vdev);
diff --git a/drivers/accel/ivpu/ivpu_hw_btrs_lnl_reg.h b/drivers/accel/ivpu/ivpu_hw_btrs_lnl_reg.h
index fff2ef2cada6..a81a9ba540fa 100644
--- a/drivers/accel/ivpu/ivpu_hw_btrs_lnl_reg.h
+++ b/drivers/accel/ivpu/ivpu_hw_btrs_lnl_reg.h
@@ -74,6 +74,9 @@
 #define VPU_HW_BTRS_LNL_PLL_FREQ				0x00000148u
 #define VPU_HW_BTRS_LNL_PLL_FREQ_RATIO_MASK			GENMASK(15, 0)
 
+#define VPU_HW_BTRS_LNL_CDYN					0x0000014cu
+#define VPU_HW_BTRS_LNL_CDYN_CDYN_MASK				GENMASK(15, 0)
+
 #define VPU_HW_BTRS_LNL_TILE_FUSE				0x00000150u
 #define VPU_HW_BTRS_LNL_TILE_FUSE_VALID_MASK			BIT_MASK(0)
 #define VPU_HW_BTRS_LNL_TILE_FUSE_CONFIG_MASK			GENMASK(6, 1)
diff --git a/drivers/accel/ivpu/ivpu_hw_ip.c b/drivers/accel/ivpu/ivpu_hw_ip.c
index 2bf9882ab52e..06aa1e7dc50b 100644
--- a/drivers/accel/ivpu/ivpu_hw_ip.c
+++ b/drivers/accel/ivpu/ivpu_hw_ip.c
@@ -691,6 +691,13 @@ static void pwr_island_delay_set(struct ivpu_device *vdev)
 		status = high ? 46 : 3;
 		break;
 
+	case PCI_DEVICE_ID_NVL:
+		post = high ? 198 : 17;
+		post1 = 0;
+		post2 = high ? 198 : 17;
+		status = 0;
+		break;
+
 	default:
 		dump_stack();
 		ivpu_err(vdev, "Unknown device ID\n");
@@ -889,6 +896,9 @@ static int soc_cpu_drive_40xx(struct ivpu_device *vdev, bool enable)
 
 static int soc_cpu_enable(struct ivpu_device *vdev)
 {
+	if (ivpu_hw_ip_gen(vdev) >= IVPU_HW_IP_60XX)
+		return 0;
+
 	return soc_cpu_drive_40xx(vdev, true);
 }
 
diff --git a/drivers/accel/ivpu/ivpu_ipc.c b/drivers/accel/ivpu/ivpu_ipc.c
index 5f00809d448a..1f13bf95b2b3 100644
--- a/drivers/accel/ivpu/ivpu_ipc.c
+++ b/drivers/accel/ivpu/ivpu_ipc.c
@@ -459,7 +459,7 @@ void ivpu_ipc_irq_handler(struct ivpu_device *vdev)
 		}
 	}
 
-	queue_work(system_wq, &vdev->irq_ipc_work);
+	queue_work(system_percpu_wq, &vdev->irq_ipc_work);
 }
 
 void ivpu_ipc_irq_work_fn(struct work_struct *work)
diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c
index 060f1fc031d3..4f8564e2878a 100644
--- a/drivers/accel/ivpu/ivpu_job.c
+++ b/drivers/accel/ivpu/ivpu_job.c
@@ -34,22 +34,20 @@ static void ivpu_cmdq_ring_db(struct ivpu_device *vdev, struct ivpu_cmdq *cmdq)
 static int ivpu_preemption_buffers_create(struct ivpu_device *vdev,
 					  struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cmdq)
 {
-	u64 primary_size = ALIGN(vdev->fw->primary_preempt_buf_size, PAGE_SIZE);
-	u64 secondary_size = ALIGN(vdev->fw->secondary_preempt_buf_size, PAGE_SIZE);
-
-	if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW ||
-	    ivpu_test_mode & IVPU_TEST_MODE_MIP_DISABLE)
+	if (ivpu_fw_preempt_buf_size(vdev) == 0)
 		return 0;
 
 	cmdq->primary_preempt_buf = ivpu_bo_create(vdev, &file_priv->ctx, &vdev->hw->ranges.user,
-						   primary_size, DRM_IVPU_BO_WC);
+						   vdev->fw->primary_preempt_buf_size,
+						   DRM_IVPU_BO_WC);
 	if (!cmdq->primary_preempt_buf) {
 		ivpu_err(vdev, "Failed to create primary preemption buffer\n");
 		return -ENOMEM;
 	}
 
 	cmdq->secondary_preempt_buf = ivpu_bo_create(vdev, &file_priv->ctx, &vdev->hw->ranges.dma,
-						     secondary_size, DRM_IVPU_BO_WC);
+						     vdev->fw->secondary_preempt_buf_size,
+						     DRM_IVPU_BO_WC);
 	if (!cmdq->secondary_preempt_buf) {
 		ivpu_err(vdev, "Failed to create secondary preemption buffer\n");
 		goto err_free_primary;
@@ -66,20 +64,39 @@ err_free_primary:
 static void ivpu_preemption_buffers_free(struct ivpu_device *vdev,
 					 struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cmdq)
 {
-	if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW)
-		return;
-
 	if (cmdq->primary_preempt_buf)
 		ivpu_bo_free(cmdq->primary_preempt_buf);
 	if (cmdq->secondary_preempt_buf)
 		ivpu_bo_free(cmdq->secondary_preempt_buf);
 }
 
+static int ivpu_preemption_job_init(struct ivpu_device *vdev, struct ivpu_file_priv *file_priv,
+				    struct ivpu_cmdq *cmdq, struct ivpu_job *job)
+{
+	int ret;
+
+	/* Use preemption buffer provided by the user space */
+	if (job->primary_preempt_buf)
+		return 0;
+
+	if (!cmdq->primary_preempt_buf) {
+		/* Allocate per command queue preemption buffers */
+		ret = ivpu_preemption_buffers_create(vdev, file_priv, cmdq);
+		if (ret)
+			return ret;
+	}
+
+	/* Use preemption buffers allocated by the kernel */
+	job->primary_preempt_buf = cmdq->primary_preempt_buf;
+	job->secondary_preempt_buf = cmdq->secondary_preempt_buf;
+
+	return 0;
+}
+
 static struct ivpu_cmdq *ivpu_cmdq_alloc(struct ivpu_file_priv *file_priv)
 {
 	struct ivpu_device *vdev = file_priv->vdev;
 	struct ivpu_cmdq *cmdq;
-	int ret;
 
 	cmdq = kzalloc(sizeof(*cmdq), GFP_KERNEL);
 	if (!cmdq)
@@ -89,10 +106,6 @@ static struct ivpu_cmdq *ivpu_cmdq_alloc(struct ivpu_file_priv *file_priv)
 	if (!cmdq->mem)
 		goto err_free_cmdq;
 
-	ret = ivpu_preemption_buffers_create(vdev, file_priv, cmdq);
-	if (ret)
-		ivpu_warn(vdev, "Failed to allocate preemption buffers, preemption limited\n");
-
 	return cmdq;
 
 err_free_cmdq:
@@ -219,11 +232,13 @@ static int ivpu_register_db(struct ivpu_file_priv *file_priv, struct ivpu_cmdq *
 		ret = ivpu_jsm_register_db(vdev, file_priv->ctx.id, cmdq->db_id,
 					   cmdq->mem->vpu_addr, ivpu_bo_size(cmdq->mem));
 
-	if (!ret)
+	if (!ret) {
 		ivpu_dbg(vdev, JOB, "DB %d registered to cmdq %d ctx %d priority %d\n",
 			 cmdq->db_id, cmdq->id, file_priv->ctx.id, cmdq->priority);
-	else
+	} else {
 		xa_erase(&vdev->db_xa, cmdq->db_id);
+		cmdq->db_id = 0;
+	}
 
 	return ret;
 }
@@ -333,7 +348,7 @@ static struct ivpu_cmdq *ivpu_cmdq_acquire(struct ivpu_file_priv *file_priv, u32
 
 	cmdq = xa_load(&file_priv->cmdq_xa, cmdq_id);
 	if (!cmdq) {
-		ivpu_warn_ratelimited(vdev, "Failed to find command queue with ID: %u\n", cmdq_id);
+		ivpu_dbg(vdev, IOCTL, "Failed to find command queue with ID: %u\n", cmdq_id);
 		return NULL;
 	}
 
@@ -427,17 +442,14 @@ static int ivpu_cmdq_push_job(struct ivpu_cmdq *cmdq, struct ivpu_job *job)
 	if (unlikely(ivpu_test_mode & IVPU_TEST_MODE_NULL_SUBMISSION))
 		entry->flags = VPU_JOB_FLAGS_NULL_SUBMISSION_MASK;
 
-	if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) {
-		if (cmdq->primary_preempt_buf) {
-			entry->primary_preempt_buf_addr = cmdq->primary_preempt_buf->vpu_addr;
-			entry->primary_preempt_buf_size = ivpu_bo_size(cmdq->primary_preempt_buf);
-		}
+	if (job->primary_preempt_buf) {
+		entry->primary_preempt_buf_addr = job->primary_preempt_buf->vpu_addr;
+		entry->primary_preempt_buf_size = ivpu_bo_size(job->primary_preempt_buf);
+	}
 
-		if (cmdq->secondary_preempt_buf) {
-			entry->secondary_preempt_buf_addr = cmdq->secondary_preempt_buf->vpu_addr;
-			entry->secondary_preempt_buf_size =
-				ivpu_bo_size(cmdq->secondary_preempt_buf);
-		}
+	if (job->secondary_preempt_buf) {
+		entry->secondary_preempt_buf_addr = job->secondary_preempt_buf->vpu_addr;
+		entry->secondary_preempt_buf_size = ivpu_bo_size(job->secondary_preempt_buf);
 	}
 
 	wmb(); /* Ensure that tail is updated after filling entry */
@@ -522,7 +534,7 @@ ivpu_job_create(struct ivpu_file_priv *file_priv, u32 engine_idx, u32 bo_count)
 	job->bo_count = bo_count;
 	job->done_fence = ivpu_fence_create(vdev);
 	if (!job->done_fence) {
-		ivpu_warn_ratelimited(vdev, "Failed to create a fence\n");
+		ivpu_err(vdev, "Failed to create a fence\n");
 		goto err_free_job;
 	}
 
@@ -552,21 +564,26 @@ static struct ivpu_job *ivpu_job_remove_from_submitted_jobs(struct ivpu_device *
 	return job;
 }
 
-static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 job_status)
+bool ivpu_job_handle_engine_error(struct ivpu_device *vdev, u32 job_id, u32 job_status)
 {
-	struct ivpu_job *job;
-
 	lockdep_assert_held(&vdev->submitted_jobs_lock);
 
-	job = xa_load(&vdev->submitted_jobs_xa, job_id);
-	if (!job)
-		return -ENOENT;
+	switch (job_status) {
+	case VPU_JSM_STATUS_PROCESSING_ERR:
+	case VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MIN ... VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MAX:
+	{
+		struct ivpu_job *job = xa_load(&vdev->submitted_jobs_xa, job_id);
 
-	if (job_status == VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW) {
+		if (!job)
+			return false;
+
+		/* Trigger an engine reset */
 		guard(mutex)(&job->file_priv->lock);
 
+		job->job_status = job_status;
+
 		if (job->file_priv->has_mmu_faults)
-			return 0;
+			return false;
 
 		/*
 		 * Mark context as faulty and defer destruction of the job to jobs abort thread
@@ -574,23 +591,43 @@ static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32
 		 * status and ensure both are handled in the same way
 		 */
 		job->file_priv->has_mmu_faults = true;
-		queue_work(system_wq, &vdev->context_abort_work);
-		return 0;
+		queue_work(system_percpu_wq, &vdev->context_abort_work);
+		return true;
 	}
+	default:
+		/* Complete job with error status, engine reset not required */
+		break;
+	}
+
+	return false;
+}
 
-	job = ivpu_job_remove_from_submitted_jobs(vdev, job_id);
+static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 job_status)
+{
+	struct ivpu_job *job;
+
+	lockdep_assert_held(&vdev->submitted_jobs_lock);
+
+	job = xa_load(&vdev->submitted_jobs_xa, job_id);
 	if (!job)
 		return -ENOENT;
 
-	if (job->file_priv->has_mmu_faults)
-		job_status = DRM_IVPU_JOB_STATUS_ABORTED;
+	ivpu_job_remove_from_submitted_jobs(vdev, job_id);
 
-	job->bos[CMD_BUF_IDX]->job_status = job_status;
+	if (job->job_status == VPU_JSM_STATUS_SUCCESS) {
+		if (job->file_priv->has_mmu_faults)
+			job->job_status = DRM_IVPU_JOB_STATUS_ABORTED;
+		else
+			job->job_status = job_status;
+	}
+
+	job->bos[CMD_BUF_IDX]->job_status = job->job_status;
 	dma_fence_signal(job->done_fence);
 
 	trace_job("done", job);
 	ivpu_dbg(vdev, JOB, "Job complete:  id %3u ctx %2d cmdq_id %u engine %d status 0x%x\n",
-		 job->job_id, job->file_priv->ctx.id, job->cmdq_id, job->engine_idx, job_status);
+		 job->job_id, job->file_priv->ctx.id, job->cmdq_id, job->engine_idx,
+		 job->job_status);
 
 	ivpu_job_destroy(job);
 	ivpu_stop_job_timeout_detection(vdev);
@@ -650,7 +687,6 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id)
 	else
 		cmdq = ivpu_cmdq_acquire(file_priv, cmdq_id);
 	if (!cmdq) {
-		ivpu_warn_ratelimited(vdev, "Failed to get job queue, ctx %d\n", file_priv->ctx.id);
 		ret = -EINVAL;
 		goto err_unlock;
 	}
@@ -661,6 +697,13 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id)
 		goto err_unlock;
 	}
 
+	ret = ivpu_preemption_job_init(vdev, file_priv, cmdq, job);
+	if (ret) {
+		ivpu_err(vdev, "Failed to initialize preemption buffers for job %d: %d\n",
+			 job->job_id, ret);
+		goto err_unlock;
+	}
+
 	job->cmdq_id = cmdq->id;
 
 	is_first_job = xa_empty(&vdev->submitted_jobs_xa);
@@ -714,7 +757,7 @@ err_unlock:
 
 static int
 ivpu_job_prepare_bos_for_submit(struct drm_file *file, struct ivpu_job *job, u32 *buf_handles,
-				u32 buf_count, u32 commands_offset)
+				u32 buf_count, u32 commands_offset, u32 preempt_buffer_index)
 {
 	struct ivpu_file_priv *file_priv = job->file_priv;
 	struct ivpu_device *vdev = file_priv->vdev;
@@ -727,40 +770,58 @@ ivpu_job_prepare_bos_for_submit(struct drm_file *file, struct ivpu_job *job, u32
 	for (i = 0; i < buf_count; i++) {
 		struct drm_gem_object *obj = drm_gem_object_lookup(file, buf_handles[i]);
 
-		if (!obj)
+		if (!obj) {
+			ivpu_dbg(vdev, IOCTL, "Failed to lookup GEM object with handle %u\n",
+				 buf_handles[i]);
 			return -ENOENT;
+		}
 
 		job->bos[i] = to_ivpu_bo(obj);
 
-		ret = ivpu_bo_pin(job->bos[i]);
+		ret = ivpu_bo_bind(job->bos[i]);
 		if (ret)
 			return ret;
 	}
 
 	bo = job->bos[CMD_BUF_IDX];
 	if (!dma_resv_test_signaled(bo->base.base.resv, DMA_RESV_USAGE_READ)) {
-		ivpu_warn(vdev, "Buffer is already in use\n");
+		ivpu_dbg(vdev, IOCTL, "Buffer is already in use by another job\n");
 		return -EBUSY;
 	}
 
 	if (commands_offset >= ivpu_bo_size(bo)) {
-		ivpu_warn(vdev, "Invalid command buffer offset %u\n", commands_offset);
+		ivpu_dbg(vdev, IOCTL, "Invalid commands offset %u for buffer size %zu\n",
+			 commands_offset, ivpu_bo_size(bo));
 		return -EINVAL;
 	}
 
 	job->cmd_buf_vpu_addr = bo->vpu_addr + commands_offset;
 
+	if (preempt_buffer_index) {
+		struct ivpu_bo *preempt_bo = job->bos[preempt_buffer_index];
+
+		if (ivpu_bo_size(preempt_bo) < ivpu_fw_preempt_buf_size(vdev)) {
+			ivpu_dbg(vdev, IOCTL, "Preemption buffer is too small\n");
+			return -EINVAL;
+		}
+		if (ivpu_bo_is_mappable(preempt_bo)) {
+			ivpu_dbg(vdev, IOCTL, "Preemption buffer cannot be mappable\n");
+			return -EINVAL;
+		}
+		job->primary_preempt_buf = preempt_bo;
+	}
+
 	ret = drm_gem_lock_reservations((struct drm_gem_object **)job->bos, buf_count,
 					&acquire_ctx);
 	if (ret) {
-		ivpu_warn(vdev, "Failed to lock reservations: %d\n", ret);
+		ivpu_warn_ratelimited(vdev, "Failed to lock reservations: %d\n", ret);
 		return ret;
 	}
 
 	for (i = 0; i < buf_count; i++) {
 		ret = dma_resv_reserve_fences(job->bos[i]->base.base.resv, 1);
 		if (ret) {
-			ivpu_warn(vdev, "Failed to reserve fences: %d\n", ret);
+			ivpu_warn_ratelimited(vdev, "Failed to reserve fences: %d\n", ret);
 			goto unlock_reservations;
 		}
 	}
@@ -780,7 +841,7 @@ unlock_reservations:
 
 static int ivpu_submit(struct drm_file *file, struct ivpu_file_priv *file_priv, u32 cmdq_id,
 		       u32 buffer_count, u32 engine, void __user *buffers_ptr, u32 cmds_offset,
-		       u8 priority)
+		       u32 preempt_buffer_index, u8 priority)
 {
 	struct ivpu_device *vdev = file_priv->vdev;
 	struct ivpu_job *job;
@@ -807,16 +868,14 @@ static int ivpu_submit(struct drm_file *file, struct ivpu_file_priv *file_priv,
 
 	job = ivpu_job_create(file_priv, engine, buffer_count);
 	if (!job) {
-		ivpu_err(vdev, "Failed to create job\n");
 		ret = -ENOMEM;
 		goto err_exit_dev;
 	}
 
-	ret = ivpu_job_prepare_bos_for_submit(file, job, buf_handles, buffer_count, cmds_offset);
-	if (ret) {
-		ivpu_err(vdev, "Failed to prepare job: %d\n", ret);
+	ret = ivpu_job_prepare_bos_for_submit(file, job, buf_handles, buffer_count, cmds_offset,
+					      preempt_buffer_index);
+	if (ret)
 		goto err_destroy_job;
-	}
 
 	down_read(&vdev->pm->reset_lock);
 	ret = ivpu_job_submit(job, priority, cmdq_id);
@@ -842,58 +901,91 @@ err_free_handles:
 int ivpu_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
 	struct ivpu_file_priv *file_priv = file->driver_priv;
+	struct ivpu_device *vdev = file_priv->vdev;
 	struct drm_ivpu_submit *args = data;
 	u8 priority;
 
-	if (args->engine != DRM_IVPU_ENGINE_COMPUTE)
+	if (args->engine != DRM_IVPU_ENGINE_COMPUTE) {
+		ivpu_dbg(vdev, IOCTL, "Invalid engine %d\n", args->engine);
 		return -EINVAL;
+	}
 
-	if (args->priority > DRM_IVPU_JOB_PRIORITY_REALTIME)
+	if (args->priority > DRM_IVPU_JOB_PRIORITY_REALTIME) {
+		ivpu_dbg(vdev, IOCTL, "Invalid priority %d\n", args->priority);
 		return -EINVAL;
+	}
 
-	if (args->buffer_count == 0 || args->buffer_count > JOB_MAX_BUFFER_COUNT)
+	if (args->buffer_count == 0 || args->buffer_count > JOB_MAX_BUFFER_COUNT) {
+		ivpu_dbg(vdev, IOCTL, "Invalid buffer count %u\n", args->buffer_count);
 		return -EINVAL;
+	}
 
-	if (!IS_ALIGNED(args->commands_offset, 8))
+	if (!IS_ALIGNED(args->commands_offset, 8)) {
+		ivpu_dbg(vdev, IOCTL, "Invalid commands offset %u\n", args->commands_offset);
 		return -EINVAL;
+	}
 
-	if (!file_priv->ctx.id)
+	if (!file_priv->ctx.id) {
+		ivpu_dbg(vdev, IOCTL, "Context not initialized\n");
 		return -EINVAL;
+	}
 
-	if (file_priv->has_mmu_faults)
+	if (file_priv->has_mmu_faults) {
+		ivpu_dbg(vdev, IOCTL, "Context %u has MMU faults\n", file_priv->ctx.id);
 		return -EBADFD;
+	}
 
 	priority = ivpu_job_to_jsm_priority(args->priority);
 
 	return ivpu_submit(file, file_priv, 0, args->buffer_count, args->engine,
-			   (void __user *)args->buffers_ptr, args->commands_offset, priority);
+			   (void __user *)args->buffers_ptr, args->commands_offset, 0, priority);
 }
 
 int ivpu_cmdq_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
 	struct ivpu_file_priv *file_priv = file->driver_priv;
+	struct ivpu_device *vdev = file_priv->vdev;
 	struct drm_ivpu_cmdq_submit *args = data;
 
-	if (!ivpu_is_capable(file_priv->vdev, DRM_IVPU_CAP_MANAGE_CMDQ))
+	if (!ivpu_is_capable(file_priv->vdev, DRM_IVPU_CAP_MANAGE_CMDQ)) {
+		ivpu_dbg(vdev, IOCTL, "Command queue management not supported\n");
 		return -ENODEV;
+	}
 
-	if (args->cmdq_id < IVPU_CMDQ_MIN_ID || args->cmdq_id > IVPU_CMDQ_MAX_ID)
+	if (args->cmdq_id < IVPU_CMDQ_MIN_ID || args->cmdq_id > IVPU_CMDQ_MAX_ID) {
+		ivpu_dbg(vdev, IOCTL, "Invalid command queue ID %u\n", args->cmdq_id);
 		return -EINVAL;
+	}
 
-	if (args->buffer_count == 0 || args->buffer_count > JOB_MAX_BUFFER_COUNT)
+	if (args->buffer_count == 0 || args->buffer_count > JOB_MAX_BUFFER_COUNT) {
+		ivpu_dbg(vdev, IOCTL, "Invalid buffer count %u\n", args->buffer_count);
 		return -EINVAL;
+	}
 
-	if (!IS_ALIGNED(args->commands_offset, 8))
+	if (args->preempt_buffer_index >= args->buffer_count) {
+		ivpu_dbg(vdev, IOCTL, "Invalid preemption buffer index %u\n",
+			 args->preempt_buffer_index);
 		return -EINVAL;
+	}
 
-	if (!file_priv->ctx.id)
+	if (!IS_ALIGNED(args->commands_offset, 8)) {
+		ivpu_dbg(vdev, IOCTL, "Invalid commands offset %u\n", args->commands_offset);
 		return -EINVAL;
+	}
 
-	if (file_priv->has_mmu_faults)
+	if (!file_priv->ctx.id) {
+		ivpu_dbg(vdev, IOCTL, "Context not initialized\n");
+		return -EINVAL;
+	}
+
+	if (file_priv->has_mmu_faults) {
+		ivpu_dbg(vdev, IOCTL, "Context %u has MMU faults\n", file_priv->ctx.id);
 		return -EBADFD;
+	}
 
 	return ivpu_submit(file, file_priv, args->cmdq_id, args->buffer_count, VPU_ENGINE_COMPUTE,
-			   (void __user *)args->buffers_ptr, args->commands_offset, 0);
+			   (void __user *)args->buffers_ptr, args->commands_offset,
+			   args->preempt_buffer_index, 0);
 }
 
 int ivpu_cmdq_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
@@ -904,11 +996,15 @@ int ivpu_cmdq_create_ioctl(struct drm_device *dev, void *data, struct drm_file *
 	struct ivpu_cmdq *cmdq;
 	int ret;
 
-	if (!ivpu_is_capable(vdev, DRM_IVPU_CAP_MANAGE_CMDQ))
+	if (!ivpu_is_capable(vdev, DRM_IVPU_CAP_MANAGE_CMDQ)) {
+		ivpu_dbg(vdev, IOCTL, "Command queue management not supported\n");
 		return -ENODEV;
+	}
 
-	if (args->priority > DRM_IVPU_JOB_PRIORITY_REALTIME)
+	if (args->priority > DRM_IVPU_JOB_PRIORITY_REALTIME) {
+		ivpu_dbg(vdev, IOCTL, "Invalid priority %d\n", args->priority);
 		return -EINVAL;
+	}
 
 	ret = ivpu_rpm_get(vdev);
 	if (ret < 0)
@@ -936,8 +1032,10 @@ int ivpu_cmdq_destroy_ioctl(struct drm_device *dev, void *data, struct drm_file
 	u32 cmdq_id = 0;
 	int ret;
 
-	if (!ivpu_is_capable(vdev, DRM_IVPU_CAP_MANAGE_CMDQ))
+	if (!ivpu_is_capable(vdev, DRM_IVPU_CAP_MANAGE_CMDQ)) {
+		ivpu_dbg(vdev, IOCTL, "Command queue management not supported\n");
 		return -ENODEV;
+	}
 
 	ret = ivpu_rpm_get(vdev);
 	if (ret < 0)
@@ -984,7 +1082,9 @@ ivpu_job_done_callback(struct ivpu_device *vdev, struct ivpu_ipc_hdr *ipc_hdr,
 	payload = (struct vpu_ipc_msg_payload_job_done *)&jsm_msg->payload;
 
 	mutex_lock(&vdev->submitted_jobs_lock);
-	ivpu_job_signal_and_destroy(vdev, payload->job_id, payload->job_status);
+	if (!ivpu_job_handle_engine_error(vdev, payload->job_id, payload->job_status))
+		/* No engine error, complete the job normally */
+		ivpu_job_signal_and_destroy(vdev, payload->job_id, payload->job_status);
 	mutex_unlock(&vdev->submitted_jobs_lock);
 }
 
@@ -1012,7 +1112,7 @@ void ivpu_context_abort_work_fn(struct work_struct *work)
 
 	if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW)
 		if (ivpu_jsm_reset_engine(vdev, 0))
-			return;
+			goto runtime_put;
 
 	mutex_lock(&vdev->context_list_lock);
 	xa_for_each(&vdev->context_xa, ctx_id, file_priv) {
@@ -1036,7 +1136,7 @@ void ivpu_context_abort_work_fn(struct work_struct *work)
 		goto runtime_put;
 
 	if (ivpu_jsm_hws_resume_engine(vdev, 0))
-		return;
+		goto runtime_put;
 	/*
 	 * In hardware scheduling mode NPU already has stopped processing jobs
 	 * and won't send us any further notifications, thus we have to free job related resources
@@ -1049,6 +1149,5 @@ void ivpu_context_abort_work_fn(struct work_struct *work)
 	mutex_unlock(&vdev->submitted_jobs_lock);
 
 runtime_put:
-	pm_runtime_mark_last_busy(vdev->drm.dev);
 	pm_runtime_put_autosuspend(vdev->drm.dev);
 }
diff --git a/drivers/accel/ivpu/ivpu_job.h b/drivers/accel/ivpu/ivpu_job.h
index 2e301c2eea7b..3ab61e6a5616 100644
--- a/drivers/accel/ivpu/ivpu_job.h
+++ b/drivers/accel/ivpu/ivpu_job.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2020-2024 Intel Corporation
+ * Copyright (C) 2020-2025 Intel Corporation
  */
 
 #ifndef __IVPU_JOB_H__
@@ -15,12 +15,17 @@ struct ivpu_device;
 struct ivpu_file_priv;
 
 /**
- * struct ivpu_cmdq - Object representing device queue used to send jobs.
- * @jobq:	   Pointer to job queue memory shared with the device
- * @mem:           Memory allocated for the job queue, shared with device
- * @entry_count    Number of job entries in the queue
- * @db_id:	   Doorbell assigned to this job queue
- * @db_registered: True if doorbell is registered in device
+ * struct ivpu_cmdq - Represents a command queue for submitting jobs to the VPU.
+ * Tracks queue memory, preemption buffers, and metadata for job management.
+ * @jobq:                Pointer to job queue memory shared with the device
+ * @primary_preempt_buf: Primary preemption buffer for this queue (optional)
+ * @secondary_preempt_buf: Secondary preemption buffer for this queue (optional)
+ * @mem:                 Memory allocated for the job queue, shared with device
+ * @entry_count:         Number of job entries in the queue
+ * @id:                  Unique command queue ID
+ * @db_id:               Doorbell ID assigned to this job queue
+ * @priority:            Priority level of the command queue
+ * @is_legacy:           True if this is a legacy command queue
  */
 struct ivpu_cmdq {
 	struct vpu_job_queue *jobq;
@@ -35,16 +40,22 @@ struct ivpu_cmdq {
 };
 
 /**
- * struct ivpu_job - KMD object that represents batchbuffer / DMA buffer.
- * Each batch / DMA buffer is a job to be submitted and executed by the VPU FW.
- * This is a unit of execution, and be tracked by the job_id for
- * any status reporting from VPU FW through IPC JOB RET/DONE message.
- * @file_priv:		  The client that submitted this job
- * @job_id:		  Job ID for KMD tracking and job status reporting from VPU FW
- * @status:		  Status of the Job from IPC JOB RET/DONE message
- * @batch_buffer:	  CPU vaddr points to the batch buffer memory allocated for the job
- * @submit_status_offset: Offset within batch buffer where job completion handler
-			  will update the job status
+ * struct ivpu_job - Representing a batch or DMA buffer submitted to the VPU.
+ * Each job is a unit of execution, tracked by job_id for status reporting from VPU FW.
+ * The structure holds all resources and metadata needed for job submission, execution,
+ * and completion handling.
+ * @vdev:                Pointer to the VPU device
+ * @file_priv:           The client context that submitted this job
+ * @done_fence:          Fence signaled when job completes
+ * @cmd_buf_vpu_addr:    VPU address of the command buffer for this job
+ * @cmdq_id:             Command queue ID used for submission
+ * @job_id:              Unique job ID for tracking and status reporting
+ * @engine_idx:          Engine index for job execution
+ * @job_status:          Status reported by firmware for this job
+ * @primary_preempt_buf: Primary preemption buffer for job
+ * @secondary_preempt_buf: Secondary preemption buffer for job (optional)
+ * @bo_count:            Number of buffer objects associated with this job
+ * @bos:                 Array of buffer objects used by the job (batch buffer is at index 0)
  */
 struct ivpu_job {
 	struct ivpu_device *vdev;
@@ -54,6 +65,9 @@ struct ivpu_job {
 	u32 cmdq_id;
 	u32 job_id;
 	u32 engine_idx;
+	u32 job_status;
+	struct ivpu_bo *primary_preempt_buf;
+	struct ivpu_bo *secondary_preempt_buf;
 	size_t bo_count;
 	struct ivpu_bo *bos[] __counted_by(bo_count);
 };
@@ -71,6 +85,7 @@ void ivpu_cmdq_abort_all_jobs(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id)
 
 void ivpu_job_done_consumer_init(struct ivpu_device *vdev);
 void ivpu_job_done_consumer_fini(struct ivpu_device *vdev);
+bool ivpu_job_handle_engine_error(struct ivpu_device *vdev, u32 job_id, u32 job_status);
 void ivpu_context_abort_work_fn(struct work_struct *work);
 
 void ivpu_jobs_abort_all(struct ivpu_device *vdev);
diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c
index 5ea010568faa..e1baf6b64935 100644
--- a/drivers/accel/ivpu/ivpu_mmu.c
+++ b/drivers/accel/ivpu/ivpu_mmu.c
@@ -970,7 +970,7 @@ void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev)
 		}
 	}
 
-	queue_work(system_wq, &vdev->context_abort_work);
+	queue_work(system_percpu_wq, &vdev->context_abort_work);
 }
 
 void ivpu_mmu_evtq_dump(struct ivpu_device *vdev)
diff --git a/drivers/accel/ivpu/ivpu_mmu_context.c b/drivers/accel/ivpu/ivpu_mmu_context.c
index f0267efa55aa..87ad593ef47d 100644
--- a/drivers/accel/ivpu/ivpu_mmu_context.c
+++ b/drivers/accel/ivpu/ivpu_mmu_context.c
@@ -430,7 +430,7 @@ static void ivpu_mmu_context_unmap_pages(struct ivpu_mmu_context *ctx, u64 vpu_a
 
 int
 ivpu_mmu_context_map_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
-			 u64 vpu_addr, struct sg_table *sgt,  bool llc_coherent)
+			 u64 vpu_addr, struct sg_table *sgt, bool llc_coherent, bool read_only)
 {
 	size_t start_vpu_addr = vpu_addr;
 	struct scatterlist *sg;
@@ -450,6 +450,8 @@ ivpu_mmu_context_map_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
 	prot = IVPU_MMU_ENTRY_MAPPED;
 	if (llc_coherent)
 		prot |= IVPU_MMU_ENTRY_FLAG_LLC_COHERENT;
+	if (read_only)
+		prot |= IVPU_MMU_ENTRY_FLAG_RO;
 
 	mutex_lock(&ctx->lock);
 
@@ -527,7 +529,8 @@ ivpu_mmu_context_unmap_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ct
 
 	ret = ivpu_mmu_invalidate_tlb(vdev, ctx->id);
 	if (ret)
-		ivpu_warn(vdev, "Failed to invalidate TLB for ctx %u: %d\n", ctx->id, ret);
+		ivpu_warn_ratelimited(vdev, "Failed to invalidate TLB for ctx %u: %d\n",
+				      ctx->id, ret);
 }
 
 int
@@ -568,7 +571,7 @@ void ivpu_mmu_context_init(struct ivpu_device *vdev, struct ivpu_mmu_context *ct
 	mutex_init(&ctx->lock);
 
 	if (!context_id) {
-		start = vdev->hw->ranges.global.start;
+		start = vdev->hw->ranges.runtime.start;
 		end = vdev->hw->ranges.shave.end;
 	} else {
 		start = min_t(u64, vdev->hw->ranges.user.start, vdev->hw->ranges.shave.start);
diff --git a/drivers/accel/ivpu/ivpu_mmu_context.h b/drivers/accel/ivpu/ivpu_mmu_context.h
index f255310968cf..663a11a9db11 100644
--- a/drivers/accel/ivpu/ivpu_mmu_context.h
+++ b/drivers/accel/ivpu/ivpu_mmu_context.h
@@ -42,7 +42,7 @@ int ivpu_mmu_context_insert_node(struct ivpu_mmu_context *ctx, const struct ivpu
 void ivpu_mmu_context_remove_node(struct ivpu_mmu_context *ctx, struct drm_mm_node *node);
 
 int ivpu_mmu_context_map_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
-			     u64 vpu_addr, struct sg_table *sgt, bool llc_coherent);
+			     u64 vpu_addr, struct sg_table *sgt, bool llc_coherent, bool read_only);
 void ivpu_mmu_context_unmap_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
 				u64 vpu_addr, struct sg_table *sgt);
 int ivpu_mmu_context_set_pages_ro(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
diff --git a/drivers/accel/ivpu/ivpu_ms.c b/drivers/accel/ivpu/ivpu_ms.c
index 2a043baf10ca..1d9c1cb17924 100644
--- a/drivers/accel/ivpu/ivpu_ms.c
+++ b/drivers/accel/ivpu/ivpu_ms.c
@@ -8,6 +8,7 @@
 
 #include "ivpu_drv.h"
 #include "ivpu_gem.h"
+#include "ivpu_hw.h"
 #include "ivpu_jsm_msg.h"
 #include "ivpu_ms.h"
 #include "ivpu_pm.h"
@@ -37,8 +38,8 @@ int ivpu_ms_start_ioctl(struct drm_device *dev, void *data, struct drm_file *fil
 	struct drm_ivpu_metric_streamer_start *args = data;
 	struct ivpu_device *vdev = file_priv->vdev;
 	struct ivpu_ms_instance *ms;
-	u64 single_buff_size;
 	u32 sample_size;
+	u64 buf_size;
 	int ret;
 
 	if (!args->metric_group_mask || !args->read_period_samples ||
@@ -52,7 +53,8 @@ int ivpu_ms_start_ioctl(struct drm_device *dev, void *data, struct drm_file *fil
 	mutex_lock(&file_priv->ms_lock);
 
 	if (get_instance_by_mask(file_priv, args->metric_group_mask)) {
-		ivpu_err(vdev, "Instance already exists (mask %#llx)\n", args->metric_group_mask);
+		ivpu_dbg(vdev, IOCTL, "Instance already exists (mask %#llx)\n",
+			 args->metric_group_mask);
 		ret = -EALREADY;
 		goto unlock;
 	}
@@ -69,12 +71,18 @@ int ivpu_ms_start_ioctl(struct drm_device *dev, void *data, struct drm_file *fil
 	if (ret)
 		goto err_free_ms;
 
-	single_buff_size = sample_size *
-		((u64)args->read_period_samples * MS_READ_PERIOD_MULTIPLIER);
-	ms->bo = ivpu_bo_create_global(vdev, PAGE_ALIGN(single_buff_size * MS_NUM_BUFFERS),
-				       DRM_IVPU_BO_CACHED | DRM_IVPU_BO_MAPPABLE);
+	buf_size = PAGE_ALIGN((u64)args->read_period_samples * sample_size *
+			      MS_READ_PERIOD_MULTIPLIER * MS_NUM_BUFFERS);
+	if (buf_size > ivpu_hw_range_size(&vdev->hw->ranges.global)) {
+		ivpu_dbg(vdev, IOCTL, "Requested MS buffer size %llu exceeds range size %llu\n",
+			 buf_size, ivpu_hw_range_size(&vdev->hw->ranges.global));
+		ret = -EINVAL;
+		goto err_free_ms;
+	}
+
+	ms->bo = ivpu_bo_create_global(vdev, buf_size, DRM_IVPU_BO_CACHED | DRM_IVPU_BO_MAPPABLE);
 	if (!ms->bo) {
-		ivpu_err(vdev, "Failed to allocate MS buffer (size %llu)\n", single_buff_size);
+		ivpu_dbg(vdev, IOCTL, "Failed to allocate MS buffer (size %llu)\n", buf_size);
 		ret = -ENOMEM;
 		goto err_free_ms;
 	}
@@ -175,7 +183,8 @@ int ivpu_ms_get_data_ioctl(struct drm_device *dev, void *data, struct drm_file *
 
 	ms = get_instance_by_mask(file_priv, args->metric_group_mask);
 	if (!ms) {
-		ivpu_err(vdev, "Instance doesn't exist for mask: %#llx\n", args->metric_group_mask);
+		ivpu_dbg(vdev, IOCTL, "Instance doesn't exist for mask: %#llx\n",
+			 args->metric_group_mask);
 		ret = -EINVAL;
 		goto unlock;
 	}
diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
index 475ddc94f1cf..480c075d87f6 100644
--- a/drivers/accel/ivpu/ivpu_pm.c
+++ b/drivers/accel/ivpu/ivpu_pm.c
@@ -54,7 +54,7 @@ static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
 {
 	struct ivpu_fw_info *fw = vdev->fw;
-	struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem);
+	struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem_bp);
 
 	if (!bp->save_restore_ret_address) {
 		ivpu_pm_prepare_cold_boot(vdev);
@@ -186,7 +186,7 @@ void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
 	if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
 		ivpu_hw_diagnose_failure(vdev);
 		ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
-		queue_work(system_unbound_wq, &vdev->pm->recovery_work);
+		queue_work(system_dfl_wq, &vdev->pm->recovery_work);
 	}
 }
 
@@ -226,7 +226,8 @@ void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
 	unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
 
 	/* No-op if already queued */
-	queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms));
+	queue_delayed_work(system_percpu_wq, &vdev->pm->job_timeout_work,
+			   msecs_to_jiffies(timeout_ms));
 }
 
 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
@@ -359,7 +360,6 @@ int ivpu_rpm_get(struct ivpu_device *vdev)
 
 void ivpu_rpm_put(struct ivpu_device *vdev)
 {
-	pm_runtime_mark_last_busy(vdev->drm.dev);
 	pm_runtime_put_autosuspend(vdev->drm.dev);
 }
 
@@ -428,7 +428,6 @@ void ivpu_pm_enable(struct ivpu_device *vdev)
 	struct device *dev = vdev->drm.dev;
 
 	pm_runtime_allow(dev);
-	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
 }
 
@@ -502,6 +501,11 @@ void ivpu_pm_irq_dct_work_fn(struct work_struct *work)
 	else
 		ret = ivpu_pm_dct_disable(vdev);
 
-	if (!ret)
-		ivpu_hw_btrs_dct_set_status(vdev, enable, vdev->pm->dct_active_percent);
+	if (!ret) {
+		/* Convert percent to U1.7 format */
+		u8 val = DIV_ROUND_CLOSEST(vdev->pm->dct_active_percent * 128, 100);
+
+		ivpu_hw_btrs_dct_set_status(vdev, enable, val);
+	}
+
 }
diff --git a/drivers/accel/ivpu/ivpu_sysfs.c b/drivers/accel/ivpu/ivpu_sysfs.c
index 268ab7744a8b..d250a10caca9 100644
--- a/drivers/accel/ivpu/ivpu_sysfs.c
+++ b/drivers/accel/ivpu/ivpu_sysfs.c
@@ -63,7 +63,8 @@ npu_memory_utilization_show(struct device *dev, struct device_attribute *attr, c
 
 	mutex_lock(&vdev->bo_list_lock);
 	list_for_each_entry(bo, &vdev->bo_list, bo_list_node)
-		total_npu_memory += bo->base.base.size;
+		if (ivpu_bo_is_resident(bo))
+			total_npu_memory += ivpu_bo_size(bo);
 	mutex_unlock(&vdev->bo_list_lock);
 
 	return sysfs_emit(buf, "%lld\n", total_npu_memory);
diff --git a/drivers/accel/ivpu/vpu_jsm_api.h b/drivers/accel/ivpu/vpu_jsm_api.h
index 4b6b2b3d2583..bca6a44dc041 100644
--- a/drivers/accel/ivpu/vpu_jsm_api.h
+++ b/drivers/accel/ivpu/vpu_jsm_api.h
@@ -1,15 +1,16 @@
 /* SPDX-License-Identifier: MIT */
 /*
- * Copyright (c) 2020-2024, Intel Corporation.
+ * Copyright (c) 2020-2025, Intel Corporation.
+ */
+
+/**
+ * @addtogroup Jsm
+ * @{
  */
 
 /**
  * @file
  * @brief JSM shared definitions
- *
- * @ingroup Jsm
- * @brief JSM shared definitions
- * @{
  */
 #ifndef VPU_JSM_API_H
 #define VPU_JSM_API_H
@@ -22,7 +23,7 @@
 /*
  * Minor version changes when API backward compatibility is preserved.
  */
-#define VPU_JSM_API_VER_MINOR 29
+#define VPU_JSM_API_VER_MINOR 33
 
 /*
  * API header changed (field names, documentation, formatting) but API itself has not been changed
@@ -71,9 +72,15 @@
 #define VPU_JSM_STATUS_MVNCI_OUT_OF_RESOURCES		 0xAU
 #define VPU_JSM_STATUS_MVNCI_NOT_IMPLEMENTED		 0xBU
 #define VPU_JSM_STATUS_MVNCI_INTERNAL_ERROR		 0xCU
-/* Job status returned when the job was preempted mid-inference */
+/* @deprecated (use VPU_JSM_STATUS_PREEMPTED_MID_COMMAND instead) */
 #define VPU_JSM_STATUS_PREEMPTED_MID_INFERENCE		 0xDU
+/* Job status returned when the job was preempted mid-command */
+#define VPU_JSM_STATUS_PREEMPTED_MID_COMMAND		 0xDU
+/* Range of status codes that require engine reset */
+#define VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MIN	 0xEU
 #define VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW	 0xEU
+#define VPU_JSM_STATUS_MVNCI_PREEMPTION_TIMED_OUT	 0xFU
+#define VPU_JSM_STATUS_ENGINE_RESET_REQUIRED_MAX	 0x1FU
 
 /*
  * Host <-> VPU IPC channels.
@@ -134,11 +141,21 @@ enum {
 	 *  2. Native fence queues are only supported on VPU 40xx onwards.
 	 */
 	VPU_JOB_QUEUE_FLAGS_USE_NATIVE_FENCE_MASK = (1 << 1U),
-
 	/*
 	 * Enable turbo mode for testing NPU performance; not recommended for regular usage.
 	 */
-	VPU_JOB_QUEUE_FLAGS_TURBO_MODE = (1 << 2U)
+	VPU_JOB_QUEUE_FLAGS_TURBO_MODE = (1 << 2U),
+	/*
+	 * Queue error detection mode flag
+	 * For 'interactive' queues (this bit not set), the FW will identify queues that have not
+	 * completed a job inside the TDR timeout as in error as part of engine reset sequence.
+	 * For 'non-interactive' queues (this bit set), the FW will identify queues that have not
+	 * progressed the heartbeat inside the non-interactive no-progress timeout as in error as
+	 * part of engine reset sequence. Additionally, there is an upper limit applied to these
+	 * queues: even if they progress the heartbeat, if they run longer than non-interactive
+	 * timeout, then the FW will also identify them as in error.
+	 */
+	VPU_JOB_QUEUE_FLAGS_NON_INTERACTIVE = (1 << 3U)
 };
 
 /*
@@ -209,7 +226,7 @@ enum {
  */
 #define VPU_INLINE_CMD_TYPE_FENCE_SIGNAL 0x2
 
-/*
+/**
  * Job scheduling priority bands for both hardware scheduling and OS scheduling.
  */
 enum vpu_job_scheduling_priority_band {
@@ -220,16 +237,16 @@ enum vpu_job_scheduling_priority_band {
 	VPU_JOB_SCHEDULING_PRIORITY_BAND_COUNT = 4,
 };
 
-/*
+/**
  * Job format.
  * Jobs defines the actual workloads to be executed by a given engine.
  */
 struct vpu_job_queue_entry {
-	/**< Address of VPU commands batch buffer */
+	/** Address of VPU commands batch buffer */
 	u64 batch_buf_addr;
-	/**< Job ID */
+	/** Job ID */
 	u32 job_id;
-	/**< Flags bit field, see VPU_JOB_FLAGS_* above */
+	/** Flags bit field, see VPU_JOB_FLAGS_* above */
 	u32 flags;
 	/**
 	 * Doorbell ring timestamp taken by KMD from SoC's global system clock, in
@@ -237,20 +254,20 @@ struct vpu_job_queue_entry {
 	 * to match other profiling timestamps.
 	 */
 	u64 doorbell_timestamp;
-	/**< Extra id for job tracking, used only in the firmware perf traces */
+	/** Extra id for job tracking, used only in the firmware perf traces */
 	u64 host_tracking_id;
-	/**< Address of the primary preemption buffer to use for this job */
+	/** Address of the primary preemption buffer to use for this job */
 	u64 primary_preempt_buf_addr;
-	/**< Size of the primary preemption buffer to use for this job */
+	/** Size of the primary preemption buffer to use for this job */
 	u32 primary_preempt_buf_size;
-	/**< Size of secondary preemption buffer to use for this job */
+	/** Size of secondary preemption buffer to use for this job */
 	u32 secondary_preempt_buf_size;
-	/**< Address of secondary preemption buffer to use for this job */
+	/** Address of secondary preemption buffer to use for this job */
 	u64 secondary_preempt_buf_addr;
 	u64 reserved_0;
 };
 
-/*
+/**
  * Inline command format.
  * Inline commands are the commands executed at scheduler level (typically,
  * synchronization directives). Inline command and job objects must be of
@@ -258,34 +275,36 @@ struct vpu_job_queue_entry {
  */
 struct vpu_inline_cmd {
 	u64 reserved_0;
-	/* Inline command type, see VPU_INLINE_CMD_TYPE_* defines. */
+	/** Inline command type, see VPU_INLINE_CMD_TYPE_* defines. */
 	u32 type;
-	/* Flags bit field, see VPU_JOB_FLAGS_* above. */
+	/** Flags bit field, see VPU_JOB_FLAGS_* above. */
 	u32 flags;
-	/* Inline command payload. Depends on inline command type. */
-	union {
-		/* Fence (wait and signal) commands' payload. */
-		struct {
-			/* Fence object handle. */
+	/** Inline command payload. Depends on inline command type. */
+	union payload {
+		/** Fence (wait and signal) commands' payload. */
+		struct fence {
+			/** Fence object handle. */
 			u64 fence_handle;
-			/* User VA of the current fence value. */
+			/** User VA of the current fence value. */
 			u64 current_value_va;
-			/* User VA of the monitored fence value (read-only). */
+			/** User VA of the monitored fence value (read-only). */
 			u64 monitored_value_va;
-			/* Value to wait for or write in fence location. */
+			/** Value to wait for or write in fence location. */
 			u64 value;
-			/* User VA of the log buffer in which to add log entry on completion. */
+			/** User VA of the log buffer in which to add log entry on completion. */
 			u64 log_buffer_va;
-			/* NPU private data. */
+			/** NPU private data. */
 			u64 npu_private_data;
 		} fence;
-		/* Other commands do not have a payload. */
-		/* Payload definition for future inline commands can be inserted here. */
+		/**
+		 * Other commands do not have a payload:
+		 * Payload definition for future inline commands can be inserted here.
+		 */
 		u64 reserved_1[6];
 	} payload;
 };
 
-/*
+/**
  * Job queue slots can be populated either with job objects or inline command objects.
  */
 union vpu_jobq_slot {
@@ -293,7 +312,7 @@ union vpu_jobq_slot {
 	struct vpu_inline_cmd inline_cmd;
 };
 
-/*
+/**
  * Job queue control registers.
  */
 struct vpu_job_queue_header {
@@ -301,18 +320,18 @@ struct vpu_job_queue_header {
 	u32 head;
 	u32 tail;
 	u32 flags;
-	/* Set to 1 to indicate priority_band field is valid */
+	/** Set to 1 to indicate priority_band field is valid */
 	u32 priority_band_valid;
-	/*
+	/**
 	 * Priority for the work of this job queue, valid only if the HWS is NOT used
-	 * and the `priority_band_valid` is set to 1. It is applied only during
-	 * the VPU_JSM_MSG_REGISTER_DB message processing.
-	 * The device firmware might use the `priority_band` to optimize the power
+	 * and the @ref priority_band_valid is set to 1. It is applied only during
+	 * the @ref VPU_JSM_MSG_REGISTER_DB message processing.
+	 * The device firmware might use the priority_band to optimize the power
 	 * management logic, but it will not affect the order of jobs.
 	 * Available priority bands: @see enum vpu_job_scheduling_priority_band
 	 */
 	u32 priority_band;
-	/* Inside realtime band assigns a further priority, limited to 0..31 range */
+	/** Inside realtime band assigns a further priority, limited to 0..31 range */
 	u32 realtime_priority_level;
 	u32 reserved_0[9];
 };
@@ -337,16 +356,16 @@ enum vpu_trace_entity_type {
 	VPU_TRACE_ENTITY_TYPE_HW_COMPONENT = 2,
 };
 
-/*
+/**
  * HWS specific log buffer header details.
  * Total size is 32 bytes.
  */
 struct vpu_hws_log_buffer_header {
-	/* Written by VPU after adding a log entry. Initialised by host to 0. */
+	/** Written by VPU after adding a log entry. Initialised by host to 0. */
 	u32 first_free_entry_index;
-	/* Incremented by VPU every time the VPU writes the 0th entry; initialised by host to 0. */
+	/** Incremented by VPU every time the VPU writes the 0th entry; initialised by host to 0. */
 	u32 wraparound_count;
-	/*
+	/**
 	 * This is the number of buffers that can be stored in the log buffer provided by the host.
 	 * It is written by host before passing buffer to VPU. VPU should consider it read-only.
 	 */
@@ -354,14 +373,14 @@ struct vpu_hws_log_buffer_header {
 	u64 reserved[2];
 };
 
-/*
+/**
  * HWS specific log buffer entry details.
  * Total size is 32 bytes.
  */
 struct vpu_hws_log_buffer_entry {
-	/* VPU timestamp must be an invariant timer tick (not impacted by DVFS) */
+	/** VPU timestamp must be an invariant timer tick (not impacted by DVFS) */
 	u64 vpu_timestamp;
-	/*
+	/**
 	 * Operation type:
 	 *     0 - context state change
 	 *     1 - queue new work
@@ -371,7 +390,7 @@ struct vpu_hws_log_buffer_entry {
 	 */
 	u32 operation_type;
 	u32 reserved;
-	/* Operation data depends on operation type */
+	/** Operation data depends on operation type */
 	u64 operation_data[2];
 };
 
@@ -381,51 +400,54 @@ enum vpu_hws_native_fence_log_type {
 	VPU_HWS_NATIVE_FENCE_LOG_TYPE_SIGNALS = 2
 };
 
-/* HWS native fence log buffer header. */
+/** HWS native fence log buffer header. */
 struct vpu_hws_native_fence_log_header {
 	union {
 		struct {
-			/* Index of the first free entry in buffer. */
+			/** Index of the first free entry in buffer. */
 			u32 first_free_entry_idx;
-			/* Incremented each time NPU wraps around the buffer to write next entry. */
+			/**
+			 * Incremented whenever the NPU wraps around the buffer and writes
+			 * to the first entry again.
+			 */
 			u32 wraparound_count;
 		};
-		/* Field allowing atomic update of both fields above. */
+		/** Field allowing atomic update of both fields above. */
 		u64 atomic_wraparound_and_entry_idx;
 	};
-	/* Log buffer type, see enum vpu_hws_native_fence_log_type. */
+	/** Log buffer type, see enum vpu_hws_native_fence_log_type. */
 	u64 type;
-	/* Allocated number of entries in the log buffer. */
+	/** Allocated number of entries in the log buffer. */
 	u64 entry_nb;
 	u64 reserved[2];
 };
 
-/* Native fence log operation types. */
+/** Native fence log operation types. */
 enum vpu_hws_native_fence_log_op {
 	VPU_HWS_NATIVE_FENCE_LOG_OP_SIGNAL_EXECUTED = 0,
 	VPU_HWS_NATIVE_FENCE_LOG_OP_WAIT_UNBLOCKED = 1
 };
 
-/* HWS native fence log entry. */
+/** HWS native fence log entry. */
 struct vpu_hws_native_fence_log_entry {
-	/* Newly signaled/unblocked fence value. */
+	/** Newly signaled/unblocked fence value. */
 	u64 fence_value;
-	/* Native fence object handle to which this operation belongs. */
+	/** Native fence object handle to which this operation belongs. */
 	u64 fence_handle;
-	/* Operation type, see enum vpu_hws_native_fence_log_op. */
+	/** Operation type, see enum vpu_hws_native_fence_log_op. */
 	u64 op_type;
 	u64 reserved_0;
-	/*
+	/**
 	 * VPU_HWS_NATIVE_FENCE_LOG_OP_WAIT_UNBLOCKED only: Timestamp at which fence
 	 * wait was started (in NPU SysTime).
 	 */
 	u64 fence_wait_start_ts;
 	u64 reserved_1;
-	/* Timestamp at which fence operation was completed (in NPU SysTime). */
+	/** Timestamp at which fence operation was completed (in NPU SysTime). */
 	u64 fence_end_ts;
 };
 
-/* Native fence log buffer. */
+/** Native fence log buffer. */
 struct vpu_hws_native_fence_log_buffer {
 	struct vpu_hws_native_fence_log_header header;
 	struct vpu_hws_native_fence_log_entry entry[];
@@ -435,10 +457,17 @@ struct vpu_hws_native_fence_log_buffer {
  * Host <-> VPU IPC messages types.
  */
 enum vpu_ipc_msg_type {
+	/** Unsupported command */
 	VPU_JSM_MSG_UNKNOWN = 0xFFFFFFFF,
 
-	/* IPC Host -> Device, Async commands */
+	/** IPC Host -> Device, base id for async commands */
 	VPU_JSM_MSG_ASYNC_CMD = 0x1100,
+	/**
+	 * Reset engine. The NPU cancels all the jobs currently executing on the target
+	 * engine making the engine become idle and then does a HW reset, before returning
+	 * to the host.
+	 * @see struct vpu_ipc_msg_payload_engine_reset
+	 */
 	VPU_JSM_MSG_ENGINE_RESET = VPU_JSM_MSG_ASYNC_CMD,
 	/**
 	 * Preempt engine. The NPU stops (preempts) all the jobs currently
@@ -448,10 +477,24 @@ enum vpu_ipc_msg_type {
 	 * the target engine, but it stops processing them (until the queue doorbell
 	 * is rung again); the host is responsible to reset the job queue, either
 	 * after preemption or when resubmitting jobs to the queue.
+	 * @see vpu_ipc_msg_payload_engine_preempt
 	 */
 	VPU_JSM_MSG_ENGINE_PREEMPT = 0x1101,
+	/**
+	 * OS scheduling doorbell register command
+	 * @see vpu_ipc_msg_payload_register_db
+	 */
 	VPU_JSM_MSG_REGISTER_DB = 0x1102,
+	/**
+	 * OS scheduling doorbell unregister command
+	 * @see vpu_ipc_msg_payload_unregister_db
+	 */
 	VPU_JSM_MSG_UNREGISTER_DB = 0x1103,
+	/**
+	 * Query engine heartbeat. Heartbeat is expected to increase monotonically
+	 * and increase while work is being progressed by NPU.
+	 * @see vpu_ipc_msg_payload_query_engine_hb
+	 */
 	VPU_JSM_MSG_QUERY_ENGINE_HB = 0x1104,
 	VPU_JSM_MSG_GET_POWER_LEVEL_COUNT = 0x1105,
 	VPU_JSM_MSG_GET_POWER_LEVEL = 0x1106,
@@ -477,6 +520,7 @@ enum vpu_ipc_msg_type {
 	 * aborted and removed from internal scheduling queues. All doorbells assigned
 	 * to the host_ssid are unregistered and any internal FW resources belonging to
 	 * the host_ssid are released.
+	 * @see vpu_ipc_msg_payload_ssid_release
 	 */
 	VPU_JSM_MSG_SSID_RELEASE = 0x110e,
 	/**
@@ -504,43 +548,78 @@ enum vpu_ipc_msg_type {
 	 * @see vpu_jsm_metric_streamer_start
 	 */
 	VPU_JSM_MSG_METRIC_STREAMER_INFO = 0x1112,
-	/** Control command: Priority band setup */
+	/**
+	 * Control command: Priority band setup
+	 * @see vpu_ipc_msg_payload_hws_priority_band_setup
+	 */
 	VPU_JSM_MSG_SET_PRIORITY_BAND_SETUP = 0x1113,
-	/** Control command: Create command queue */
+	/**
+	 * Control command: Create command queue
+	 * @see vpu_ipc_msg_payload_hws_create_cmdq
+	 */
 	VPU_JSM_MSG_CREATE_CMD_QUEUE = 0x1114,
-	/** Control command: Destroy command queue */
+	/**
+	 * Control command: Destroy command queue
+	 * @see vpu_ipc_msg_payload_hws_destroy_cmdq
+	 */
 	VPU_JSM_MSG_DESTROY_CMD_QUEUE = 0x1115,
-	/** Control command: Set context scheduling properties */
+	/**
+	 * Control command: Set context scheduling properties
+	 * @see vpu_ipc_msg_payload_hws_set_context_sched_properties
+	 */
 	VPU_JSM_MSG_SET_CONTEXT_SCHED_PROPERTIES = 0x1116,
-	/*
+	/**
 	 * Register a doorbell to notify VPU of new work. The doorbell may later be
 	 * deallocated or reassigned to another context.
+	 * @see vpu_jsm_hws_register_db
 	 */
 	VPU_JSM_MSG_HWS_REGISTER_DB = 0x1117,
-	/** Control command: Log buffer setting */
+	/**
+	 * Control command: Log buffer setting
+	 * @see vpu_ipc_msg_payload_hws_set_scheduling_log
+	 */
 	VPU_JSM_MSG_HWS_SET_SCHEDULING_LOG = 0x1118,
-	/* Control command: Suspend command queue. */
+	/**
+	 * Control command: Suspend command queue.
+	 * @see vpu_ipc_msg_payload_hws_suspend_cmdq
+	 */
 	VPU_JSM_MSG_HWS_SUSPEND_CMDQ = 0x1119,
-	/* Control command: Resume command queue */
+	/**
+	 * Control command: Resume command queue
+	 * @see vpu_ipc_msg_payload_hws_resume_cmdq
+	 */
 	VPU_JSM_MSG_HWS_RESUME_CMDQ = 0x111a,
-	/* Control command: Resume engine after reset */
+	/**
+	 * Control command: Resume engine after reset
+	 * @see vpu_ipc_msg_payload_hws_resume_engine
+	 */
 	VPU_JSM_MSG_HWS_ENGINE_RESUME = 0x111b,
-	/* Control command: Enable survivability/DCT mode */
+	/**
+	 * Control command: Enable survivability/DCT mode
+	 * @see vpu_ipc_msg_payload_pwr_dct_control
+	 */
 	VPU_JSM_MSG_DCT_ENABLE = 0x111c,
-	/* Control command: Disable survivability/DCT mode */
+	/**
+	 * Control command: Disable survivability/DCT mode
+	 * This command has no payload
+	 */
 	VPU_JSM_MSG_DCT_DISABLE = 0x111d,
 	/**
 	 * Dump VPU state. To be used for debug purposes only.
-	 * NOTE: Please introduce new ASYNC commands before this one. *
+	 * This command has no payload.
+	 * NOTE: Please introduce new ASYNC commands before this one.
 	 */
 	VPU_JSM_MSG_STATE_DUMP = 0x11FF,
 
-	/* IPC Host -> Device, General commands */
+	/** IPC Host -> Device, base id for general commands */
 	VPU_JSM_MSG_GENERAL_CMD = 0x1200,
+	/** Unsupported command */
 	VPU_JSM_MSG_BLOB_DEINIT_DEPRECATED = VPU_JSM_MSG_GENERAL_CMD,
 	/**
 	 * Control dyndbg behavior by executing a dyndbg command; equivalent to
-	 * Linux command: `echo '<dyndbg_cmd>' > <debugfs>/dynamic_debug/control`.
+	 * Linux command:
+	 * @verbatim echo '<dyndbg_cmd>' > <debugfs>/dynamic_debug/control @endverbatim
+	 * @see vpu_ipc_msg_payload_dyndbg_control
 	 */
 	VPU_JSM_MSG_DYNDBG_CONTROL = 0x1201,
 	/**
@@ -548,17 +627,35 @@ enum vpu_ipc_msg_type {
 	 */
 	VPU_JSM_MSG_PWR_D0I3_ENTER = 0x1202,
 
-	/* IPC Device -> Host, Job completion */
+	/**
+	 * IPC Device -> Host, Job completion
+	 * @see struct vpu_ipc_msg_payload_job_done
+	 */
 	VPU_JSM_MSG_JOB_DONE = 0x2100,
-	/* IPC Device -> Host, Fence signalled */
+	/**
+	 * IPC Device -> Host, Fence signalled
+	 * @see vpu_ipc_msg_payload_native_fence_signalled
+	 */
 	VPU_JSM_MSG_NATIVE_FENCE_SIGNALLED = 0x2101,
 
 	/* IPC Device -> Host, Async command completion */
 	VPU_JSM_MSG_ASYNC_CMD_DONE = 0x2200,
+	/**
+	 * IPC Device -> Host, engine reset complete
+	 * @see vpu_ipc_msg_payload_engine_reset_done
+	 */
 	VPU_JSM_MSG_ENGINE_RESET_DONE = VPU_JSM_MSG_ASYNC_CMD_DONE,
+	/**
+	 * Preempt complete message
+	 * @see vpu_ipc_msg_payload_engine_preempt_done
+	 */
 	VPU_JSM_MSG_ENGINE_PREEMPT_DONE = 0x2201,
 	VPU_JSM_MSG_REGISTER_DB_DONE = 0x2202,
 	VPU_JSM_MSG_UNREGISTER_DB_DONE = 0x2203,
+	/**
+	 * Response to query engine heartbeat.
+	 * @see vpu_ipc_msg_payload_query_engine_hb_done
+	 */
 	VPU_JSM_MSG_QUERY_ENGINE_HB_DONE = 0x2204,
 	VPU_JSM_MSG_GET_POWER_LEVEL_COUNT_DONE = 0x2205,
 	VPU_JSM_MSG_GET_POWER_LEVEL_DONE = 0x2206,
@@ -575,7 +672,10 @@ enum vpu_ipc_msg_type {
 	VPU_JSM_MSG_TRACE_GET_CAPABILITY_RSP = 0x220c,
 	/** Response to VPU_JSM_MSG_TRACE_GET_NAME. */
 	VPU_JSM_MSG_TRACE_GET_NAME_RSP = 0x220d,
-	/** Response to VPU_JSM_MSG_SSID_RELEASE. */
+	/**
+	 * Response to VPU_JSM_MSG_SSID_RELEASE.
+	 * @see vpu_ipc_msg_payload_ssid_release
+	 */
 	VPU_JSM_MSG_SSID_RELEASE_DONE = 0x220e,
 	/**
 	 * Response to VPU_JSM_MSG_METRIC_STREAMER_START.
@@ -605,37 +705,71 @@ enum vpu_ipc_msg_type {
 	/**
 	 * Asynchronous event sent from the VPU to the host either when the current
 	 * metric buffer is full or when the VPU has collected a multiple of
-	 * @notify_sample_count samples as indicated through the start command
-	 * (VPU_JSM_MSG_METRIC_STREAMER_START). Returns information about collected
-	 * metric data.
+	 * @ref vpu_jsm_metric_streamer_start::notify_sample_count samples as indicated
+	 * through the start command (VPU_JSM_MSG_METRIC_STREAMER_START). Returns
+	 * information about collected metric data.
 	 * @see vpu_jsm_metric_streamer_done
 	 */
 	VPU_JSM_MSG_METRIC_STREAMER_NOTIFICATION = 0x2213,
-	/** Response to control command: Priority band setup */
+	/**
+	 * Response to control command: Priority band setup
+	 * @see vpu_ipc_msg_payload_hws_priority_band_setup
+	 */
 	VPU_JSM_MSG_SET_PRIORITY_BAND_SETUP_RSP = 0x2214,
-	/** Response to control command: Create command queue */
+	/**
+	 * Response to control command: Create command queue
+	 * @see vpu_ipc_msg_payload_hws_create_cmdq_rsp
+	 */
 	VPU_JSM_MSG_CREATE_CMD_QUEUE_RSP = 0x2215,
-	/** Response to control command: Destroy command queue */
+	/**
+	 * Response to control command: Destroy command queue
+	 * @see vpu_ipc_msg_payload_hws_destroy_cmdq
+	 */
 	VPU_JSM_MSG_DESTROY_CMD_QUEUE_RSP = 0x2216,
-	/** Response to control command: Set context scheduling properties */
+	/**
+	 * Response to control command: Set context scheduling properties
+	 * @see vpu_ipc_msg_payload_hws_set_context_sched_properties
+	 */
 	VPU_JSM_MSG_SET_CONTEXT_SCHED_PROPERTIES_RSP = 0x2217,
-	/** Response to control command: Log buffer setting */
+	/**
+	 * Response to control command: Log buffer setting
+	 * @see vpu_ipc_msg_payload_hws_set_scheduling_log
+	 */
 	VPU_JSM_MSG_HWS_SET_SCHEDULING_LOG_RSP = 0x2218,
-	/* IPC Device -> Host, HWS notify index entry of log buffer written */
+	/**
+	 * IPC Device -> Host, HWS notify index entry of log buffer written
+	 * @see vpu_ipc_msg_payload_hws_scheduling_log_notification
+	 */
 	VPU_JSM_MSG_HWS_SCHEDULING_LOG_NOTIFICATION = 0x2219,
-	/* IPC Device -> Host, HWS completion of a context suspend request */
+	/**
+	 * IPC Device -> Host, HWS completion of a context suspend request
+	 * @see vpu_ipc_msg_payload_hws_suspend_cmdq
+	 */
 	VPU_JSM_MSG_HWS_SUSPEND_CMDQ_DONE = 0x221a,
-	/* Response to control command: Resume command queue */
+	/**
+	 * Response to control command: Resume command queue
+	 * @see vpu_ipc_msg_payload_hws_resume_cmdq
+	 */
 	VPU_JSM_MSG_HWS_RESUME_CMDQ_RSP = 0x221b,
-	/* Response to control command: Resume engine command response */
+	/**
+	 * Response to control command: Resume engine command response
+	 * @see vpu_ipc_msg_payload_hws_resume_engine
+	 */
 	VPU_JSM_MSG_HWS_RESUME_ENGINE_DONE = 0x221c,
-	/* Response to control command: Enable survivability/DCT mode */
+	/**
+	 * Response to control command: Enable survivability/DCT mode
+	 * This command has no payload
+	 */
 	VPU_JSM_MSG_DCT_ENABLE_DONE = 0x221d,
-	/* Response to control command: Disable survivability/DCT mode */
+	/**
+	 * Response to control command: Disable survivability/DCT mode
+	 * This command has no payload
+	 */
 	VPU_JSM_MSG_DCT_DISABLE_DONE = 0x221e,
 	/**
 	 * Response to state dump control command.
-	 * NOTE: Please introduce new ASYNC responses before this one. *
+	 * This command has no payload.
+	 * NOTE: Please introduce new ASYNC responses before this one.
 	 */
 	VPU_JSM_MSG_STATE_DUMP_RSP = 0x22FF,
 
@@ -653,57 +787,66 @@ enum vpu_ipc_msg_type {
 
 enum vpu_ipc_msg_status { VPU_JSM_MSG_FREE, VPU_JSM_MSG_ALLOCATED };
 
-/*
- * Host <-> LRT IPC message payload definitions
+/**
+ * Engine reset request payload
+ * @see VPU_JSM_MSG_ENGINE_RESET
  */
 struct vpu_ipc_msg_payload_engine_reset {
-	/* Engine to be reset. */
+	/** Engine to be reset. */
 	u32 engine_idx;
-	/* Reserved */
+	/** Reserved */
 	u32 reserved_0;
 };
 
+/**
+ * Engine preemption request struct
+ * @see VPU_JSM_MSG_ENGINE_PREEMPT
+ */
 struct vpu_ipc_msg_payload_engine_preempt {
-	/* Engine to be preempted. */
+	/** Engine to be preempted. */
 	u32 engine_idx;
-	/* ID of the preemption request. */
+	/** ID of the preemption request. */
 	u32 preempt_id;
 };
 
-/*
- * @brief Register doorbell command structure.
+/**
+ * Register doorbell command structure.
  * This structure supports doorbell registration for only OS scheduling.
  * @see VPU_JSM_MSG_REGISTER_DB
  */
 struct vpu_ipc_msg_payload_register_db {
-	/* Index of the doorbell to register. */
+	/** Index of the doorbell to register. */
 	u32 db_idx;
-	/* Reserved */
+	/** Reserved */
 	u32 reserved_0;
-	/* Virtual address in Global GTT pointing to the start of job queue. */
+	/** Virtual address in Global GTT pointing to the start of job queue. */
 	u64 jobq_base;
-	/* Size of the job queue in bytes. */
+	/** Size of the job queue in bytes. */
 	u32 jobq_size;
-	/* Host sub-stream ID for the context assigned to the doorbell. */
+	/** Host sub-stream ID for the context assigned to the doorbell. */
 	u32 host_ssid;
 };
 
 /**
- * @brief Unregister doorbell command structure.
+ * Unregister doorbell command structure.
  * Request structure to unregister a doorbell for both HW and OS scheduling.
  * @see VPU_JSM_MSG_UNREGISTER_DB
  */
 struct vpu_ipc_msg_payload_unregister_db {
-	/* Index of the doorbell to unregister. */
+	/** Index of the doorbell to unregister. */
 	u32 db_idx;
-	/* Reserved */
+	/** Reserved */
 	u32 reserved_0;
 };
 
+/**
+ * Heartbeat request structure
+ * @see VPU_JSM_MSG_QUERY_ENGINE_HB
+ */
 struct vpu_ipc_msg_payload_query_engine_hb {
-	/* Engine to return heartbeat value. */
+	/** Engine to return heartbeat value. */
 	u32 engine_idx;
-	/* Reserved */
+	/** Reserved */
 	u32 reserved_0;
 };
 
@@ -723,10 +866,14 @@ struct vpu_ipc_msg_payload_power_level {
 	u32 reserved_0;
 };
 
+/**
+ * Structure for requesting ssid release
+ * @see VPU_JSM_MSG_SSID_RELEASE
+ */
 struct vpu_ipc_msg_payload_ssid_release {
-	/* Host sub-stream ID for the context to be released. */
+	/** Host sub-stream ID for the context to be released. */
 	u32 host_ssid;
-	/* Reserved */
+	/** Reserved */
 	u32 reserved_0;
 };
 
@@ -752,7 +899,7 @@ struct vpu_jsm_metric_streamer_start {
 	u64 sampling_rate;
 	/**
 	 * If > 0 the VPU will send a VPU_JSM_MSG_METRIC_STREAMER_NOTIFICATION message
-	 * after every @notify_sample_count samples is collected or dropped by the VPU.
+	 * after every @ref notify_sample_count samples is collected or dropped by the VPU.
 	 * If set to UINT_MAX the VPU will only generate a notification when the metric
 	 * buffer is full. If set to 0 the VPU will never generate a notification.
 	 */
@@ -762,9 +909,9 @@ struct vpu_jsm_metric_streamer_start {
 	 * Address and size of the buffer where the VPU will write metric data. The
 	 * VPU writes all counters from enabled metric groups one after another. If
 	 * there is no space left to write data at the next sample period the VPU
-	 * will switch to the next buffer (@see next_buffer_addr) and will optionally
-	 * send a notification to the host driver if @notify_sample_count is non-zero.
-	 * If @next_buffer_addr is NULL the VPU will stop collecting metric data.
+	 * will switch to the next buffer (@ref next_buffer_addr) and will optionally
+	 * send a notification to the host driver if @ref notify_sample_count is non-zero.
+	 * If @ref next_buffer_addr is NULL the VPU will stop collecting metric data.
 	 */
 	u64 buffer_addr;
 	u64 buffer_size;
@@ -827,63 +974,80 @@ struct vpu_jsm_metric_streamer_update {
 	u64 next_buffer_size;
 };
 
+/**
+ * Device -> host job completion message.
+ * @see VPU_JSM_MSG_JOB_DONE
+ */
 struct vpu_ipc_msg_payload_job_done {
-	/* Engine to which the job was submitted. */
+	/** Engine to which the job was submitted. */
 	u32 engine_idx;
-	/* Index of the doorbell to which the job was submitted */
+	/** Index of the doorbell to which the job was submitted */
 	u32 db_idx;
-	/* ID of the completed job */
+	/** ID of the completed job */
 	u32 job_id;
-	/* Status of the completed job */
+	/** Status of the completed job */
 	u32 job_status;
-	/* Host SSID */
+	/** Host SSID */
 	u32 host_ssid;
-	/* Zero Padding */
+	/** Zero Padding */
 	u32 reserved_0;
-	/* Command queue id */
+	/** Command queue id */
 	u64 cmdq_id;
 };
 
-/*
+/**
  * Notification message upon native fence signalling.
  * @see VPU_JSM_MSG_NATIVE_FENCE_SIGNALLED
  */
 struct vpu_ipc_msg_payload_native_fence_signalled {
-	/* Engine ID. */
+	/** Engine ID. */
 	u32 engine_idx;
-	/* Host SSID. */
+	/** Host SSID. */
 	u32 host_ssid;
-	/* CMDQ ID */
+	/** CMDQ ID */
 	u64 cmdq_id;
-	/* Fence object handle. */
+	/** Fence object handle. */
 	u64 fence_handle;
 };
 
+/**
+ * vpu_ipc_msg_payload_engine_reset_done will contain an array of this structure
+ * which contains which queues caused reset if FW was able to detect any error.
+ * @see vpu_ipc_msg_payload_engine_reset_done
+ */
 struct vpu_jsm_engine_reset_context {
-	/* Host SSID */
+	/** Host SSID */
 	u32 host_ssid;
-	/* Zero Padding */
+	/** Zero Padding */
 	u32 reserved_0;
-	/* Command queue id */
+	/** Command queue id */
 	u64 cmdq_id;
-	/* See VPU_ENGINE_RESET_CONTEXT_* defines */
+	/** See VPU_ENGINE_RESET_CONTEXT_* defines */
 	u64 flags;
 };
 
+/**
+ * Engine reset response.
+ * @see VPU_JSM_MSG_ENGINE_RESET_DONE
+ */
 struct vpu_ipc_msg_payload_engine_reset_done {
-	/* Engine ordinal */
+	/** Engine ordinal */
 	u32 engine_idx;
-	/* Number of impacted contexts */
+	/** Number of impacted contexts */
 	u32 num_impacted_contexts;
-	/* Array of impacted command queue ids and their flags */
+	/** Array of impacted command queue ids and their flags */
 	struct vpu_jsm_engine_reset_context
 		impacted_contexts[VPU_MAX_ENGINE_RESET_IMPACTED_CONTEXTS];
 };
 
+/**
+ * Preemption response struct
+ * @see VPU_JSM_MSG_ENGINE_PREEMPT_DONE
+ */
 struct vpu_ipc_msg_payload_engine_preempt_done {
-	/* Engine preempted. */
+	/** Engine preempted. */
 	u32 engine_idx;
-	/* ID of the preemption request. */
+	/** ID of the preemption request. */
 	u32 preempt_id;
 };
 
@@ -912,12 +1076,16 @@ struct vpu_ipc_msg_payload_unregister_db_done {
 	u32 reserved_0;
 };
 
+/**
+ * Structure for heartbeat response
+ * @see VPU_JSM_MSG_QUERY_ENGINE_HB_DONE
+ */
 struct vpu_ipc_msg_payload_query_engine_hb_done {
-	/* Engine returning heartbeat value. */
+	/** Engine returning heartbeat value. */
 	u32 engine_idx;
-	/* Reserved */
+	/** Reserved */
 	u32 reserved_0;
-	/* Heartbeat value. */
+	/** Heartbeat value. */
 	u64 heartbeat;
 };
 
@@ -937,7 +1105,10 @@ struct vpu_ipc_msg_payload_get_power_level_count_done {
 	u8 power_limit[16];
 };
 
-/* HWS priority band setup request / response */
+/**
+ * HWS priority band setup request / response
+ * @see VPU_JSM_MSG_SET_PRIORITY_BAND_SETUP
+ */
 struct vpu_ipc_msg_payload_hws_priority_band_setup {
 	/*
 	 * Grace period in 100ns units when preempting another priority band for
@@ -964,15 +1135,23 @@ struct vpu_ipc_msg_payload_hws_priority_band_setup {
 	 * TDR timeout value in milliseconds. Default value of 0 meaning no timeout.
 	 */
 	u32 tdr_timeout;
+	/* Non-interactive queue timeout for no progress of heartbeat in milliseconds.
+	 * Default value of 0 meaning no timeout.
+	 */
+	u32 non_interactive_no_progress_timeout;
+	/*
+	 * Non-interactive queue upper limit timeout value in milliseconds. Default
+	 * value of 0 meaning no timeout.
+	 */
+	u32 non_interactive_timeout;
 };
 
-/*
+/**
  * @brief HWS create command queue request.
  * Host will create a command queue via this command.
  * Note: Cmdq group is a handle of an object which
  * may contain one or more command queues.
  * @see VPU_JSM_MSG_CREATE_CMD_QUEUE
- * @see VPU_JSM_MSG_CREATE_CMD_QUEUE_RSP
  */
 struct vpu_ipc_msg_payload_hws_create_cmdq {
 	/* Process id */
@@ -993,66 +1172,73 @@ struct vpu_ipc_msg_payload_hws_create_cmdq {
 	u32 reserved_0;
 };
 
-/*
- * @brief HWS create command queue response.
- * @see VPU_JSM_MSG_CREATE_CMD_QUEUE
+/**
+ * HWS create command queue response.
  * @see VPU_JSM_MSG_CREATE_CMD_QUEUE_RSP
  */
 struct vpu_ipc_msg_payload_hws_create_cmdq_rsp {
-	/* Process id */
+	/** Process id */
 	u64 process_id;
-	/* Host SSID */
+	/** Host SSID */
 	u32 host_ssid;
-	/* Engine for which queue is being created */
+	/** Engine for which queue is being created */
 	u32 engine_idx;
-	/* Command queue group */
+	/** Command queue group */
 	u64 cmdq_group;
-	/* Command queue id */
+	/** Command queue id */
 	u64 cmdq_id;
 };
 
-/* HWS destroy command queue request / response */
+/**
+ * HWS destroy command queue request / response
+ * @see VPU_JSM_MSG_DESTROY_CMD_QUEUE
+ * @see VPU_JSM_MSG_DESTROY_CMD_QUEUE_RSP
+ */
 struct vpu_ipc_msg_payload_hws_destroy_cmdq {
-	/* Host SSID */
+	/** Host SSID */
 	u32 host_ssid;
-	/* Zero Padding */
+	/** Zero Padding */
 	u32 reserved;
-	/* Command queue id */
+	/** Command queue id */
 	u64 cmdq_id;
 };
 
-/* HWS set context scheduling properties request / response */
+/**
+ * HWS set context scheduling properties request / response
+ * @see VPU_JSM_MSG_SET_CONTEXT_SCHED_PROPERTIES
+ * @see VPU_JSM_MSG_SET_CONTEXT_SCHED_PROPERTIES_RSP
+ */
 struct vpu_ipc_msg_payload_hws_set_context_sched_properties {
-	/* Host SSID */
+	/** Host SSID */
 	u32 host_ssid;
-	/* Zero Padding */
+	/** Zero Padding */
 	u32 reserved_0;
-	/* Command queue id */
+	/** Command queue id */
 	u64 cmdq_id;
-	/*
+	/**
 	 * Priority band to assign to work of this context.
 	 * Available priority bands: @see enum vpu_job_scheduling_priority_band
 	 */
 	u32 priority_band;
-	/* Inside realtime band assigns a further priority */
+	/** Inside realtime band assigns a further priority */
 	u32 realtime_priority_level;
-	/* Priority relative to other contexts in the same process */
+	/** Priority relative to other contexts in the same process */
 	s32 in_process_priority;
-	/* Zero padding / Reserved */
+	/** Zero padding / Reserved */
 	u32 reserved_1;
-	/*
+	/**
 	 * Context quantum relative to other contexts of same priority in the same process
 	 * Minimum value supported by NPU is 1ms (10000 in 100ns units).
 	 */
 	u64 context_quantum;
-	/* Grace period when preempting context of the same priority within the same process */
+	/** Grace period when preempting context of the same priority within the same process */
 	u64 grace_period_same_priority;
-	/* Grace period when preempting context of a lower priority within the same process */
+	/** Grace period when preempting context of a lower priority within the same process */
 	u64 grace_period_lower_priority;
 };
 
-/*
- * @brief Register doorbell command structure.
+/**
+ * Register doorbell command structure.
  * This structure supports doorbell registration for both HW and OS scheduling.
  * Note: Queue base and size are added here so that the same structure can be used for
  * OS scheduling and HW scheduling. For OS scheduling, cmdq_id will be ignored
@@ -1061,27 +1247,27 @@ struct vpu_ipc_msg_payload_hws_set_context_sched_properties {
  * @see VPU_JSM_MSG_HWS_REGISTER_DB
  */
 struct vpu_jsm_hws_register_db {
-	/* Index of the doorbell to register. */
+	/** Index of the doorbell to register. */
 	u32 db_id;
-	/* Host sub-stream ID for the context assigned to the doorbell. */
+	/** Host sub-stream ID for the context assigned to the doorbell. */
 	u32 host_ssid;
-	/* ID of the command queue associated with the doorbell. */
+	/** ID of the command queue associated with the doorbell. */
 	u64 cmdq_id;
-	/* Virtual address pointing to the start of command queue. */
+	/** Virtual address pointing to the start of command queue. */
 	u64 cmdq_base;
-	/* Size of the command queue in bytes. */
+	/** Size of the command queue in bytes. */
 	u64 cmdq_size;
 };
 
-/*
- * @brief Structure to set another buffer to be used for scheduling-related logging.
+/**
+ * Structure to set another buffer to be used for scheduling-related logging.
  * The size of the logging buffer and the number of entries is defined as part of the
  * buffer itself as described next.
  * The log buffer received from the host is made up of;
- *   - header:     32 bytes in size, as shown in 'struct vpu_hws_log_buffer_header'.
+ *   - header:     32 bytes in size, as shown in @ref vpu_hws_log_buffer_header.
  *                 The header contains the number of log entries in the buffer.
  *   - log entry:  0 to n-1, each log entry is 32 bytes in size, as shown in
- *                 'struct vpu_hws_log_buffer_entry'.
+ *                 @ref vpu_hws_log_buffer_entry.
  *                 The entry contains the VPU timestamp, operation type and data.
  * The host should provide the notify index value of log buffer to VPU. This is a
  * value defined within the log buffer and when written to will generate the
@@ -1095,30 +1281,30 @@ struct vpu_jsm_hws_register_db {
  * @see VPU_JSM_MSG_HWS_SCHEDULING_LOG_NOTIFICATION
  */
 struct vpu_ipc_msg_payload_hws_set_scheduling_log {
-	/* Engine ordinal */
+	/** Engine ordinal */
 	u32 engine_idx;
-	/* Host SSID */
+	/** Host SSID */
 	u32 host_ssid;
-	/*
+	/**
 	 * VPU log buffer virtual address.
 	 * Set to 0 to disable logging for this engine.
 	 */
 	u64 vpu_log_buffer_va;
-	/*
+	/**
 	 * Notify index of log buffer. VPU_JSM_MSG_HWS_SCHEDULING_LOG_NOTIFICATION
 	 * is generated when an event log is written to this index.
 	 */
 	u64 notify_index;
-	/*
+	/**
 	 * Field is now deprecated, will be removed when KMD is updated to support removal
 	 */
 	u32 enable_extra_events;
-	/* Zero Padding */
+	/** Zero Padding */
 	u32 reserved_0;
 };
 
-/*
- * @brief The scheduling log notification is generated by VPU when it writes
+/**
+ * The scheduling log notification is generated by VPU when it writes
  * an event into the log buffer at the notify_index. VPU notifies host with
  * VPU_JSM_MSG_HWS_SCHEDULING_LOG_NOTIFICATION. This is an asynchronous
  * message from VPU to host.
@@ -1126,14 +1312,14 @@ struct vpu_ipc_msg_payload_hws_set_scheduling_log {
  * @see VPU_JSM_MSG_HWS_SET_SCHEDULING_LOG
  */
 struct vpu_ipc_msg_payload_hws_scheduling_log_notification {
-	/* Engine ordinal */
+	/** Engine ordinal */
 	u32 engine_idx;
-	/* Zero Padding */
+	/** Zero Padding */
 	u32 reserved_0;
 };
 
-/*
- * @brief HWS suspend command queue request and done structure.
+/**
+ * HWS suspend command queue request and done structure.
  * Host will request the suspend of contexts and VPU will;
  *   - Suspend all work on this context
  *   - Preempt any running work
@@ -1152,21 +1338,21 @@ struct vpu_ipc_msg_payload_hws_scheduling_log_notification {
  * @see VPU_JSM_MSG_HWS_SUSPEND_CMDQ_DONE
  */
 struct vpu_ipc_msg_payload_hws_suspend_cmdq {
-	/* Host SSID */
+	/** Host SSID */
 	u32 host_ssid;
-	/* Zero Padding */
+	/** Zero Padding */
 	u32 reserved_0;
-	/* Command queue id */
+	/** Command queue id */
 	u64 cmdq_id;
-	/*
+	/**
 	 * Suspend fence value - reported by the VPU suspend context
 	 * completed once suspend is complete.
 	 */
 	u64 suspend_fence_value;
 };
 
-/*
- * @brief HWS Resume command queue request / response structure.
+/**
+ * HWS Resume command queue request / response structure.
  * Host will request the resume of a context;
  *  - VPU will resume all work on this context
  *  - Scheduler will allow this context to be scheduled
@@ -1174,25 +1360,25 @@ struct vpu_ipc_msg_payload_hws_suspend_cmdq {
  * @see VPU_JSM_MSG_HWS_RESUME_CMDQ_RSP
  */
 struct vpu_ipc_msg_payload_hws_resume_cmdq {
-	/* Host SSID */
+	/** Host SSID */
 	u32 host_ssid;
-	/* Zero Padding */
+	/** Zero Padding */
 	u32 reserved_0;
-	/* Command queue id */
+	/** Command queue id */
 	u64 cmdq_id;
 };
 
-/*
- * @brief HWS Resume engine request / response structure.
- * After a HWS engine reset, all scheduling is stopped on VPU until a engine resume.
+/**
+ * HWS Resume engine request / response structure.
+ * After a HWS engine reset, all scheduling is stopped on VPU until an engine resume.
  * Host shall send this command to resume scheduling of any valid queue.
- * @see VPU_JSM_MSG_HWS_RESUME_ENGINE
+ * @see VPU_JSM_MSG_HWS_ENGINE_RESUME
  * @see VPU_JSM_MSG_HWS_RESUME_ENGINE_DONE
  */
 struct vpu_ipc_msg_payload_hws_resume_engine {
-	/* Engine to be resumed */
+	/** Engine to be resumed */
 	u32 engine_idx;
-	/* Reserved */
+	/** Reserved */
 	u32 reserved_0;
 };
 
@@ -1326,7 +1512,7 @@ struct vpu_jsm_metric_streamer_done {
 /**
  * Metric group description placed in the metric buffer after successful completion
  * of the VPU_JSM_MSG_METRIC_STREAMER_INFO command. This is followed by one or more
- * @vpu_jsm_metric_counter_descriptor records.
+ * @ref vpu_jsm_metric_counter_descriptor records.
  * @see VPU_JSM_MSG_METRIC_STREAMER_INFO
  */
 struct vpu_jsm_metric_group_descriptor {
@@ -1413,29 +1599,24 @@ struct vpu_jsm_metric_counter_descriptor {
 };
 
 /**
- * Payload for VPU_JSM_MSG_DYNDBG_CONTROL requests.
+ * Payload for @ref VPU_JSM_MSG_DYNDBG_CONTROL requests.
  *
- * VPU_JSM_MSG_DYNDBG_CONTROL are used to control the VPU FW Dynamic Debug
- * feature, which allows developers to selectively enable / disable MVLOG_DEBUG
- * messages. This is equivalent to the Dynamic Debug functionality provided by
- * Linux
- * (https://www.kernel.org/doc/html/latest/admin-guide/dynamic-debug-howto.html)
- * The host can control Dynamic Debug behavior by sending dyndbg commands, which
- * have the same syntax as Linux
- * dyndbg commands.
+ * VPU_JSM_MSG_DYNDBG_CONTROL requests are used to control the VPU FW dynamic debug
+ * feature, which allows developers to selectively enable/disable code to obtain
+ * additional FW information. This is equivalent to the dynamic debug functionality
+ * provided by Linux. The host can control dynamic debug behavior by sending dyndbg
+ * commands, using the same syntax as for Linux dynamic debug commands.
  *
- * NOTE: in order for MVLOG_DEBUG messages to be actually printed, the host
- * still has to set the logging level to MVLOG_DEBUG, using the
- * VPU_JSM_MSG_TRACE_SET_CONFIG command.
+ * @see https://www.kernel.org/doc/html/latest/admin-guide/dynamic-debug-howto.html.
  *
- * The host can see the current dynamic debug configuration by executing a
- * special 'show' command. The dyndbg configuration will be printed to the
- * configured logging destination using MVLOG_INFO logging level.
+ * NOTE:
+ * As the dynamic debug feature uses MVLOG messages to provide information, the host
+ * must first set the logging level to MVLOG_DEBUG, using the @ref VPU_JSM_MSG_TRACE_SET_CONFIG
+ * command.
  */
 struct vpu_ipc_msg_payload_dyndbg_control {
 	/**
-	 * Dyndbg command (same format as Linux dyndbg); must be a NULL-terminated
-	 * string.
+	 * Dyndbg command to be executed.
 	 */
 	char dyndbg_cmd[VPU_DYNDBG_CMD_MAX_LEN];
 };
@@ -1456,7 +1637,7 @@ struct vpu_ipc_msg_payload_pwr_d0i3_enter {
 };
 
 /**
- * Payload for VPU_JSM_MSG_DCT_ENABLE message.
+ * Payload for @ref VPU_JSM_MSG_DCT_ENABLE message.
  *
  * Default values for DCT active/inactive times are 5.3ms and 30ms respectively,
  * corresponding to a 85% duty cycle. This payload allows the host to tune these
@@ -1513,28 +1694,28 @@ union vpu_ipc_msg_payload {
 	struct vpu_ipc_msg_payload_pwr_dct_control pwr_dct_control;
 };
 
-/*
- * Host <-> LRT IPC message base structure.
+/**
+ * Host <-> NPU IPC message base structure.
  *
  * NOTE: All instances of this object must be aligned on a 64B boundary
  * to allow proper handling of VPU cache operations.
  */
 struct vpu_jsm_msg {
-	/* Reserved */
+	/** Reserved */
 	u64 reserved_0;
-	/* Message type, see vpu_ipc_msg_type enum. */
+	/** Message type, see @ref vpu_ipc_msg_type. */
 	u32 type;
-	/* Buffer status, see vpu_ipc_msg_status enum. */
+	/** Buffer status, see @ref vpu_ipc_msg_status. */
 	u32 status;
-	/*
+	/**
 	 * Request ID, provided by the host in a request message and passed
 	 * back by VPU in the response message.
 	 */
 	u32 request_id;
-	/* Request return code set by the VPU, see VPU_JSM_STATUS_* defines. */
+	/** Request return code set by the VPU, see VPU_JSM_STATUS_* defines. */
 	u32 result;
 	u64 reserved_1;
-	/* Message payload depending on message type, see vpu_ipc_msg_payload union. */
+	/** Message payload depending on message type, see vpu_ipc_msg_payload union. */
 	union vpu_ipc_msg_payload payload;
 };
 
diff --git a/drivers/accel/qaic/Kconfig b/drivers/accel/qaic/Kconfig
index 5e405a19c157..116e42d152ca 100644
--- a/drivers/accel/qaic/Kconfig
+++ b/drivers/accel/qaic/Kconfig
@@ -9,6 +9,7 @@ config DRM_ACCEL_QAIC
 	depends on PCI && HAS_IOMEM
 	depends on MHI_BUS
 	select CRC32
+	select WANT_DEV_COREDUMP
 	help
 	  Enables driver for Qualcomm's Cloud AI accelerator PCIe cards that are
 	  designed to accelerate Deep Learning inference workloads.
diff --git a/drivers/accel/qaic/Makefile b/drivers/accel/qaic/Makefile
index 1106b876f737..71f727b74da3 100644
--- a/drivers/accel/qaic/Makefile
+++ b/drivers/accel/qaic/Makefile
@@ -11,6 +11,8 @@ qaic-y := \
 	qaic_data.o \
 	qaic_drv.o \
 	qaic_ras.o \
+	qaic_ssr.o \
+	qaic_sysfs.o \
 	qaic_timesync.o \
 	sahara.o
 
diff --git a/drivers/accel/qaic/qaic.h b/drivers/accel/qaic/qaic.h
index 820d133236dd..fa7a8155658c 100644
--- a/drivers/accel/qaic/qaic.h
+++ b/drivers/accel/qaic/qaic.h
@@ -21,6 +21,7 @@
 
 #define QAIC_DBC_BASE		SZ_128K
 #define QAIC_DBC_SIZE		SZ_4K
+#define QAIC_SSR_DBC_SENTINEL	U32_MAX /* No ongoing SSR sentinel */
 
 #define QAIC_NO_PARTITION	-1
 
@@ -47,6 +48,22 @@ enum __packed dev_states {
 	QAIC_ONLINE,
 };
 
+enum dbc_states {
+	/* DBC is free and can be activated */
+	DBC_STATE_IDLE,
+	/* DBC is activated and a workload is running on device */
+	DBC_STATE_ASSIGNED,
+	/* Sub-system associated with this workload has crashed and it will shutdown soon */
+	DBC_STATE_BEFORE_SHUTDOWN,
+	/* Sub-system associated with this workload has crashed and it has shutdown */
+	DBC_STATE_AFTER_SHUTDOWN,
+	/* Sub-system associated with this workload is shutdown and it will be powered up soon */
+	DBC_STATE_BEFORE_POWER_UP,
+	/* Sub-system associated with this workload is now powered up */
+	DBC_STATE_AFTER_POWER_UP,
+	DBC_STATE_MAX,
+};
+
 extern bool datapath_polling;
 
 struct qaic_user {
@@ -114,6 +131,8 @@ struct dma_bridge_chan {
 	unsigned int		irq;
 	/* Polling work item to simulate interrupts */
 	struct work_struct	poll_work;
+	/* Represents various states of this DBC from enum dbc_states */
+	unsigned int		state;
 };
 
 struct qaic_device {
@@ -161,6 +180,8 @@ struct qaic_device {
 	struct mhi_device	*qts_ch;
 	/* Work queue for tasks related to MHI "QAIC_TIMESYNC" channel */
 	struct workqueue_struct	*qts_wq;
+	/* MHI "QAIC_TIMESYNC_PERIODIC" channel device */
+	struct mhi_device	*mqts_ch;
 	/* Head of list of page allocated by MHI bootlog device */
 	struct list_head        bootlog;
 	/* MHI bootlog channel device */
@@ -177,6 +198,14 @@ struct qaic_device {
 	unsigned int		ue_count;
 	/* Un-correctable non-fatal error count */
 	unsigned int		ue_nf_count;
+	/* MHI SSR channel device */
+	struct mhi_device	*ssr_ch;
+	/* Work queue for tasks related to MHI SSR device */
+	struct workqueue_struct	*ssr_wq;
+	/* Buffer to collect SSR crashdump via SSR MHI channel */
+	void			*ssr_mhi_buf;
+	/* DBC which is under SSR. Sentinel U32_MAX would mean that no SSR in progress */
+	u32			ssr_dbc;
 };
 
 struct qaic_drm_device {
@@ -195,6 +224,8 @@ struct qaic_drm_device {
 	struct list_head	users;
 	/* Synchronizes access to users list */
 	struct mutex		users_mutex;
+	/* Pointer to array of DBC sysfs attributes */
+	void			*sysfs_attrs;
 };
 
 struct qaic_bo {
@@ -317,6 +348,13 @@ int qaic_partial_execute_bo_ioctl(struct drm_device *dev, void *data, struct drm
 int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
 int qaic_perf_stats_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
 int qaic_detach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
-void irq_polling_work(struct work_struct *work);
+void qaic_irq_polling_work(struct work_struct *work);
+void qaic_dbc_enter_ssr(struct qaic_device *qdev, u32 dbc_id);
+void qaic_dbc_exit_ssr(struct qaic_device *qdev);
+
+/* qaic_sysfs.c */
+int qaic_sysfs_init(struct qaic_drm_device *qddev);
+void qaic_sysfs_remove(struct qaic_drm_device *qddev);
+void set_dbc_state(struct qaic_device *qdev, u32 dbc_id, unsigned int state);
 
 #endif /* _QAIC_H_ */
diff --git a/drivers/accel/qaic/qaic_control.c b/drivers/accel/qaic/qaic_control.c
index b86a8e48e731..428d8f65bff3 100644
--- a/drivers/accel/qaic/qaic_control.c
+++ b/drivers/accel/qaic/qaic_control.c
@@ -17,6 +17,7 @@
 #include <linux/overflow.h>
 #include <linux/pci.h>
 #include <linux/scatterlist.h>
+#include <linux/sched/signal.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
 #include <linux/workqueue.h>
@@ -30,7 +31,7 @@
 #define MANAGE_MAGIC_NUMBER		((__force __le32)0x43494151) /* "QAIC" in little endian */
 #define QAIC_DBC_Q_GAP			SZ_256
 #define QAIC_DBC_Q_BUF_ALIGN		SZ_4K
-#define QAIC_MANAGE_EXT_MSG_LENGTH	SZ_64K /* Max DMA message length */
+#define QAIC_MANAGE_WIRE_MSG_LENGTH	SZ_64K /* Max DMA message length */
 #define QAIC_WRAPPER_MAX_SIZE		SZ_4K
 #define QAIC_MHI_RETRY_WAIT_MS		100
 #define QAIC_MHI_RETRY_MAX		20
@@ -309,6 +310,7 @@ static void save_dbc_buf(struct qaic_device *qdev, struct ioctl_resources *resou
 		enable_dbc(qdev, dbc_id, usr);
 		qdev->dbc[dbc_id].in_use = true;
 		resources->buf = NULL;
+		set_dbc_state(qdev, dbc_id, DBC_STATE_ASSIGNED);
 	}
 }
 
@@ -367,7 +369,7 @@ static int encode_passthrough(struct qaic_device *qdev, void *trans, struct wrap
 	if (in_trans->hdr.len % 8 != 0)
 		return -EINVAL;
 
-	if (size_add(msg_hdr_len, in_trans->hdr.len) > QAIC_MANAGE_EXT_MSG_LENGTH)
+	if (size_add(msg_hdr_len, in_trans->hdr.len) > QAIC_MANAGE_WIRE_MSG_LENGTH)
 		return -ENOSPC;
 
 	trans_wrapper = add_wrapper(wrappers,
@@ -495,7 +497,7 @@ static int encode_addr_size_pairs(struct dma_xfer *xfer, struct wrapper_list *wr
 
 	nents = sgt->nents;
 	nents_dma = nents;
-	*size = QAIC_MANAGE_EXT_MSG_LENGTH - msg_hdr_len - sizeof(**out_trans);
+	*size = QAIC_MANAGE_WIRE_MSG_LENGTH - msg_hdr_len - sizeof(**out_trans);
 	for_each_sgtable_dma_sg(sgt, sg, i) {
 		*size -= sizeof(*asp);
 		/* Save 1K for possible follow-up transactions. */
@@ -576,7 +578,7 @@ static int encode_dma(struct qaic_device *qdev, void *trans, struct wrapper_list
 
 	/* There should be enough space to hold at least one ASP entry. */
 	if (size_add(msg_hdr_len, sizeof(*out_trans) + sizeof(struct wire_addr_size_pair)) >
-	    QAIC_MANAGE_EXT_MSG_LENGTH)
+	    QAIC_MANAGE_WIRE_MSG_LENGTH)
 		return -ENOMEM;
 
 	xfer = kmalloc(sizeof(*xfer), GFP_KERNEL);
@@ -645,7 +647,7 @@ static int encode_activate(struct qaic_device *qdev, void *trans, struct wrapper
 	msg = &wrapper->msg;
 	msg_hdr_len = le32_to_cpu(msg->hdr.len);
 
-	if (size_add(msg_hdr_len, sizeof(*out_trans)) > QAIC_MANAGE_MAX_MSG_LENGTH)
+	if (size_add(msg_hdr_len, sizeof(*out_trans)) > QAIC_MANAGE_WIRE_MSG_LENGTH)
 		return -ENOSPC;
 
 	if (!in_trans->queue_size)
@@ -655,8 +657,9 @@ static int encode_activate(struct qaic_device *qdev, void *trans, struct wrapper
 		return -EINVAL;
 
 	nelem = in_trans->queue_size;
-	size = (get_dbc_req_elem_size() + get_dbc_rsp_elem_size()) * nelem;
-	if (size / nelem != get_dbc_req_elem_size() + get_dbc_rsp_elem_size())
+	if (check_mul_overflow((u32)(get_dbc_req_elem_size() + get_dbc_rsp_elem_size()),
+			       nelem,
+			       &size))
 		return -EINVAL;
 
 	if (size + QAIC_DBC_Q_GAP + QAIC_DBC_Q_BUF_ALIGN < size)
@@ -729,7 +732,7 @@ static int encode_status(struct qaic_device *qdev, void *trans, struct wrapper_l
 	msg = &wrapper->msg;
 	msg_hdr_len = le32_to_cpu(msg->hdr.len);
 
-	if (size_add(msg_hdr_len, in_trans->hdr.len) > QAIC_MANAGE_MAX_MSG_LENGTH)
+	if (size_add(msg_hdr_len, in_trans->hdr.len) > QAIC_MANAGE_WIRE_MSG_LENGTH)
 		return -ENOSPC;
 
 	trans_wrapper = add_wrapper(wrappers, sizeof(*trans_wrapper));
@@ -810,7 +813,7 @@ static int encode_message(struct qaic_device *qdev, struct manage_msg *user_msg,
 		}
 
 		if (ret)
-			break;
+			goto out;
 	}
 
 	if (user_len != user_msg->len)
@@ -921,6 +924,7 @@ static int decode_deactivate(struct qaic_device *qdev, void *trans, u32 *msg_len
 	}
 
 	release_dbc(qdev, dbc_id);
+	set_dbc_state(qdev, dbc_id, DBC_STATE_IDLE);
 	*msg_len += sizeof(*in_trans);
 
 	return 0;
@@ -1052,7 +1056,7 @@ static void *msg_xfer(struct qaic_device *qdev, struct wrapper_list *wrappers, u
 	init_completion(&elem.xfer_done);
 	if (likely(!qdev->cntl_lost_buf)) {
 		/*
-		 * The max size of request to device is QAIC_MANAGE_EXT_MSG_LENGTH.
+		 * The max size of request to device is QAIC_MANAGE_WIRE_MSG_LENGTH.
 		 * The max size of response from device is QAIC_MANAGE_MAX_MSG_LENGTH.
 		 */
 		out_buf = kmalloc(QAIC_MANAGE_MAX_MSG_LENGTH, GFP_KERNEL);
@@ -1079,7 +1083,6 @@ static void *msg_xfer(struct qaic_device *qdev, struct wrapper_list *wrappers, u
 
 	list_for_each_entry(w, &wrappers->list, list) {
 		kref_get(&w->ref_count);
-		retry_count = 0;
 		ret = mhi_queue_buf(qdev->cntl_ch, DMA_TO_DEVICE, &w->msg, w->len,
 				    list_is_last(&w->list, &wrappers->list) ? MHI_EOT : MHI_CHAIN);
 		if (ret) {
diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c
index c4f117edb266..60cb4d65d48e 100644
--- a/drivers/accel/qaic/qaic_data.c
+++ b/drivers/accel/qaic/qaic_data.c
@@ -18,6 +18,7 @@
 #include <linux/scatterlist.h>
 #include <linux/spinlock.h>
 #include <linux/srcu.h>
+#include <linux/string.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
 #include <linux/wait.h>
@@ -165,7 +166,7 @@ static void free_slice(struct kref *kref)
 	drm_gem_object_put(&slice->bo->base);
 	sg_free_table(slice->sgt);
 	kfree(slice->sgt);
-	kfree(slice->reqs);
+	kvfree(slice->reqs);
 	kfree(slice);
 }
 
@@ -404,7 +405,7 @@ static int qaic_map_one_slice(struct qaic_device *qdev, struct qaic_bo *bo,
 		goto free_sgt;
 	}
 
-	slice->reqs = kcalloc(sgt->nents, sizeof(*slice->reqs), GFP_KERNEL);
+	slice->reqs = kvcalloc(sgt->nents, sizeof(*slice->reqs), GFP_KERNEL);
 	if (!slice->reqs) {
 		ret = -ENOMEM;
 		goto free_slice;
@@ -430,7 +431,7 @@ static int qaic_map_one_slice(struct qaic_device *qdev, struct qaic_bo *bo,
 	return 0;
 
 free_req:
-	kfree(slice->reqs);
+	kvfree(slice->reqs);
 free_slice:
 	kfree(slice);
 free_sgt:
@@ -643,8 +644,36 @@ static void qaic_free_object(struct drm_gem_object *obj)
 	kfree(bo);
 }
 
+static struct sg_table *qaic_get_sg_table(struct drm_gem_object *obj)
+{
+	struct qaic_bo *bo = to_qaic_bo(obj);
+	struct scatterlist *sg, *sg_in;
+	struct sg_table *sgt, *sgt_in;
+	int i;
+
+	sgt_in = bo->sgt;
+
+	sgt = kmalloc(sizeof(*sgt), GFP_KERNEL);
+	if (!sgt)
+		return ERR_PTR(-ENOMEM);
+
+	if (sg_alloc_table(sgt, sgt_in->orig_nents, GFP_KERNEL)) {
+		kfree(sgt);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	sg = sgt->sgl;
+	for_each_sgtable_sg(sgt_in, sg_in, i) {
+		memcpy(sg, sg_in, sizeof(*sg));
+		sg = sg_next(sg);
+	}
+
+	return sgt;
+}
+
 static const struct drm_gem_object_funcs qaic_gem_funcs = {
 	.free = qaic_free_object,
+	.get_sg_table = qaic_get_sg_table,
 	.print_info = qaic_gem_print_info,
 	.mmap = qaic_gem_object_mmap,
 	.vm_ops = &drm_vm_ops,
@@ -953,8 +982,9 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi
 	if (args->hdr.count == 0)
 		return -EINVAL;
 
-	arg_size = args->hdr.count * sizeof(*slice_ent);
-	if (arg_size / args->hdr.count != sizeof(*slice_ent))
+	if (check_mul_overflow((unsigned long)args->hdr.count,
+			       (unsigned long)sizeof(*slice_ent),
+			       &arg_size))
 		return -EINVAL;
 
 	if (!(args->hdr.dir == DMA_TO_DEVICE || args->hdr.dir == DMA_FROM_DEVICE))
@@ -984,18 +1014,12 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi
 
 	user_data = u64_to_user_ptr(args->data);
 
-	slice_ent = kzalloc(arg_size, GFP_KERNEL);
-	if (!slice_ent) {
-		ret = -EINVAL;
+	slice_ent = memdup_user(user_data, arg_size);
+	if (IS_ERR(slice_ent)) {
+		ret = PTR_ERR(slice_ent);
 		goto unlock_dev_srcu;
 	}
 
-	ret = copy_from_user(slice_ent, user_data, arg_size);
-	if (ret) {
-		ret = -EFAULT;
-		goto free_slice_ent;
-	}
-
 	obj = drm_gem_object_lookup(file_priv, args->hdr.handle);
 	if (!obj) {
 		ret = -ENOENT;
@@ -1023,6 +1047,11 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi
 		goto unlock_ch_srcu;
 	}
 
+	if (dbc->id == qdev->ssr_dbc) {
+		ret = -EPIPE;
+		goto unlock_ch_srcu;
+	}
+
 	ret = qaic_prepare_bo(qdev, bo, &args->hdr);
 	if (ret)
 		goto unlock_ch_srcu;
@@ -1300,8 +1329,6 @@ static int __qaic_execute_bo_ioctl(struct drm_device *dev, void *data, struct dr
 	int usr_rcu_id, qdev_rcu_id;
 	struct qaic_device *qdev;
 	struct qaic_user *usr;
-	u8 __user *user_data;
-	unsigned long n;
 	u64 received_ts;
 	u32 queue_level;
 	u64 submit_ts;
@@ -1314,20 +1341,12 @@ static int __qaic_execute_bo_ioctl(struct drm_device *dev, void *data, struct dr
 	received_ts = ktime_get_ns();
 
 	size = is_partial ? sizeof(struct qaic_partial_execute_entry) : sizeof(*exec);
-	n = (unsigned long)size * args->hdr.count;
-	if (args->hdr.count == 0 || n / args->hdr.count != size)
+	if (args->hdr.count == 0)
 		return -EINVAL;
 
-	user_data = u64_to_user_ptr(args->data);
-
-	exec = kcalloc(args->hdr.count, size, GFP_KERNEL);
-	if (!exec)
-		return -ENOMEM;
-
-	if (copy_from_user(exec, user_data, n)) {
-		ret = -EFAULT;
-		goto free_exec;
-	}
+	exec = memdup_array_user(u64_to_user_ptr(args->data), args->hdr.count, size);
+	if (IS_ERR(exec))
+		return PTR_ERR(exec);
 
 	usr = file_priv->driver_priv;
 	usr_rcu_id = srcu_read_lock(&usr->qddev_lock);
@@ -1356,6 +1375,11 @@ static int __qaic_execute_bo_ioctl(struct drm_device *dev, void *data, struct dr
 		goto release_ch_rcu;
 	}
 
+	if (dbc->id == qdev->ssr_dbc) {
+		ret = -EPIPE;
+		goto release_ch_rcu;
+	}
+
 	ret = mutex_lock_interruptible(&dbc->req_lock);
 	if (ret)
 		goto release_ch_rcu;
@@ -1396,7 +1420,6 @@ unlock_dev_srcu:
 	srcu_read_unlock(&qdev->dev_lock, qdev_rcu_id);
 unlock_usr_srcu:
 	srcu_read_unlock(&usr->qddev_lock, usr_rcu_id);
-free_exec:
 	kfree(exec);
 	return ret;
 }
@@ -1491,7 +1514,7 @@ irqreturn_t dbc_irq_handler(int irq, void *data)
 	return IRQ_WAKE_THREAD;
 }
 
-void irq_polling_work(struct work_struct *work)
+void qaic_irq_polling_work(struct work_struct *work)
 {
 	struct dma_bridge_chan *dbc = container_of(work, struct dma_bridge_chan,  poll_work);
 	unsigned long flags;
@@ -1709,6 +1732,11 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 		goto unlock_ch_srcu;
 	}
 
+	if (dbc->id == qdev->ssr_dbc) {
+		ret = -EPIPE;
+		goto unlock_ch_srcu;
+	}
+
 	obj = drm_gem_object_lookup(file_priv, args->handle);
 	if (!obj) {
 		ret = -ENOENT;
@@ -1729,6 +1757,9 @@ int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 	if (!dbc->usr)
 		ret = -EPERM;
 
+	if (dbc->id == qdev->ssr_dbc)
+		ret = -EPIPE;
+
 put_obj:
 	drm_gem_object_put(obj);
 unlock_ch_srcu:
@@ -1749,7 +1780,8 @@ int qaic_perf_stats_bo_ioctl(struct drm_device *dev, void *data, struct drm_file
 	struct qaic_device *qdev;
 	struct qaic_user *usr;
 	struct qaic_bo *bo;
-	int ret, i;
+	int ret = 0;
+	int i;
 
 	usr = file_priv->driver_priv;
 	usr_rcu_id = srcu_read_lock(&usr->qddev_lock);
@@ -1770,18 +1802,12 @@ int qaic_perf_stats_bo_ioctl(struct drm_device *dev, void *data, struct drm_file
 		goto unlock_dev_srcu;
 	}
 
-	ent = kcalloc(args->hdr.count, sizeof(*ent), GFP_KERNEL);
-	if (!ent) {
-		ret = -EINVAL;
+	ent = memdup_array_user(u64_to_user_ptr(args->data), args->hdr.count, sizeof(*ent));
+	if (IS_ERR(ent)) {
+		ret = PTR_ERR(ent);
 		goto unlock_dev_srcu;
 	}
 
-	ret = copy_from_user(ent, u64_to_user_ptr(args->data), args->hdr.count * sizeof(*ent));
-	if (ret) {
-		ret = -EFAULT;
-		goto free_ent;
-	}
-
 	for (i = 0; i < args->hdr.count; i++) {
 		obj = drm_gem_object_lookup(file_priv, ent[i].handle);
 		if (!obj) {
@@ -1789,6 +1815,16 @@ int qaic_perf_stats_bo_ioctl(struct drm_device *dev, void *data, struct drm_file
 			goto free_ent;
 		}
 		bo = to_qaic_bo(obj);
+		if (!bo->sliced) {
+			drm_gem_object_put(obj);
+			ret = -EINVAL;
+			goto free_ent;
+		}
+		if (bo->dbc->id != args->hdr.dbc_id) {
+			drm_gem_object_put(obj);
+			ret = -EINVAL;
+			goto free_ent;
+		}
 		/*
 		 * perf stats ioctl is called before wait ioctl is complete then
 		 * the latency information is invalid.
@@ -1927,6 +1963,17 @@ static void empty_xfer_list(struct qaic_device *qdev, struct dma_bridge_chan *db
 	spin_unlock_irqrestore(&dbc->xfer_lock, flags);
 }
 
+static void sync_empty_xfer_list(struct qaic_device *qdev, struct dma_bridge_chan *dbc)
+{
+	empty_xfer_list(qdev, dbc);
+	synchronize_srcu(&dbc->ch_lock);
+	/*
+	 * Threads holding channel lock, may add more elements in the xfer_list.
+	 * Flush out these elements from xfer_list.
+	 */
+	empty_xfer_list(qdev, dbc);
+}
+
 int disable_dbc(struct qaic_device *qdev, u32 dbc_id, struct qaic_user *usr)
 {
 	if (!qdev->dbc[dbc_id].usr || qdev->dbc[dbc_id].usr->handle != usr->handle)
@@ -1941,7 +1988,7 @@ int disable_dbc(struct qaic_device *qdev, u32 dbc_id, struct qaic_user *usr)
  * enable_dbc - Enable the DBC. DBCs are disabled by removing the context of
  * user. Add user context back to DBC to enable it. This function trusts the
  * DBC ID passed and expects the DBC to be disabled.
- * @qdev: Qranium device handle
+ * @qdev: qaic device handle
  * @dbc_id: ID of the DBC
  * @usr: User context
  */
@@ -1955,13 +2002,7 @@ void wakeup_dbc(struct qaic_device *qdev, u32 dbc_id)
 	struct dma_bridge_chan *dbc = &qdev->dbc[dbc_id];
 
 	dbc->usr = NULL;
-	empty_xfer_list(qdev, dbc);
-	synchronize_srcu(&dbc->ch_lock);
-	/*
-	 * Threads holding channel lock, may add more elements in the xfer_list.
-	 * Flush out these elements from xfer_list.
-	 */
-	empty_xfer_list(qdev, dbc);
+	sync_empty_xfer_list(qdev, dbc);
 }
 
 void release_dbc(struct qaic_device *qdev, u32 dbc_id)
@@ -2002,3 +2043,30 @@ void qaic_data_get_fifo_info(struct dma_bridge_chan *dbc, u32 *head, u32 *tail)
 	*head = readl(dbc->dbc_base + REQHP_OFF);
 	*tail = readl(dbc->dbc_base + REQTP_OFF);
 }
+
+/*
+ * qaic_dbc_enter_ssr - Prepare to enter in sub system reset(SSR) for given DBC ID.
+ * @qdev: qaic device handle
+ * @dbc_id: ID of the DBC which will enter SSR
+ *
+ * The device will automatically deactivate the workload as not
+ * all errors can be silently recovered. The user will be
+ * notified and will need to decide the required recovery
+ * action to take.
+ */
+void qaic_dbc_enter_ssr(struct qaic_device *qdev, u32 dbc_id)
+{
+	qdev->ssr_dbc = dbc_id;
+	release_dbc(qdev, dbc_id);
+}
+
+/*
+ * qaic_dbc_exit_ssr - Prepare to exit from sub system reset(SSR) for given DBC ID.
+ * @qdev: qaic device handle
+ *
+ * The DBC returns to an operational state and begins accepting work after exiting SSR.
+ */
+void qaic_dbc_exit_ssr(struct qaic_device *qdev)
+{
+	qdev->ssr_dbc = QAIC_SSR_DBC_SENTINEL;
+}
diff --git a/drivers/accel/qaic/qaic_drv.c b/drivers/accel/qaic/qaic_drv.c
index e162f4b8a262..4c70bd949d53 100644
--- a/drivers/accel/qaic/qaic_drv.c
+++ b/drivers/accel/qaic/qaic_drv.c
@@ -30,6 +30,7 @@
 #include "qaic.h"
 #include "qaic_debugfs.h"
 #include "qaic_ras.h"
+#include "qaic_ssr.h"
 #include "qaic_timesync.h"
 #include "sahara.h"
 
@@ -270,6 +271,13 @@ static int qaic_create_drm_device(struct qaic_device *qdev, s32 partition_id)
 		return ret;
 	}
 
+	ret = qaic_sysfs_init(qddev);
+	if (ret) {
+		drm_dev_unregister(drm);
+		pci_dbg(qdev->pdev, "qaic_sysfs_init failed %d\n", ret);
+		return ret;
+	}
+
 	qaic_debugfs_init(qddev);
 
 	return ret;
@@ -281,6 +289,7 @@ static void qaic_destroy_drm_device(struct qaic_device *qdev, s32 partition_id)
 	struct drm_device *drm = to_drm(qddev);
 	struct qaic_user *usr;
 
+	qaic_sysfs_remove(qddev);
 	drm_dev_unregister(drm);
 	qddev->partition_id = 0;
 	/*
@@ -382,6 +391,7 @@ void qaic_dev_reset_clean_local_state(struct qaic_device *qdev)
 	qaic_notify_reset(qdev);
 
 	/* start tearing things down */
+	qaic_clean_up_ssr(qdev);
 	for (i = 0; i < qdev->num_dbc; ++i)
 		release_dbc(qdev, i);
 }
@@ -431,11 +441,18 @@ static struct qaic_device *create_qdev(struct pci_dev *pdev,
 	qdev->qts_wq = qaicm_wq_init(drm, "qaic_ts");
 	if (IS_ERR(qdev->qts_wq))
 		return NULL;
+	qdev->ssr_wq = qaicm_wq_init(drm, "qaic_ssr");
+	if (IS_ERR(qdev->ssr_wq))
+		return NULL;
 
 	ret = qaicm_srcu_init(drm, &qdev->dev_lock);
 	if (ret)
 		return NULL;
 
+	ret = qaic_ssr_init(qdev, drm);
+	if (ret)
+		pci_info(pdev, "QAIC SSR crashdump collection not supported.\n");
+
 	qdev->qddev = qddev;
 	qdev->pdev = pdev;
 	qddev->qdev = qdev;
@@ -545,7 +562,7 @@ static int init_msi(struct qaic_device *qdev, struct pci_dev *pdev)
 			qdev->dbc[i].irq = pci_irq_vector(pdev, qdev->single_msi ? 0 : i + 1);
 			if (!qdev->single_msi)
 				disable_irq_nosync(qdev->dbc[i].irq);
-			INIT_WORK(&qdev->dbc[i].poll_work, irq_polling_work);
+			INIT_WORK(&qdev->dbc[i].poll_work, qaic_irq_polling_work);
 		}
 	}
 
@@ -660,6 +677,92 @@ static const struct pci_error_handlers qaic_pci_err_handler = {
 	.reset_done = qaic_pci_reset_done,
 };
 
+static bool qaic_is_under_reset(struct qaic_device *qdev)
+{
+	int rcu_id;
+	bool ret;
+
+	rcu_id = srcu_read_lock(&qdev->dev_lock);
+	ret = qdev->dev_state != QAIC_ONLINE;
+	srcu_read_unlock(&qdev->dev_lock, rcu_id);
+	return ret;
+}
+
+static bool qaic_data_path_busy(struct qaic_device *qdev)
+{
+	bool ret = false;
+	int dev_rcu_id;
+	int i;
+
+	dev_rcu_id = srcu_read_lock(&qdev->dev_lock);
+	if (qdev->dev_state != QAIC_ONLINE) {
+		srcu_read_unlock(&qdev->dev_lock, dev_rcu_id);
+		return false;
+	}
+	for (i = 0; i < qdev->num_dbc; i++) {
+		struct dma_bridge_chan *dbc = &qdev->dbc[i];
+		unsigned long flags;
+		int ch_rcu_id;
+
+		ch_rcu_id = srcu_read_lock(&dbc->ch_lock);
+		if (!dbc->usr || !dbc->in_use) {
+			srcu_read_unlock(&dbc->ch_lock, ch_rcu_id);
+			continue;
+		}
+		spin_lock_irqsave(&dbc->xfer_lock, flags);
+		ret = !list_empty(&dbc->xfer_list);
+		spin_unlock_irqrestore(&dbc->xfer_lock, flags);
+		srcu_read_unlock(&dbc->ch_lock, ch_rcu_id);
+		if (ret)
+			break;
+	}
+	srcu_read_unlock(&qdev->dev_lock, dev_rcu_id);
+	return ret;
+}
+
+static int qaic_pm_suspend(struct device *dev)
+{
+	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev));
+
+	dev_dbg(dev, "Suspending..\n");
+	if (qaic_data_path_busy(qdev)) {
+		dev_dbg(dev, "Device's datapath is busy. Aborting suspend..\n");
+		return -EBUSY;
+	}
+	if (qaic_is_under_reset(qdev)) {
+		dev_dbg(dev, "Device is under reset. Aborting suspend..\n");
+		return -EBUSY;
+	}
+	qaic_mqts_ch_stop_timer(qdev->mqts_ch);
+	qaic_pci_reset_prepare(qdev->pdev);
+	pci_save_state(qdev->pdev);
+	pci_disable_device(qdev->pdev);
+	pci_set_power_state(qdev->pdev, PCI_D3hot);
+	return 0;
+}
+
+static int qaic_pm_resume(struct device *dev)
+{
+	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev));
+	int ret;
+
+	dev_dbg(dev, "Resuming..\n");
+	pci_set_power_state(qdev->pdev, PCI_D0);
+	pci_restore_state(qdev->pdev);
+	ret = pci_enable_device(qdev->pdev);
+	if (ret) {
+		dev_err(dev, "pci_enable_device failed on resume %d\n", ret);
+		return ret;
+	}
+	pci_set_master(qdev->pdev);
+	qaic_pci_reset_done(qdev->pdev);
+	return 0;
+}
+
+static const struct dev_pm_ops qaic_pm_ops = {
+	SYSTEM_SLEEP_PM_OPS(qaic_pm_suspend, qaic_pm_resume)
+};
+
 static struct pci_driver qaic_pci_driver = {
 	.name = QAIC_NAME,
 	.id_table = qaic_ids,
@@ -667,6 +770,9 @@ static struct pci_driver qaic_pci_driver = {
 	.remove = qaic_pci_remove,
 	.shutdown = qaic_pci_shutdown,
 	.err_handler = &qaic_pci_err_handler,
+	.driver = {
+		.pm = pm_sleep_ptr(&qaic_pm_ops),
+	},
 };
 
 static int __init qaic_init(void)
@@ -702,9 +808,16 @@ static int __init qaic_init(void)
 	ret = qaic_ras_register();
 	if (ret)
 		pr_debug("qaic: qaic_ras_register failed %d\n", ret);
+	ret = qaic_ssr_register();
+	if (ret) {
+		pr_debug("qaic: qaic_ssr_register failed %d\n", ret);
+		goto free_bootlog;
+	}
 
 	return 0;
 
+free_bootlog:
+	qaic_bootlog_unregister();
 free_mhi:
 	mhi_driver_unregister(&qaic_mhi_driver);
 free_pci:
@@ -730,6 +843,7 @@ static void __exit qaic_exit(void)
 	 * reinitializing the link_up state after the cleanup is done.
 	 */
 	link_up = true;
+	qaic_ssr_unregister();
 	qaic_ras_unregister();
 	qaic_bootlog_unregister();
 	qaic_timesync_deinit();
diff --git a/drivers/accel/qaic/qaic_ras.c b/drivers/accel/qaic/qaic_ras.c
index 914ffc4a9970..f1d52a710136 100644
--- a/drivers/accel/qaic/qaic_ras.c
+++ b/drivers/accel/qaic/qaic_ras.c
@@ -514,21 +514,21 @@ static ssize_t ce_count_show(struct device *dev, struct device_attribute *attr,
 {
 	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev));
 
-	return snprintf(buf, PAGE_SIZE, "%d\n", qdev->ce_count);
+	return sysfs_emit(buf, "%d\n", qdev->ce_count);
 }
 
 static ssize_t ue_count_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev));
 
-	return snprintf(buf, PAGE_SIZE, "%d\n", qdev->ue_count);
+	return sysfs_emit(buf, "%d\n", qdev->ue_count);
 }
 
 static ssize_t ue_nonfatal_count_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(dev));
 
-	return snprintf(buf, PAGE_SIZE, "%d\n", qdev->ue_nf_count);
+	return sysfs_emit(buf, "%d\n", qdev->ue_nf_count);
 }
 
 static DEVICE_ATTR_RO(ce_count);
diff --git a/drivers/accel/qaic/qaic_ssr.c b/drivers/accel/qaic/qaic_ssr.c
new file mode 100644
index 000000000000..9b662d690371
--- /dev/null
+++ b/drivers/accel/qaic/qaic_ssr.c
@@ -0,0 +1,815 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. */
+/* Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved. */
+
+#include <asm/byteorder.h>
+#include <drm/drm_file.h>
+#include <drm/drm_managed.h>
+#include <linux/devcoredump.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mhi.h>
+#include <linux/workqueue.h>
+
+#include "qaic.h"
+#include "qaic_ssr.h"
+
+#define SSR_RESP_MSG_SZ 32
+#define SSR_MHI_BUF_SIZE SZ_64K
+#define SSR_MEM_READ_DATA_SIZE ((u64)SSR_MHI_BUF_SIZE - sizeof(struct ssr_crashdump))
+#define SSR_MEM_READ_CHUNK_SIZE ((u64)SSR_MEM_READ_DATA_SIZE - sizeof(struct ssr_memory_read_rsp))
+
+#define DEBUG_TRANSFER_INFO		BIT(0)
+#define DEBUG_TRANSFER_INFO_RSP		BIT(1)
+#define MEMORY_READ			BIT(2)
+#define MEMORY_READ_RSP			BIT(3)
+#define DEBUG_TRANSFER_DONE		BIT(4)
+#define DEBUG_TRANSFER_DONE_RSP		BIT(5)
+#define SSR_EVENT			BIT(8)
+#define SSR_EVENT_RSP			BIT(9)
+
+#define SSR_EVENT_NACK		BIT(0)
+#define BEFORE_SHUTDOWN		BIT(1)
+#define AFTER_SHUTDOWN		BIT(2)
+#define BEFORE_POWER_UP		BIT(3)
+#define AFTER_POWER_UP		BIT(4)
+
+struct debug_info_table {
+	/* Save preferences. Default is mandatory */
+	u64 save_perf;
+	/* Base address of the debug region */
+	u64 mem_base;
+	/* Size of debug region in bytes */
+	u64 len;
+	/* Description */
+	char desc[20];
+	/* Filename of debug region */
+	char filename[20];
+};
+
+struct _ssr_hdr {
+	__le32 cmd;
+	__le32 len;
+	__le32 dbc_id;
+};
+
+struct ssr_hdr {
+	u32 cmd;
+	u32 len;
+	u32 dbc_id;
+};
+
+struct ssr_debug_transfer_info {
+	struct ssr_hdr hdr;
+	u32 resv;
+	u64 tbl_addr;
+	u64 tbl_len;
+} __packed;
+
+struct ssr_debug_transfer_info_rsp {
+	struct _ssr_hdr hdr;
+	__le32 ret;
+} __packed;
+
+struct ssr_memory_read {
+	struct _ssr_hdr hdr;
+	__le32 resv;
+	__le64 addr;
+	__le64 len;
+} __packed;
+
+struct ssr_memory_read_rsp {
+	struct _ssr_hdr hdr;
+	__le32 resv;
+	u8 data[];
+} __packed;
+
+struct ssr_debug_transfer_done {
+	struct _ssr_hdr hdr;
+	__le32 resv;
+} __packed;
+
+struct ssr_debug_transfer_done_rsp {
+	struct _ssr_hdr hdr;
+	__le32 ret;
+} __packed;
+
+struct ssr_event {
+	struct ssr_hdr hdr;
+	u32 event;
+} __packed;
+
+struct ssr_event_rsp {
+	struct _ssr_hdr hdr;
+	__le32 event;
+} __packed;
+
+struct ssr_resp {
+	/* Work struct to schedule work coming on QAIC_SSR channel */
+	struct work_struct work;
+	/* Root struct of device, used to access device resources */
+	struct qaic_device *qdev;
+	/* Buffer used by MHI for transfer requests */
+	u8 data[] __aligned(8);
+};
+
+/* SSR crashdump book keeping structure */
+struct ssr_dump_info {
+	/* DBC associated with this SSR crashdump */
+	struct dma_bridge_chan *dbc;
+	/*
+	 * It will be used when we complete the crashdump download and switch
+	 * to waiting on SSR events
+	 */
+	struct ssr_resp *resp;
+	/* MEMORY READ request MHI buffer.*/
+	struct ssr_memory_read *read_buf_req;
+	/* TRUE: ->read_buf_req is queued for MHI transaction. FALSE: Otherwise */
+	bool read_buf_req_queued;
+	/* Address of table in host */
+	void *tbl_addr;
+	/* Total size of table */
+	u64 tbl_len;
+	/* Offset of table(->tbl_addr) where the new chunk will be dumped */
+	u64 tbl_off;
+	/* Address of table in device/target */
+	u64 tbl_addr_dev;
+	/* Ptr to the entire dump */
+	void *dump_addr;
+	/* Entire crashdump size */
+	u64 dump_sz;
+	/* Offset of crashdump(->dump_addr) where the new chunk will be dumped */
+	u64 dump_off;
+	/* Points to the table entry we are currently downloading */
+	struct debug_info_table *tbl_ent;
+	/* Offset in the current table entry(->tbl_ent) for next chuck */
+	u64 tbl_ent_off;
+};
+
+struct ssr_crashdump {
+	/*
+	 * Points to a book keeping struct maintained by MHI SSR device while
+	 * downloading a SSR crashdump. It is NULL when crashdump downloading
+	 * not in progress.
+	 */
+	struct ssr_dump_info *dump_info;
+	/* Work struct to schedule work coming on QAIC_SSR channel */
+	struct work_struct work;
+	/* Root struct of device, used to access device resources */
+	struct qaic_device *qdev;
+	/* Buffer used by MHI for transfer requests */
+	u8 data[];
+};
+
+#define QAIC_SSR_DUMP_V1_MAGIC 0x1234567890abcdef
+#define QAIC_SSR_DUMP_V1_VER   1
+struct dump_file_meta {
+	u64 magic;
+	u64 version;
+	u64 size;		/* Total size of the entire dump */
+	u64 tbl_len;		/* Length of the table in byte */
+};
+
+/*
+ * Layout of crashdump
+ *              +------------------------------------------+
+ *              |         Crashdump Meta structure         |
+ *              | type: struct dump_file_meta              |
+ *              +------------------------------------------+
+ *              |             Crashdump Table              |
+ *              | type: array of struct debug_info_table   |
+ *              |                                          |
+ *              |                                          |
+ *              |                                          |
+ *              +------------------------------------------+
+ *              |                Crashdump                 |
+ *              |                                          |
+ *              |                                          |
+ *              |                                          |
+ *              |                                          |
+ *              |                                          |
+ *              +------------------------------------------+
+ */
+
+static void free_ssr_dump_info(struct ssr_crashdump *ssr_crash)
+{
+	struct ssr_dump_info *dump_info = ssr_crash->dump_info;
+
+	ssr_crash->dump_info = NULL;
+	if (!dump_info)
+		return;
+	if (!dump_info->read_buf_req_queued)
+		kfree(dump_info->read_buf_req);
+	vfree(dump_info->tbl_addr);
+	vfree(dump_info->dump_addr);
+	kfree(dump_info);
+}
+
+void qaic_clean_up_ssr(struct qaic_device *qdev)
+{
+	struct ssr_crashdump *ssr_crash = qdev->ssr_mhi_buf;
+
+	if (!ssr_crash)
+		return;
+
+	qaic_dbc_exit_ssr(qdev);
+	free_ssr_dump_info(ssr_crash);
+}
+
+static int alloc_dump(struct ssr_dump_info *dump_info)
+{
+	struct debug_info_table *tbl_ent = dump_info->tbl_addr;
+	struct dump_file_meta *dump_meta;
+	u64 tbl_sz_lp = 0;
+	u64 dump_size = 0;
+
+	while (tbl_sz_lp < dump_info->tbl_len) {
+		le64_to_cpus(&tbl_ent->save_perf);
+		le64_to_cpus(&tbl_ent->mem_base);
+		le64_to_cpus(&tbl_ent->len);
+
+		if (tbl_ent->len == 0)
+			return -EINVAL;
+
+		dump_size += tbl_ent->len;
+		tbl_ent++;
+		tbl_sz_lp += sizeof(*tbl_ent);
+	}
+
+	dump_info->dump_sz = dump_size + dump_info->tbl_len + sizeof(*dump_meta);
+	dump_info->dump_addr = vzalloc(dump_info->dump_sz);
+	if (!dump_info->dump_addr)
+		return -ENOMEM;
+
+	/* Copy crashdump meta and table */
+	dump_meta = dump_info->dump_addr;
+	dump_meta->magic = QAIC_SSR_DUMP_V1_MAGIC;
+	dump_meta->version = QAIC_SSR_DUMP_V1_VER;
+	dump_meta->size = dump_info->dump_sz;
+	dump_meta->tbl_len = dump_info->tbl_len;
+	memcpy(dump_info->dump_addr + sizeof(*dump_meta), dump_info->tbl_addr, dump_info->tbl_len);
+	/* Offset by crashdump meta and table (copied above) */
+	dump_info->dump_off = dump_info->tbl_len + sizeof(*dump_meta);
+
+	return 0;
+}
+
+static int send_xfer_done(struct qaic_device *qdev, void *resp, u32 dbc_id)
+{
+	struct ssr_debug_transfer_done *xfer_done;
+	int ret;
+
+	xfer_done = kmalloc(sizeof(*xfer_done), GFP_KERNEL);
+	if (!xfer_done) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = mhi_queue_buf(qdev->ssr_ch, DMA_FROM_DEVICE, resp, SSR_RESP_MSG_SZ, MHI_EOT);
+	if (ret)
+		goto free_xfer_done;
+
+	xfer_done->hdr.cmd = cpu_to_le32(DEBUG_TRANSFER_DONE);
+	xfer_done->hdr.len = cpu_to_le32(sizeof(*xfer_done));
+	xfer_done->hdr.dbc_id = cpu_to_le32(dbc_id);
+
+	ret = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE, xfer_done, sizeof(*xfer_done), MHI_EOT);
+	if (ret)
+		goto free_xfer_done;
+
+	return 0;
+
+free_xfer_done:
+	kfree(xfer_done);
+out:
+	return ret;
+}
+
+static int mem_read_req(struct qaic_device *qdev, u64 dest_addr, u64 dest_len)
+{
+	struct ssr_crashdump *ssr_crash = qdev->ssr_mhi_buf;
+	struct ssr_memory_read *read_buf_req;
+	struct ssr_dump_info *dump_info;
+	int ret;
+
+	dump_info = ssr_crash->dump_info;
+	ret = mhi_queue_buf(qdev->ssr_ch, DMA_FROM_DEVICE, ssr_crash->data, SSR_MEM_READ_DATA_SIZE,
+			    MHI_EOT);
+	if (ret)
+		goto out;
+
+	read_buf_req = dump_info->read_buf_req;
+	read_buf_req->hdr.cmd = cpu_to_le32(MEMORY_READ);
+	read_buf_req->hdr.len = cpu_to_le32(sizeof(*read_buf_req));
+	read_buf_req->hdr.dbc_id = cpu_to_le32(qdev->ssr_dbc);
+	read_buf_req->addr = cpu_to_le64(dest_addr);
+	read_buf_req->len = cpu_to_le64(dest_len);
+
+	ret = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE, read_buf_req, sizeof(*read_buf_req),
+			    MHI_EOT);
+	if (!ret)
+		dump_info->read_buf_req_queued = true;
+
+out:
+	return ret;
+}
+
+static int ssr_copy_table(struct ssr_dump_info *dump_info, void *data, u64 len)
+{
+	if (len > dump_info->tbl_len - dump_info->tbl_off)
+		return -EINVAL;
+
+	memcpy(dump_info->tbl_addr + dump_info->tbl_off, data, len);
+	dump_info->tbl_off += len;
+
+	/* Entire table has been downloaded, alloc dump memory */
+	if (dump_info->tbl_off == dump_info->tbl_len) {
+		dump_info->tbl_ent = dump_info->tbl_addr;
+		return alloc_dump(dump_info);
+	}
+
+	return 0;
+}
+
+static int ssr_copy_dump(struct ssr_dump_info *dump_info, void *data, u64 len)
+{
+	struct debug_info_table *tbl_ent;
+
+	tbl_ent = dump_info->tbl_ent;
+
+	if (len > tbl_ent->len - dump_info->tbl_ent_off)
+		return -EINVAL;
+
+	memcpy(dump_info->dump_addr + dump_info->dump_off, data, len);
+	dump_info->dump_off += len;
+	dump_info->tbl_ent_off += len;
+
+	/*
+	 * Current segment (a entry in table) of the crashdump is complete,
+	 * move to next one
+	 */
+	if (tbl_ent->len == dump_info->tbl_ent_off) {
+		dump_info->tbl_ent++;
+		dump_info->tbl_ent_off = 0;
+	}
+
+	return 0;
+}
+
+static void ssr_dump_worker(struct work_struct *work)
+{
+	struct ssr_crashdump *ssr_crash = container_of(work, struct ssr_crashdump, work);
+	struct qaic_device *qdev = ssr_crash->qdev;
+	struct ssr_memory_read_rsp *mem_rd_resp;
+	struct debug_info_table *tbl_ent;
+	struct ssr_dump_info *dump_info;
+	u64 dest_addr, dest_len;
+	struct _ssr_hdr *_hdr;
+	struct ssr_hdr hdr;
+	u64 data_len;
+	int ret;
+
+	mem_rd_resp = (struct ssr_memory_read_rsp *)ssr_crash->data;
+	_hdr = &mem_rd_resp->hdr;
+	hdr.cmd = le32_to_cpu(_hdr->cmd);
+	hdr.len = le32_to_cpu(_hdr->len);
+	hdr.dbc_id = le32_to_cpu(_hdr->dbc_id);
+
+	if (hdr.dbc_id != qdev->ssr_dbc)
+		goto reset_device;
+
+	dump_info = ssr_crash->dump_info;
+	if (!dump_info)
+		goto reset_device;
+
+	if (hdr.cmd != MEMORY_READ_RSP)
+		goto free_dump_info;
+
+	if (hdr.len > SSR_MEM_READ_DATA_SIZE)
+		goto free_dump_info;
+
+	data_len = hdr.len - sizeof(*mem_rd_resp);
+
+	if (dump_info->tbl_off < dump_info->tbl_len) /* Chunk belongs to table */
+		ret = ssr_copy_table(dump_info, mem_rd_resp->data, data_len);
+	else /* Chunk belongs to crashdump */
+		ret = ssr_copy_dump(dump_info, mem_rd_resp->data, data_len);
+
+	if (ret)
+		goto free_dump_info;
+
+	if (dump_info->tbl_off < dump_info->tbl_len) {
+		/* Continue downloading table */
+		dest_addr = dump_info->tbl_addr_dev + dump_info->tbl_off;
+		dest_len = min(SSR_MEM_READ_CHUNK_SIZE, dump_info->tbl_len - dump_info->tbl_off);
+		ret = mem_read_req(qdev, dest_addr, dest_len);
+	} else if (dump_info->dump_off < dump_info->dump_sz) {
+		/* Continue downloading crashdump */
+		tbl_ent = dump_info->tbl_ent;
+		dest_addr = tbl_ent->mem_base + dump_info->tbl_ent_off;
+		dest_len = min(SSR_MEM_READ_CHUNK_SIZE, tbl_ent->len - dump_info->tbl_ent_off);
+		ret = mem_read_req(qdev, dest_addr, dest_len);
+	} else {
+		/* Crashdump download complete */
+		ret = send_xfer_done(qdev, dump_info->resp->data, hdr.dbc_id);
+	}
+
+	/* Most likely a MHI xfer has failed */
+	if (ret)
+		goto free_dump_info;
+
+	return;
+
+free_dump_info:
+	/* Free the allocated memory */
+	free_ssr_dump_info(ssr_crash);
+reset_device:
+	/*
+	 * After subsystem crashes in device crashdump collection begins but
+	 * something went wrong while collecting crashdump, now instead of
+	 * handling this error we just reset the device as the best effort has
+	 * been made
+	 */
+	mhi_soc_reset(qdev->mhi_cntrl);
+}
+
+static struct ssr_dump_info *alloc_dump_info(struct qaic_device *qdev,
+					     struct ssr_debug_transfer_info *debug_info)
+{
+	struct ssr_dump_info *dump_info;
+	int ret;
+
+	le64_to_cpus(&debug_info->tbl_len);
+	le64_to_cpus(&debug_info->tbl_addr);
+
+	if (debug_info->tbl_len == 0 ||
+	    debug_info->tbl_len % sizeof(struct debug_info_table) != 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Allocate SSR crashdump book keeping structure */
+	dump_info = kzalloc(sizeof(*dump_info), GFP_KERNEL);
+	if (!dump_info) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* Buffer used to send MEMORY READ request to device via MHI */
+	dump_info->read_buf_req = kzalloc(sizeof(*dump_info->read_buf_req), GFP_KERNEL);
+	if (!dump_info->read_buf_req) {
+		ret = -ENOMEM;
+		goto free_dump_info;
+	}
+
+	/* Crashdump meta table buffer */
+	dump_info->tbl_addr = vzalloc(debug_info->tbl_len);
+	if (!dump_info->tbl_addr) {
+		ret = -ENOMEM;
+		goto free_read_buf_req;
+	}
+
+	dump_info->tbl_addr_dev = debug_info->tbl_addr;
+	dump_info->tbl_len = debug_info->tbl_len;
+
+	return dump_info;
+
+free_read_buf_req:
+	kfree(dump_info->read_buf_req);
+free_dump_info:
+	kfree(dump_info);
+out:
+	return ERR_PTR(ret);
+}
+
+static int dbg_xfer_info_rsp(struct qaic_device *qdev, struct dma_bridge_chan *dbc,
+			     struct ssr_debug_transfer_info *debug_info)
+{
+	struct ssr_debug_transfer_info_rsp *debug_rsp;
+	struct ssr_crashdump *ssr_crash = NULL;
+	int ret = 0, ret2;
+
+	debug_rsp = kmalloc(sizeof(*debug_rsp), GFP_KERNEL);
+	if (!debug_rsp)
+		return -ENOMEM;
+
+	if (!qdev->ssr_mhi_buf) {
+		ret = -ENOMEM;
+		goto send_rsp;
+	}
+
+	if (dbc->state != DBC_STATE_BEFORE_POWER_UP) {
+		ret = -EINVAL;
+		goto send_rsp;
+	}
+
+	ssr_crash = qdev->ssr_mhi_buf;
+	ssr_crash->dump_info = alloc_dump_info(qdev, debug_info);
+	if (IS_ERR(ssr_crash->dump_info)) {
+		ret = PTR_ERR(ssr_crash->dump_info);
+		ssr_crash->dump_info = NULL;
+	}
+
+send_rsp:
+	debug_rsp->hdr.cmd = cpu_to_le32(DEBUG_TRANSFER_INFO_RSP);
+	debug_rsp->hdr.len = cpu_to_le32(sizeof(*debug_rsp));
+	debug_rsp->hdr.dbc_id = cpu_to_le32(dbc->id);
+	/*
+	 * 0 = Return an ACK confirming the host is ready to download crashdump
+	 * 1 = Return an NACK confirming the host is not ready to download crashdump
+	 */
+	debug_rsp->ret = cpu_to_le32(ret ? 1 : 0);
+
+	ret2 = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE, debug_rsp, sizeof(*debug_rsp), MHI_EOT);
+	if (ret2) {
+		free_ssr_dump_info(ssr_crash);
+		kfree(debug_rsp);
+		return ret2;
+	}
+
+	return ret;
+}
+
+static void dbg_xfer_done_rsp(struct qaic_device *qdev, struct dma_bridge_chan *dbc,
+			      struct ssr_debug_transfer_done_rsp *xfer_rsp)
+{
+	struct ssr_crashdump *ssr_crash = qdev->ssr_mhi_buf;
+	u32 status = le32_to_cpu(xfer_rsp->ret);
+	struct device *dev = &qdev->pdev->dev;
+	struct ssr_dump_info *dump_info;
+
+	dump_info = ssr_crash->dump_info;
+	if (!dump_info)
+		return;
+
+	if (status) {
+		free_ssr_dump_info(ssr_crash);
+		return;
+	}
+
+	dev_coredumpv(dev, dump_info->dump_addr, dump_info->dump_sz, GFP_KERNEL);
+	/* dev_coredumpv will free dump_info->dump_addr */
+	dump_info->dump_addr = NULL;
+	free_ssr_dump_info(ssr_crash);
+}
+
+static void ssr_worker(struct work_struct *work)
+{
+	struct ssr_resp *resp = container_of(work, struct ssr_resp, work);
+	struct ssr_hdr *hdr = (struct ssr_hdr *)resp->data;
+	struct ssr_dump_info *dump_info = NULL;
+	struct qaic_device *qdev = resp->qdev;
+	struct ssr_crashdump *ssr_crash;
+	struct ssr_event_rsp *event_rsp;
+	struct dma_bridge_chan *dbc;
+	struct ssr_event *event;
+	u32 ssr_event_ack;
+	int ret;
+
+	le32_to_cpus(&hdr->cmd);
+	le32_to_cpus(&hdr->len);
+	le32_to_cpus(&hdr->dbc_id);
+
+	if (hdr->len > SSR_RESP_MSG_SZ)
+		goto out;
+
+	if (hdr->dbc_id >= qdev->num_dbc)
+		goto out;
+
+	dbc = &qdev->dbc[hdr->dbc_id];
+
+	switch (hdr->cmd) {
+	case DEBUG_TRANSFER_INFO:
+		ret = dbg_xfer_info_rsp(qdev, dbc, (struct ssr_debug_transfer_info *)resp->data);
+		if (ret)
+			break;
+
+		ssr_crash = qdev->ssr_mhi_buf;
+		dump_info = ssr_crash->dump_info;
+		dump_info->dbc = dbc;
+		dump_info->resp = resp;
+
+		/* Start by downloading debug table */
+		ret = mem_read_req(qdev, dump_info->tbl_addr_dev,
+				   min(dump_info->tbl_len, SSR_MEM_READ_CHUNK_SIZE));
+		if (ret) {
+			free_ssr_dump_info(ssr_crash);
+			break;
+		}
+
+		/*
+		 * Till now everything went fine, which means that we will be
+		 * collecting crashdump chunk by chunk. Do not queue a response
+		 * buffer for SSR cmds till the crashdump is complete.
+		 */
+		return;
+	case SSR_EVENT:
+		event = (struct ssr_event *)hdr;
+		le32_to_cpus(&event->event);
+		ssr_event_ack = event->event;
+		ssr_crash = qdev->ssr_mhi_buf;
+
+		switch (event->event) {
+		case BEFORE_SHUTDOWN:
+			set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_BEFORE_SHUTDOWN);
+			qaic_dbc_enter_ssr(qdev, hdr->dbc_id);
+			break;
+		case AFTER_SHUTDOWN:
+			set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_AFTER_SHUTDOWN);
+			break;
+		case BEFORE_POWER_UP:
+			set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_BEFORE_POWER_UP);
+			break;
+		case AFTER_POWER_UP:
+			/*
+			 * If dump info is a non NULL value it means that we
+			 * have received this SSR event while downloading a
+			 * crashdump for this DBC is still in progress. NACK
+			 * the SSR event
+			 */
+			if (ssr_crash && ssr_crash->dump_info) {
+				free_ssr_dump_info(ssr_crash);
+				ssr_event_ack = SSR_EVENT_NACK;
+				break;
+			}
+
+			set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_AFTER_POWER_UP);
+			break;
+		default:
+			break;
+		}
+
+		event_rsp = kmalloc(sizeof(*event_rsp), GFP_KERNEL);
+		if (!event_rsp)
+			break;
+
+		event_rsp->hdr.cmd = cpu_to_le32(SSR_EVENT_RSP);
+		event_rsp->hdr.len = cpu_to_le32(sizeof(*event_rsp));
+		event_rsp->hdr.dbc_id = cpu_to_le32(hdr->dbc_id);
+		event_rsp->event = cpu_to_le32(ssr_event_ack);
+
+		ret = mhi_queue_buf(qdev->ssr_ch, DMA_TO_DEVICE, event_rsp, sizeof(*event_rsp),
+				    MHI_EOT);
+		if (ret)
+			kfree(event_rsp);
+
+		if (event->event == AFTER_POWER_UP && ssr_event_ack != SSR_EVENT_NACK) {
+			qaic_dbc_exit_ssr(qdev);
+			set_dbc_state(qdev, hdr->dbc_id, DBC_STATE_IDLE);
+		}
+
+		break;
+	case DEBUG_TRANSFER_DONE_RSP:
+		dbg_xfer_done_rsp(qdev, dbc, (struct ssr_debug_transfer_done_rsp *)hdr);
+		break;
+	default:
+		break;
+	}
+
+out:
+	ret = mhi_queue_buf(qdev->ssr_ch, DMA_FROM_DEVICE, resp->data, SSR_RESP_MSG_SZ, MHI_EOT);
+	if (ret)
+		kfree(resp);
+}
+
+static int qaic_ssr_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
+{
+	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(mhi_dev->mhi_cntrl->cntrl_dev));
+	struct ssr_resp *resp;
+	int ret;
+
+	ret = mhi_prepare_for_transfer(mhi_dev);
+	if (ret)
+		return ret;
+
+	resp = kzalloc(sizeof(*resp) + SSR_RESP_MSG_SZ, GFP_KERNEL);
+	if (!resp) {
+		mhi_unprepare_from_transfer(mhi_dev);
+		return -ENOMEM;
+	}
+
+	resp->qdev = qdev;
+	INIT_WORK(&resp->work, ssr_worker);
+
+	ret = mhi_queue_buf(mhi_dev, DMA_FROM_DEVICE, resp->data, SSR_RESP_MSG_SZ, MHI_EOT);
+	if (ret) {
+		kfree(resp);
+		mhi_unprepare_from_transfer(mhi_dev);
+		return ret;
+	}
+
+	dev_set_drvdata(&mhi_dev->dev, qdev);
+	qdev->ssr_ch = mhi_dev;
+
+	return 0;
+}
+
+static void qaic_ssr_mhi_remove(struct mhi_device *mhi_dev)
+{
+	struct qaic_device *qdev;
+
+	qdev = dev_get_drvdata(&mhi_dev->dev);
+	mhi_unprepare_from_transfer(qdev->ssr_ch);
+	qdev->ssr_ch = NULL;
+}
+
+static void qaic_ssr_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
+{
+	struct qaic_device *qdev = dev_get_drvdata(&mhi_dev->dev);
+	struct ssr_crashdump *ssr_crash = qdev->ssr_mhi_buf;
+	struct _ssr_hdr *hdr = mhi_result->buf_addr;
+	struct ssr_dump_info *dump_info;
+
+	if (mhi_result->transaction_status) {
+		kfree(mhi_result->buf_addr);
+		return;
+	}
+
+	/*
+	 * MEMORY READ is used to download crashdump. And crashdump is
+	 * downloaded chunk by chunk in a series of MEMORY READ SSR commands.
+	 * Hence to avoid too many kmalloc() and kfree() of the same MEMORY READ
+	 * request buffer, we allocate only one such buffer and free it only
+	 * once.
+	 */
+	if (le32_to_cpu(hdr->cmd) == MEMORY_READ) {
+		dump_info = ssr_crash->dump_info;
+		if (dump_info) {
+			dump_info->read_buf_req_queued = false;
+			return;
+		}
+	}
+
+	kfree(mhi_result->buf_addr);
+}
+
+static void qaic_ssr_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
+{
+	struct ssr_resp *resp = container_of(mhi_result->buf_addr, struct ssr_resp, data);
+	struct qaic_device *qdev = dev_get_drvdata(&mhi_dev->dev);
+	struct ssr_crashdump *ssr_crash = qdev->ssr_mhi_buf;
+	bool memory_read_rsp = false;
+
+	if (ssr_crash && ssr_crash->data == mhi_result->buf_addr)
+		memory_read_rsp = true;
+
+	if (mhi_result->transaction_status) {
+		/* Do not free SSR crashdump buffer as it allocated via managed APIs */
+		if (!memory_read_rsp)
+			kfree(resp);
+		return;
+	}
+
+	if (memory_read_rsp)
+		queue_work(qdev->ssr_wq, &ssr_crash->work);
+	else
+		queue_work(qdev->ssr_wq, &resp->work);
+}
+
+static const struct mhi_device_id qaic_ssr_mhi_match_table[] = {
+	{ .chan = "QAIC_SSR", },
+	{},
+};
+
+static struct mhi_driver qaic_ssr_mhi_driver = {
+	.id_table = qaic_ssr_mhi_match_table,
+	.remove = qaic_ssr_mhi_remove,
+	.probe = qaic_ssr_mhi_probe,
+	.ul_xfer_cb = qaic_ssr_mhi_ul_xfer_cb,
+	.dl_xfer_cb = qaic_ssr_mhi_dl_xfer_cb,
+	.driver = {
+		.name = "qaic_ssr",
+	},
+};
+
+int qaic_ssr_init(struct qaic_device *qdev, struct drm_device *drm)
+{
+	struct ssr_crashdump *ssr_crash;
+
+	qdev->ssr_dbc = QAIC_SSR_DBC_SENTINEL;
+
+	/*
+	 * Device requests only one SSR at a time. So allocating only one
+	 * buffer to download crashdump is good enough.
+	 */
+	ssr_crash = drmm_kzalloc(drm, SSR_MHI_BUF_SIZE, GFP_KERNEL);
+	if (!ssr_crash)
+		return -ENOMEM;
+
+	ssr_crash->qdev = qdev;
+	INIT_WORK(&ssr_crash->work, ssr_dump_worker);
+	qdev->ssr_mhi_buf = ssr_crash;
+
+	return 0;
+}
+
+int qaic_ssr_register(void)
+{
+	return mhi_driver_register(&qaic_ssr_mhi_driver);
+}
+
+void qaic_ssr_unregister(void)
+{
+	mhi_driver_unregister(&qaic_ssr_mhi_driver);
+}
diff --git a/drivers/accel/qaic/qaic_ssr.h b/drivers/accel/qaic/qaic_ssr.h
new file mode 100644
index 000000000000..97ccff305750
--- /dev/null
+++ b/drivers/accel/qaic/qaic_ssr.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * Copyright (c) 2020, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2021, 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef __QAIC_SSR_H__
+#define __QAIC_SSR_H__
+
+struct drm_device;
+struct qaic_device;
+
+int qaic_ssr_register(void);
+void qaic_ssr_unregister(void);
+void qaic_clean_up_ssr(struct qaic_device *qdev);
+int qaic_ssr_init(struct qaic_device *qdev, struct drm_device *drm);
+#endif /* __QAIC_SSR_H__ */
diff --git a/drivers/accel/qaic/qaic_sysfs.c b/drivers/accel/qaic/qaic_sysfs.c
new file mode 100644
index 000000000000..e0afb0ffb589
--- /dev/null
+++ b/drivers/accel/qaic/qaic_sysfs.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* Copyright (c) 2020-2025, The Linux Foundation. All rights reserved. */
+
+#include <drm/drm_file.h>
+#include <drm/drm_managed.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/mutex.h>
+#include <linux/sysfs.h>
+
+#include "qaic.h"
+
+#define NAME_LEN		14
+
+struct dbc_attribute {
+	struct device_attribute dev_attr;
+	u32 dbc_id;
+	char name[NAME_LEN];
+};
+
+static ssize_t dbc_state_show(struct device *dev, struct device_attribute *a, char *buf)
+{
+	struct dbc_attribute *dbc_attr = container_of(a, struct dbc_attribute, dev_attr);
+	struct drm_minor *minor = dev_get_drvdata(dev);
+	struct qaic_device *qdev;
+
+	qdev = to_qaic_device(minor->dev);
+	return sysfs_emit(buf, "%d\n", qdev->dbc[dbc_attr->dbc_id].state);
+}
+
+void set_dbc_state(struct qaic_device *qdev, u32 dbc_id, unsigned int state)
+{
+	struct device *kdev = to_accel_kdev(qdev->qddev);
+	char *envp[3] = {};
+	char state_str[16];
+	char id_str[12];
+
+	envp[0] = id_str;
+	envp[1] = state_str;
+
+	if (state >= DBC_STATE_MAX)
+		return;
+	if (dbc_id >= qdev->num_dbc)
+		return;
+	if (state == qdev->dbc[dbc_id].state)
+		return;
+
+	scnprintf(id_str, ARRAY_SIZE(id_str), "DBC_ID=%d", dbc_id);
+	scnprintf(state_str, ARRAY_SIZE(state_str), "DBC_STATE=%d", state);
+
+	qdev->dbc[dbc_id].state = state;
+	kobject_uevent_env(&kdev->kobj, KOBJ_CHANGE, envp);
+}
+
+int qaic_sysfs_init(struct qaic_drm_device *qddev)
+{
+	struct device *kdev = to_accel_kdev(qddev);
+	struct drm_device *drm = to_drm(qddev);
+	u32 num_dbc = qddev->qdev->num_dbc;
+	struct dbc_attribute *dbc_attrs;
+	int i, ret;
+
+	dbc_attrs = drmm_kcalloc(drm, num_dbc, sizeof(*dbc_attrs), GFP_KERNEL);
+	if (!dbc_attrs)
+		return -ENOMEM;
+
+	for (i = 0; i < num_dbc; ++i) {
+		struct dbc_attribute *dbc_attr = &dbc_attrs[i];
+
+		sysfs_attr_init(&dbc_attr->dev_attr.attr);
+		dbc_attr->dbc_id = i;
+		scnprintf(dbc_attr->name, NAME_LEN, "dbc%d_state", i);
+		dbc_attr->dev_attr.attr.name = dbc_attr->name;
+		dbc_attr->dev_attr.attr.mode = 0444;
+		dbc_attr->dev_attr.show = dbc_state_show;
+		ret = sysfs_create_file(&kdev->kobj, &dbc_attr->dev_attr.attr);
+		if (ret) {
+			int j;
+
+			for (j = 0; j < i; ++j) {
+				dbc_attr = &dbc_attrs[j];
+				sysfs_remove_file(&kdev->kobj, &dbc_attr->dev_attr.attr);
+			}
+			drmm_kfree(drm, dbc_attrs);
+			return ret;
+		}
+	}
+
+	qddev->sysfs_attrs = dbc_attrs;
+	return 0;
+}
+
+void qaic_sysfs_remove(struct qaic_drm_device *qddev)
+{
+	struct dbc_attribute *dbc_attrs = qddev->sysfs_attrs;
+	struct device *kdev = to_accel_kdev(qddev);
+	u32 num_dbc = qddev->qdev->num_dbc;
+	int i;
+
+	if (!dbc_attrs)
+		return;
+
+	qddev->sysfs_attrs = NULL;
+	for (i = 0; i < num_dbc; ++i)
+		sysfs_remove_file(&kdev->kobj, &dbc_attrs[i].dev_attr.attr);
+	drmm_kfree(to_drm(qddev), dbc_attrs);
+}
diff --git a/drivers/accel/qaic/qaic_timesync.c b/drivers/accel/qaic/qaic_timesync.c
index 3fac540f8e03..8af2475f4f36 100644
--- a/drivers/accel/qaic/qaic_timesync.c
+++ b/drivers/accel/qaic/qaic_timesync.c
@@ -171,6 +171,13 @@ mod_timer:
 		dev_err(mqtsdev->dev, "%s mod_timer error:%d\n", __func__, ret);
 }
 
+void qaic_mqts_ch_stop_timer(struct mhi_device *mhi_dev)
+{
+	struct mqts_dev *mqtsdev = dev_get_drvdata(&mhi_dev->dev);
+
+	timer_delete_sync(&mqtsdev->timer);
+}
+
 static int qaic_timesync_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
 {
 	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(mhi_dev->mhi_cntrl->cntrl_dev));
@@ -206,6 +213,7 @@ static int qaic_timesync_probe(struct mhi_device *mhi_dev, const struct mhi_devi
 	timer->expires = jiffies + msecs_to_jiffies(timesync_delay_ms);
 	add_timer(timer);
 	dev_set_drvdata(&mhi_dev->dev, mqtsdev);
+	qdev->mqts_ch = mhi_dev;
 
 	return 0;
 
@@ -221,6 +229,7 @@ static void qaic_timesync_remove(struct mhi_device *mhi_dev)
 {
 	struct mqts_dev *mqtsdev = dev_get_drvdata(&mhi_dev->dev);
 
+	mqtsdev->qdev->mqts_ch = NULL;
 	timer_delete_sync(&mqtsdev->timer);
 	mhi_unprepare_from_transfer(mqtsdev->mhi_dev);
 	kfree(mqtsdev->sync_msg);
diff --git a/drivers/accel/qaic/qaic_timesync.h b/drivers/accel/qaic/qaic_timesync.h
index 851b7acd43bb..77b9c2b55057 100644
--- a/drivers/accel/qaic/qaic_timesync.h
+++ b/drivers/accel/qaic/qaic_timesync.h
@@ -6,6 +6,9 @@
 #ifndef __QAIC_TIMESYNC_H__
 #define __QAIC_TIMESYNC_H__
 
+#include <linux/mhi.h>
+
 int qaic_timesync_init(void);
 void qaic_timesync_deinit(void);
+void qaic_mqts_ch_stop_timer(struct mhi_device *mhi_dev);
 #endif /* __QAIC_TIMESYNC_H__ */
diff --git a/drivers/accel/qaic/sahara.c b/drivers/accel/qaic/sahara.c
index 3ebcc1f7ff58..fd3c3b2d1fd3 100644
--- a/drivers/accel/qaic/sahara.c
+++ b/drivers/accel/qaic/sahara.c
@@ -159,6 +159,7 @@ struct sahara_context {
 	struct sahara_packet		*rx;
 	struct work_struct		fw_work;
 	struct work_struct		dump_work;
+	struct work_struct		read_data_work;
 	struct mhi_device		*mhi_dev;
 	const char * const		*image_table;
 	u32				table_size;
@@ -174,7 +175,10 @@ struct sahara_context {
 	u64				dump_image_offset;
 	void				*mem_dump_freespace;
 	u64				dump_images_left;
+	u32				read_data_offset;
+	u32				read_data_length;
 	bool				is_mem_dump_mode;
+	bool				non_streaming;
 };
 
 static const char * const aic100_image_table[] = {
@@ -194,6 +198,7 @@ static const char * const aic200_image_table[] = {
 	[23] = "qcom/aic200/aop.mbn",
 	[32] = "qcom/aic200/tz.mbn",
 	[33] = "qcom/aic200/hypvm.mbn",
+	[38] = "qcom/aic200/xbl_config.elf",
 	[39] = "qcom/aic200/aic200_abl.elf",
 	[40] = "qcom/aic200/apdp.mbn",
 	[41] = "qcom/aic200/devcfg.mbn",
@@ -202,6 +207,7 @@ static const char * const aic200_image_table[] = {
 	[49] = "qcom/aic200/shrm.elf",
 	[50] = "qcom/aic200/cpucp.elf",
 	[51] = "qcom/aic200/aop_devcfg.mbn",
+	[54] = "qcom/aic200/qupv3fw.elf",
 	[57] = "qcom/aic200/cpucp_dtbs.elf",
 	[62] = "qcom/aic200/uefi_dtbs.elf",
 	[63] = "qcom/aic200/xbl_ac_config.mbn",
@@ -213,9 +219,15 @@ static const char * const aic200_image_table[] = {
 	[69] = "qcom/aic200/dcd.mbn",
 	[73] = "qcom/aic200/gearvm.mbn",
 	[74] = "qcom/aic200/sti.bin",
-	[75] = "qcom/aic200/pvs.bin",
+	[76] = "qcom/aic200/tz_qti_config.mbn",
+	[78] = "qcom/aic200/pvs.bin",
 };
 
+static bool is_streaming(struct sahara_context *context)
+{
+	return !context->non_streaming;
+}
+
 static int sahara_find_image(struct sahara_context *context, u32 image_id)
 {
 	int ret;
@@ -265,6 +277,8 @@ static void sahara_send_reset(struct sahara_context *context)
 	int ret;
 
 	context->is_mem_dump_mode = false;
+	context->read_data_offset = 0;
+	context->read_data_length = 0;
 
 	context->tx[0]->cmd = cpu_to_le32(SAHARA_RESET_CMD);
 	context->tx[0]->length = cpu_to_le32(SAHARA_RESET_LENGTH);
@@ -319,9 +333,39 @@ static void sahara_hello(struct sahara_context *context)
 		dev_err(&context->mhi_dev->dev, "Unable to send hello response %d\n", ret);
 }
 
+static int read_data_helper(struct sahara_context *context, int buf_index)
+{
+	enum mhi_flags mhi_flag;
+	u32 pkt_data_len;
+	int ret;
+
+	pkt_data_len = min(context->read_data_length, SAHARA_PACKET_MAX_SIZE);
+
+	memcpy(context->tx[buf_index],
+	       &context->firmware->data[context->read_data_offset],
+	       pkt_data_len);
+
+	context->read_data_offset += pkt_data_len;
+	context->read_data_length -= pkt_data_len;
+
+	if (is_streaming(context) || !context->read_data_length)
+		mhi_flag = MHI_EOT;
+	else
+		mhi_flag = MHI_CHAIN;
+
+	ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE,
+			    context->tx[buf_index], pkt_data_len, mhi_flag);
+	if (ret) {
+		dev_err(&context->mhi_dev->dev, "Unable to send read_data response %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
 static void sahara_read_data(struct sahara_context *context)
 {
-	u32 image_id, data_offset, data_len, pkt_data_len;
+	u32 image_id, data_offset, data_len;
 	int ret;
 	int i;
 
@@ -357,7 +401,7 @@ static void sahara_read_data(struct sahara_context *context)
 	 * and is not needed here on error.
 	 */
 
-	if (data_len > SAHARA_TRANSFER_MAX_SIZE) {
+	if (context->non_streaming && data_len > SAHARA_TRANSFER_MAX_SIZE) {
 		dev_err(&context->mhi_dev->dev, "Malformed read_data packet - data len %d exceeds max xfer size %d\n",
 			data_len, SAHARA_TRANSFER_MAX_SIZE);
 		sahara_send_reset(context);
@@ -378,22 +422,18 @@ static void sahara_read_data(struct sahara_context *context)
 		return;
 	}
 
-	for (i = 0; i < SAHARA_NUM_TX_BUF && data_len; ++i) {
-		pkt_data_len = min(data_len, SAHARA_PACKET_MAX_SIZE);
-
-		memcpy(context->tx[i], &context->firmware->data[data_offset], pkt_data_len);
+	context->read_data_offset = data_offset;
+	context->read_data_length = data_len;
 
-		data_offset += pkt_data_len;
-		data_len -= pkt_data_len;
+	if (is_streaming(context)) {
+		schedule_work(&context->read_data_work);
+		return;
+	}
 
-		ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE,
-				    context->tx[i], pkt_data_len,
-				    !data_len ? MHI_EOT : MHI_CHAIN);
-		if (ret) {
-			dev_err(&context->mhi_dev->dev, "Unable to send read_data response %d\n",
-				ret);
-			return;
-		}
+	for (i = 0; i < SAHARA_NUM_TX_BUF && context->read_data_length; ++i) {
+		ret = read_data_helper(context, i);
+		if (ret)
+			break;
 	}
 }
 
@@ -538,6 +578,7 @@ static void sahara_parse_dump_table(struct sahara_context *context)
 	struct sahara_memory_dump_meta_v1 *dump_meta;
 	u64 table_nents;
 	u64 dump_length;
+	u64 mul_bytes;
 	int ret;
 	u64 i;
 
@@ -551,8 +592,9 @@ static void sahara_parse_dump_table(struct sahara_context *context)
 		dev_table[i].description[SAHARA_TABLE_ENTRY_STR_LEN - 1] = 0;
 		dev_table[i].filename[SAHARA_TABLE_ENTRY_STR_LEN - 1] = 0;
 
-		dump_length = size_add(dump_length, le64_to_cpu(dev_table[i].length));
-		if (dump_length == SIZE_MAX) {
+		if (check_add_overflow(dump_length,
+				       le64_to_cpu(dev_table[i].length),
+				       &dump_length)) {
 			/* Discard the dump */
 			sahara_send_reset(context);
 			return;
@@ -568,14 +610,17 @@ static void sahara_parse_dump_table(struct sahara_context *context)
 			dev_table[i].filename);
 	}
 
-	dump_length = size_add(dump_length, sizeof(*dump_meta));
-	if (dump_length == SIZE_MAX) {
+	if (check_add_overflow(dump_length, (u64)sizeof(*dump_meta), &dump_length)) {
 		/* Discard the dump */
 		sahara_send_reset(context);
 		return;
 	}
-	dump_length = size_add(dump_length, size_mul(sizeof(*image_out_table), table_nents));
-	if (dump_length == SIZE_MAX) {
+	if (check_mul_overflow((u64)sizeof(*image_out_table), table_nents, &mul_bytes)) {
+		/* Discard the dump */
+		sahara_send_reset(context);
+		return;
+	}
+	if (check_add_overflow(dump_length, mul_bytes, &dump_length)) {
 		/* Discard the dump */
 		sahara_send_reset(context);
 		return;
@@ -615,7 +660,7 @@ static void sahara_parse_dump_table(struct sahara_context *context)
 
 	/* Request the first chunk of the first image */
 	context->dump_image = &image_out_table[0];
-	dump_length = min(context->dump_image->length, SAHARA_READ_MAX_SIZE);
+	dump_length = min_t(u64, context->dump_image->length, SAHARA_READ_MAX_SIZE);
 	/* Avoid requesting EOI sized data so that we can identify errors */
 	if (dump_length == SAHARA_END_OF_IMAGE_LENGTH)
 		dump_length = SAHARA_END_OF_IMAGE_LENGTH / 2;
@@ -663,7 +708,7 @@ static void sahara_parse_dump_image(struct sahara_context *context)
 
 	/* Get next image chunk */
 	dump_length = context->dump_image->length - context->dump_image_offset;
-	dump_length = min(dump_length, SAHARA_READ_MAX_SIZE);
+	dump_length = min_t(u64, dump_length, SAHARA_READ_MAX_SIZE);
 	/* Avoid requesting EOI sized data so that we can identify errors */
 	if (dump_length == SAHARA_END_OF_IMAGE_LENGTH)
 		dump_length = SAHARA_END_OF_IMAGE_LENGTH / 2;
@@ -742,6 +787,13 @@ error:
 	sahara_send_reset(context);
 }
 
+static void sahara_read_data_processing(struct work_struct *work)
+{
+	struct sahara_context *context = container_of(work, struct sahara_context, read_data_work);
+
+	read_data_helper(context, 0);
+}
+
 static int sahara_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
 {
 	struct sahara_context *context;
@@ -756,34 +808,56 @@ static int sahara_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_
 	if (!context->rx)
 		return -ENOMEM;
 
+	if (!strcmp(mhi_dev->mhi_cntrl->name, "AIC200")) {
+		context->image_table = aic200_image_table;
+		context->table_size = ARRAY_SIZE(aic200_image_table);
+	} else {
+		context->image_table = aic100_image_table;
+		context->table_size = ARRAY_SIZE(aic100_image_table);
+		context->non_streaming = true;
+	}
+
 	/*
-	 * AIC100 defines SAHARA_TRANSFER_MAX_SIZE as the largest value it
-	 * will request for READ_DATA. This is larger than
-	 * SAHARA_PACKET_MAX_SIZE, and we need 9x SAHARA_PACKET_MAX_SIZE to
-	 * cover SAHARA_TRANSFER_MAX_SIZE. When the remote side issues a
-	 * READ_DATA, it requires a transfer of the exact size requested. We
-	 * can use MHI_CHAIN to link multiple buffers into a single transfer
-	 * but the remote side will not consume the buffers until it sees an
-	 * EOT, thus we need to allocate enough buffers to put in the tx fifo
-	 * to cover an entire READ_DATA request of the max size.
+	 * There are two firmware implementations for READ_DATA handling.
+	 * The older "SBL" implementation defines a Sahara transfer size, and
+	 * expects that the response is a single transport transfer. If the
+	 * FW wants to transfer a file that is larger than the transfer size,
+	 * the FW will issue multiple READ_DATA commands. For this
+	 * implementation, we need to allocate enough buffers to contain the
+	 * entire Sahara transfer size.
+	 *
+	 * The newer "XBL" implementation does not define a maximum transfer
+	 * size and instead expects the data to be streamed over using the
+	 * transport level MTU. The FW will issue a single READ_DATA command
+	 * of whatever size, and consume multiple transport level transfers
+	 * until the expected amount of data is consumed. For this
+	 * implementation we only need a single buffer of the transport MTU
+	 * but we'll need to be able to use it multiple times for a single
+	 * READ_DATA request.
+	 *
+	 * AIC100 is the SBL implementation and defines SAHARA_TRANSFER_MAX_SIZE
+	 * and we need 9x SAHARA_PACKET_MAX_SIZE to cover that. We can use
+	 * MHI_CHAIN to link multiple buffers into a single transfer but the
+	 * remote side will not consume the buffers until it sees an EOT, thus
+	 * we need to allocate enough buffers to put in the tx fifo to cover an
+	 * entire READ_DATA request of the max size.
+	 *
+	 * AIC200 is the XBL implementation, and so a single buffer will work.
 	 */
 	for (i = 0; i < SAHARA_NUM_TX_BUF; ++i) {
-		context->tx[i] = devm_kzalloc(&mhi_dev->dev, SAHARA_PACKET_MAX_SIZE, GFP_KERNEL);
+		context->tx[i] = devm_kzalloc(&mhi_dev->dev,
+					      SAHARA_PACKET_MAX_SIZE,
+					      GFP_KERNEL);
 		if (!context->tx[i])
 			return -ENOMEM;
+		if (is_streaming(context))
+			break;
 	}
 
 	context->mhi_dev = mhi_dev;
 	INIT_WORK(&context->fw_work, sahara_processing);
 	INIT_WORK(&context->dump_work, sahara_dump_processing);
-
-	if (!strcmp(mhi_dev->mhi_cntrl->name, "AIC200")) {
-		context->image_table = aic200_image_table;
-		context->table_size = ARRAY_SIZE(aic200_image_table);
-	} else {
-		context->image_table = aic100_image_table;
-		context->table_size = ARRAY_SIZE(aic100_image_table);
-	}
+	INIT_WORK(&context->read_data_work, sahara_read_data_processing);
 
 	context->active_image_id = SAHARA_IMAGE_ID_NONE;
 	dev_set_drvdata(&mhi_dev->dev, context);
@@ -814,6 +888,10 @@ static void sahara_mhi_remove(struct mhi_device *mhi_dev)
 
 static void sahara_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
 {
+	struct sahara_context *context = dev_get_drvdata(&mhi_dev->dev);
+
+	if (!mhi_result->transaction_status && context->read_data_length && is_streaming(context))
+		schedule_work(&context->read_data_work);
 }
 
 static void sahara_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
diff --git a/drivers/accel/rocket/rocket_gem.c b/drivers/accel/rocket/rocket_gem.c
index 0551e11cc184..624c4ecf5a34 100644
--- a/drivers/accel/rocket/rocket_gem.c
+++ b/drivers/accel/rocket/rocket_gem.c
@@ -2,6 +2,7 @@
 /* Copyright 2024-2025 Tomeu Vizoso <tomeu@tomeuvizoso.net> */
 
 #include <drm/drm_device.h>
+#include <drm/drm_print.h>
 #include <drm/drm_utils.h>
 #include <drm/rocket_accel.h>
 #include <linux/dma-mapping.h>