From 3698807094ecae945436921325f5c309d1123f11 Mon Sep 17 00:00:00 2001 From: Rajneesh Bhardwaj Date: Tue, 24 Aug 2021 16:13:41 -0400 Subject: drm/amdkfd: CRIU Introduce Checkpoint-Restore APIs Checkpoint-Restore in userspace (CRIU) is a powerful tool that can snapshot a running process and later restore it on same or a remote machine but expects the processes that have a device file (e.g. GPU) associated with them, provide necessary driver support to assist CRIU and its extensible plugin interface. Thus, In order to support the Checkpoint-Restore of any ROCm process, the AMD Radeon Open Compute Kernel driver, needs to provide a set of new APIs that provide necessary VRAM metadata and its contents to a userspace component (CRIU plugin) that can store it in form of image files. This introduces some new ioctls which will be used to checkpoint-Restore any KFD bound user process. KFD only allows ioctl calls from the same process that opened the KFD file descriptor. Since these ioctls are expected to be called from a KFD criu plugin which has elevated ptrace attached privileges and CAP_CHECKPOINT_RESTORE capabilities attached with the file descriptors so modify KFD to allow such calls. (API redesigned by David Yat Sin) Suggested-by: Felix Kuehling Reviewed-by: Felix Kuehling Signed-off-by: David Yat Sin Signed-off-by: Rajneesh Bhardwaj Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 81 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index af96af174dc4..49429a6c42fc 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -468,6 +468,82 @@ struct kfd_ioctl_smi_events_args { __u32 anon_fd; /* from KFD */ }; +/************************************************************************************************** + * CRIU IOCTLs (Checkpoint Restore In Userspace) + * + * When checkpointing a process, the userspace application will perform: + * 1. PROCESS_INFO op to determine current process information. This pauses execution and evicts + * all the queues. + * 2. CHECKPOINT op to checkpoint process contents (BOs, queues, events, svm-ranges) + * 3. UNPAUSE op to un-evict all the queues + * + * When restoring a process, the CRIU userspace application will perform: + * + * 1. RESTORE op to restore process contents + * 2. RESUME op to start the process + * + * Note: Queues are forced into an evicted state after a successful PROCESS_INFO. User + * application needs to perform an UNPAUSE operation after calling PROCESS_INFO. + */ + +enum kfd_criu_op { + KFD_CRIU_OP_PROCESS_INFO, + KFD_CRIU_OP_CHECKPOINT, + KFD_CRIU_OP_UNPAUSE, + KFD_CRIU_OP_RESTORE, + KFD_CRIU_OP_RESUME, +}; + +/** + * kfd_ioctl_criu_args - Arguments perform CRIU operation + * @devices: [in/out] User pointer to memory location for devices information. + * This is an array of type kfd_criu_device_bucket. + * @bos: [in/out] User pointer to memory location for BOs information + * This is an array of type kfd_criu_bo_bucket. + * @priv_data: [in/out] User pointer to memory location for private data + * @priv_data_size: [in/out] Size of priv_data in bytes + * @num_devices: [in/out] Number of GPUs used by process. Size of @devices array. + * @num_bos [in/out] Number of BOs used by process. Size of @bos array. + * @num_objects: [in/out] Number of objects used by process. Objects are opaque to + * user application. + * @pid: [in/out] PID of the process being checkpointed + * @op [in] Type of operation (kfd_criu_op) + * + * Return: 0 on success, -errno on failure + */ +struct kfd_ioctl_criu_args { + __u64 devices; /* Used during ops: CHECKPOINT, RESTORE */ + __u64 bos; /* Used during ops: CHECKPOINT, RESTORE */ + __u64 priv_data; /* Used during ops: CHECKPOINT, RESTORE */ + __u64 priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */ + __u32 num_devices; /* Used during ops: PROCESS_INFO, RESTORE */ + __u32 num_bos; /* Used during ops: PROCESS_INFO, RESTORE */ + __u32 num_objects; /* Used during ops: PROCESS_INFO, RESTORE */ + __u32 pid; /* Used during ops: PROCESS_INFO, RESUME */ + __u32 op; +}; + +struct kfd_criu_device_bucket { + __u32 user_gpu_id; + __u32 actual_gpu_id; + __u32 drm_fd; + __u32 pad; +}; + +struct kfd_criu_bo_bucket { + __u64 addr; + __u64 size; + __u64 offset; + __u64 restored_offset; /* During restore, updated offset for BO */ + __u32 gpu_id; /* This is the user_gpu_id */ + __u32 alloc_flags; + __u32 dmabuf_fd; + __u32 pad; +}; + +/* CRIU IOCTLs - END */ +/**************************************************************************************************/ + /* Register offset inside the remapped mmio page */ enum kfd_mmio_remap { @@ -742,7 +818,10 @@ struct kfd_ioctl_set_xnack_mode_args { #define AMDKFD_IOC_SET_XNACK_MODE \ AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args) +#define AMDKFD_IOC_CRIU_OP \ + AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x22 +#define AMDKFD_COMMAND_END 0x23 #endif -- cgit From 692996f2bef7aa1737e07554255ba0d9a73fb750 Mon Sep 17 00:00:00 2001 From: Rajneesh Bhardwaj Date: Tue, 18 Jan 2022 01:47:58 -0500 Subject: drm/amdkfd: Bump up KFD API version for CRIU - Change KFD minor version to 7 for CRIU Proposed userspace changes: https://github.com/RadeonOpenCompute/criu Reviewed-by: Felix Kuehling Signed-off-by: Rajneesh Bhardwaj Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 49429a6c42fc..e6a56c146920 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -32,9 +32,10 @@ * - 1.4 - Indicate new SRAM EDC bit in device properties * - 1.5 - Add SVM API * - 1.6 - Query clear flags in SVM get_attr API + * - 1.7 - Checkpoint Restore (CRIU) API */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 6 +#define KFD_IOCTL_MINOR_VERSION 7 struct kfd_ioctl_get_version_args { __u32 major_version; /* from KFD */ -- cgit From 5bdd3eb253544b1e80f904e1205699d0a126d2d6 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 4 Feb 2022 11:58:32 -0500 Subject: drm/amdkfd: Remove unused old debugger implementation Cleanup the kfd code by removing the unused old debugger implementation. The address watch was only ever implemented in the upstream driver for GFXv7 (Kaveri). The user mode tools runtime using this API was never open-sourced. Work on the old debugger prototype that used this API has been discontinued years ago. Only a small piece of resetting wavefronts is kept and is moved to kfd_device_queue_manager.c. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index e6a56c146920..6e4268f5e482 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -756,16 +756,16 @@ struct kfd_ioctl_set_xnack_mode_args { #define AMDKFD_IOC_WAIT_EVENTS \ AMDKFD_IOWR(0x0C, struct kfd_ioctl_wait_events_args) -#define AMDKFD_IOC_DBG_REGISTER \ +#define AMDKFD_IOC_DBG_REGISTER_DEPRECATED \ AMDKFD_IOW(0x0D, struct kfd_ioctl_dbg_register_args) -#define AMDKFD_IOC_DBG_UNREGISTER \ +#define AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED \ AMDKFD_IOW(0x0E, struct kfd_ioctl_dbg_unregister_args) -#define AMDKFD_IOC_DBG_ADDRESS_WATCH \ +#define AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED \ AMDKFD_IOW(0x0F, struct kfd_ioctl_dbg_address_watch_args) -#define AMDKFD_IOC_DBG_WAVE_CONTROL \ +#define AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED \ AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args) #define AMDKFD_IOC_SET_SCRATCH_BACKING_VA \ -- cgit From 79aa0367385ceaf5351ea77ea1fb66136739ea9d Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 15 Feb 2022 18:54:38 -0500 Subject: drm/amdkfd: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reference: https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays CC: Changcheng Deng Signed-off-by: Felix Kuehling Reviewed-by: Philip Yang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 6e4268f5e482..baec5a41de3e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -673,7 +673,7 @@ struct kfd_ioctl_svm_args { __u32 op; __u32 nattr; /* Variable length array of attributes */ - struct kfd_ioctl_svm_attribute attrs[0]; + struct kfd_ioctl_svm_attribute attrs[]; }; /** -- cgit From d58b8a99cbb84c1eb3b3613d23c1a328695a9455 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 25 Feb 2022 12:33:51 -0500 Subject: drm/amdkfd: Add SMI add event helper To remove duplicate code, unify event message format and simplify new event add in the following patches. Use KFD_SMI_EVENT_MSG_SIZE to define msg size, the same size will be used in user space to alloc the msg receive buffer. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index baec5a41de3e..b40687bf1014 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -463,6 +463,7 @@ enum kfd_smi_event { }; #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) +#define KFD_SMI_EVENT_MSG_SIZE 96 struct kfd_ioctl_smi_events_args { __u32 gpuid; /* to KFD */ -- cgit From 65722ff6181aa52c3d5b0929004af22a3a63e148 Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Tue, 8 Mar 2022 14:00:50 -0500 Subject: drm/amdkfd: CRIU export dmabuf handles for GTT BOs Export dmabuf handles for GTT BOs so that their contents can be accessed using SDMA during checkpoint/restore. v2: Squash in fix from David to set dmabuf handle to invalid for BOs that cannot be accessed using SDMA during checkpoint/restore. Signed-off-by: David Yat Sin Reviewed-by : Rajneesh Bhardwaj Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index b40687bf1014..42975e940758 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -33,9 +33,10 @@ * - 1.5 - Add SVM API * - 1.6 - Query clear flags in SVM get_attr API * - 1.7 - Checkpoint Restore (CRIU) API + * - 1.8 - CRIU - Support for SDMA transfers with GTT BOs */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 7 +#define KFD_IOCTL_MINOR_VERSION 8 struct kfd_ioctl_get_version_args { __u32 major_version; /* from KFD */ @@ -195,6 +196,8 @@ struct kfd_ioctl_dbg_wave_control_args { __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ }; +#define KFD_INVALID_FD 0xffffffff + /* Matching HSA_EVENTTYPE */ #define KFD_IOC_EVENT_SIGNAL 0 #define KFD_IOC_EVENT_NODECHANGE 1 -- cgit