/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES */ #ifndef __GENERIC_PT_COMMON_H #define __GENERIC_PT_COMMON_H #include #include #include /** * DOC: Generic Radix Page Table * * Generic Radix Page Table is a set of functions and helpers to efficiently * parse radix style page tables typically seen in HW implementations. The * interface is built to deliver similar code generation as the mm's pte/pmd/etc * system by fully inlining the exact code required to handle each table level. * * Like the mm subsystem each format contributes its parsing implementation * under common names and the common code implements the required algorithms. * * The system is divided into three logical levels: * * - The page table format and its manipulation functions * - Generic helpers to give a consistent API regardless of underlying format * - An algorithm implementation (e.g. IOMMU/DRM/KVM/MM) * * Multiple implementations are supported. The intention is to have the generic * format code be re-usable for whatever specialized implementation is required. * The generic code is solely about the format of the radix tree; it does not * include memory allocation or higher level decisions that are left for the * implementation. * * The generic framework supports a superset of functions across many HW * implementations: * * - Entries comprised of contiguous blocks of IO PTEs for larger page sizes * - Multi-level tables, up to 6 levels. Runtime selected top level * - Runtime variable table level size (ARM's concatenated tables) * - Expandable top level allowing dynamic sizing of table levels * - Optional leaf entries at any level * - 32-bit/64-bit virtual and output addresses, using every address bit * - Dirty tracking * - Sign extended addressing */ /** * struct pt_common - struct for all page table implementations */ struct pt_common { /** * @top_of_table: Encodes the table top pointer and the top level in a * single value. Must use READ_ONCE/WRITE_ONCE to access it. The lower * bits of the aligned table pointer are used for the level. */ uintptr_t top_of_table; /** * @max_oasz_lg2: Maximum number of bits the OA can contain. Upper bits * must be zero. This may be less than what the page table format * supports, but must not be more. */ u8 max_oasz_lg2; /** * @max_vasz_lg2: Maximum number of bits the VA can contain. Upper bits * are 0 or 1 depending on pt_full_va_prefix(). This may be less than * what the page table format supports, but must not be more. When * PT_FEAT_DYNAMIC_TOP is set this reflects the maximum VA capability. */ u8 max_vasz_lg2; /** * @features: Bitmap of `enum pt_features` */ unsigned int features; }; /* Encoding parameters for top_of_table */ enum { PT_TOP_LEVEL_BITS = 3, PT_TOP_LEVEL_MASK = GENMASK(PT_TOP_LEVEL_BITS - 1, 0), }; /** * enum pt_features - Features turned on in the table. Each symbol is a bit * position. */ enum pt_features { /** * @PT_FEAT_DMA_INCOHERENT: Cache flush page table memory before * assuming the HW can read it. Otherwise a SMP release is sufficient * for HW to read it. */ PT_FEAT_DMA_INCOHERENT, /** * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to * PT_VADDR_MAX. */ PT_FEAT_FULL_VA, /** * @PT_FEAT_DYNAMIC_TOP: The table's top level can be increased * dynamically during map. This requires HW support for atomically * setting both the table top pointer and the starting table level. */ PT_FEAT_DYNAMIC_TOP, /** * @PT_FEAT_SIGN_EXTEND: The top most bit of the valid VA range sign * extends up to the full pt_vaddr_t. This divides the page table into * three VA ranges:: * * 0 -> 2^N - 1 Lower * 2^N -> (MAX - 2^N - 1) Non-Canonical * MAX - 2^N -> MAX Upper * * In this mode pt_common::max_vasz_lg2 includes the sign bit and the * upper bits that don't fall within the translation are just validated. * * If not set there is no sign extension and valid VA goes from 0 to 2^N * - 1. */ PT_FEAT_SIGN_EXTEND, /** * @PT_FEAT_FLUSH_RANGE: IOTLB maintenance is done by flushing IOVA * ranges which will clean out any walk cache or any IOPTE fully * contained by the range. The optimization objective is to minimize the * number of flushes even if ranges include IOVA gaps that do not need * to be flushed. */ PT_FEAT_FLUSH_RANGE, /** * @PT_FEAT_FLUSH_RANGE_NO_GAPS: Like PT_FEAT_FLUSH_RANGE except that * the optimization objective is to only flush IOVA that has been * changed. This mode is suitable for cases like hypervisor shadowing * where flushing unchanged ranges may cause the hypervisor to reparse * significant amount of page table. */ PT_FEAT_FLUSH_RANGE_NO_GAPS, /* private: */ PT_FEAT_FMT_START, }; struct pt_amdv1 { struct pt_common common; }; enum { /* * The memory backing the tables is encrypted. Use __sme_set() to adjust * the page table pointers in the tree. This only works with * CONFIG_AMD_MEM_ENCRYPT. */ PT_FEAT_AMDV1_ENCRYPT_TABLES = PT_FEAT_FMT_START, /* * The PTEs are set to prevent cache incoherent traffic, such as PCI no * snoop. This is set either at creation time or before the first map * operation. */ PT_FEAT_AMDV1_FORCE_COHERENCE, }; struct pt_vtdss { struct pt_common common; }; enum { /* * The PTEs are set to prevent cache incoherent traffic, such as PCI no * snoop. This is set either at creation time or before the first map * operation. */ PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START, /* * Prevent creating read-only PTEs. Used to work around HW errata * ERRATA_772415_SPR17. */ PT_FEAT_VTDSS_FORCE_WRITEABLE, }; struct pt_x86_64 { struct pt_common common; }; enum { /* * The memory backing the tables is encrypted. Use __sme_set() to adjust * the page table pointers in the tree. This only works with * CONFIG_AMD_MEM_ENCRYPT. */ PT_FEAT_X86_64_AMD_ENCRYPT_TABLES = PT_FEAT_FMT_START, }; #endif