1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
|
// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/linkage.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/pgtable.h>
#include <asm/init.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/sev.h>
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
extern unsigned int next_early_pgt;
static inline bool check_la57_support(void)
{
/*
* 5-level paging is detected and enabled at kernel decompression
* stage. Only check if it has been enabled there.
*/
if (!(native_read_cr4() & X86_CR4_LA57))
return false;
__pgtable_l5_enabled = 1;
pgdir_shift = 48;
ptrs_per_p4d = 512;
return true;
}
static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
pmdval_t *pmd,
unsigned long p2v_offset)
{
unsigned long paddr, paddr_end;
int i;
/* Encrypt the kernel and related (if SME is active) */
sme_encrypt_kernel(bp);
/*
* Clear the memory encryption mask from the .bss..decrypted section.
* The bss section will be memset to zero later in the initialization so
* there is no need to zero it after changing the memory encryption
* attribute.
*/
if (sme_get_me_mask()) {
paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted);
paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted);
for (; paddr < paddr_end; paddr += PMD_SIZE) {
/*
* On SNP, transition the page to shared in the RMP table so that
* it is consistent with the page table attribute change.
*
* __start_bss_decrypted has a virtual address in the high range
* mapping (kernel .text). PVALIDATE, by way of
* early_snp_set_memory_shared(), requires a valid virtual
* address but the kernel is currently running off of the identity
* mapping so use the PA to get a *currently* valid virtual address.
*/
early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
i = pmd_index(paddr - p2v_offset);
pmd[i] -= sme_get_me_mask();
}
}
/*
* Return the SME encryption mask (if SME is active) to be used as a
* modifier for the initial pgdir entry programmed into CR3.
*/
return sme_get_me_mask();
}
/*
* This code is compiled using PIC codegen because it will execute from the
* early 1:1 mapping of memory, which deviates from the mapping expected by the
* linker. Due to this deviation, taking the address of a global variable will
* produce an ambiguous result when using the plain & operator. Instead,
* rip_rel_ptr() must be used, which will return the RIP-relative address in
* the 1:1 mapping of memory. Kernel virtual addresses can be determined by
* subtracting p2v_offset from the RIP-relative address.
*/
unsigned long __head __startup_64(unsigned long p2v_offset,
struct boot_params *bp)
{
pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts);
unsigned long physaddr = (unsigned long)rip_rel_ptr(_text);
unsigned long va_text, va_end;
unsigned long pgtable_flags;
unsigned long load_delta;
pgdval_t *pgd;
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
bool la57;
int i;
la57 = check_la57_support();
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
for (;;);
/*
* Compute the delta between the address I am compiled to run at
* and the address I am actually running at.
*/
phys_base = load_delta = __START_KERNEL_map + p2v_offset;
/* Is the address not 2M aligned? */
if (load_delta & ~PMD_MASK)
for (;;);
va_text = physaddr - p2v_offset;
va_end = (unsigned long)rip_rel_ptr(_end) - p2v_offset;
/* Include the SME encryption mask in the fixup value */
load_delta += sme_get_me_mask();
/* Fixup the physical addresses in the page table */
pgd = rip_rel_ptr(early_top_pgt);
pgd[pgd_index(__START_KERNEL_map)] += load_delta;
if (la57) {
p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt);
p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
}
level3_kernel_pgt[PTRS_PER_PUD - 2].pud += load_delta;
level3_kernel_pgt[PTRS_PER_PUD - 1].pud += load_delta;
for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
level2_fixmap_pgt[i].pmd += load_delta;
/*
* Set up the identity mapping for the switchover. These
* entries should *NOT* have the global bit set! This also
* creates a bunch of nonsense entries but that is fine --
* it avoids problems around wraparound.
*/
pud = &early_pgts[0]->pmd;
pmd = &early_pgts[1]->pmd;
next_early_pgt = 2;
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
if (la57) {
p4d = &early_pgts[next_early_pgt++]->pmd;
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
i = physaddr >> P4D_SHIFT;
p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
} else {
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
}
i = physaddr >> PUD_SHIFT;
pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;
for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
int idx = i + (physaddr >> PMD_SHIFT);
pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
}
/*
* Fixup the kernel text+data virtual addresses. Note that
* we might write invalid pmds, when the kernel is relocated
* cleanup_highmap() fixes this up along with the mappings
* beyond _end.
*
* Only the region occupied by the kernel image has so far
* been checked against the table of usable memory regions
* provided by the firmware, so invalidate pages outside that
* region. A page table entry that maps to a reserved area of
* memory would allow processor speculation into that area,
* and on some hardware (particularly the UV platform) even
* speculative access to some reserved areas is caught as an
* error, causing the BIOS to halt the system.
*/
pmd = rip_rel_ptr(level2_kernel_pgt);
/* invalidate pages before the kernel image */
for (i = 0; i < pmd_index(va_text); i++)
pmd[i] &= ~_PAGE_PRESENT;
/* fixup pages that are part of the kernel image */
for (; i <= pmd_index(va_end); i++)
if (pmd[i] & _PAGE_PRESENT)
pmd[i] += load_delta;
/* invalidate pages after the kernel image */
for (; i < PTRS_PER_PMD; i++)
pmd[i] &= ~_PAGE_PRESENT;
return sme_postprocess_startup(bp, pmd, p2v_offset);
}
|