1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
|
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* relocate_kernel.S - put the kernel image in place to boot
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
*/
#include <linux/linkage.h>
#include <linux/stringify.h>
#include <asm/alternative.h>
#include <asm/page_types.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
#include <asm/pgtable_types.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
#include <asm/asm-offsets.h>
/*
* Must be relocatable PIC code callable as a C function, in particular
* there must be a plain RET and not jump to return thunk.
*/
#define PTR(x) (x << 3)
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
/*
* The .text..relocate_kernel and .data..relocate_kernel sections are copied
* into the control page, and the remainder of the page is used as the stack.
*/
.section .data..relocate_kernel,"a";
/* Minimal CPU state */
SYM_DATA_LOCAL(saved_rsp, .quad 0)
SYM_DATA_LOCAL(saved_cr0, .quad 0)
SYM_DATA_LOCAL(saved_cr3, .quad 0)
SYM_DATA_LOCAL(saved_cr4, .quad 0)
/* other data */
SYM_DATA(kexec_va_control_page, .quad 0)
SYM_DATA(kexec_pa_table_page, .quad 0)
SYM_DATA(kexec_pa_swap_page, .quad 0)
SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
SYM_DATA(kexec_debug_8250_port, .word 0)
.balign 16
SYM_DATA_START_LOCAL(kexec_debug_gdt)
.word kexec_debug_gdt_end - kexec_debug_gdt - 1
.long 0
.word 0
.quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
.balign 8
SYM_DATA_START(kexec_debug_idt)
.skip 0x100, 0x00
SYM_DATA_END(kexec_debug_idt)
.section .text..relocate_kernel,"ax";
.code64
SYM_CODE_START_NOALIGN(relocate_kernel)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
/*
* %rdi indirection_page
* %rsi pa_control_page
* %rdx start address
* %rcx preserve_context
* %r8 host_mem_enc_active
*/
/* Save the CPU context, used for jumping back */
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
pushf
/* Invalidate GDT/IDT, zero out flags */
pushq $0
pushq $0
lidt (%rsp)
lgdt (%rsp)
addq $8, %rsp
popfq
/* Switch to the identity mapped page tables */
movq %cr3, %rax
movq kexec_pa_table_page(%rip), %r9
movq %r9, %cr3
/* Leave CR4 in %r13 to enable the right paging mode later. */
movq %cr4, %r13
/* Disable global pages immediately to ensure this mapping is RWX */
movq %r13, %r12
andq $~(X86_CR4_PGE), %r12
movq %r12, %cr4
/* Save %rsp and CRs. */
movq %r13, saved_cr4(%rip)
movq %rsp, saved_rsp(%rip)
movq %rax, saved_cr3(%rip)
movq %cr0, %rax
movq %rax, saved_cr0(%rip)
/* save indirection list for jumping back */
movq %rdi, pa_backup_pages_map(%rip)
/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
movq %rcx, %r11
/* setup a new stack at the end of the physical control page */
lea PAGE_SIZE(%rsi), %rsp
/* jump to identity mapped page */
0: addq $identity_mapped - 0b, %rsi
subq $__relocate_kernel_start - 0b, %rsi
ANNOTATE_RETPOLINE_SAFE
jmp *%rsi
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
UNWIND_HINT_END_OF_STACK
/*
* %rdi indirection page
* %rdx start address
* %r8 host_mem_enc_active
* %r9 page table page
* %r11 preserve_context
* %r13 original CR4 when relocate_kernel() was invoked
*/
/* store the start address on the stack */
pushq %rdx
/* Create a GDTR (16 bits limit, 64 bits addr) on stack */
leaq kexec_debug_gdt(%rip), %rax
pushq %rax
pushw (%rax)
/* Load the GDT, put the stack back */
lgdt (%rsp)
addq $10, %rsp
/* Test that we can load segments */
movq %ds, %rax
movq %rax, %ds
/* Now an IDTR on the stack to load the IDT the kernel created */
leaq kexec_debug_idt(%rip), %rsi
pushq %rsi
pushw $0xff
lidt (%rsp)
addq $10, %rsp
//int3
/*
* Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
* below.
*/
movq %cr4, %rax
andq $~(X86_CR4_CET), %rax
movq %rax, %cr4
/*
* Set cr0 to a known state:
* - Paging enabled
* - Alignment check disabled
* - Write protect disabled
* - No task switch
* - Don't do FP software emulation.
* - Protected mode enabled
*/
movq %cr0, %rax
andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
orl $(X86_CR0_PG | X86_CR0_PE), %eax
movq %rax, %cr0
/*
* Set cr4 to a known state:
* - physical address extension enabled
* - 5-level paging, if it was enabled before
* - Machine check exception on TDX guest, if it was enabled before.
* Clearing MCE might not be allowed in TDX guests, depending on setup.
*
* Use R13 that contains the original CR4 value, read in relocate_kernel().
* PAE is always set in the original CR4.
*/
andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d
ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
movq %r13, %cr4
/* Flush the TLB (needed?) */
movq %r9, %cr3
/*
* If SME is active, there could be old encrypted cache line
* entries that will conflict with the now unencrypted memory
* used by kexec. Flush the caches before copying the kernel.
*/
testq %r8, %r8
jz .Lsme_off
wbinvd
.Lsme_off:
call swap_pages
/*
* To be certain of avoiding problems with self-modifying code
* I need to execute a serializing instruction here.
* So I flush the TLB by reloading %cr3 here, it's handy,
* and not processor dependent.
*/
movq %cr3, %rax
movq %rax, %cr3
testq %r11, %r11 /* preserve_context */
jnz .Lrelocate
/*
* set all of the registers to known values
* leave %rsp alone
*/
xorl %eax, %eax
xorl %ebx, %ebx
xorl %ecx, %ecx
xorl %edx, %edx
xorl %esi, %esi
xorl %edi, %edi
xorl %ebp, %ebp
xorl %r8d, %r8d
xorl %r9d, %r9d
xorl %r10d, %r10d
xorl %r11d, %r11d
xorl %r12d, %r12d
xorl %r13d, %r13d
xorl %r14d, %r14d
xorl %r15d, %r15d
ANNOTATE_UNRET_SAFE
ret
int3
.Lrelocate:
popq %rdx
/* Use the swap page for the callee's stack */
movq kexec_pa_swap_page(%rip), %r10
leaq PAGE_SIZE(%r10), %rsp
/* push the existing entry point onto the callee's stack */
pushq %rdx
ANNOTATE_RETPOLINE_SAFE
call *%rdx
/* get the re-entry point of the peer system */
popq %rbp
movq kexec_pa_swap_page(%rip), %r10
movq pa_backup_pages_map(%rip), %rdi
movq kexec_pa_table_page(%rip), %rax
movq %rax, %cr3
/* Find start (and end) of this physical mapping of control page */
leaq (%rip), %r8
ANNOTATE_NOENDBR
andq $PAGE_MASK, %r8
lea PAGE_SIZE(%r8), %rsp
movl $1, %r11d /* Ensure preserve_context flag is set */
call swap_pages
movq kexec_va_control_page(%rip), %rax
0: addq $virtual_mapped - 0b, %rax
subq $__relocate_kernel_start - 0b, %rax
pushq %rax
ANNOTATE_UNRET_SAFE
ret
int3
SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // RET target, above
movq saved_rsp(%rip), %rsp
movq saved_cr4(%rip), %rax
movq %rax, %cr4
movq saved_cr3(%rip), %rax
movq saved_cr0(%rip), %r8
movq %rax, %cr3
movq %r8, %cr0
#ifdef CONFIG_KEXEC_JUMP
/* Saved in save_processor_state. */
movq $saved_context, %rax
lgdt saved_context_gdt_desc(%rax)
#endif
/* relocate_kernel() returns the re-entry point for next time */
movq %rbp, %rax
popf
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
ANNOTATE_UNRET_SAFE
ret
int3
SYM_CODE_END(virtual_mapped)
/* Do the copies */
SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
UNWIND_HINT_END_OF_STACK
/*
* %rdi indirection page
* %r11 preserve_context
*/
movq %rdi, %rcx /* Put the indirection_page in %rcx */
xorl %edi, %edi
xorl %esi, %esi
jmp .Lstart /* Should start with an indirection record */
.Lloop: /* top, read another word for the indirection page */
movq (%rbx), %rcx
addq $8, %rbx
.Lstart:
testb $0x1, %cl /* is it a destination page? */
jz .Lnotdest
movq %rcx, %rdi
andq $0xfffffffffffff000, %rdi
jmp .Lloop
.Lnotdest:
testb $0x2, %cl /* is it an indirection page? */
jz .Lnotind
movq %rcx, %rbx
andq $0xfffffffffffff000, %rbx
jmp .Lloop
.Lnotind:
testb $0x4, %cl /* is it the done indicator? */
jz .Lnotdone
jmp .Ldone
.Lnotdone:
testb $0x8, %cl /* is it the source indicator? */
jz .Lloop /* Ignore it otherwise */
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
movq %rdi, %rdx /* Save destination page to %rdx */
movq %rsi, %rax /* Save source page to %rax */
testq %r11, %r11 /* Only actually swap for ::preserve_context */
jz .Lnoswap
/* copy source page to swap page */
movq kexec_pa_swap_page(%rip), %rdi
movl $512, %ecx
rep movsq
/* copy destination page to source page */
movq %rax, %rdi
movq %rdx, %rsi
movl $512, %ecx
rep movsq
/* copy swap page to destination page */
movq %rdx, %rdi
movq kexec_pa_swap_page(%rip), %rsi
.Lnoswap:
movl $512, %ecx
rep movsq
lea PAGE_SIZE(%rax), %rsi
jmp .Lloop
.Ldone:
ANNOTATE_UNRET_SAFE
ret
int3
SYM_CODE_END(swap_pages)
/*
* Generic 'print character' routine
* - %al: Character to be printed (may clobber %rax)
* - %rdx: MMIO address or port.
*/
#define XMTRDY 0x20
#define TXR 0 /* Transmit register (WRITE) */
#define LSR 5 /* Line Status */
SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
addw $LSR, %dx
xchg %al, %ah
.Lxmtrdy_loop:
inb %dx, %al
testb $XMTRDY, %al
jnz .Lready
pause
jmp .Lxmtrdy_loop
.Lready:
subw $LSR, %dx
xchg %al, %ah
outb %al, %dx
pr_char_null:
ANNOTATE_NOENDBR
ANNOTATE_UNRET_SAFE
ret
SYM_CODE_END(pr_char_8250)
SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
.Lxmtrdy_loop_mmio:
movb (LSR*4)(%rdx), %ah
testb $XMTRDY, %ah
jnz .Lready_mmio
pause
jmp .Lxmtrdy_loop_mmio
.Lready_mmio:
movb %al, (%rdx)
ANNOTATE_UNRET_SAFE
ret
SYM_CODE_END(pr_char_8250_mmio32)
/*
* Load pr_char function pointer into %rsi and load %rdx with whatever
* that function wants to see there (typically port/MMIO address).
*/
.macro pr_setup
leaq pr_char_8250(%rip), %rsi
movw kexec_debug_8250_port(%rip), %dx
testw %dx, %dx
jnz 1f
leaq pr_char_8250_mmio32(%rip), %rsi
movq kexec_debug_8250_mmio32(%rip), %rdx
testq %rdx, %rdx
jnz 1f
leaq pr_char_null(%rip), %rsi
1:
.endm
/* Print the nybble in %bl, clobber %rax */
SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
UNWIND_HINT_FUNC
movb %bl, %al
nop
andb $0x0f, %al
addb $0x30, %al
cmpb $0x3a, %al
jb 1f
addb $('a' - '0' - 10), %al
ANNOTATE_RETPOLINE_SAFE
1: jmp *%rsi
SYM_CODE_END(pr_nybble)
SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
UNWIND_HINT_FUNC
movq $16, %rcx
1: rolq $4, %rbx
call pr_nybble
loop 1b
movb $'\n', %al
ANNOTATE_RETPOLINE_SAFE
jmp *%rsi
SYM_CODE_END(pr_qword)
.macro print_reg a, b, c, d, r
movb $\a, %al
ANNOTATE_RETPOLINE_SAFE
call *%rsi
movb $\b, %al
ANNOTATE_RETPOLINE_SAFE
call *%rsi
movb $\c, %al
ANNOTATE_RETPOLINE_SAFE
call *%rsi
movb $\d, %al
ANNOTATE_RETPOLINE_SAFE
call *%rsi
movq \r, %rbx
call pr_qword
.endm
SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
/* Each of these is 6 bytes. */
.macro vec_err exc
UNWIND_HINT_ENTRY
. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
nop
nop
pushq $\exc
jmp exc_handler
.endm
.macro vec_noerr exc
UNWIND_HINT_ENTRY
. = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
pushq $0
pushq $\exc
jmp exc_handler
.endm
ANNOTATE_NOENDBR
vec_noerr 0 // #DE
vec_noerr 1 // #DB
vec_noerr 2 // #NMI
vec_noerr 3 // #BP
vec_noerr 4 // #OF
vec_noerr 5 // #BR
vec_noerr 6 // #UD
vec_noerr 7 // #NM
vec_err 8 // #DF
vec_noerr 9
vec_err 10 // #TS
vec_err 11 // #NP
vec_err 12 // #SS
vec_err 13 // #GP
vec_err 14 // #PF
vec_noerr 15
SYM_CODE_END(kexec_debug_exc_vectors)
SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
/* No need for RET mitigations during kexec */
VALIDATE_UNRET_END
pushq %rax
pushq %rbx
pushq %rcx
pushq %rdx
pushq %rsi
/* Stack frame */
#define EXC_SS 0x58 /* Architectural... */
#define EXC_RSP 0x50
#define EXC_EFLAGS 0x48
#define EXC_CS 0x40
#define EXC_RIP 0x38
#define EXC_ERRORCODE 0x30 /* Either architectural or zero pushed by handler */
#define EXC_EXCEPTION 0x28 /* Pushed by handler entry point */
#define EXC_RAX 0x20 /* Pushed just above in exc_handler */
#define EXC_RBX 0x18
#define EXC_RCX 0x10
#define EXC_RDX 0x08
#define EXC_RSI 0x00
/* Set up %rdx/%rsi for debug output */
pr_setup
/* rip and exception info */
print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp)
print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp)
print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp)
print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp)
/* We spilled these to the stack */
print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp)
print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp)
print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp)
print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp)
print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp)
/* Other registers untouched */
print_reg 'r', 'd', 'i', ':', %rdi
print_reg 'r', '8', ' ', ':', %r8
print_reg 'r', '9', ' ', ':', %r9
print_reg 'r', '1', '0', ':', %r10
print_reg 'r', '1', '1', ':', %r11
print_reg 'r', '1', '2', ':', %r12
print_reg 'r', '1', '3', ':', %r13
print_reg 'r', '1', '4', ':', %r14
print_reg 'r', '1', '5', ':', %r15
print_reg 'c', 'r', '2', ':', %cr2
/* Only return from INT3 */
cmpq $3, EXC_EXCEPTION(%rsp)
jne .Ldie
popq %rsi
popq %rdx
popq %rcx
popq %rbx
popq %rax
addq $16, %rsp
iretq
.Ldie:
hlt
jmp .Ldie
SYM_CODE_END(exc_handler)
|