summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c1
-rw-r--r--arch/x86/kvm/emulate.c319
-rw-r--r--arch/x86/kvm/fpu.h66
-rw-r--r--arch/x86/kvm/hyperv.c2
-rw-r--r--arch/x86/kvm/kvm_emulate.h20
-rw-r--r--arch/x86/kvm/lapic.c44
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/mmu/mmu.c94
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h10
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/mmu/spte.c2
-rw-r--r--arch/x86/kvm/mmu/spte.h10
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c50
-rw-r--r--arch/x86/kvm/svm/avic.c86
-rw-r--r--arch/x86/kvm/svm/nested.c12
-rw-r--r--arch/x86/kvm/svm/sev.c47
-rw-r--r--arch/x86/kvm/svm/svm.c103
-rw-r--r--arch/x86/kvm/svm/svm.h4
-rw-r--r--arch/x86/kvm/svm/vmenter.S53
-rw-r--r--arch/x86/kvm/vmx/main.c9
-rw-r--r--arch/x86/kvm/vmx/nested.c173
-rw-r--r--arch/x86/kvm/vmx/run_flags.h10
-rw-r--r--arch/x86/kvm/vmx/tdx.c805
-rw-r--r--arch/x86/kvm/vmx/tdx.h9
-rw-r--r--arch/x86/kvm/vmx/vmenter.S51
-rw-r--r--arch/x86/kvm/vmx/vmx.c323
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h2
-rw-r--r--arch/x86/kvm/x86.c285
-rw-r--r--arch/x86/kvm/x86.h16
30 files changed, 1489 insertions, 1126 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 52524e0ca97f..d563a948318b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -1135,6 +1135,7 @@ void kvm_set_cpu_caps(void)
F(AMD_STIBP),
F(AMD_STIBP_ALWAYS_ON),
F(AMD_IBRS_SAME_MODE),
+ PASSTHROUGH_F(EFER_LMSLE_MBZ),
F(AMD_PSFD),
F(AMD_IBPB_RET),
);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4e3da5b497b8..c8e292e9a24d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -81,9 +81,8 @@
*/
/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp (1<<0) /* 8-bit operands. */
-/* Destination operand type. */
-#define DstShift 1
+#define ByteOp (1<<0) /* 8-bit operands. */
+#define DstShift 1 /* Destination operand type at bits 1-5 */
#define ImplicitOps (OpImplicit << DstShift)
#define DstReg (OpReg << DstShift)
#define DstMem (OpMem << DstShift)
@@ -95,8 +94,7 @@
#define DstDX (OpDX << DstShift)
#define DstAccLo (OpAccLo << DstShift)
#define DstMask (OpMask << DstShift)
-/* Source operand type. */
-#define SrcShift 6
+#define SrcShift 6 /* Source operand type at bits 6-10 */
#define SrcNone (OpNone << SrcShift)
#define SrcReg (OpReg << SrcShift)
#define SrcMem (OpMem << SrcShift)
@@ -119,10 +117,10 @@
#define SrcAccHi (OpAccHi << SrcShift)
#define SrcMask (OpMask << SrcShift)
#define BitOp (1<<11)
-#define MemAbs (1<<12) /* Memory operand is absolute displacement */
+#define MemAbs (1<<12) /* Memory operand is absolute displacement */
#define String (1<<13) /* String instruction (rep capable) */
#define Stack (1<<14) /* Stack instruction (push/pop) */
-#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */
+#define GroupMask (7<<15) /* Group mechanisms, at bits 15-17 */
#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
@@ -131,11 +129,8 @@
#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
#define ModeDual (7<<15) /* Different instruction for 32/64 bit */
#define Sse (1<<18) /* SSE Vector instruction */
-/* Generic ModRM decode. */
-#define ModRM (1<<19)
-/* Destination is only written; never read. */
-#define Mov (1<<20)
-/* Misc flags */
+#define ModRM (1<<19) /* Generic ModRM decode. */
+#define Mov (1<<20) /* Destination is only written; never read. */
#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */
#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
@@ -143,11 +138,11 @@
#define Undefined (1<<25) /* No Such Instruction */
#define Lock (1<<26) /* lock prefix is allowed for the instruction */
#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
-#define No64 (1<<28)
+#define No64 (1<<28) /* Instruction generates #UD in 64-bit mode */
#define PageTable (1 << 29) /* instruction used to write page table */
#define NotImpl (1 << 30) /* instruction is not implemented */
-/* Source 2 operand type */
-#define Src2Shift (31)
+#define Avx ((u64)1 << 31) /* Instruction uses VEX prefix */
+#define Src2Shift (32) /* Source 2 operand type at bits 32-36 */
#define Src2None (OpNone << Src2Shift)
#define Src2Mem (OpMem << Src2Shift)
#define Src2CL (OpCL << Src2Shift)
@@ -161,12 +156,13 @@
#define Src2FS (OpFS << Src2Shift)
#define Src2GS (OpGS << Src2Shift)
#define Src2Mask (OpMask << Src2Shift)
+/* free: 37-39 */
#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
-#define AlignMask ((u64)7 << 41)
+#define AlignMask ((u64)3 << 41) /* Memory alignment requirement at bits 41-42 */
#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */
-#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */
-#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */
+#define Aligned16 ((u64)3 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */
+/* free: 43-44 */
#define NoWrite ((u64)1 << 45) /* No writeback */
#define SrcWrite ((u64)1 << 46) /* Write back src operand */
#define NoMod ((u64)1 << 47) /* Mod field is ignored */
@@ -243,6 +239,13 @@ enum x86_transfer_type {
X86_TRANSFER_TASK_SWITCH,
};
+enum rex_bits {
+ REX_B = 1,
+ REX_X = 2,
+ REX_R = 4,
+ REX_W = 8,
+};
+
static void writeback_registers(struct x86_emulate_ctxt *ctxt)
{
unsigned long dirty = ctxt->regs_dirty;
@@ -622,7 +625,6 @@ static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size)
switch (alignment) {
case Unaligned:
- case Avx:
return 1;
case Aligned16:
return 16;
@@ -924,7 +926,7 @@ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
int byteop)
{
void *p;
- int highbyte_regs = (ctxt->rex_prefix == 0) && byteop;
+ int highbyte_regs = (ctxt->rex_prefix == REX_NONE) && byteop;
if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
@@ -1030,6 +1032,7 @@ static void fetch_register_operand(struct operand *op)
op->val = *(u64 *)op->addr.reg;
break;
}
+ op->orig_val = op->val;
}
static int em_fninit(struct x86_emulate_ctxt *ctxt)
@@ -1075,17 +1078,17 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
- struct operand *op)
+static void __decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op, int reg)
{
- unsigned int reg;
-
- if (ctxt->d & ModRM)
- reg = ctxt->modrm_reg;
- else
- reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
-
- if (ctxt->d & Sse) {
+ if ((ctxt->d & Avx) && ctxt->op_bytes == 32) {
+ op->type = OP_YMM;
+ op->bytes = 32;
+ op->addr.xmm = reg;
+ kvm_read_avx_reg(reg, &op->vec_val2);
+ return;
+ }
+ if (ctxt->d & (Avx|Sse)) {
op->type = OP_XMM;
op->bytes = 16;
op->addr.xmm = reg;
@@ -1103,9 +1106,20 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
op->type = OP_REG;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp);
-
fetch_register_operand(op);
- op->orig_val = op->val;
+}
+
+static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ unsigned int reg;
+
+ if (ctxt->d & ModRM)
+ reg = ctxt->modrm_reg;
+ else
+ reg = (ctxt->b & 7) | (ctxt->rex_bits & REX_B ? 8 : 0);
+
+ __decode_register_operand(ctxt, op, reg);
}
static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
@@ -1122,9 +1136,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
int rc = X86EMUL_CONTINUE;
ulong modrm_ea = 0;
- ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
- index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
- base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
+ ctxt->modrm_reg = (ctxt->rex_bits & REX_R ? 8 : 0);
+ index_reg = (ctxt->rex_bits & REX_X ? 8 : 0);
+ base_reg = (ctxt->rex_bits & REX_B ? 8 : 0);
ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
@@ -1132,24 +1146,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
ctxt->modrm_seg = VCPU_SREG_DS;
if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
- op->type = OP_REG;
- op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
- ctxt->d & ByteOp);
- if (ctxt->d & Sse) {
- op->type = OP_XMM;
- op->bytes = 16;
- op->addr.xmm = ctxt->modrm_rm;
- kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val);
- return rc;
- }
- if (ctxt->d & Mmx) {
- op->type = OP_MM;
- op->bytes = 8;
- op->addr.mm = ctxt->modrm_rm & 7;
- return rc;
- }
- fetch_register_operand(op);
+ __decode_register_operand(ctxt, op, ctxt->modrm_rm);
return rc;
}
@@ -1783,7 +1780,15 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
op->data,
op->bytes * op->count);
case OP_XMM:
- kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
+ if (!(ctxt->d & Avx)) {
+ kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
+ break;
+ }
+ /* full YMM write but with high bytes cleared */
+ memset(op->valptr + 16, 0, 16);
+ fallthrough;
+ case OP_YMM:
+ kvm_write_avx_reg(op->addr.xmm, &op->vec_val2);
break;
case OP_MM:
kvm_write_mmx_reg(op->addr.mm, &op->mm_val);
@@ -2466,7 +2471,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
setup_syscalls_segments(&cs, &ss);
- if ((ctxt->rex_prefix & 0x8) != 0x0)
+ if (ctxt->rex_bits & REX_W)
usermode = X86EMUL_MODE_PROT64;
else
usermode = X86EMUL_MODE_PROT32;
@@ -3958,6 +3963,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+static const struct opcode ud = I(SrcNone, emulate_ud);
+
static const struct opcode group7_rm0[] = {
N,
I(SrcNone | Priv | EmulateOnUD, em_hypercall),
@@ -4114,7 +4121,7 @@ static const struct group_dual group15 = { {
} };
static const struct gprefix pfx_0f_6f_0f_7f = {
- I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
+ I(Mmx, em_mov), I(Sse | Avx | Aligned, em_mov), N, I(Sse | Avx | Unaligned, em_mov),
};
static const struct instr_dual instr_dual_0f_2b = {
@@ -4133,8 +4140,8 @@ static const struct gprefix pfx_0f_28_0f_29 = {
I(Aligned, em_mov), I(Aligned, em_mov), N, N,
};
-static const struct gprefix pfx_0f_e7 = {
- N, I(Sse, em_mov), N, N,
+static const struct gprefix pfx_0f_e7_0f_38_2a = {
+ N, I(Sse | Avx, em_mov), N, N,
};
static const struct escape escape_d9 = { {
@@ -4347,8 +4354,8 @@ static const struct opcode twobyte_table[256] = {
DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
/* 0x10 - 0x1F */
- GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11),
- GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11),
+ GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_10_0f_11),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_10_0f_11),
N, N, N, N, N, N,
D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */
D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
@@ -4364,9 +4371,9 @@ static const struct opcode twobyte_table[256] = {
IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
check_dr_write),
N, N, N, N,
- GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
- GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
- N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b),
+ GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_28_0f_29),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_28_0f_29),
+ N, GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_2b),
N, N, N, N,
/* 0x30 - 0x3F */
II(ImplicitOps | Priv, em_wrmsr, wrmsr),
@@ -4431,7 +4438,7 @@ static const struct opcode twobyte_table[256] = {
/* 0xD0 - 0xDF */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0xE0 - 0xEF */
- N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7),
+ N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7_0f_38_2a),
N, N, N, N, N, N, N, N,
/* 0xF0 - 0xFF */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
@@ -4458,8 +4465,13 @@ static const struct gprefix three_byte_0f_38_f1 = {
* byte.
*/
static const struct opcode opcode_map_0f_38[256] = {
- /* 0x00 - 0x7f */
- X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0x00 - 0x1f */
+ X16(N), X16(N),
+ /* 0x20 - 0x2f */
+ X8(N),
+ X2(N), GP(SrcReg | DstMem | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N,
+ /* 0x30 - 0x7f */
+ X16(N), X16(N), X16(N), X16(N), X16(N),
/* 0x80 - 0xef */
X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
/* 0xf0 - 0xf1 */
@@ -4618,14 +4630,12 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
fetch_register_operand(op);
- op->orig_val = op->val;
break;
case OpAccLo:
op->type = OP_REG;
op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
fetch_register_operand(op);
- op->orig_val = op->val;
break;
case OpAccHi:
if (ctxt->d & ByteOp) {
@@ -4636,7 +4646,6 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->bytes = ctxt->op_bytes;
op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
fetch_register_operand(op);
- op->orig_val = op->val;
break;
case OpDI:
op->type = OP_MEM;
@@ -4755,12 +4764,87 @@ done:
return rc;
}
+static int x86_decode_avx(struct x86_emulate_ctxt *ctxt,
+ u8 vex_1st, u8 vex_2nd, struct opcode *opcode)
+{
+ u8 vex_3rd, map, pp, l, v;
+ int rc = X86EMUL_CONTINUE;
+
+ if (ctxt->rep_prefix || ctxt->op_prefix || ctxt->rex_prefix)
+ goto ud;
+
+ if (vex_1st == 0xc5) {
+ /* Expand RVVVVlpp to VEX3 format */
+ vex_3rd = vex_2nd & ~0x80; /* VVVVlpp from VEX2, w=0 */
+ vex_2nd = (vex_2nd & 0x80) | 0x61; /* R from VEX2, X=1 B=1 mmmmm=00001 */
+ } else {
+ vex_3rd = insn_fetch(u8, ctxt);
+ }
+
+ /* vex_2nd = RXBmmmmm, vex_3rd = wVVVVlpp. Fix polarity */
+ vex_2nd ^= 0xE0; /* binary 11100000 */
+ vex_3rd ^= 0x78; /* binary 01111000 */
+
+ ctxt->rex_prefix = REX_PREFIX;
+ ctxt->rex_bits = (vex_2nd & 0xE0) >> 5; /* RXB */
+ ctxt->rex_bits |= (vex_3rd & 0x80) >> 4; /* w */
+ if (ctxt->rex_bits && ctxt->mode != X86EMUL_MODE_PROT64)
+ goto ud;
+
+ map = vex_2nd & 0x1f;
+ v = (vex_3rd >> 3) & 0xf;
+ l = vex_3rd & 0x4;
+ pp = vex_3rd & 0x3;
+
+ ctxt->b = insn_fetch(u8, ctxt);
+ switch (map) {
+ case 1:
+ ctxt->opcode_len = 2;
+ *opcode = twobyte_table[ctxt->b];
+ break;
+ case 2:
+ ctxt->opcode_len = 3;
+ *opcode = opcode_map_0f_38[ctxt->b];
+ break;
+ case 3:
+ /* no 0f 3a instructions are supported yet */
+ return X86EMUL_UNHANDLEABLE;
+ default:
+ goto ud;
+ }
+
+ /*
+ * No three operand instructions are supported yet; those that
+ * *are* marked with the Avx flag reserve the VVVV flag.
+ */
+ if (v)
+ goto ud;
+
+ if (l)
+ ctxt->op_bytes = 32;
+ else
+ ctxt->op_bytes = 16;
+
+ switch (pp) {
+ case 0: break;
+ case 1: ctxt->op_prefix = true; break;
+ case 2: ctxt->rep_prefix = 0xf3; break;
+ case 3: ctxt->rep_prefix = 0xf2; break;
+ }
+
+done:
+ return rc;
+ud:
+ *opcode = ud;
+ return rc;
+}
+
int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
{
int rc = X86EMUL_CONTINUE;
int mode = ctxt->mode;
int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
- bool op_prefix = false;
+ bool vex_prefix = false;
bool has_seg_override = false;
struct opcode opcode;
u16 dummy;
@@ -4812,7 +4896,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
for (;;) {
switch (ctxt->b = insn_fetch(u8, ctxt)) {
case 0x66: /* operand-size override */
- op_prefix = true;
+ ctxt->op_prefix = true;
/* switch between 2/4 bytes */
ctxt->op_bytes = def_op_bytes ^ 6;
break;
@@ -4851,7 +4935,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
goto done_prefixes;
- ctxt->rex_prefix = ctxt->b;
+ ctxt->rex_prefix = REX_PREFIX;
+ ctxt->rex_bits = ctxt->b & 0xf;
continue;
case 0xf0: /* LOCK */
ctxt->lock_prefix = 1;
@@ -4865,20 +4950,33 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int
}
/* Any legacy prefix after a REX prefix nullifies its effect. */
-
- ctxt->rex_prefix = 0;
+ ctxt->rex_prefix = REX_NONE;
+ ctxt->rex_bits = 0;
}
done_prefixes:
/* REX prefix. */
- if (ctxt->rex_prefix & 8)
- ctxt->op_bytes = 8; /* REX.W */
+ if (ctxt->rex_bits & REX_W)
+ ctxt->op_bytes = 8;
/* Opcode byte(s). */
- opcode = opcode_table[ctxt->b];
- /* Two-byte opcode? */
- if (ctxt->b == 0x0f) {
+ if (ctxt->b == 0xc4 || ctxt->b == 0xc5) {
+ /* VEX or LDS/LES */
+ u8 vex_2nd = insn_fetch(u8, ctxt);
+ if (mode != X86EMUL_MODE_PROT64 && (vex_2nd & 0xc0) != 0xc0) {
+ opcode = opcode_table[ctxt->b];
+ ctxt->modrm = vex_2nd;
+ /* the Mod/RM byte has been fetched already! */
+ goto done_modrm;
+ }
+
+ vex_prefix = true;
+ rc = x86_decode_avx(ctxt, ctxt->b, vex_2nd, &opcode);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ } else if (ctxt->b == 0x0f) {
+ /* Two- or three-byte opcode */
ctxt->opcode_len = 2;
ctxt->b = insn_fetch(u8, ctxt);
opcode = twobyte_table[ctxt->b];
@@ -4889,18 +4987,16 @@ done_prefixes:
ctxt->b = insn_fetch(u8, ctxt);
opcode = opcode_map_0f_38[ctxt->b];
}
+ } else {
+ /* Opcode byte(s). */
+ opcode = opcode_table[ctxt->b];
}
- ctxt->d = opcode.flags;
- if (ctxt->d & ModRM)
+ if (opcode.flags & ModRM)
ctxt->modrm = insn_fetch(u8, ctxt);
- /* vex-prefix instructions are not implemented */
- if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
- (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) {
- ctxt->d = NotImpl;
- }
-
+done_modrm:
+ ctxt->d = opcode.flags;
while (ctxt->d & GroupMask) {
switch (ctxt->d & GroupMask) {
case Group:
@@ -4919,9 +5015,9 @@ done_prefixes:
opcode = opcode.u.group[goffset];
break;
case Prefix:
- if (ctxt->rep_prefix && op_prefix)
+ if (ctxt->rep_prefix && ctxt->op_prefix)
return EMULATION_FAILED;
- simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
+ simd_prefix = ctxt->op_prefix ? 0x66 : ctxt->rep_prefix;
switch (simd_prefix) {
case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
@@ -4966,6 +5062,19 @@ done_prefixes:
if (ctxt->d == 0)
return EMULATION_FAILED;
+ if (unlikely(vex_prefix)) {
+ /*
+ * Only specifically marked instructions support VEX. Since many
+ * instructions support it but are not annotated, return not implemented
+ * rather than #UD.
+ */
+ if (!(ctxt->d & Avx))
+ return EMULATION_FAILED;
+
+ if (!(ctxt->d & AlignMask))
+ ctxt->d |= Unaligned;
+ }
+
ctxt->execute = opcode.u.execute;
/*
@@ -5036,8 +5145,10 @@ done_prefixes:
if ((ctxt->d & No16) && ctxt->op_bytes == 2)
ctxt->op_bytes = 4;
- if (ctxt->d & Sse)
- ctxt->op_bytes = 16;
+ if (vex_prefix)
+ ;
+ else if (ctxt->d & Sse)
+ ctxt->op_bytes = 16, ctxt->d &= ~Avx;
else if (ctxt->d & Mmx)
ctxt->op_bytes = 8;
}
@@ -5137,8 +5248,10 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt)
{
/* Clear fields that are set conditionally but read without a guard. */
ctxt->rip_relative = false;
- ctxt->rex_prefix = 0;
+ ctxt->rex_prefix = REX_NONE;
+ ctxt->rex_bits = 0;
ctxt->lock_prefix = 0;
+ ctxt->op_prefix = false;
ctxt->rep_prefix = 0;
ctxt->regs_valid = 0;
ctxt->regs_dirty = 0;
@@ -5168,20 +5281,34 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts)
}
if (unlikely(ctxt->d &
- (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
+ (No64|Undefined|Avx|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
(ctxt->d & Undefined)) {
rc = emulate_ud(ctxt);
goto done;
}
- if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
- || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+ if ((ctxt->d & (Avx|Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) {
rc = emulate_ud(ctxt);
goto done;
}
- if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ if (ctxt->d & Avx) {
+ u64 xcr = 0;
+ if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE)
+ || ops->get_xcr(ctxt, 0, &xcr)
+ || !(xcr & XFEATURE_MASK_YMM)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+ } else if (ctxt->d & Sse) {
+ if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+ }
+
+ if ((ctxt->d & (Avx|Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
rc = emulate_nm(ctxt);
goto done;
}
diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h
index 3ba12888bf66..f898781b6a06 100644
--- a/arch/x86/kvm/fpu.h
+++ b/arch/x86/kvm/fpu.h
@@ -15,6 +15,58 @@ typedef u32 __attribute__((vector_size(16))) sse128_t;
#define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; })
#define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; })
+typedef u32 __attribute__((vector_size(32))) avx256_t;
+
+static inline void _kvm_read_avx_reg(int reg, avx256_t *data)
+{
+ switch (reg) {
+ case 0: asm("vmovdqa %%ymm0, %0" : "=m"(*data)); break;
+ case 1: asm("vmovdqa %%ymm1, %0" : "=m"(*data)); break;
+ case 2: asm("vmovdqa %%ymm2, %0" : "=m"(*data)); break;
+ case 3: asm("vmovdqa %%ymm3, %0" : "=m"(*data)); break;
+ case 4: asm("vmovdqa %%ymm4, %0" : "=m"(*data)); break;
+ case 5: asm("vmovdqa %%ymm5, %0" : "=m"(*data)); break;
+ case 6: asm("vmovdqa %%ymm6, %0" : "=m"(*data)); break;
+ case 7: asm("vmovdqa %%ymm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("vmovdqa %%ymm8, %0" : "=m"(*data)); break;
+ case 9: asm("vmovdqa %%ymm9, %0" : "=m"(*data)); break;
+ case 10: asm("vmovdqa %%ymm10, %0" : "=m"(*data)); break;
+ case 11: asm("vmovdqa %%ymm11, %0" : "=m"(*data)); break;
+ case 12: asm("vmovdqa %%ymm12, %0" : "=m"(*data)); break;
+ case 13: asm("vmovdqa %%ymm13, %0" : "=m"(*data)); break;
+ case 14: asm("vmovdqa %%ymm14, %0" : "=m"(*data)); break;
+ case 15: asm("vmovdqa %%ymm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_avx_reg(int reg, const avx256_t *data)
+{
+ switch (reg) {
+ case 0: asm("vmovdqa %0, %%ymm0" : : "m"(*data)); break;
+ case 1: asm("vmovdqa %0, %%ymm1" : : "m"(*data)); break;
+ case 2: asm("vmovdqa %0, %%ymm2" : : "m"(*data)); break;
+ case 3: asm("vmovdqa %0, %%ymm3" : : "m"(*data)); break;
+ case 4: asm("vmovdqa %0, %%ymm4" : : "m"(*data)); break;
+ case 5: asm("vmovdqa %0, %%ymm5" : : "m"(*data)); break;
+ case 6: asm("vmovdqa %0, %%ymm6" : : "m"(*data)); break;
+ case 7: asm("vmovdqa %0, %%ymm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("vmovdqa %0, %%ymm8" : : "m"(*data)); break;
+ case 9: asm("vmovdqa %0, %%ymm9" : : "m"(*data)); break;
+ case 10: asm("vmovdqa %0, %%ymm10" : : "m"(*data)); break;
+ case 11: asm("vmovdqa %0, %%ymm11" : : "m"(*data)); break;
+ case 12: asm("vmovdqa %0, %%ymm12" : : "m"(*data)); break;
+ case 13: asm("vmovdqa %0, %%ymm13" : : "m"(*data)); break;
+ case 14: asm("vmovdqa %0, %%ymm14" : : "m"(*data)); break;
+ case 15: asm("vmovdqa %0, %%ymm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
static inline void _kvm_read_sse_reg(int reg, sse128_t *data)
{
switch (reg) {
@@ -109,6 +161,20 @@ static inline void kvm_fpu_put(void)
fpregs_unlock();
}
+static inline void kvm_read_avx_reg(int reg, avx256_t *data)
+{
+ kvm_fpu_get();
+ _kvm_read_avx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_avx_reg(int reg, const avx256_t *data)
+{
+ kvm_fpu_get();
+ _kvm_write_avx_reg(reg, data);
+ kvm_fpu_put();
+}
+
static inline void kvm_read_sse_reg(int reg, sse128_t *data)
{
kvm_fpu_get();
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 38595ecb990d..de92292eb1f5 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1568,7 +1568,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
* only, there can be valuable data in the rest which needs
* to be preserved e.g. on migration.
*/
- if (__put_user(0, (u32 __user *)addr))
+ if (put_user(0, (u32 __user *)addr))
return 1;
hv_vcpu->hv_vapic = data;
kvm_vcpu_mark_page_dirty(vcpu, gfn);
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 7b5ddb787a25..fb3dab4b5a53 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -237,6 +237,7 @@ struct x86_emulate_ops {
bool (*is_smm)(struct x86_emulate_ctxt *ctxt);
int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
+ int (*get_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr);
int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr,
@@ -248,7 +249,7 @@ struct x86_emulate_ops {
/* Type, address-of, and value of an instruction's operand. */
struct operand {
- enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
+ enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type;
unsigned int bytes;
unsigned int count;
union {
@@ -267,11 +268,12 @@ struct operand {
union {
unsigned long val;
u64 val64;
- char valptr[sizeof(sse128_t)];
+ char valptr[sizeof(avx256_t)];
sse128_t vec_val;
+ avx256_t vec_val2;
u64 mm_val;
void *data;
- };
+ } __aligned(32);
};
#define X86_MAX_INSTRUCTION_LENGTH 15
@@ -317,6 +319,14 @@ typedef void (*fastop_t)(struct fastop *);
#define NR_EMULATOR_GPRS 8
#endif
+/*
+ * Distinguish between no prefix, REX, or in the future REX2.
+ */
+enum rex_type {
+ REX_NONE,
+ REX_PREFIX,
+};
+
struct x86_emulate_ctxt {
void *vcpu;
const struct x86_emulate_ops *ops;
@@ -348,6 +358,7 @@ struct x86_emulate_ctxt {
u8 opcode_len;
u8 b;
u8 intercept;
+ bool op_prefix;
u8 op_bytes;
u8 ad_bytes;
union {
@@ -357,7 +368,8 @@ struct x86_emulate_ctxt {
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
bool rip_relative;
- u8 rex_prefix;
+ enum rex_type rex_prefix;
+ u8 rex_bits;
u8 lock_prefix;
u8 rep_prefix;
/* bitmaps of registers in _regs[] that can be read */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0ae7f913d782..1597dd0b0cc6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2126,23 +2126,41 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
static void advance_periodic_target_expiration(struct kvm_lapic *apic)
{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
ktime_t now = ktime_get();
u64 tscl = rdtsc();
ktime_t delta;
/*
- * Synchronize both deadlines to the same time source or
- * differences in the periods (caused by differences in the
- * underlying clocks or numerical approximation errors) will
- * cause the two to drift apart over time as the errors
- * accumulate.
+ * Use kernel time as the time source for both the hrtimer deadline and
+ * TSC-based deadline so that they stay synchronized. Computing each
+ * deadline independently will cause the two deadlines to drift apart
+ * over time as differences in the periods accumulate, e.g. due to
+ * differences in the underlying clocks or numerical approximation errors.
*/
- apic->lapic_timer.target_expiration =
- ktime_add_ns(apic->lapic_timer.target_expiration,
- apic->lapic_timer.period);
- delta = ktime_sub(apic->lapic_timer.target_expiration, now);
- apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
- nsec_to_cycles(apic->vcpu, delta);
+ ktimer->target_expiration = ktime_add_ns(ktimer->target_expiration,
+ ktimer->period);
+
+ /*
+ * If the new expiration is in the past, e.g. because userspace stopped
+ * running the VM for an extended duration, then force the expiration
+ * to "now" and don't try to play catch-up with the missed events. KVM
+ * will only deliver a single interrupt regardless of how many events
+ * are pending, i.e. restarting the timer with an expiration in the
+ * past will do nothing more than waste host cycles, and can even lead
+ * to a hard lockup in extreme cases.
+ */
+ if (ktime_before(ktimer->target_expiration, now))
+ ktimer->target_expiration = now;
+
+ /*
+ * Note, ensuring the expiration isn't in the past also prevents delta
+ * from going negative, which could cause the TSC deadline to become
+ * excessively large due to it an unsigned value.
+ */
+ delta = ktime_sub(ktimer->target_expiration, now);
+ ktimer->tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, delta);
}
static void start_sw_period(struct kvm_lapic *apic)
@@ -2970,9 +2988,9 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
apic_timer_expired(apic, true);
- if (lapic_is_periodic(apic)) {
+ if (lapic_is_periodic(apic) && !WARN_ON_ONCE(!apic->lapic_timer.period)) {
advance_periodic_target_expiration(apic);
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ hrtimer_set_expires(&ktimer->timer, ktimer->target_expiration);
return HRTIMER_RESTART;
} else
return HRTIMER_NORESTART;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index f63074048ec6..830f46145692 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -235,8 +235,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
-bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm);
-
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
@@ -257,8 +255,7 @@ extern bool tdp_mmu_enabled;
#define tdp_mmu_enabled false
#endif
-bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
-int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level);
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 667d66cf76d5..02c450686b4a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4859,7 +4859,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
*/
BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
- vcpu->arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
if (!flags) {
trace_kvm_page_fault(vcpu, fault_address, error_code);
@@ -4924,7 +4924,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
-int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level)
+static int kvm_tdp_page_prefault(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 error_code, u8 *level)
{
int r;
@@ -4966,7 +4967,6 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level
return -EIO;
}
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_map_page);
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range)
@@ -5002,7 +5002,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
* Shadow paging uses GVA for kvm page fault, so restrict to
* two-dimensional paging.
*/
- r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level);
+ r = kvm_tdp_page_prefault(vcpu, range->gpa | direct_bits, error_code, &level);
if (r < 0)
return r;
@@ -5014,6 +5014,86 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
return min(range->size, end - range->gpa);
}
+#ifdef CONFIG_KVM_GUEST_MEMFD
+static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) ||
+ WARN_ON_ONCE(!slot->gmem.file) ||
+ WARN_ON_ONCE(!file_count(slot->gmem.file)))
+ return;
+
+ lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock);
+#endif
+}
+
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+{
+ struct kvm_page_fault fault = {
+ .addr = gfn_to_gpa(gfn),
+ .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
+ .prefetch = true,
+ .is_tdp = true,
+ .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
+
+ .max_level = PG_LEVEL_4K,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ .is_private = true,
+
+ .gfn = gfn,
+ .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
+ .pfn = pfn,
+ .map_writable = true,
+ };
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * Mapping a pre-determined private pfn is intended only for use when
+ * populating a guest_memfd instance. Assert that the slot is backed
+ * by guest_memfd and that the gmem instance's invalidate_lock is held.
+ */
+ kvm_assert_gmem_invalidate_lock_held(fault.slot);
+
+ if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
+ return -EIO;
+
+ if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn))
+ return -EPERM;
+
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+
+ if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
+ cond_resched();
+
+ guard(read_lock)(&kvm->mmu_lock);
+
+ r = kvm_tdp_mmu_map(vcpu, &fault);
+ } while (r == RET_PF_RETRY);
+
+ if (r != RET_PF_FIXED)
+ return -EIO;
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn);
+#endif
+
static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
@@ -5997,7 +6077,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
out:
return r;
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
@@ -6863,6 +6942,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
write_unlock(&kvm->mmu_lock);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range);
static bool slot_rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
@@ -7204,7 +7284,6 @@ restart:
return need_tlb_flush;
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range);
static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
@@ -7364,6 +7443,9 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
{
WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+ if (!enable_mmio_caching)
+ return;
+
gen &= MMIO_SPTE_GEN_MASK;
/*
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index ed5c01df21ba..73cdcbccc89e 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -39,16 +39,6 @@
#define INVALID_PAE_ROOT 0
#define IS_VALID_PAE_ROOT(x) (!!(x))
-static inline hpa_t kvm_mmu_get_dummy_root(void)
-{
- return my_zero_pfn(0) << PAGE_SHIFT;
-}
-
-static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
-{
- return is_zero_pfn(shadow_page >> PAGE_SHIFT);
-}
-
typedef u64 __rcu *tdp_ptep_t;
struct kvm_mmu_page {
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index ed762bb4b007..901cd2bd40b8 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -402,7 +402,7 @@ retry_walk:
goto error;
ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
- if (unlikely(__get_user(pte, ptep_user)))
+ if (unlikely(get_user(pte, ptep_user)))
goto error;
walker->ptep_user[walker->level - 1] = ptep_user;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 37647afde7d3..85a0473809b0 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -292,7 +292,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
}
- if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
!kvm_vcpu_can_access_host_mmio(vcpu) &&
kvm_is_mmio_pfn(pfn, &is_host_mmio))
kvm_track_host_mmio_mapping(vcpu);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 3133f066927e..91ce29fd6f1b 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -246,6 +246,16 @@ static inline int spte_index(u64 *sptep)
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+static inline hpa_t kvm_mmu_get_dummy_root(void)
+{
+ return my_zero_pfn(0) << PAGE_SHIFT;
+}
+
+static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
+{
+ return is_zero_pfn(shadow_page >> PAGE_SHIFT);
+}
+
static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
{
struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c5734ca5c17d..9c26038f6b77 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -362,9 +362,6 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
int level)
{
- kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
- int ret;
-
/*
* External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
* PTs are removed in a special order, involving free_external_spt().
@@ -377,9 +374,8 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
/* Zapping leaf spte is allowed only when write lock is held. */
lockdep_assert_held_write(&kvm->mmu_lock);
- /* Because write lock is held, operation should success. */
- ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn);
- KVM_BUG_ON(ret, kvm);
+
+ kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
}
/**
@@ -519,7 +515,6 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
bool is_leaf = is_present && is_last_spte(new_spte, level);
- kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
int ret = 0;
KVM_BUG_ON(was_present, kvm);
@@ -538,7 +533,7 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
* external page table, or leaf.
*/
if (is_leaf) {
- ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn);
+ ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
} else {
void *external_spt = get_external_spt(gfn, new_spte, level);
@@ -1273,6 +1268,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
struct kvm_mmu_page *sp;
int ret = RET_PF_RETRY;
+ KVM_MMU_WARN_ON(!root || root->role.invalid);
+
kvm_mmu_hugepage_adjust(vcpu, fault);
trace_kvm_mmu_spte_requested(fault);
@@ -1939,13 +1936,16 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
*
* Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
*/
-static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
- struct kvm_mmu_page *root)
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
struct tdp_iter iter;
gfn_t gfn = addr >> PAGE_SHIFT;
int leaf = -1;
+ *root_level = vcpu->arch.mmu->root_role.level;
+
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
@@ -1954,36 +1954,6 @@ static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
return leaf;
}
-int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
- int *root_level)
-{
- struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
- *root_level = vcpu->arch.mmu->root_role.level;
-
- return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root);
-}
-
-bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa)
-{
- struct kvm *kvm = vcpu->kvm;
- bool is_direct = kvm_is_addr_direct(kvm, gpa);
- hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa :
- vcpu->arch.mmu->mirror_root_hpa;
- u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte;
- int leaf;
-
- lockdep_assert_held(&kvm->mmu_lock);
- rcu_read_lock();
- leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root));
- rcu_read_unlock();
- if (leaf < 0)
- return false;
-
- spte = sptes[leaf];
- return is_shadow_present_pte(spte) && is_last_spte(spte, leaf);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped);
-
/*
* Returns the last level spte pointer of the shadow page walk for the given
* gpa, and sets *spte to the spte value. This spte may be non-preset. If no
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index fef00546c885..6b77b2033208 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -106,7 +106,7 @@ static u32 next_vm_id = 0;
static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
static bool x2avic_enabled;
-
+static u32 x2avic_max_physical_id;
static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
bool intercept)
@@ -158,12 +158,40 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
svm->x2avic_msrs_intercepted = intercept;
}
+static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+ u32 arch_max;
+
+ /*
+ * Return the largest size (x2APIC) when querying without a vCPU, e.g.
+ * to allocate the per-VM table..
+ */
+ if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic)))
+ arch_max = x2avic_max_physical_id;
+ else
+ arch_max = AVIC_MAX_PHYSICAL_ID;
+
+ /*
+ * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID
+ * plus one, so the max possible APIC ID is one less than that.
+ */
+ return min(kvm->arch.max_vcpu_ids - 1, arch_max);
+}
+
+static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
+{
+ return __avic_get_max_physical_id(vcpu->kvm, vcpu);
+}
+
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+ vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu);
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
@@ -176,7 +204,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
*/
if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
- vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
+
/* Disabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, false);
} else {
@@ -186,8 +214,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
- /* For xAVIC and hybrid-xAVIC modes */
- vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
/* Enabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, true);
}
@@ -247,6 +273,30 @@ static int avic_ga_log_notifier(u32 ga_tag)
return 0;
}
+static int avic_get_physical_id_table_order(struct kvm *kvm)
+{
+ /* Provision for the maximum physical ID supported in x2avic mode */
+ return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64));
+}
+
+int avic_alloc_physical_id_table(struct kvm *kvm)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (!irqchip_in_kernel(kvm) || !enable_apicv)
+ return 0;
+
+ if (kvm_svm->avic_physical_id_table)
+ return 0;
+
+ kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ avic_get_physical_id_table_order(kvm));
+ if (!kvm_svm->avic_physical_id_table)
+ return -ENOMEM;
+
+ return 0;
+}
+
void avic_vm_destroy(struct kvm *kvm)
{
unsigned long flags;
@@ -256,7 +306,8 @@ void avic_vm_destroy(struct kvm *kvm)
return;
free_page((unsigned long)kvm_svm->avic_logical_id_table);
- free_page((unsigned long)kvm_svm->avic_physical_id_table);
+ free_pages((unsigned long)kvm_svm->avic_physical_id_table,
+ avic_get_physical_id_table_order(kvm));
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_del(&kvm_svm->hnode);
@@ -274,10 +325,6 @@ int avic_vm_init(struct kvm *kvm)
if (!enable_apicv)
return 0;
- kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (!kvm_svm->avic_physical_id_table)
- goto free_avic;
-
kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!kvm_svm->avic_logical_id_table)
goto free_avic;
@@ -342,7 +389,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
* fully initialized AVIC.
*/
if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
- (id > X2AVIC_MAX_PHYSICAL_ID)) {
+ (id > x2avic_max_physical_id)) {
kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
vcpu->arch.apic->apicv_active = false;
return 0;
@@ -562,7 +609,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
u32 icrl = svm->vmcb->control.exit_info_1;
u32 id = svm->vmcb->control.exit_info_2 >> 32;
- u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
+ u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK;
struct kvm_lapic *apic = vcpu->arch.apic;
trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
@@ -962,7 +1009,8 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
- if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
+ PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
return;
/*
@@ -1024,7 +1072,8 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
lockdep_assert_preemption_disabled();
- if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
+ PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
return;
/*
@@ -1226,10 +1275,15 @@ bool __init avic_hardware_setup(void)
/* AVIC is a prerequisite for x2AVIC. */
x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
- if (x2avic_enabled)
- pr_info("x2AVIC enabled\n");
- else
+ if (x2avic_enabled) {
+ if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT))
+ x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID;
+ else
+ x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID;
+ pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1);
+ } else {
svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
+ }
/*
* Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index da6e80b3ac35..c81005b24522 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -613,6 +613,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
struct kvm_vcpu *vcpu = &svm->vcpu;
nested_vmcb02_compute_g_pat(svm);
+ vmcb_mark_dirty(vmcb02, VMCB_NPT);
/* Load the nested guest state */
if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
@@ -751,6 +752,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+ vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
/*
* Stash vmcb02's counter if the guest hasn't moved past the guilty
@@ -1430,16 +1432,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
case SVM_EXIT_IOIO:
vmexit = nested_svm_intercept_ioio(svm);
break;
- case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
- if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
- if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
- vmexit = NESTED_EXIT_DONE;
- break;
- }
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
/*
* Host-intercepted exceptions have been checked already in
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0835c664fbfd..f59c65abe3cf 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -65,20 +65,24 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04
#define AP_RESET_HOLD_NAE_EVENT 1
#define AP_RESET_HOLD_MSR_PROTO 2
-/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
-#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0)
-#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8)
-#define SNP_POLICY_MASK_SMT BIT_ULL(16)
-#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17)
-#define SNP_POLICY_MASK_DEBUG BIT_ULL(19)
-#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20)
-
-#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
- SNP_POLICY_MASK_API_MAJOR | \
- SNP_POLICY_MASK_SMT | \
- SNP_POLICY_MASK_RSVD_MBO | \
- SNP_POLICY_MASK_DEBUG | \
- SNP_POLICY_MASK_SINGLE_SOCKET)
+/*
+ * SEV-SNP policy bits that can be supported by KVM. These include policy bits
+ * that have implementation support within KVM or policy bits that do not
+ * require implementation support within KVM to enforce the policy.
+ */
+#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
+ SNP_POLICY_MASK_API_MAJOR | \
+ SNP_POLICY_MASK_SMT | \
+ SNP_POLICY_MASK_RSVD_MBO | \
+ SNP_POLICY_MASK_DEBUG | \
+ SNP_POLICY_MASK_SINGLE_SOCKET | \
+ SNP_POLICY_MASK_CXL_ALLOW | \
+ SNP_POLICY_MASK_MEM_AES_256_XTS | \
+ SNP_POLICY_MASK_RAPL_DIS | \
+ SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \
+ SNP_POLICY_MASK_PAGE_SWAP_DISABLE)
+
+static u64 snp_supported_policy_bits __ro_after_init;
#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
@@ -2143,6 +2147,10 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
*val = sev_supported_vmsa_features;
return 0;
+ case KVM_X86_SNP_POLICY_BITS:
+ *val = snp_supported_policy_bits;
+ return 0;
+
default:
return -ENXIO;
}
@@ -2207,7 +2215,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (params.flags)
return -EINVAL;
- if (params.policy & ~SNP_POLICY_MASK_VALID)
+ if (params.policy & ~snp_supported_policy_bits)
return -EINVAL;
/* Check for policy bits that must be set */
@@ -3100,8 +3108,11 @@ out:
else if (sev_snp_supported)
sev_snp_supported = is_sev_snp_initialized();
- if (sev_snp_supported)
+ if (sev_snp_supported) {
+ snp_supported_policy_bits = sev_get_snp_policy_bits() &
+ KVM_SNP_POLICY_MASK_VALID;
nr_ciphertext_hiding_asids = init_args.max_snp_asid;
+ }
/*
* If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP
@@ -5085,10 +5096,10 @@ struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
/* Check if the SEV policy allows debugging */
if (sev_snp_guest(vcpu->kvm)) {
- if (!(sev->policy & SNP_POLICY_DEBUG))
+ if (!(sev->policy & SNP_POLICY_MASK_DEBUG))
return NULL;
} else {
- if (sev->policy & SEV_POLICY_NODBG)
+ if (sev->policy & SEV_POLICY_MASK_NODBG)
return NULL;
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9d29b2e7e855..f56c2d895011 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -272,6 +272,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
}
static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
+ int emul_type,
bool commit_side_effects)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -293,7 +294,7 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
if (unlikely(!commit_side_effects))
old_rflags = svm->vmcb->save.rflags;
- if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+ if (!kvm_emulate_instruction(vcpu, emul_type))
return 0;
if (unlikely(!commit_side_effects))
@@ -311,11 +312,13 @@ done:
static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- return __svm_skip_emulated_instruction(vcpu, true);
+ return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true);
}
-static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
+static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector)
{
+ const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT |
+ EMULTYPE_SET_SOFT_INT_VECTOR(vector);
unsigned long rip, old_rip = kvm_rip_read(vcpu);
struct vcpu_svm *svm = to_svm(vcpu);
@@ -331,7 +334,7 @@ static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
* in use, the skip must not commit any side effects such as clearing
* the interrupt shadow or RFLAGS.RF.
*/
- if (!__svm_skip_emulated_instruction(vcpu, !nrips))
+ if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips))
return -EIO;
rip = kvm_rip_read(vcpu);
@@ -367,7 +370,7 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu)
kvm_deliver_exception_payload(vcpu, ex);
if (kvm_exception_is_soft(ex->vector) &&
- svm_update_soft_interrupt_rip(vcpu))
+ svm_update_soft_interrupt_rip(vcpu, ex->vector))
return;
svm->vmcb->control.event_inj = ex->vector
@@ -1198,6 +1201,11 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
svm->vmcb = target_vmcb->ptr;
}
+static int svm_vcpu_precreate(struct kvm *kvm)
+{
+ return avic_alloc_physical_id_table(kvm);
+}
+
static int svm_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
@@ -3442,13 +3450,8 @@ static bool svm_check_exit_valid(u64 exit_code)
static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
{
- vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
dump_vmcb(vcpu);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_code;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ kvm_prepare_unexpected_reason_exit(vcpu, exit_code);
return 0;
}
@@ -3633,11 +3636,12 @@ static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
+ struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt;
struct vcpu_svm *svm = to_svm(vcpu);
u32 type;
- if (vcpu->arch.interrupt.soft) {
- if (svm_update_soft_interrupt_rip(vcpu))
+ if (intr->soft) {
+ if (svm_update_soft_interrupt_rip(vcpu, intr->nr))
return;
type = SVM_EVTINJ_TYPE_SOFT;
@@ -3645,12 +3649,10 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
type = SVM_EVTINJ_TYPE_INTR;
}
- trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
- vcpu->arch.interrupt.soft, reinjected);
+ trace_kvm_inj_virq(intr->nr, intr->soft, reinjected);
++vcpu->stat.irq_injections;
- svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
- SVM_EVTINJ_VALID | type;
+ svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type;
}
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
@@ -4251,7 +4253,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
- kvm_load_guest_xsave_state(vcpu);
/*
* Hardware only context switches DEBUGCTL if LBR virtualization is
@@ -4294,7 +4295,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
update_debugctlmsr(vcpu->arch.host_debugctl);
- kvm_load_host_xsave_state(vcpu);
stgi();
/* Any pending NMI will happen here */
@@ -4326,14 +4326,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
- /*
- * We need to handle MC intercepts here before the vcpu has a chance to
- * change the physical cpu
- */
- if (unlikely(svm->vmcb->control.exit_code ==
- SVM_EXIT_EXCP_BASE + MC_VECTOR))
- svm_handle_mce(vcpu);
-
trace_kvm_exit(vcpu, KVM_ISA_SVM);
svm_complete_interrupts(vcpu);
@@ -4526,31 +4518,45 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
case SVM_EXIT_WRITE_CR0: {
unsigned long cr0, val;
- if (info->intercept == x86_intercept_cr_write)
+ /*
+ * Adjust the exit code accordingly if a CR other than CR0 is
+ * being written, and skip straight to the common handling as
+ * only CR0 has an additional selective intercept.
+ */
+ if (info->intercept == x86_intercept_cr_write && info->modrm_reg) {
icpt_info.exit_code += info->modrm_reg;
-
- if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
- info->intercept == x86_intercept_clts)
break;
+ }
- if (!(vmcb12_is_intercept(&svm->nested.ctl,
- INTERCEPT_SELECTIVE_CR0)))
+ /*
+ * Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a
+ * selective CR0 intercept is triggered (the common logic will
+ * treat the selective intercept as being enabled). Note, the
+ * unconditional intercept has higher priority, i.e. this is
+ * only relevant if *only* the selective intercept is enabled.
+ */
+ if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) ||
+ !(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))
break;
- cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
- val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
+ /* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */
+ if (info->intercept == x86_intercept_clts)
+ break;
+ /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */
if (info->intercept == x86_intercept_lmsw) {
- cr0 &= 0xfUL;
- val &= 0xfUL;
- /* lmsw can't clear PE - catch this here */
- if (cr0 & X86_CR0_PE)
- val |= X86_CR0_PE;
+ icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ break;
}
+ /*
+ * MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit
+ * other than SVM_CR0_SELECTIVE_MASK is changed.
+ */
+ cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
+ val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
if (cr0 ^ val)
icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
-
break;
}
case SVM_EXIT_READ_DR0:
@@ -4622,8 +4628,16 @@ out:
static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
- if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
+ switch (to_svm(vcpu)->vmcb->control.exit_code) {
+ case SVM_EXIT_EXCP_BASE + MC_VECTOR:
+ svm_handle_mce(vcpu);
+ break;
+ case SVM_EXIT_INTR:
vcpu->arch.at_instruction_boundary = true;
+ break;
+ default:
+ break;
+ }
}
static void svm_setup_mce(struct kvm_vcpu *vcpu)
@@ -5012,6 +5026,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
.emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
.has_emulated_msr = svm_has_emulated_msr,
+ .vcpu_precreate = svm_vcpu_precreate,
.vcpu_create = svm_vcpu_create,
.vcpu_free = svm_vcpu_free,
.vcpu_reset = svm_vcpu_reset,
@@ -5316,7 +5331,9 @@ static __init int svm_hardware_setup(void)
if (nested) {
pr_info("Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+ kvm_enable_efer_bits(EFER_SVME);
+ if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ))
+ kvm_enable_efer_bits(EFER_LMSLE);
r = nested_svm_init_msrpm_merge_offsets();
if (r)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index dd78e6402345..9e151dbdef25 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -117,9 +117,6 @@ struct kvm_sev_info {
cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
};
-#define SEV_POLICY_NODBG BIT_ULL(0)
-#define SNP_POLICY_DEBUG BIT_ULL(19)
-
struct kvm_svm {
struct kvm kvm;
@@ -807,6 +804,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
bool __init avic_hardware_setup(void);
void avic_hardware_unsetup(void);
+int avic_alloc_physical_id_table(struct kvm *kvm);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 235c4af6b692..3392bcadfb89 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -52,11 +52,23 @@
* there must not be any returns or indirect branches between this code
* and vmentry.
*/
- movl SVM_spec_ctrl(%_ASM_DI), %eax
- cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax
+#ifdef CONFIG_X86_64
+ mov SVM_spec_ctrl(%rdi), %rdx
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+ je 801b
+ movl %edx, %eax
+ shr $32, %rdx
+#else
+ mov SVM_spec_ctrl(%edi), %eax
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
+ xor %eax, %ecx
+ mov SVM_spec_ctrl + 4(%edi), %edx
+ mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %esi
+ xor %edx, %esi
+ or %esi, %ecx
je 801b
+#endif
mov $MSR_IA32_SPEC_CTRL, %ecx
- xor %edx, %edx
wrmsr
jmp 801b
.endm
@@ -81,17 +93,31 @@
jnz 998f
rdmsr
movl %eax, SVM_spec_ctrl(%_ASM_DI)
+ movl %edx, SVM_spec_ctrl + 4(%_ASM_DI)
998:
-
/* Now restore the host value of the MSR if different from the guest's. */
- movl PER_CPU_VAR(x86_spec_ctrl_current), %eax
- cmp SVM_spec_ctrl(%_ASM_DI), %eax
+#ifdef CONFIG_X86_64
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %rdx
+ cmp SVM_spec_ctrl(%rdi), %rdx
je 901b
- xor %edx, %edx
+ movl %edx, %eax
+ shr $32, %rdx
+#else
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %eax
+ mov SVM_spec_ctrl(%edi), %esi
+ xor %eax, %esi
+ mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edx
+ mov SVM_spec_ctrl + 4(%edi), %edi
+ xor %edx, %edi
+ or %edi, %esi
+ je 901b
+#endif
wrmsr
jmp 901b
.endm
+#define SVM_CLEAR_CPU_BUFFERS \
+ ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF_VM
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
@@ -134,7 +160,7 @@ SYM_FUNC_START(__svm_vcpu_run)
mov %_ASM_ARG1, %_ASM_DI
.endif
- /* Clobbers RAX, RCX, RDX. */
+ /* Clobbers RAX, RCX, RDX (and ESI on 32-bit), consumes RDI (@svm). */
RESTORE_GUEST_SPEC_CTRL
/*
@@ -170,7 +196,7 @@ SYM_FUNC_START(__svm_vcpu_run)
mov VCPU_RDI(%_ASM_DI), %_ASM_DI
/* Clobbers EFLAGS.ZF */
- VM_CLEAR_CPU_BUFFERS
+ SVM_CLEAR_CPU_BUFFERS
/* Enter guest mode */
3: vmrun %_ASM_AX
@@ -211,7 +237,10 @@ SYM_FUNC_START(__svm_vcpu_run)
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
- /* Clobbers RAX, RCX, RDX. */
+ /*
+ * Clobbers RAX, RCX, RDX (and ESI, EDI on 32-bit), consumes RDI (@svm)
+ * and RSP (pointer to @spec_ctrl_intercepted).
+ */
RESTORE_HOST_SPEC_CTRL
/*
@@ -331,7 +360,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov %rdi, SEV_ES_RDI (%rdx)
mov %rsi, SEV_ES_RSI (%rdx)
- /* Clobbers RAX, RCX, RDX (@hostsa). */
+ /* Clobbers RAX, RCX, and RDX (@hostsa), consumes RDI (@svm). */
RESTORE_GUEST_SPEC_CTRL
/* Get svm->current_vmcb->pa into RAX. */
@@ -339,7 +368,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov KVM_VMCB_pa(%rax), %rax
/* Clobbers EFLAGS.ZF */
- VM_CLEAR_CPU_BUFFERS
+ SVM_CLEAR_CPU_BUFFERS
/* Enter guest mode */
1: vmrun %rax
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 0eb2773b2ae2..a46ccd670785 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -831,6 +831,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
return tdx_vcpu_ioctl(vcpu, argp);
}
+static int vt_vcpu_mem_enc_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ if (!is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return tdx_vcpu_unlocked_ioctl(vcpu, argp);
+}
+
static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
bool is_private)
{
@@ -1005,6 +1013,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
.vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
+ .vcpu_mem_enc_unlocked_ioctl = vt_op_tdx_only(vcpu_mem_enc_unlocked_ioctl),
.gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level)
};
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index bcea087b642f..40777278eabb 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -23,8 +23,8 @@
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
-static bool __read_mostly nested_early_check = 0;
-module_param(nested_early_check, bool, S_IRUGO);
+static bool __ro_after_init warn_on_missed_cc;
+module_param(warn_on_missed_cc, bool, 0444);
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
@@ -555,6 +555,9 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
return -EINVAL;
+ if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4))
+ return -EINVAL;
+
return 0;
}
@@ -761,7 +764,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
return;
- kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+ kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
VMCS12_SIZE);
}
@@ -780,7 +783,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
vmcs12->vmcs_link_pointer, VMCS12_SIZE))
return;
- kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+ kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu),
VMCS12_SIZE);
}
@@ -2296,15 +2299,6 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
return;
vmx->nested.vmcs02_initialized = true;
- /*
- * We don't care what the EPTP value is we just need to guarantee
- * it's valid so we don't get a false positive when doing early
- * consistency checks.
- */
- if (enable_ept && nested_early_check)
- vmcs_write64(EPT_POINTER,
- construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
-
if (vmx->ve_info)
vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
@@ -2749,7 +2743,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
vcpu->arch.pat = vmcs12->guest_ia32_pat;
} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+ vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat);
}
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
@@ -2961,6 +2955,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
}
}
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) &&
+ CC(!vmcs12->tsc_multiplier))
+ return -EINVAL;
+
return 0;
}
@@ -3078,6 +3076,38 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva;
+ u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0;
+
+ /*
+ * Don't bother with the consistency checks if KVM isn't configured to
+ * WARN on missed consistency checks, as KVM needs to rely on hardware
+ * to fully detect an illegal vTPR vs. TRP Threshold combination due to
+ * the vTPR being writable by L1 at all times (it's an in-memory value,
+ * not a VMCS field). I.e. even if the check passes now, it might fail
+ * at the actual VM-Enter.
+ *
+ * Keying off the module param also allows treating an invalid vAPIC
+ * mapping as a consistency check failure without increasing the risk
+ * of breaking a "real" VM.
+ */
+ if (!warn_on_missed_cc)
+ return 0;
+
+ if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) &&
+ nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) &&
+ !nested_cpu_has_vid(vmcs12) &&
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+ (CC(!vapic) ||
+ CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0)))))
+ return -EINVAL;
+
+ return 0;
+}
+
static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
@@ -3333,84 +3363,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
return 0;
}
-static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
- bool vm_fail;
-
- if (!nested_early_check)
- return 0;
-
- if (vmx->msr_autoload.host.nr)
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- if (vmx->msr_autoload.guest.nr)
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-
- preempt_disable();
-
- vmx_prepare_switch_to_guest(vcpu);
-
- /*
- * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
- * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
- * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
- * there is no need to preserve other bits or save/restore the field.
- */
- vmcs_writel(GUEST_RFLAGS, 0);
-
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
-
- cr4 = cr4_read_shadow();
- if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
- vmcs_writel(HOST_CR4, cr4);
- vmx->loaded_vmcs->host_state.cr4 = cr4;
- }
-
- vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
- __vmx_vcpu_run_flags(vmx));
-
- if (vmx->msr_autoload.host.nr)
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
- if (vmx->msr_autoload.guest.nr)
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
-
- if (vm_fail) {
- u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
-
- preempt_enable();
-
- trace_kvm_nested_vmenter_failed(
- "early hardware check VM-instruction error: ", error);
- WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
- }
-
- /*
- * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
- */
- if (hw_breakpoint_active())
- set_debugreg(__this_cpu_read(cpu_dr7), 7);
- local_irq_enable();
- preempt_enable();
-
- /*
- * A non-failing VMEntry means we somehow entered guest mode with
- * an illegal RIP, and that's just the tip of the iceberg. There
- * is no telling what memory has been modified or what state has
- * been exposed to unknown code. Hitting this all but guarantees
- * a (very critical) hardware issue.
- */
- WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
- VMX_EXIT_REASONS_FAILED_VMENTRY));
-
- return 0;
-}
-
#ifdef CONFIG_KVM_HYPERV
static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
{
@@ -3667,22 +3619,18 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
&vmx->nested.pre_vmenter_ssp_tbl);
/*
- * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
- * nested early checks are disabled. In the event of a "late" VM-Fail,
- * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
- * software model to the pre-VMEntry host state. When EPT is disabled,
- * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
- * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
- * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
- * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
- * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
- * guaranteed to be overwritten with a shadow CR3 prior to re-entering
- * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
- * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
- * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
- * path would need to manually save/restore vmcs01.GUEST_CR3.
+ * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the
+ * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but
+ * not KVM, KVM must unwind its software model to the pre-VM-Entry host
+ * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not
+ * L1's "real" CR3, which causes nested_vmx_restore_host_state() to
+ * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the
+ * unwind naturally setting arch.cr3 to the correct value. Smashing
+ * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind,
+ * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be
+ * overwritten with a shadow CR3 prior to re-entering L1.
*/
- if (!enable_ept && !nested_early_check)
+ if (!enable_ept)
vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
@@ -3695,7 +3643,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
}
- if (nested_vmx_check_vmentry_hw(vcpu)) {
+ if (nested_vmx_check_controls_late(vcpu, vmcs12)) {
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
return NVMX_VMENTRY_VMFAIL;
}
@@ -3880,7 +3828,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
goto vmentry_failed;
/* Hide L1D cache contents from the nested guest. */
- vmx->vcpu.arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
/*
* Must happen outside of nested_vmx_enter_non_root_mode() as it will
@@ -5164,12 +5112,13 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/*
* The only expected VM-instruction error is "VM entry with
* invalid control field(s)." Anything else indicates a
- * problem with L0. And we should never get here with a
- * VMFail of any type if early consistency checks are enabled.
+ * problem with L0.
*/
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- WARN_ON_ONCE(nested_early_check);
+
+ /* VM-Fail at VM-Entry means KVM missed a consistency check. */
+ WARN_ON_ONCE(warn_on_missed_cc);
}
/*
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
index 2f20fb170def..6a87a12135fb 100644
--- a/arch/x86/kvm/vmx/run_flags.h
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -2,12 +2,8 @@
#ifndef __KVM_X86_VMX_RUN_FLAGS_H
#define __KVM_X86_VMX_RUN_FLAGS_H
-#define VMX_RUN_VMRESUME_SHIFT 0
-#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
-#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT 2
-
-#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
-#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
-#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT)
+#define VMX_RUN_VMRESUME BIT(0)
+#define VMX_RUN_SAVE_SPEC_CTRL BIT(1)
+#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(2)
#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 0a49c863c811..2d7a4d52ccfb 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -24,20 +24,33 @@
#undef pr_fmt
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define pr_tdx_error(__fn, __err) \
- pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
+#define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \
+({ \
+ struct kvm *_kvm = (__kvm); \
+ bool __ret = !!(__err); \
+ \
+ if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \
+ if (_kvm) \
+ kvm_vm_bugged(_kvm); \
+ pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\
+ __err, __args); \
+ } \
+ unlikely(__ret); \
+})
+
+#define TDX_BUG_ON(__err, __fn, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "")
+
+#define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1)
+
+#define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2)
+
+#define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \
+ __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \
+ a1, a2, a3)
-#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \
- pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__)
-
-#define pr_tdx_error_1(__fn, __err, __rcx) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
-
-#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
-
-#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \
- __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
bool enable_tdx __ro_after_init;
module_param_named(tdx, enable_tdx, bool, 0444);
@@ -281,25 +294,34 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
vcpu->cpu = -1;
}
-static void tdx_no_vcpus_enter_start(struct kvm *kvm)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
-
- kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
-}
-
-static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
-}
+/*
+ * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single
+ * retry (if necessary) after forcing vCPUs to exit and wait for the operation
+ * to complete. All flows that remove/block S-EPT entries run with mmu_lock
+ * held for write, i.e. are mutually exclusive with each other, but they aren't
+ * mutually exclusive with running vCPUs, and so can fail with "operand busy"
+ * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL.
+ *
+ * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs.
+ */
+#define tdh_do_no_vcpus(tdh_func, kvm, args...) \
+({ \
+ struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \
+ u64 __err; \
+ \
+ lockdep_assert_held_write(&kvm->mmu_lock); \
+ \
+ __err = tdh_func(args); \
+ if (unlikely(tdx_operand_busy(__err))) { \
+ WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \
+ kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \
+ \
+ __err = tdh_func(args); \
+ \
+ WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \
+ } \
+ __err; \
+})
/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
static int __tdx_reclaim_page(struct page *page)
@@ -313,10 +335,9 @@ static int __tdx_reclaim_page(struct page *page)
* before the HKID is released and control pages have also been
* released at this point, so there is no possibility of contention.
*/
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
+ if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL))
return -EIO;
- }
+
return 0;
}
@@ -404,8 +425,8 @@ static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
return;
smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
- if (KVM_BUG_ON(arg.err, vcpu->kvm))
- pr_tdx_error(TDH_VP_FLUSH, arg.err);
+
+ TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm);
}
void tdx_disable_virtualization_cpu(void)
@@ -464,8 +485,7 @@ static void smp_func_do_phymem_cache_wb(void *unused)
}
out:
- if (WARN_ON_ONCE(err))
- pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
+ TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL);
}
void tdx_mmu_release_hkid(struct kvm *kvm)
@@ -504,8 +524,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
err = tdh_mng_vpflushdone(&kvm_tdx->td);
if (err == TDX_FLUSHVP_NOT_DONE)
goto out;
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
+ if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) {
pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
kvm_tdx->hkid);
goto out;
@@ -528,8 +547,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
* tdh_mng_key_freeid() will fail.
*/
err = tdh_mng_key_freeid(&kvm_tdx->td);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_MNG_KEY_FREEID, err);
+ if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) {
pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
kvm_tdx->hkid);
} else {
@@ -580,10 +598,9 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
* when it is reclaiming TDCS).
*/
err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+ if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
return;
- }
+
tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
__free_page(kvm_tdx->td.tdr_page);
@@ -606,11 +623,8 @@ static int tdx_do_tdh_mng_key_config(void *param)
/* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
err = tdh_mng_key_config(&kvm_tdx->td);
-
- if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
- pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
+ if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm))
return -EIO;
- }
return 0;
}
@@ -763,25 +777,6 @@ static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
}
-/*
- * Compared to vmx_prepare_switch_to_guest(), there is not much to do
- * as SEAMCALL/SEAMRET calls take care of most of save and restore.
- */
-void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vt *vt = to_vt(vcpu);
-
- if (vt->guest_state_loaded)
- return;
-
- if (likely(is_64bit_mm(current->mm)))
- vt->msr_host_kernel_gs_base = current->thread.gsbase;
- else
- vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
-
- vt->guest_state_loaded = true;
-}
-
struct tdx_uret_msr {
u32 msr;
unsigned int slot;
@@ -795,19 +790,38 @@ static struct tdx_uret_msr tdx_uret_msrs[] = {
{.msr = MSR_TSC_AUX,},
};
-static void tdx_user_return_msr_update_cache(void)
+void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vt *vt = to_vt(vcpu);
int i;
+ if (vt->guest_state_loaded)
+ return;
+
+ if (likely(is_64bit_mm(current->mm)))
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
+ else
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+
+ vt->guest_state_loaded = true;
+
+ /*
+ * Explicitly set user-return MSRs that are clobbered by the TDX-Module
+ * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be
+ * written by the TDX-Module. Don't rely on the TDX-Module to actually
+ * clobber the MSRs, as the contract is poorly defined and not upheld.
+ * E.g. the TDX-Module will synthesize an EPT Violation without doing
+ * VM-Enter if it suspects a zero-step attack, and never "restore" VMM
+ * state.
+ */
for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
- kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
- tdx_uret_msrs[i].defval);
+ kvm_set_user_return_msr(tdx_uret_msrs[i].slot,
+ tdx_uret_msrs[i].defval, -1ull);
}
static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
{
struct vcpu_vt *vt = to_vt(vcpu);
- struct vcpu_tdx *tdx = to_tdx(vcpu);
if (!vt->guest_state_loaded)
return;
@@ -815,11 +829,6 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
++vcpu->stat.host_state_reload;
wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
- if (tdx->guest_entered) {
- tdx_user_return_msr_update_cache();
- tdx->guest_entered = false;
- }
-
vt->guest_state_loaded = false;
}
@@ -829,19 +838,52 @@ void tdx_vcpu_put(struct kvm_vcpu *vcpu)
tdx_prepare_switch_to_host(vcpu);
}
+/*
+ * Life cycles for a TD and a vCPU:
+ * 1. KVM_CREATE_VM ioctl.
+ * TD state is TD_STATE_UNINITIALIZED.
+ * hkid is not assigned at this stage.
+ * 2. KVM_TDX_INIT_VM ioctl.
+ * TD transitions to TD_STATE_INITIALIZED.
+ * hkid is assigned after this stage.
+ * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED).
+ * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED.
+ * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create().
+ * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create()
+ * kvm_arch_vcpu_destroy() --> tdx_vcpu_free().
+ * 4. KVM_TDX_INIT_VCPU ioctl.
+ * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED.
+ * vCPU control structures are allocated at this stage.
+ * 5. kvm_destroy_vm().
+ * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs.
+ * (2) puts hkid to !assigned state.
+ * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free():
+ * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state.
+ * 5.3 tdx_vm_destroy()
+ * transitions TD to TD_STATE_UNINITIALIZED state.
+ *
+ * tdx_vcpu_free() can be invoked only at 3.3 or 5.2.
+ * - If at 3.3, hkid is still assigned, but the vCPU must be in
+ * VCPU_TD_STATE_UNINITIALIZED state.
+ * - if at 5.2, hkid must be !assigned and all vCPUs must be in
+ * VCPU_TD_STATE_INITIALIZED state and have been dissociated.
+ */
void tdx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
struct vcpu_tdx *tdx = to_tdx(vcpu);
int i;
+ if (vcpu->cpu != -1) {
+ KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm);
+ tdx_flush_vp_on_cpu(vcpu);
+ return;
+ }
+
/*
* It is not possible to reclaim pages while hkid is assigned. It might
- * be assigned if:
- * 1. the TD VM is being destroyed but freeing hkid failed, in which
- * case the pages are leaked
- * 2. TD VCPU creation failed and this on the error path, in which case
- * there is nothing to do anyway
+ * be assigned if the TD VM is being destroyed but freeing hkid failed,
+ * in which case the pages are leaked.
*/
if (is_hkid_assigned(kvm_tdx))
return;
@@ -856,7 +898,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
}
if (tdx->vp.tdvpr_page) {
tdx_reclaim_control_page(tdx->vp.tdvpr_page);
- tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_page = NULL;
tdx->vp.tdvpr_pa = 0;
}
@@ -1059,7 +1101,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
update_debugctlmsr(vcpu->arch.host_debugctl);
tdx_load_host_xsave_state(vcpu);
- tdx->guest_entered = true;
vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
@@ -1069,9 +1110,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
return EXIT_FASTPATH_NONE;
- if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
- kvm_machine_check();
-
trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(tdx_failed_vmentry(vcpu)))
@@ -1583,137 +1621,79 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
}
-static void tdx_unpin(struct kvm *kvm, struct page *page)
+static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ kvm_pfn_t pfn)
{
- put_page(page);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err, entry, level_state;
+ gpa_t gpa = gfn_to_gpa(gfn);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) ||
+ KVM_BUG_ON(!kvm_tdx->page_add_src, kvm))
+ return -EIO;
+
+ err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
+ kvm_tdx->page_add_src, &entry, &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm))
+ return -EIO;
+
+ return 0;
}
static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
+ enum pg_level level, kvm_pfn_t pfn)
{
int tdx_level = pg_level_to_tdx_sept_level(level);
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct page *page = pfn_to_page(pfn);
gpa_t gpa = gfn_to_gpa(gfn);
u64 entry, level_state;
u64 err;
err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
- if (unlikely(tdx_operand_busy(err))) {
- tdx_unpin(kvm, page);
+ if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
- }
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
- tdx_unpin(kvm, page);
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm))
return -EIO;
- }
-
- return 0;
-}
-
-/*
- * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
- * callback tdx_gmem_post_populate() then maps pages into private memory.
- * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the
- * private EPT structures for the page to have been built before, which is
- * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
- * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
- * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
- * are no half-initialized shared EPT pages.
- */
-static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
-{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
-
- if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
- return -EINVAL;
- /* nr_premapped will be decreased when tdh_mem_page_add() is called. */
- atomic64_inc(&kvm_tdx->nr_premapped);
return 0;
}
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+ enum pg_level level, u64 mirror_spte)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- struct page *page = pfn_to_page(pfn);
+ kvm_pfn_t pfn = spte_to_pfn(mirror_spte);
/* TODO: handle large pages. */
if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
- return -EINVAL;
+ return -EIO;
- /*
- * Because guest_memfd doesn't support page migration with
- * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
- * migration. Until guest_memfd supports page migration, prevent page
- * migration.
- * TODO: Once guest_memfd introduces callback on page migration,
- * implement it and remove get_page/put_page().
- */
- get_page(page);
+ WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) ||
+ (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK);
/*
- * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
- * barrier in tdx_td_finalize().
+ * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory()
+ * before kvm_tdx->state. Userspace must not be allowed to pre-fault
+ * arbitrary memory until the initial memory image is finalized. Pairs
+ * with the smp_wmb() in tdx_td_finalize().
*/
smp_rmb();
- if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
- return tdx_mem_page_aug(kvm, gfn, level, page);
-
- return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
-}
-
-static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
-{
- int tdx_level = pg_level_to_tdx_sept_level(level);
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- gpa_t gpa = gfn_to_gpa(gfn);
- u64 err, entry, level_state;
-
- /* TODO: handle large pages. */
- if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
- return -EINVAL;
-
- if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
- return -EINVAL;
/*
- * When zapping private page, write lock is held. So no race condition
- * with other vcpu sept operation.
- * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ * If the TD isn't finalized/runnable, then userspace is initializing
+ * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD.
*/
- err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
- &level_state);
-
- if (unlikely(tdx_operand_busy(err))) {
- /*
- * The second retry is expected to succeed after kicking off all
- * other vCPUs and prevent them from invoking TDH.VP.ENTER.
- */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
- &level_state);
- tdx_no_vcpus_enter_stop(kvm);
- }
-
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
- return -EIO;
- }
-
- err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+ if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
+ return tdx_mem_page_add(kvm, gfn, level, pfn);
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
- return -EIO;
- }
- tdx_quirk_reset_page(page);
- tdx_unpin(kvm, page);
- return 0;
+ return tdx_mem_page_aug(kvm, gfn, level, pfn);
}
static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
@@ -1729,81 +1709,13 @@ static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
if (unlikely(tdx_operand_busy(err)))
return -EBUSY;
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm))
return -EIO;
- }
return 0;
}
/*
- * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
- * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
- * successfully.
- *
- * Since tdh_mem_sept_add() must have been invoked successfully before a
- * non-leaf entry present in the mirrored page table, the SEPT ZAP related
- * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
- * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
- * SEPT.
- *
- * Further check if the returned entry from SEPT walking is with RWX permissions
- * to filter out anything unexpected.
- *
- * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
- * level_state returned from a SEAMCALL error is the same as that passed into
- * the SEAMCALL.
- */
-static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
- u64 entry, int level)
-{
- if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
- return false;
-
- if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
- return false;
-
- if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
- return false;
-
- return true;
-}
-
-static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, struct page *page)
-{
- int tdx_level = pg_level_to_tdx_sept_level(level);
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
- u64 err, entry, level_state;
-
- /* For now large page isn't supported yet. */
- WARN_ON_ONCE(level != PG_LEVEL_4K);
-
- err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
-
- if (unlikely(tdx_operand_busy(err))) {
- /* After no vCPUs enter, the second retry is expected to succeed */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
- tdx_no_vcpus_enter_stop(kvm);
- }
- if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
- !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
- atomic64_dec(&kvm_tdx->nr_premapped);
- tdx_unpin(kvm, page);
- return 0;
- }
-
- if (KVM_BUG_ON(err, kvm)) {
- pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
- return -EIO;
- }
- return 1;
-}
-
-/*
* Ensure shared and private EPTs to be flushed on all vCPUs.
* tdh_mem_track() is the only caller that increases TD epoch. An increase in
* the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
@@ -1836,18 +1748,15 @@ static void tdx_track(struct kvm *kvm)
if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
return;
+ /*
+ * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest
+ * mode must be serialized, as TDH.MEM.TRACK will fail if the previous
+ * tracking epoch hasn't completed.
+ */
lockdep_assert_held_write(&kvm->mmu_lock);
- err = tdh_mem_track(&kvm_tdx->td);
- if (unlikely(tdx_operand_busy(err))) {
- /* After no vCPUs enter, the second retry is expected to succeed */
- tdx_no_vcpus_enter_start(kvm);
- err = tdh_mem_track(&kvm_tdx->td);
- tdx_no_vcpus_enter_stop(kvm);
- }
-
- if (KVM_BUG_ON(err, kvm))
- pr_tdx_error(TDH_MEM_TRACK, err);
+ err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td);
+ TDX_BUG_ON(err, TDH_MEM_TRACK, kvm);
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
@@ -1866,7 +1775,7 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
* and slot move/deletion.
*/
if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
- return -EINVAL;
+ return -EIO;
/*
* The HKID assigned to this TD was already freed and cache was
@@ -1875,11 +1784,16 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
return tdx_reclaim_page(virt_to_page(private_spt));
}
-static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, u64 mirror_spte)
{
- struct page *page = pfn_to_page(pfn);
- int ret;
+ struct page *page = pfn_to_page(spte_to_pfn(mirror_spte));
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 err, entry, level_state;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
/*
* HKID is released after all private pages have been removed, and set
@@ -1887,11 +1801,16 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
* there can't be anything populated in the private EPT.
*/
if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
- return -EINVAL;
+ return;
- ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
- if (ret <= 0)
- return ret;
+ /* TODO: handle large pages. */
+ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+ return;
+
+ err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa,
+ tdx_level, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm))
+ return;
/*
* TDX requires TLB tracking before dropping private page. Do
@@ -1899,7 +1818,21 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
*/
tdx_track(kvm);
- return tdx_sept_drop_private_spte(kvm, gfn, level, page);
+ /*
+ * When zapping private page, write lock is held. So no race condition
+ * with other vcpu sept operation.
+ * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ */
+ err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa,
+ tdx_level, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm))
+ return;
+
+ err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+ if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm))
+ return;
+
+ tdx_quirk_reset_page(page);
}
void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
@@ -2145,11 +2078,7 @@ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
}
unhandled_exit:
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = vp_enter_ret;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret);
return 0;
}
@@ -2282,37 +2211,28 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
if (cmd->flags)
return -EINVAL;
- caps = kzalloc(sizeof(*caps) +
- sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
- GFP_KERNEL);
- if (!caps)
- return -ENOMEM;
-
user_caps = u64_to_user_ptr(cmd->data);
- if (get_user(nr_user_entries, &user_caps->cpuid.nent)) {
- ret = -EFAULT;
- goto out;
- }
+ if (get_user(nr_user_entries, &user_caps->cpuid.nent))
+ return -EFAULT;
- if (nr_user_entries < td_conf->num_cpuid_config) {
- ret = -E2BIG;
- goto out;
- }
+ if (nr_user_entries < td_conf->num_cpuid_config)
+ return -E2BIG;
+
+ caps = kzalloc(struct_size(caps, cpuid.entries,
+ td_conf->num_cpuid_config), GFP_KERNEL);
+ if (!caps)
+ return -ENOMEM;
ret = init_kvm_tdx_caps(td_conf, caps);
if (ret)
goto out;
- if (copy_to_user(user_caps, caps, sizeof(*caps))) {
+ if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries,
+ caps->cpuid.nent))) {
ret = -EFAULT;
goto out;
}
- if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
- caps->cpuid.nent *
- sizeof(caps->cpuid.entries[0])))
- ret = -EFAULT;
-
out:
/* kfree() accepts NULL. */
kfree(caps);
@@ -2537,8 +2457,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
goto free_packages;
}
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error(TDH_MNG_CREATE, err);
+ if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) {
ret = -EIO;
goto free_packages;
}
@@ -2579,8 +2498,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
ret = -EAGAIN;
goto teardown;
}
- if (WARN_ON_ONCE(err)) {
- pr_tdx_error(TDH_MNG_ADDCX, err);
+ if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) {
ret = -EIO;
goto teardown;
}
@@ -2597,8 +2515,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
*seamcall_err = err;
ret = -EINVAL;
goto teardown;
- } else if (WARN_ON_ONCE(err)) {
- pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
+ } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) {
ret = -EIO;
goto teardown;
}
@@ -2642,7 +2559,7 @@ free_tdcs:
free_tdr:
if (tdr_page)
__free_page(tdr_page);
- kvm_tdx->td.tdr_page = 0;
+ kvm_tdx->td.tdr_page = NULL;
free_hkid:
tdx_hkid_free(kvm_tdx);
@@ -2747,11 +2664,53 @@ err_out:
return -EIO;
}
+typedef void *tdx_vm_state_guard_t;
+
+static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm)
+{
+ int r;
+
+ mutex_lock(&kvm->lock);
+
+ if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) {
+ r = -EBUSY;
+ goto out_err;
+ }
+
+ r = kvm_lock_all_vcpus(kvm);
+ if (r)
+ goto out_err;
+
+ /*
+ * Note the unintuitive ordering! vcpu->mutex must be taken outside
+ * kvm->slots_lock!
+ */
+ mutex_lock(&kvm->slots_lock);
+ return kvm;
+
+out_err:
+ mutex_unlock(&kvm->lock);
+ return ERR_PTR(r);
+}
+
+static void tdx_release_vm_state_locks(struct kvm *kvm)
+{
+ mutex_unlock(&kvm->slots_lock);
+ kvm_unlock_all_vcpus(kvm);
+ mutex_unlock(&kvm->lock);
+}
+
+DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t,
+ if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T),
+ tdx_acquire_vm_state_locks(kvm), struct kvm *kvm);
+
static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
+ struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data);
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct kvm_tdx_init_vm *init_vm;
struct td_params *td_params = NULL;
+ u32 nr_user_entries;
int ret;
BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
@@ -2763,28 +2722,16 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
if (cmd->flags)
return -EINVAL;
- init_vm = kmalloc(sizeof(*init_vm) +
- sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
- GFP_KERNEL);
- if (!init_vm)
- return -ENOMEM;
-
- if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
- ret = -EFAULT;
- goto out;
- }
+ if (get_user(nr_user_entries, &user_data->cpuid.nent))
+ return -EFAULT;
- if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
- ret = -E2BIG;
- goto out;
- }
+ if (nr_user_entries > KVM_MAX_CPUID_ENTRIES)
+ return -E2BIG;
- if (copy_from_user(init_vm->cpuid.entries,
- u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
- flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
- ret = -EFAULT;
- goto out;
- }
+ init_vm = memdup_user(user_data,
+ struct_size(user_data, cpuid.entries, nr_user_entries));
+ if (IS_ERR(init_vm))
+ return PTR_ERR(init_vm);
if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
ret = -EINVAL;
@@ -2868,24 +2815,14 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
- guard(mutex)(&kvm->slots_lock);
-
if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
- /*
- * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
- * TDH.MEM.PAGE.ADD().
- */
- if (atomic64_read(&kvm_tdx->nr_premapped))
- return -EINVAL;
cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
if (tdx_operand_busy(cmd->hw_error))
return -EBUSY;
- if (KVM_BUG_ON(cmd->hw_error, kvm)) {
- pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
+ if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm))
return -EIO;
- }
kvm_tdx->state = TD_STATE_RUNNABLE;
/* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
@@ -2894,27 +2831,38 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
return 0;
}
-int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd)
{
- struct kvm_tdx_cmd tdx_cmd;
- int r;
-
- if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
+ if (copy_from_user(cmd, argp, sizeof(*cmd)))
return -EFAULT;
/*
- * Userspace should never set hw_error. It is used to fill
- * hardware-defined error by the kernel.
+ * Userspace should never set hw_error. KVM writes hw_error to report
+ * hardware-defined error back to userspace.
*/
- if (tdx_cmd.hw_error)
+ if (cmd->hw_error)
return -EINVAL;
- mutex_lock(&kvm->lock);
+ return 0;
+}
+
+int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_tdx_cmd tdx_cmd;
+ int r;
+
+ r = tdx_get_cmd(argp, &tdx_cmd);
+ if (r)
+ return r;
+
+ if (tdx_cmd.id == KVM_TDX_CAPABILITIES)
+ return tdx_get_capabilities(&tdx_cmd);
+
+ CLASS(tdx_vm_state_guard, guard)(kvm);
+ if (IS_ERR(guard))
+ return PTR_ERR(guard);
switch (tdx_cmd.id) {
- case KVM_TDX_CAPABILITIES:
- r = tdx_get_capabilities(&tdx_cmd);
- break;
case KVM_TDX_INIT_VM:
r = tdx_td_init(kvm, &tdx_cmd);
break;
@@ -2922,15 +2870,12 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
r = tdx_td_finalize(kvm, &tdx_cmd);
break;
default:
- r = -EINVAL;
- goto out;
+ return -EINVAL;
}
if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
- r = -EFAULT;
+ return -EFAULT;
-out:
- mutex_unlock(&kvm->lock);
return r;
}
@@ -2972,16 +2917,14 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
}
err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
+ if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) {
ret = -EIO;
- pr_tdx_error(TDH_VP_CREATE, err);
goto free_tdcx;
}
for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
- pr_tdx_error(TDH_VP_ADDCX, err);
+ if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) {
/*
* Pages already added are reclaimed by the vcpu_free
* method, but the rest are freed here.
@@ -2994,10 +2937,19 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
}
}
- err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
- if (KVM_BUG_ON(err, vcpu->kvm)) {
- pr_tdx_error(TDH_VP_INIT, err);
- return -EIO;
+ /*
+ * tdh_vp_init() can take an exclusive lock of the TDR resource inside
+ * the TDX-Module. The TDR resource is also taken as shared in several
+ * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention
+ * (TDX-Module locks are try-lock implementations with no slow path).
+ * Take mmu_lock for write to reflect the nature of the lock taken by
+ * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if
+ * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs.
+ */
+ scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
+ err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
+ if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm))
+ return -EIO;
}
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3016,7 +2968,7 @@ free_tdcx:
free_tdvpr:
if (tdx->vp.tdvpr_page)
__free_page(tdx->vp.tdvpr_page);
- tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_page = NULL;
tdx->vp.tdvpr_pa = 0;
return ret;
@@ -3054,7 +3006,8 @@ static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_i
static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
{
- struct kvm_cpuid2 __user *output, *td_cpuid;
+ struct kvm_cpuid2 __user *output;
+ struct kvm_cpuid2 *td_cpuid;
int r = 0, i = 0, leaf;
u32 level;
@@ -3167,15 +3120,15 @@ struct tdx_gmem_post_populate_arg {
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
void __user *src, int order, void *_arg)
{
- u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct tdx_gmem_post_populate_arg *arg = _arg;
- struct kvm_vcpu *vcpu = arg->vcpu;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err, entry, level_state;
gpa_t gpa = gfn_to_gpa(gfn);
- u8 level = PG_LEVEL_4K;
struct page *src_page;
int ret, i;
- u64 err, entry, level_state;
+
+ if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
+ return -EIO;
/*
* Get the source page if it has been faulted in. Return failure if the
@@ -3187,49 +3140,29 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
if (ret != 1)
return -ENOMEM;
- ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
- if (ret < 0)
- goto out;
-
- /*
- * The private mem cannot be zapped after kvm_tdp_map_page()
- * because all paths are covered by slots_lock and the
- * filemap invalidate lock. Check that they are indeed enough.
- */
- if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
- scoped_guard(read_lock, &kvm->mmu_lock) {
- if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
- ret = -EIO;
- goto out;
- }
- }
- }
+ kvm_tdx->page_add_src = src_page;
+ ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
+ kvm_tdx->page_add_src = NULL;
- ret = 0;
- err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
- src_page, &entry, &level_state);
- if (err) {
- ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
- goto out;
- }
+ put_page(src_page);
- if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
- atomic64_dec(&kvm_tdx->nr_premapped);
+ if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
+ return ret;
- if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
- for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
- err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
- &level_state);
- if (err) {
- ret = -EIO;
- break;
- }
- }
+ /*
+ * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed
+ * between mapping the pfn and now, but slots_lock prevents memslot
+ * updates, filemap_invalidate_lock() prevents guest_memfd updates,
+ * mmu_notifier events can't reach S-EPT entries, and KVM's internal
+ * zapping flows are mutually exclusive with S-EPT mappings.
+ */
+ for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
+ err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state);
+ if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm))
+ return -EIO;
}
-out:
- put_page(src_page);
- return ret;
+ return 0;
}
static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
@@ -3245,8 +3178,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
if (tdx->state != VCPU_TD_STATE_INITIALIZED)
return -EINVAL;
- guard(mutex)(&kvm->slots_lock);
-
/* Once TD is finalized, the initial guest memory is fixed. */
if (kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
@@ -3264,7 +3195,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
!vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
return -EINVAL;
- kvm_mmu_reload(vcpu);
ret = 0;
while (region.nr_pages) {
if (signal_pending(current)) {
@@ -3301,28 +3231,57 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
return ret;
}
-int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
{
- struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct kvm_tdx_cmd cmd;
- int ret;
+ int r;
- if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
- return -EINVAL;
+ r = tdx_get_cmd(argp, &cmd);
+ if (r)
+ return r;
- if (copy_from_user(&cmd, argp, sizeof(cmd)))
- return -EFAULT;
+ CLASS(tdx_vm_state_guard, guard)(kvm);
+ if (IS_ERR(guard))
+ return PTR_ERR(guard);
- if (cmd.hw_error)
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
return -EINVAL;
+ vcpu_load(vcpu);
+
switch (cmd.id) {
+ case KVM_TDX_INIT_MEM_REGION:
+ r = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ break;
case KVM_TDX_INIT_VCPU:
- ret = tdx_vcpu_init(vcpu, &cmd);
+ r = tdx_vcpu_init(vcpu, &cmd);
break;
- case KVM_TDX_INIT_MEM_REGION:
- ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ default:
+ r = -ENOIOCTLCMD;
break;
+ }
+
+ vcpu_put(vcpu);
+
+ return r;
+}
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct kvm_tdx_cmd cmd;
+ int ret;
+
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ ret = tdx_get_cmd(argp, &cmd);
+ if (ret)
+ return ret;
+
+ switch (cmd.id) {
case KVM_TDX_GET_CPUID:
ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
break;
@@ -3447,10 +3406,6 @@ static int __init __tdx_bringup(void)
/*
* Check if MSRs (tdx_uret_msrs) can be saved/restored
* before returning to user space.
- *
- * this_cpu_ptr(user_return_msrs)->registered isn't checked
- * because the registration is done at vcpu runtime by
- * tdx_user_return_msr_update_cache().
*/
tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
if (tdx_uret_msrs[i].slot == -1) {
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index ca39a9391db1..45b5183ccb36 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -36,8 +36,12 @@ struct kvm_tdx {
struct tdx_td td;
- /* For KVM_TDX_INIT_MEM_REGION. */
- atomic64_t nr_premapped;
+ /*
+ * Scratch pointer used to pass the source page to tdx_mem_page_add().
+ * Protected by slots_lock, and non-NULL only when mapping a private
+ * pfn via tdx_gmem_post_populate().
+ */
+ struct page *page_add_src;
/*
* Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do
@@ -67,7 +71,6 @@ struct vcpu_tdx {
u64 vp_enter_ret;
enum vcpu_tdx_state state;
- bool guest_entered;
u64 map_gpa_next;
u64 map_gpa_end;
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index bc255d709d8a..4426d34811fc 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -71,6 +71,7 @@
* @regs: unsigned long * (to guest registers)
* @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH
* VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl
+ * VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO: vCPU can access host MMIO
*
* Returns:
* 0 on VM-Exit, 1 on VM-Fail
@@ -92,7 +93,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Save @vmx for SPEC_CTRL handling */
push %_ASM_ARG1
- /* Save @flags for SPEC_CTRL handling */
+ /* Save @flags (used for VMLAUNCH vs. VMRESUME and mitigations). */
push %_ASM_ARG3
/*
@@ -101,9 +102,6 @@ SYM_FUNC_START(__vmx_vcpu_run)
*/
push %_ASM_ARG2
- /* Copy @flags to EBX, _ASM_ARG3 is volatile. */
- mov %_ASM_ARG3L, %ebx
-
lea (%_ASM_SP), %_ASM_ARG2
call vmx_update_host_rsp
@@ -118,13 +116,23 @@ SYM_FUNC_START(__vmx_vcpu_run)
* and vmentry.
*/
mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI
- movl VMX_spec_ctrl(%_ASM_DI), %edi
- movl PER_CPU_VAR(x86_spec_ctrl_current), %esi
- cmp %edi, %esi
+#ifdef CONFIG_X86_64
+ mov VMX_spec_ctrl(%rdi), %rdx
+ cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx
je .Lspec_ctrl_done
+ movl %edx, %eax
+ shr $32, %rdx
+#else
+ mov VMX_spec_ctrl(%edi), %eax
+ mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx
+ xor %eax, %ecx
+ mov VMX_spec_ctrl + 4(%edi), %edx
+ mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edi
+ xor %edx, %edi
+ or %edi, %ecx
+ je .Lspec_ctrl_done
+#endif
mov $MSR_IA32_SPEC_CTRL, %ecx
- xor %edx, %edx
- mov %edi, %eax
wrmsr
.Lspec_ctrl_done:
@@ -137,9 +145,6 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load @regs to RAX. */
mov (%_ASM_SP), %_ASM_AX
- /* Check if vmlaunch or vmresume is needed */
- bt $VMX_RUN_VMRESUME_SHIFT, %ebx
-
/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
mov VCPU_RDX(%_ASM_AX), %_ASM_DX
@@ -160,11 +165,23 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
- /* Clobbers EFLAGS.ZF */
- CLEAR_CPU_BUFFERS
-
- /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */
- jnc .Lvmlaunch
+ /*
+ * Note, ALTERNATIVE_2 works in reverse order. If CLEAR_CPU_BUF_VM is
+ * enabled, do VERW unconditionally. If CPU_BUF_VM_MMIO is enabled,
+ * check @flags to see if the vCPU has access to host MMIO, and if so,
+ * do VERW. Else, do nothing (no mitigations needed/enabled).
+ */
+ ALTERNATIVE_2 "", \
+ __stringify(testl $VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO, WORD_SIZE(%_ASM_SP); \
+ jz .Lskip_mmio_verw; \
+ VERW; \
+ .Lskip_mmio_verw:), \
+ X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO, \
+ __stringify(VERW), X86_FEATURE_CLEAR_CPU_BUF_VM
+
+ /* Check @flags to see if VMLAUNCH or VMRESUME is needed. */
+ testl $VMX_RUN_VMRESUME, WORD_SIZE(%_ASM_SP)
+ jz .Lvmlaunch
/*
* After a successful VMRESUME/VMLAUNCH, control flow "magically"
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 91b6f2f3edc2..4cbe8c84b636 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -203,6 +203,7 @@ module_param(pt_mode, int, S_IRUGO);
struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
+#ifdef CONFIG_CPU_MITIGATIONS
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
@@ -225,7 +226,7 @@ static const struct {
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
-static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
unsigned int i;
@@ -302,6 +303,26 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
+static int vmx_setup_l1d_flush(void)
+{
+ /*
+ * Hand the parameter mitigation value in which was stored in the pre
+ * module init parser. If no parameter was given, it will contain
+ * 'auto' which will be turned into the default 'cond' mitigation mode.
+ */
+ return __vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+}
+
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
static int vmentry_l1d_flush_parse(const char *s)
{
unsigned int i;
@@ -339,7 +360,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
}
mutex_lock(&vmx_l1d_flush_mutex);
- ret = vmx_setup_l1d_flush(l1tf);
+ ret = __vmx_setup_l1d_flush(l1tf);
mutex_unlock(&vmx_l1d_flush_mutex);
return ret;
}
@@ -352,6 +373,101 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ if (!static_branch_unlikely(&vmx_l1d_should_flush))
+ return;
+
+ /*
+ * This code is only executed when the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ /*
+ * Clear the per-cpu flush bit, it gets set again if the vCPU
+ * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ * exits to userspace, or if KVM reaches one of the unsafe
+ * VMEXIT handlers, e.g. if KVM calls into the emulator,
+ * or from the interrupt handlers.
+ */
+ if (!kvm_get_cpu_l1tf_flush_l1d())
+ return;
+ kvm_clear_cpu_l1tf_flush_l1d();
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
+#else /* CONFIG_CPU_MITIGATIONS*/
+static int vmx_setup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER;
+ return 0;
+}
+static void vmx_cleanup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+
+}
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n");
+ return 0;
+}
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ return sysfs_emit(s, "never\n");
+}
+#endif
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+ .set = vmentry_l1d_flush_set,
+ .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
{
u64 msr;
@@ -404,12 +520,6 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
vmx->disable_fb_clear = false;
}
-static const struct kernel_param_ops vmentry_l1d_flush_ops = {
- .set = vmentry_l1d_flush_set,
- .get = vmentry_l1d_flush_get,
-};
-module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
-
static u32 vmx_segment_access_rights(struct kvm_segment *var);
void vmx_vmexit(void);
@@ -752,7 +862,7 @@ static void __loaded_vmcs_clear(void *arg)
loaded_vmcs->launched = 0;
}
-void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
int cpu = loaded_vmcs->cpu;
@@ -903,7 +1013,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
- if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
@@ -3219,6 +3329,40 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
return to_vmx(vcpu)->vpid;
}
+static u64 construct_eptp(hpa_t root_hpa)
+{
+ u64 eptp = root_hpa | VMX_EPTP_MT_WB;
+ struct kvm_mmu_page *root;
+
+ if (kvm_mmu_is_dummy_root(root_hpa))
+ return eptp | VMX_EPTP_PWL_4;
+
+ /*
+ * EPT roots should always have an associated MMU page. Return a "bad"
+ * EPTP to induce VM-Fail instead of continuing on in a unknown state.
+ */
+ root = root_to_sp(root_hpa);
+ if (WARN_ON_ONCE(!root))
+ return INVALID_PAGE;
+
+ eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+
+ if (enable_ept_ad_bits && !root->role.ad_disabled)
+ eptp |= VMX_EPTP_AD_ENABLE_BIT;
+
+ return eptp;
+}
+
+static void vmx_flush_tlb_ept_root(hpa_t root_hpa)
+{
+ u64 eptp = construct_eptp(root_hpa);
+
+ if (VALID_PAGE(eptp))
+ ept_sync_context(eptp);
+ else
+ ept_sync_global();
+}
+
void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -3229,8 +3373,7 @@ void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
return;
if (enable_ept)
- ept_sync_context(construct_eptp(vcpu, root_hpa,
- mmu->root_role.level));
+ vmx_flush_tlb_ept_root(root_hpa);
else
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
@@ -3396,30 +3539,16 @@ static int vmx_get_max_ept_level(void)
return 4;
}
-u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
-{
- u64 eptp = VMX_EPTP_MT_WB;
-
- eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
-
- if (enable_ept_ad_bits &&
- (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
- eptp |= VMX_EPTP_AD_ENABLE_BIT;
- eptp |= root_hpa;
-
- return eptp;
-}
-
void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
unsigned long guest_cr3;
- u64 eptp;
if (enable_ept) {
- eptp = construct_eptp(vcpu, root_hpa, root_level);
- vmcs_write64(EPT_POINTER, eptp);
+ KVM_MMU_WARN_ON(root_to_sp(root_hpa) &&
+ root_level != root_to_sp(root_hpa)->role.level);
+ vmcs_write64(EPT_POINTER, construct_eptp(root_hpa));
hv_track_root_tdp(vcpu, root_hpa);
@@ -6631,15 +6760,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
unexpected_vmexit:
- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
- exit_reason.full);
dump_vmcs(vcpu);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_reason.full;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full);
return 0;
}
@@ -6661,77 +6783,6 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return ret;
}
-/*
- * Software based L1D cache flush which is used when microcode providing
- * the cache control MSR is not loaded.
- *
- * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
- * flush it is required to read in 64 KiB because the replacement algorithm
- * is not exactly LRU. This could be sized at runtime via topology
- * information but as all relevant affected CPUs have 32KiB L1D cache size
- * there is no point in doing so.
- */
-static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
-{
- int size = PAGE_SIZE << L1D_CACHE_ORDER;
-
- /*
- * This code is only executed when the flush mode is 'cond' or
- * 'always'
- */
- if (static_branch_likely(&vmx_l1d_flush_cond)) {
- bool flush_l1d;
-
- /*
- * Clear the per-vcpu flush bit, it gets set again if the vCPU
- * is reloaded, i.e. if the vCPU is scheduled out or if KVM
- * exits to userspace, or if KVM reaches one of the unsafe
- * VMEXIT handlers, e.g. if KVM calls into the emulator.
- */
- flush_l1d = vcpu->arch.l1tf_flush_l1d;
- vcpu->arch.l1tf_flush_l1d = false;
-
- /*
- * Clear the per-cpu flush bit, it gets set again from
- * the interrupt handlers.
- */
- flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
- kvm_clear_cpu_l1tf_flush_l1d();
-
- if (!flush_l1d)
- return;
- }
-
- vcpu->stat.l1d_flush++;
-
- if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
- return;
- }
-
- asm volatile(
- /* First ensure the pages are in the TLB */
- "xorl %%eax, %%eax\n"
- ".Lpopulate_tlb:\n\t"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $4096, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lpopulate_tlb\n\t"
- "xorl %%eax, %%eax\n\t"
- "cpuid\n\t"
- /* Now fill the cache */
- "xorl %%eax, %%eax\n"
- ".Lfill_cache:\n"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $64, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lfill_cache\n\t"
- "lfence\n"
- :: [flush_pages] "r" (vmx_l1d_flush_pages),
- [size] "r" (size)
- : "eax", "ebx", "ecx", "edx");
-}
-
void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -7050,10 +7101,19 @@ void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
if (to_vt(vcpu)->emulation_required)
return;
- if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+ switch (vmx_get_exit_reason(vcpu).basic) {
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
- else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI)
+ break;
+ case EXIT_REASON_EXCEPTION_NMI:
handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
+ break;
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ kvm_machine_check();
+ break;
+ default:
+ break;
+ }
}
/*
@@ -7328,21 +7388,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
guest_state_enter_irqoff();
- /*
- * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW
- * mitigation for MDS is done late in VMentry and is still
- * executed in spite of L1D Flush. This is because an extra VERW
- * should not matter much after the big hammer L1D Flush.
- *
- * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA,
- * and is affected by MMIO Stale Data. In such cases mitigation in only
- * needed against an MMIO capable guest.
- */
- if (static_branch_unlikely(&vmx_l1d_should_flush))
- vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&cpu_buf_vm_clear) &&
- (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO))
- x86_clear_cpu_buffers();
+ vmx_l1d_flush(vcpu);
vmx_disable_fb_clear(vmx);
@@ -7454,8 +7500,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- kvm_load_guest_xsave_state(vcpu);
-
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx);
@@ -7499,8 +7543,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
pt_guest_exit(vmx);
- kvm_load_host_xsave_state(vcpu);
-
if (is_guest_mode(vcpu)) {
/*
* Track VMLAUNCH/VMRESUME that have made past guest state
@@ -7516,9 +7558,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
- if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
- kvm_machine_check();
-
trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
@@ -8679,16 +8718,6 @@ __init int vmx_hardware_setup(void)
return r;
}
-static void vmx_cleanup_l1d_flush(void)
-{
- if (vmx_l1d_flush_pages) {
- free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
- vmx_l1d_flush_pages = NULL;
- }
- /* Restore state so sysfs ignores VMX */
- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
-}
-
void vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
@@ -8724,14 +8753,8 @@ int __init vmx_init(void)
if (r)
return r;
- /*
- * Must be called after common x86 init so enable_ept is properly set
- * up. Hand the parameter mitigation value in which was stored in
- * the pre module init parser. If no parameter was given, it will
- * contain 'auto' which will be turned into the default 'cond'
- * mitigation mode.
- */
- r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ /* Must be called after common x86 init so enable_ept is setup. */
+ r = vmx_setup_l1d_flush();
if (r)
goto err_l1d_flush;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index ea93121029f9..bc3ed3145d7e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -369,7 +369,6 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
@@ -681,7 +680,6 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
void free_vmcs(struct vmcs *vmcs);
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
-void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs);
static inline struct vmcs *alloc_vmcs(bool shadow)
{
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 9697368d65b3..d09abeac2b56 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -73,7 +73,6 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
-void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
@@ -149,6 +148,7 @@ int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
+int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c9c2aa6f4705..0c6d899d53dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -159,9 +159,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs);
unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, 0644);
-static bool __read_mostly kvmclock_periodic_sync = true;
-module_param(kvmclock_periodic_sync, bool, 0444);
-
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, 0644);
@@ -212,7 +209,7 @@ struct kvm_user_return_msrs {
u32 __read_mostly kvm_nr_uret_msrs;
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs);
static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
-static struct kvm_user_return_msrs __percpu *user_return_msrs;
+static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs);
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
@@ -575,24 +572,26 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
vcpu->arch.apf.gfns[i] = ~0;
}
+static void kvm_destroy_user_return_msrs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered);
+
+ kvm_nr_uret_msrs = 0;
+}
+
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
struct kvm_user_return_msrs *msrs
= container_of(urn, struct kvm_user_return_msrs, urn);
struct kvm_user_return_msr_values *values;
- unsigned long flags;
- /*
- * Disabling irqs at this point since the following code could be
- * interrupted and executed through kvm_arch_disable_virtualization_cpu()
- */
- local_irq_save(flags);
- if (msrs->registered) {
- msrs->registered = false;
- user_return_notifier_unregister(urn);
- }
- local_irq_restore(flags);
+ msrs->registered = false;
+ user_return_notifier_unregister(urn);
+
for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
values = &msrs->values[slot];
if (values->host != values->curr) {
@@ -643,7 +642,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr);
static void kvm_user_return_msr_cpu_online(void)
{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
u64 value;
int i;
@@ -665,7 +664,7 @@ static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs)
int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
int err;
value = (value & mask) | (msrs->values[slot].host & ~mask);
@@ -681,24 +680,15 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
-void kvm_user_return_msr_update_cache(unsigned int slot, u64 value)
-{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
-
- msrs->values[slot].curr = value;
- kvm_user_return_register_notifier(msrs);
-}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache);
-
u64 kvm_get_user_return_msr(unsigned int slot)
{
- return this_cpu_ptr(user_return_msrs)->values[slot].curr;
+ return this_cpu_ptr(&user_return_msrs)->values[slot].curr;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr);
static void drop_user_return_notifiers(void)
{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
if (msrs->registered)
kvm_on_user_return(&msrs->urn);
@@ -1045,6 +1035,13 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr);
+static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
+{
+ u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
+
+ return (vcpu->arch.apf.msr_en_val & mask) == mask;
+}
+
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@ -1137,15 +1134,20 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
}
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
-
/*
* Clearing CR0.PG is defined to flush the TLB from the guest's
* perspective.
*/
if (!(cr0 & X86_CR0_PG))
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ /*
+ * Check for async #PF completion events when enabling paging,
+ * as the vCPU may have previously encountered async #PFs (it's
+ * entirely legal for the guest to toggle paging on/off without
+ * waiting for the async #PF queue to drain).
+ */
+ else if (kvm_pv_async_pf_enabled(vcpu))
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
}
if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
@@ -1203,20 +1205,27 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw);
-void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
+static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest)
{
if (vcpu->arch.guest_state_protected)
return;
- if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE))
+ return;
- if (vcpu->arch.xcr0 != kvm_host.xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK,
+ load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0);
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != kvm_host.xss)
- wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss);
- }
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
+ vcpu->arch.ia32_xss != kvm_host.xss)
+ wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss);
+}
+
+static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.guest_state_protected)
+ return;
if (cpu_feature_enabled(X86_FEATURE_PKU) &&
vcpu->arch.pkru != vcpu->arch.host_pkru &&
@@ -1224,9 +1233,8 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
wrpkru(vcpu->arch.pkru);
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_guest_xsave_state);
-void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
+static void kvm_load_host_pkru(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.guest_state_protected)
return;
@@ -1238,19 +1246,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
wrpkru(vcpu->arch.host_pkru);
}
-
- if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
-
- if (vcpu->arch.xcr0 != kvm_host.xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
-
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != kvm_host.xss)
- wrmsrq(MSR_IA32_XSS, kvm_host.xss);
- }
-
}
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state);
#ifdef CONFIG_X86_64
static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
@@ -3505,27 +3501,17 @@ uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
/*
* kvmclock updates which are isolated to a given vcpu, such as
* vcpu->cpu migration, should not allow system_timestamp from
- * the rest of the vcpus to remain static. Otherwise ntp frequency
- * correction applies to one vcpu's system_timestamp but not
- * the others.
+ * the rest of the vcpus to remain static.
*
* So in those cases, request a kvmclock update for all vcpus.
- * We need to rate-limit these requests though, as they can
- * considerably slow guests that have a large number of vcpus.
- * The time for a remote vcpu to update its kvmclock is bound
- * by the delay we use to rate-limit the updates.
+ * The worst case for a remote vcpu to update its kvmclock
+ * is then bounded by maximum nohz sleep latency.
*/
-
-#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
-
-static void kvmclock_update_fn(struct work_struct *work)
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
{
unsigned long i;
- struct delayed_work *dwork = to_delayed_work(work);
- struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
- kvmclock_update_work);
- struct kvm *kvm = container_of(ka, struct kvm, arch);
struct kvm_vcpu *vcpu;
+ struct kvm *kvm = v->kvm;
kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -3533,29 +3519,6 @@ static void kvmclock_update_fn(struct work_struct *work)
}
}
-static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
-{
- struct kvm *kvm = v->kvm;
-
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
- schedule_delayed_work(&kvm->arch.kvmclock_update_work,
- KVMCLOCK_UPDATE_DELAY);
-}
-
-#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
-
-static void kvmclock_sync_fn(struct work_struct *work)
-{
- struct delayed_work *dwork = to_delayed_work(work);
- struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
- kvmclock_sync_work);
- struct kvm *kvm = container_of(ka, struct kvm, arch);
-
- schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
- KVMCLOCK_SYNC_PERIOD);
-}
-
/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
static bool is_mci_control_msr(u32 msr)
{
@@ -3650,13 +3613,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
-static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
-{
- u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
-
- return (vcpu->arch.apf.msr_en_val & mask) == mask;
-}
-
static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
@@ -4182,7 +4138,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
return 1;
if (data & 0x1) {
- vcpu->arch.apf.pageready_pending = false;
+ /*
+ * Pairs with the smp_mb__after_atomic() in
+ * kvm_arch_async_page_present_queued().
+ */
+ smp_store_mb(vcpu->arch.apf.pageready_pending, false);
+
kvm_check_async_pf_completion(vcpu);
}
break;
@@ -5188,7 +5149,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- vcpu->arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
pmu->need_cleanup = true;
@@ -7239,6 +7200,19 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
return 0;
}
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+
+ if (ioctl == KVM_MEMORY_ENCRYPT_OP &&
+ kvm_x86_ops.vcpu_mem_enc_unlocked_ioctl)
+ return kvm_x86_call(vcpu_mem_enc_unlocked_ioctl)(vcpu, argp);
+
+ return -ENOIOCTLCMD;
+}
+
int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
@@ -7998,7 +7972,7 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
unsigned int bytes, struct x86_exception *exception)
{
/* kvm_write_guest_virt_system can pull in tons of pages. */
- vcpu->arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
@@ -8842,6 +8816,14 @@ static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
}
+static int emulator_get_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr)
+{
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ *xcr = emul_to_vcpu(ctxt)->arch.xcr0;
+ return 0;
+}
+
static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
{
return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
@@ -8914,6 +8896,7 @@ static const struct x86_emulate_ops emulate_ops = {
.is_smm = emulator_is_smm,
.leave_smm = emulator_leave_smm,
.triple_fault = emulator_triple_fault,
+ .get_xcr = emulator_get_xcr,
.set_xcr = emulator_set_xcr,
.get_untagged_addr = emulator_get_untagged_addr,
.is_canonical_addr = emulator_is_canonical_addr,
@@ -9109,6 +9092,18 @@ void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit);
+void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason)
+{
+ vcpu_unimpl(vcpu, "unexpected exit reason 0x%llx\n", exit_reason);
+
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_reason;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_unexpected_reason_exit);
+
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
struct kvm *kvm = vcpu->kvm;
@@ -9337,6 +9332,23 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
return false;
}
+static bool is_soft_int_instruction(struct x86_emulate_ctxt *ctxt,
+ int emulation_type)
+{
+ u8 vector = EMULTYPE_GET_SOFT_INT_VECTOR(emulation_type);
+
+ switch (ctxt->b) {
+ case 0xcc:
+ return vector == BP_VECTOR;
+ case 0xcd:
+ return vector == ctxt->src.val;
+ case 0xce:
+ return vector == OF_VECTOR;
+ default:
+ return false;
+ }
+}
+
/*
* Decode an instruction for emulation. The caller is responsible for handling
* code breakpoints. Note, manually detecting code breakpoints is unnecessary
@@ -9394,7 +9406,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return handle_emulation_failure(vcpu, emulation_type);
}
- vcpu->arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
kvm_clear_exception_queue(vcpu);
@@ -9447,6 +9459,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* injecting single-step #DBs.
*/
if (emulation_type & EMULTYPE_SKIP) {
+ if (emulation_type & EMULTYPE_SKIP_SOFT_INT &&
+ !is_soft_int_instruction(ctxt, emulation_type))
+ return 0;
+
if (ctxt->mode != X86EMUL_MODE_PROT64)
ctxt->eip = (u32)ctxt->_eip;
else
@@ -10031,17 +10047,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
return -ENOMEM;
}
- user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
- if (!user_return_msrs) {
- pr_err("failed to allocate percpu kvm_user_return_msrs\n");
- r = -ENOMEM;
- goto out_free_x86_emulator_cache;
- }
- kvm_nr_uret_msrs = 0;
-
r = kvm_mmu_vendor_module_init();
if (r)
- goto out_free_percpu;
+ goto out_free_x86_emulator_cache;
kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
@@ -10066,6 +10074,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities);
+ WARN_ON_ONCE(kvm_nr_uret_msrs);
+
r = ops->hardware_setup();
if (r != 0)
goto out_mmu_exit;
@@ -10138,9 +10148,8 @@ out_unwind_ops:
kvm_x86_ops.enable_virtualization_cpu = NULL;
kvm_x86_call(hardware_unsetup)();
out_mmu_exit:
+ kvm_destroy_user_return_msrs();
kvm_mmu_vendor_module_exit();
-out_free_percpu:
- free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
return r;
@@ -10168,8 +10177,8 @@ void kvm_x86_vendor_exit(void)
cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_call(hardware_unsetup)();
+ kvm_destroy_user_return_msrs();
kvm_mmu_vendor_module_exit();
- free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
#ifdef CONFIG_KVM_XEN
static_key_deferred_flush(&kvm_xen_enabled);
@@ -11291,6 +11300,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_fpu.xfd_err)
wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+ kvm_load_xfeatures(vcpu, true);
+
if (unlikely(vcpu->arch.switch_db_regs &&
!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
set_debugreg(DR7_FIXED_1, 7);
@@ -11319,6 +11330,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
guest_timing_enter_irqoff();
+ /*
+ * Swap PKRU with hardware breakpoints disabled to minimize the number
+ * of flows where non-KVM code can run with guest state loaded.
+ */
+ kvm_load_guest_pkru(vcpu);
+
for (;;) {
/*
* Assert that vCPU vs. VM APICv state is consistent. An APICv
@@ -11347,6 +11364,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
++vcpu->stat.exits;
}
+ kvm_load_host_pkru(vcpu);
+
/*
* Do this here before restoring debug registers on the host. And
* since we do this before handling the vmexit, a DR access vmexit
@@ -11377,6 +11396,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
+ kvm_load_xfeatures(vcpu, false);
+
/*
* Sync xfd before calling handle_exit_irqoff() which may
* rely on the fact that guest_fpu::xfd is up-to-date (e.g.
@@ -12734,8 +12755,6 @@ fail_mmu_destroy:
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vcpu->kvm;
-
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
@@ -12746,10 +12765,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
vcpu->arch.msr_kvm_poll_control = 1;
mutex_unlock(&vcpu->mutex);
-
- if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
- KVMCLOCK_SYNC_PERIOD);
}
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -13088,7 +13103,21 @@ int kvm_arch_enable_virtualization_cpu(void)
void kvm_arch_disable_virtualization_cpu(void)
{
kvm_x86_call(disable_virtualization_cpu)();
- drop_user_return_notifiers();
+
+ /*
+ * Leave the user-return notifiers as-is when disabling virtualization
+ * for reboot, i.e. when disabling via IPI function call, and instead
+ * pin kvm.ko (if it's a module) to defend against use-after-free (in
+ * the *very* unlikely scenario module unload is racing with reboot).
+ * On a forced reboot, tasks aren't frozen before shutdown, and so KVM
+ * could be actively modifying user-return MSR state when the IPI to
+ * disable virtualization arrives. Handle the extreme edge case here
+ * instead of trying to account for it in the normal flows.
+ */
+ if (in_task() || WARN_ON_ONCE(!kvm_rebooting))
+ drop_user_return_notifiers();
+ else
+ __module_get(THIS_MODULE);
}
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
@@ -13160,9 +13189,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.hv_root_tdp = INVALID_PAGE;
#endif
- INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
- INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
-
kvm_apicv_init(kvm);
kvm_hv_init_vm(kvm);
kvm_xen_init_vm(kvm);
@@ -13269,9 +13295,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
* is unsafe, i.e. will lead to use-after-free. The PIT also needs to
* be stopped before IRQ routing is freed.
*/
- cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
- cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
-
#ifdef CONFIG_KVM_IOAPIC
kvm_free_pit(kvm);
#endif
@@ -13888,7 +13911,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
if ((work->wakeup_all || work->notpresent_injected) &&
kvm_pv_async_pf_enabled(vcpu) &&
!apf_put_user_ready(vcpu, work->arch.token)) {
- vcpu->arch.apf.pageready_pending = true;
+ WRITE_ONCE(vcpu->arch.apf.pageready_pending, true);
kvm_apic_set_irq(vcpu, &irq, NULL);
}
@@ -13899,7 +13922,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
{
kvm_make_request(KVM_REQ_APF_READY, vcpu);
- if (!vcpu->arch.apf.pageready_pending)
+
+ /* Pairs with smp_store_mb() in kvm_set_msr_common(). */
+ smp_mb__after_atomic();
+
+ if (!READ_ONCE(vcpu->arch.apf.pageready_pending))
kvm_vcpu_kick(vcpu);
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f3dc77f006f9..fdab0ad49098 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -420,6 +420,20 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
return !(kvm->arch.disabled_quirks & quirk);
}
+static __always_inline void kvm_request_l1tf_flush_l1d(void)
+{
+#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL)
+ /*
+ * Use a raw write to set the per-CPU flag, as KVM will ensure a flush
+ * even if preemption is currently enabled.. If the current vCPU task
+ * is migrated to a different CPU (or userspace runs the vCPU on a
+ * different task) before the next VM-Entry, then kvm_arch_vcpu_load()
+ * will request a flush on the new CPU.
+ */
+ raw_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
+#endif
+}
+
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
@@ -622,8 +636,6 @@ static inline void kvm_machine_check(void)
#endif
}
-void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
-void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
int kvm_spec_ctrl_test_value(u64 value);
int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e);