aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexander Potapenko <glider@google.com>2025-11-19 15:45:40 +0100
committerAlexander Potapenko <glider@google.com>2025-11-20 09:23:19 +0000
commit5c74d2f73618084fe35322dbb04bf713f7d177f2 (patch)
tree1c3b16cd8fa7e9a93adb462870551f6d99f04e72
parentec988b2fb6c4aada0b3afcd9739ad27ec64b8249 (diff)
executor: sys/linux: implement SYZOS_API_NESTED_VMRESUME
Provide the SYZOS API command to resume L2 execution after a VM exit, using VMRESUME on Intel and VMRUN on AMD. For testing purpose, implement basic handling of the INVD instruction: - enable INVD interception on AMD (set all bits in VMCB 00Ch); - map EXIT_REASON_INVD and VMEXIT_INVD into SYZOS_NESTED_EXIT_REASON_INVD; - advance L2 RIP to skip to the next instruction. While at it, perform minor refactorings of L2 exit reason handling. sys/linux/test/amd64-syz_kvm_nested_vmresume tests the new command by executing two instructions, INVD and HLT, in the nested VM.
-rw-r--r--executor/common_kvm_amd64_syzos.h77
-rw-r--r--executor/kvm.h2
-rw-r--r--sys/linux/dev_kvm_amd64.txt1
-rw-r--r--sys/linux/test/amd64-syz_kvm_nested_vmresume29
4 files changed, 90 insertions, 19 deletions
diff --git a/executor/common_kvm_amd64_syzos.h b/executor/common_kvm_amd64_syzos.h
index dc092409b..27f4be400 100644
--- a/executor/common_kvm_amd64_syzos.h
+++ b/executor/common_kvm_amd64_syzos.h
@@ -30,6 +30,7 @@ typedef enum {
SYZOS_API_NESTED_CREATE_VM = 301,
SYZOS_API_NESTED_LOAD_CODE = 302,
SYZOS_API_NESTED_VMLAUNCH = 303,
+ SYZOS_API_NESTED_VMRESUME = 304,
SYZOS_API_STOP, // Must be the last one
} syzos_api_id;
@@ -102,6 +103,7 @@ GUEST_CODE static void guest_handle_enable_nested(struct api_call_1* cmd, uint64
GUEST_CODE static void guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id);
+GUEST_CODE static void guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id);
typedef enum {
UEXIT_END = (uint64)-1,
@@ -208,6 +210,9 @@ guest_main(uint64 size, uint64 cpu)
} else if (call == SYZOS_API_NESTED_VMLAUNCH) {
// Launch the nested VM.
guest_handle_nested_vmlaunch((struct api_call_1*)cmd, cpu);
+ } else if (call == SYZOS_API_NESTED_VMRESUME) {
+ // Resume a nested VM.
+ guest_handle_nested_vmresume((struct api_call_1*)cmd, cpu);
}
addr += cmd->size;
size -= cmd->size;
@@ -733,6 +738,7 @@ GUEST_CODE static noinline void init_vmcs_control_fields(uint64 cpu_id, uint64 v
// Common L2 exit reasons for Intel and AMD.
typedef enum {
SYZOS_NESTED_EXIT_REASON_HLT = 1,
+ SYZOS_NESTED_EXIT_REASON_INVD = 2,
SYZOS_NESTED_EXIT_REASON_UNKNOWN = 0xFF,
} syz_nested_exit_reason;
@@ -748,22 +754,37 @@ GUEST_CODE static void guest_uexit_l2(uint64 exit_reason, syz_nested_exit_reason
}
}
-GUEST_CODE static syz_nested_exit_reason map_intel_exit_reason(uint64 reason)
+#define EXIT_REASON_HLT 0xc
+#define EXIT_REASON_INVD 0xd
+
+GUEST_CODE static syz_nested_exit_reason map_intel_exit_reason(uint64 basic_reason)
{
- volatile uint64 basic_reason = reason & 0xFFFF;
- // EXIT_REASON_HLT.
- if (basic_reason == 0xc)
+ // Disable optimizations.
+ volatile uint64 reason = basic_reason;
+ if (reason == EXIT_REASON_HLT)
return SYZOS_NESTED_EXIT_REASON_HLT;
+ if (reason == EXIT_REASON_INVD)
+ return SYZOS_NESTED_EXIT_REASON_INVD;
return SYZOS_NESTED_EXIT_REASON_UNKNOWN;
}
+GUEST_CODE static void advance_l2_rip_intel(uint64 basic_reason)
+{
+ if (basic_reason == EXIT_REASON_INVD) {
+ uint64 rip = vmread(VMCS_GUEST_RIP);
+ vmwrite(VMCS_GUEST_RIP, rip + 2);
+ }
+}
+
// This function is called from inline assembly.
__attribute__((used))
GUEST_CODE static void
nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs)
{
- syz_nested_exit_reason mapped_reason = map_intel_exit_reason(exit_reason);
+ uint64 basic_reason = exit_reason & 0xFFFF;
+ syz_nested_exit_reason mapped_reason = map_intel_exit_reason(basic_reason);
guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_INTEL);
+ advance_l2_rip_intel(basic_reason);
}
extern char after_vmentry_label;
@@ -810,20 +831,36 @@ __attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(v
[vm_exit_reason] "i"(VMCS_VM_EXIT_REASON) : "memory", "cc", "rbx", "rdi", "rsi");
}
-GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 reason)
+#define VMEXIT_INVD 0x76
+#define VMEXIT_HLT 0x78
+
+GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 basic_reason)
{
- volatile uint64 basic_reason = reason & 0xFFFF;
- // #VMEXIT_HLT.
- if (basic_reason == 0x78)
+ // Disable optimizations.
+ volatile uint64 reason = basic_reason;
+ if (reason == VMEXIT_HLT)
return SYZOS_NESTED_EXIT_REASON_HLT;
+ if (reason == VMEXIT_INVD)
+ return SYZOS_NESTED_EXIT_REASON_INVD;
return SYZOS_NESTED_EXIT_REASON_UNKNOWN;
}
+GUEST_CODE static void advance_l2_rip_amd(uint64 basic_reason, uint64 cpu_id, uint64 vm_id)
+{
+ if (basic_reason == VMEXIT_INVD) {
+ uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
+ uint64 rip = vmcb_read64((volatile uint8*)vmcb_addr, VMCB_GUEST_RIP);
+ vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, rip + 2);
+ }
+}
+
__attribute__((used)) GUEST_CODE static void
nested_vm_exit_handler_amd(uint64 exit_reason, uint64 cpu_id, uint64 vm_id)
{
- syz_nested_exit_reason mapped_reason = map_amd_exit_reason(exit_reason);
+ volatile uint64 basic_reason = exit_reason & 0xFFFF;
+ syz_nested_exit_reason mapped_reason = map_amd_exit_reason(basic_reason);
guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_AMD);
+ advance_l2_rip_amd(basic_reason, cpu_id, vm_id);
}
GUEST_CODE static noinline void init_vmcs_host_state(void)
@@ -1011,7 +1048,7 @@ GUEST_CODE static noinline void init_vmcb_guest_state(uint64 cpu_id, uint64 vm_i
vmcb_write32(vmcb_addr, VMCB_GUEST_IDTR_LIM, idtr.limit);
// Setup VMCB Control Fields.
- vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC3, VMCB_CTRL_INTERCEPT_HLT);
+ vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC3, VMCB_CTRL_INTERCEPT_VEC3_ALL);
vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC4, VMCB_CTRL_INTERCEPT_VEC4_ALL);
// Enable Nested Paging (NPT):
@@ -1078,9 +1115,8 @@ guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_
}
GUEST_CODE static noinline void
-guest_handle_nested_vmentry_intel(struct api_call_1* cmd, uint64 cpu_id, bool is_launch)
+guest_handle_nested_vmentry_intel(uint64 vm_id, uint64 cpu_id, bool is_launch)
{
- uint64 vm_id = cmd->arg;
uint64 vmx_error_code = 0;
uint8 fail_flag = 0; // Will be 1 if EITHER CF or ZF is set
@@ -1149,19 +1185,24 @@ guest_run_amd_vm(uint64 cpu_id, uint64 vm_id)
}
GUEST_CODE static noinline void
-guest_handle_nested_vmlaunch_amd(struct api_call_1* cmd, uint64 cpu_id, uint64 vm_id)
+guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id)
{
- guest_run_amd_vm(cpu_id, vm_id);
+ uint64 vm_id = cmd->arg;
+ if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
+ guest_handle_nested_vmentry_intel(vm_id, cpu_id, true);
+ } else {
+ guest_run_amd_vm(cpu_id, vm_id);
+ }
}
GUEST_CODE static noinline void
-guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id)
+guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id)
{
uint64 vm_id = cmd->arg;
if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
- guest_handle_nested_vmentry_intel(cmd, cpu_id, true);
+ guest_handle_nested_vmentry_intel(vm_id, cpu_id, false);
} else {
- guest_handle_nested_vmlaunch_amd(cmd, cpu_id, vm_id);
+ guest_run_amd_vm(cpu_id, vm_id);
}
}
diff --git a/executor/kvm.h b/executor/kvm.h
index 53ba00888..d7c708b34 100644
--- a/executor/kvm.h
+++ b/executor/kvm.h
@@ -393,7 +393,7 @@
// Control Area
#define VMCB_CTRL_INTERCEPT_VEC3 0x0c
-#define VMCB_CTRL_INTERCEPT_HLT (1 << 24) // Bit 24 in VEC3
+#define VMCB_CTRL_INTERCEPT_VEC3_ALL (0xffffffff)
#define VMCB_CTRL_INTERCEPT_VEC4 0x10
// Bits 0-9: intercept VMRUN, VMMCALL, VMLOAD, VMSAVE, STGI, CLGI, SKINIT, RDTSCP, ICEBP, WBINVD.
#define VMCB_CTRL_INTERCEPT_VEC4_ALL (0x3ff)
diff --git a/sys/linux/dev_kvm_amd64.txt b/sys/linux/dev_kvm_amd64.txt
index cb116574d..d3dac2cf9 100644
--- a/sys/linux/dev_kvm_amd64.txt
+++ b/sys/linux/dev_kvm_amd64.txt
@@ -117,6 +117,7 @@ syzos_api_call$x86 [
nested_create_vm syzos_api$x86[301, syzos_api_vm_id]
nested_load_code syzos_api$x86[302, syzos_api_nested_load_code]
nested_vmlaunch syzos_api$x86[303, syzos_api_vm_id]
+ nested_vmresume syzos_api$x86[304, syzos_api_vm_id]
] [varlen]
kvm_text_x86 [
diff --git a/sys/linux/test/amd64-syz_kvm_nested_vmresume b/sys/linux/test/amd64-syz_kvm_nested_vmresume
new file mode 100644
index 000000000..f26b683bf
--- /dev/null
+++ b/sys/linux/test/amd64-syz_kvm_nested_vmresume
@@ -0,0 +1,29 @@
+#
+# requires: arch=amd64 -threaded
+#
+r0 = openat$kvm(0, &AUTO='/dev/kvm\x00', 0x0, 0x0)
+r1 = ioctl$KVM_CREATE_VM(r0, AUTO, 0x0)
+r2 = syz_kvm_setup_syzos_vm$x86(r1, &(0x7f0000c00000/0x400000)=nil)
+
+# Create a nested VM that performs INVD (0f 08) and HLT (f4) to test vmresume.
+# INVD is one of the few instructions that cause unconditional VM exit on Intel.
+# On AMD, SYZOS also turns on INVD interception.
+#
+r3 = syz_kvm_add_vcpu$x86(r2, &AUTO={0x0, &AUTO=[@enable_nested={AUTO, AUTO, 0x0}, @nested_create_vm={AUTO, AUTO, 0x0}, @nested_load_code={AUTO, AUTO, {0x0, "0f08f4"}}, @nested_vmlaunch={AUTO, AUTO, 0x0}, @nested_vmresume={AUTO, AUTO, 0x0}], AUTO})
+r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, AUTO)
+r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, r4, 0x3, 0x1, r3, 0x0)
+
+# L2 VM executes INVD. Exit reason is mapped to 0xe2e20002.
+#
+ioctl$KVM_RUN(r3, AUTO, 0x0)
+syz_kvm_assert_syzos_uexit$x86(r5, 0xe2e20002)
+
+# L1 resumes L2, which executes HLT. Exit reason is mapped to 0xe2e20001.
+#
+ioctl$KVM_RUN(r3, AUTO, 0x0)
+syz_kvm_assert_syzos_uexit$x86(r5, 0xe2e20001)
+
+# guest_main should finish with guest_uexit(-1).
+#
+ioctl$KVM_RUN(r3, AUTO, 0x0)
+syz_kvm_assert_syzos_uexit$x86(r5, 0xffffffff)