From 5c74d2f73618084fe35322dbb04bf713f7d177f2 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 19 Nov 2025 15:45:40 +0100 Subject: executor: sys/linux: implement SYZOS_API_NESTED_VMRESUME Provide the SYZOS API command to resume L2 execution after a VM exit, using VMRESUME on Intel and VMRUN on AMD. For testing purpose, implement basic handling of the INVD instruction: - enable INVD interception on AMD (set all bits in VMCB 00Ch); - map EXIT_REASON_INVD and VMEXIT_INVD into SYZOS_NESTED_EXIT_REASON_INVD; - advance L2 RIP to skip to the next instruction. While at it, perform minor refactorings of L2 exit reason handling. sys/linux/test/amd64-syz_kvm_nested_vmresume tests the new command by executing two instructions, INVD and HLT, in the nested VM. --- executor/common_kvm_amd64_syzos.h | 77 +++++++++++++++++++++------- executor/kvm.h | 2 +- sys/linux/dev_kvm_amd64.txt | 1 + sys/linux/test/amd64-syz_kvm_nested_vmresume | 29 +++++++++++ 4 files changed, 90 insertions(+), 19 deletions(-) create mode 100644 sys/linux/test/amd64-syz_kvm_nested_vmresume diff --git a/executor/common_kvm_amd64_syzos.h b/executor/common_kvm_amd64_syzos.h index dc092409b..27f4be400 100644 --- a/executor/common_kvm_amd64_syzos.h +++ b/executor/common_kvm_amd64_syzos.h @@ -30,6 +30,7 @@ typedef enum { SYZOS_API_NESTED_CREATE_VM = 301, SYZOS_API_NESTED_LOAD_CODE = 302, SYZOS_API_NESTED_VMLAUNCH = 303, + SYZOS_API_NESTED_VMRESUME = 304, SYZOS_API_STOP, // Must be the last one } syzos_api_id; @@ -102,6 +103,7 @@ GUEST_CODE static void guest_handle_enable_nested(struct api_call_1* cmd, uint64 GUEST_CODE static void guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id); GUEST_CODE static void guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id); GUEST_CODE static void guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id); +GUEST_CODE static void guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id); typedef enum { UEXIT_END = (uint64)-1, @@ -208,6 +210,9 @@ guest_main(uint64 size, uint64 cpu) } else if (call == SYZOS_API_NESTED_VMLAUNCH) { // Launch the nested VM. guest_handle_nested_vmlaunch((struct api_call_1*)cmd, cpu); + } else if (call == SYZOS_API_NESTED_VMRESUME) { + // Resume a nested VM. + guest_handle_nested_vmresume((struct api_call_1*)cmd, cpu); } addr += cmd->size; size -= cmd->size; @@ -733,6 +738,7 @@ GUEST_CODE static noinline void init_vmcs_control_fields(uint64 cpu_id, uint64 v // Common L2 exit reasons for Intel and AMD. typedef enum { SYZOS_NESTED_EXIT_REASON_HLT = 1, + SYZOS_NESTED_EXIT_REASON_INVD = 2, SYZOS_NESTED_EXIT_REASON_UNKNOWN = 0xFF, } syz_nested_exit_reason; @@ -748,22 +754,37 @@ GUEST_CODE static void guest_uexit_l2(uint64 exit_reason, syz_nested_exit_reason } } -GUEST_CODE static syz_nested_exit_reason map_intel_exit_reason(uint64 reason) +#define EXIT_REASON_HLT 0xc +#define EXIT_REASON_INVD 0xd + +GUEST_CODE static syz_nested_exit_reason map_intel_exit_reason(uint64 basic_reason) { - volatile uint64 basic_reason = reason & 0xFFFF; - // EXIT_REASON_HLT. - if (basic_reason == 0xc) + // Disable optimizations. + volatile uint64 reason = basic_reason; + if (reason == EXIT_REASON_HLT) return SYZOS_NESTED_EXIT_REASON_HLT; + if (reason == EXIT_REASON_INVD) + return SYZOS_NESTED_EXIT_REASON_INVD; return SYZOS_NESTED_EXIT_REASON_UNKNOWN; } +GUEST_CODE static void advance_l2_rip_intel(uint64 basic_reason) +{ + if (basic_reason == EXIT_REASON_INVD) { + uint64 rip = vmread(VMCS_GUEST_RIP); + vmwrite(VMCS_GUEST_RIP, rip + 2); + } +} + // This function is called from inline assembly. __attribute__((used)) GUEST_CODE static void nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs) { - syz_nested_exit_reason mapped_reason = map_intel_exit_reason(exit_reason); + uint64 basic_reason = exit_reason & 0xFFFF; + syz_nested_exit_reason mapped_reason = map_intel_exit_reason(basic_reason); guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_INTEL); + advance_l2_rip_intel(basic_reason); } extern char after_vmentry_label; @@ -810,20 +831,36 @@ __attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(v [vm_exit_reason] "i"(VMCS_VM_EXIT_REASON) : "memory", "cc", "rbx", "rdi", "rsi"); } -GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 reason) +#define VMEXIT_INVD 0x76 +#define VMEXIT_HLT 0x78 + +GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 basic_reason) { - volatile uint64 basic_reason = reason & 0xFFFF; - // #VMEXIT_HLT. - if (basic_reason == 0x78) + // Disable optimizations. + volatile uint64 reason = basic_reason; + if (reason == VMEXIT_HLT) return SYZOS_NESTED_EXIT_REASON_HLT; + if (reason == VMEXIT_INVD) + return SYZOS_NESTED_EXIT_REASON_INVD; return SYZOS_NESTED_EXIT_REASON_UNKNOWN; } +GUEST_CODE static void advance_l2_rip_amd(uint64 basic_reason, uint64 cpu_id, uint64 vm_id) +{ + if (basic_reason == VMEXIT_INVD) { + uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); + uint64 rip = vmcb_read64((volatile uint8*)vmcb_addr, VMCB_GUEST_RIP); + vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, rip + 2); + } +} + __attribute__((used)) GUEST_CODE static void nested_vm_exit_handler_amd(uint64 exit_reason, uint64 cpu_id, uint64 vm_id) { - syz_nested_exit_reason mapped_reason = map_amd_exit_reason(exit_reason); + volatile uint64 basic_reason = exit_reason & 0xFFFF; + syz_nested_exit_reason mapped_reason = map_amd_exit_reason(basic_reason); guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_AMD); + advance_l2_rip_amd(basic_reason, cpu_id, vm_id); } GUEST_CODE static noinline void init_vmcs_host_state(void) @@ -1011,7 +1048,7 @@ GUEST_CODE static noinline void init_vmcb_guest_state(uint64 cpu_id, uint64 vm_i vmcb_write32(vmcb_addr, VMCB_GUEST_IDTR_LIM, idtr.limit); // Setup VMCB Control Fields. - vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC3, VMCB_CTRL_INTERCEPT_HLT); + vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC3, VMCB_CTRL_INTERCEPT_VEC3_ALL); vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC4, VMCB_CTRL_INTERCEPT_VEC4_ALL); // Enable Nested Paging (NPT): @@ -1078,9 +1115,8 @@ guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_ } GUEST_CODE static noinline void -guest_handle_nested_vmentry_intel(struct api_call_1* cmd, uint64 cpu_id, bool is_launch) +guest_handle_nested_vmentry_intel(uint64 vm_id, uint64 cpu_id, bool is_launch) { - uint64 vm_id = cmd->arg; uint64 vmx_error_code = 0; uint8 fail_flag = 0; // Will be 1 if EITHER CF or ZF is set @@ -1149,19 +1185,24 @@ guest_run_amd_vm(uint64 cpu_id, uint64 vm_id) } GUEST_CODE static noinline void -guest_handle_nested_vmlaunch_amd(struct api_call_1* cmd, uint64 cpu_id, uint64 vm_id) +guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id) { - guest_run_amd_vm(cpu_id, vm_id); + uint64 vm_id = cmd->arg; + if (get_cpu_vendor() == CPU_VENDOR_INTEL) { + guest_handle_nested_vmentry_intel(vm_id, cpu_id, true); + } else { + guest_run_amd_vm(cpu_id, vm_id); + } } GUEST_CODE static noinline void -guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id) +guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id) { uint64 vm_id = cmd->arg; if (get_cpu_vendor() == CPU_VENDOR_INTEL) { - guest_handle_nested_vmentry_intel(cmd, cpu_id, true); + guest_handle_nested_vmentry_intel(vm_id, cpu_id, false); } else { - guest_handle_nested_vmlaunch_amd(cmd, cpu_id, vm_id); + guest_run_amd_vm(cpu_id, vm_id); } } diff --git a/executor/kvm.h b/executor/kvm.h index 53ba00888..d7c708b34 100644 --- a/executor/kvm.h +++ b/executor/kvm.h @@ -393,7 +393,7 @@ // Control Area #define VMCB_CTRL_INTERCEPT_VEC3 0x0c -#define VMCB_CTRL_INTERCEPT_HLT (1 << 24) // Bit 24 in VEC3 +#define VMCB_CTRL_INTERCEPT_VEC3_ALL (0xffffffff) #define VMCB_CTRL_INTERCEPT_VEC4 0x10 // Bits 0-9: intercept VMRUN, VMMCALL, VMLOAD, VMSAVE, STGI, CLGI, SKINIT, RDTSCP, ICEBP, WBINVD. #define VMCB_CTRL_INTERCEPT_VEC4_ALL (0x3ff) diff --git a/sys/linux/dev_kvm_amd64.txt b/sys/linux/dev_kvm_amd64.txt index cb116574d..d3dac2cf9 100644 --- a/sys/linux/dev_kvm_amd64.txt +++ b/sys/linux/dev_kvm_amd64.txt @@ -117,6 +117,7 @@ syzos_api_call$x86 [ nested_create_vm syzos_api$x86[301, syzos_api_vm_id] nested_load_code syzos_api$x86[302, syzos_api_nested_load_code] nested_vmlaunch syzos_api$x86[303, syzos_api_vm_id] + nested_vmresume syzos_api$x86[304, syzos_api_vm_id] ] [varlen] kvm_text_x86 [ diff --git a/sys/linux/test/amd64-syz_kvm_nested_vmresume b/sys/linux/test/amd64-syz_kvm_nested_vmresume new file mode 100644 index 000000000..f26b683bf --- /dev/null +++ b/sys/linux/test/amd64-syz_kvm_nested_vmresume @@ -0,0 +1,29 @@ +# +# requires: arch=amd64 -threaded +# +r0 = openat$kvm(0, &AUTO='/dev/kvm\x00', 0x0, 0x0) +r1 = ioctl$KVM_CREATE_VM(r0, AUTO, 0x0) +r2 = syz_kvm_setup_syzos_vm$x86(r1, &(0x7f0000c00000/0x400000)=nil) + +# Create a nested VM that performs INVD (0f 08) and HLT (f4) to test vmresume. +# INVD is one of the few instructions that cause unconditional VM exit on Intel. +# On AMD, SYZOS also turns on INVD interception. +# +r3 = syz_kvm_add_vcpu$x86(r2, &AUTO={0x0, &AUTO=[@enable_nested={AUTO, AUTO, 0x0}, @nested_create_vm={AUTO, AUTO, 0x0}, @nested_load_code={AUTO, AUTO, {0x0, "0f08f4"}}, @nested_vmlaunch={AUTO, AUTO, 0x0}, @nested_vmresume={AUTO, AUTO, 0x0}], AUTO}) +r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, AUTO) +r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, r4, 0x3, 0x1, r3, 0x0) + +# L2 VM executes INVD. Exit reason is mapped to 0xe2e20002. +# +ioctl$KVM_RUN(r3, AUTO, 0x0) +syz_kvm_assert_syzos_uexit$x86(r5, 0xe2e20002) + +# L1 resumes L2, which executes HLT. Exit reason is mapped to 0xe2e20001. +# +ioctl$KVM_RUN(r3, AUTO, 0x0) +syz_kvm_assert_syzos_uexit$x86(r5, 0xe2e20001) + +# guest_main should finish with guest_uexit(-1). +# +ioctl$KVM_RUN(r3, AUTO, 0x0) +syz_kvm_assert_syzos_uexit$x86(r5, 0xffffffff) -- cgit mrf-deployment