diff options
Diffstat (limited to 'executor')
| -rw-r--r-- | executor/common_kvm_amd64.h | 25 | ||||
| -rw-r--r-- | executor/common_kvm_amd64_syzos.h | 540 | ||||
| -rw-r--r-- | executor/kvm.h | 8 |
3 files changed, 467 insertions, 106 deletions
diff --git a/executor/common_kvm_amd64.h b/executor/common_kvm_amd64.h index 474bc875b..c5d7c6983 100644 --- a/executor/common_kvm_amd64.h +++ b/executor/common_kvm_amd64.h @@ -206,6 +206,7 @@ static void setup_64bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t g #endif #if SYZ_EXECUTOR || __NR_syz_kvm_setup_syzos_vm || __NR_syz_kvm_add_vcpu + // SYZOS guest virtual memory layout (must be in sync with executor/kvm.h): static const struct mem_region syzos_mem_regions[] = { // AMD64 fixed data structures (5 pages: Zero, GDT, PML4, PDP, PD). @@ -250,6 +251,7 @@ struct kvm_syz_vm { void* user_text; void* gpa0_mem; void* pt_pool_mem; + void* globals_mem; }; #endif @@ -1100,19 +1102,16 @@ static volatile long syz_kvm_setup_cpu(volatile long a0, volatile long a1, volat #define RFLAGS_1_BIT (1ULL << 1) #define RFLAGS_IF_BIT (1ULL << 9) -static void reset_cpu_regs(int cpufd, int cpu_id, size_t text_size) +static void reset_cpu_regs(int cpufd, uint64 rip, uint64 cpu_id) { struct kvm_regs regs; memset(®s, 0, sizeof(regs)); // RFLAGS.1 must be 1, RFLAGS.IF enables interrupts. regs.rflags |= RFLAGS_1_BIT | RFLAGS_IF_BIT; - // PC points to the relative offset of guest_main() within the guest code. - regs.rip = executor_fn_guest_addr(guest_main); + regs.rip = rip; regs.rsp = X86_SYZOS_ADDR_STACK0; - // Pass parameters to guest_main(). - regs.rdi = text_size; - regs.rsi = cpu_id; + regs.rdi = cpu_id; ioctl(cpufd, KVM_SET_REGS, ®s); } @@ -1126,7 +1125,15 @@ static void install_user_code(struct kvm_syz_vm* vm, int cpufd, int cpu_id, cons memcpy(target, text, text_size); setup_gdt_ldt_pg(vm, cpufd, cpu_id); setup_cpuid(cpufd); - reset_cpu_regs(cpufd, cpu_id, text_size); + + uint64 entry_rip = executor_fn_guest_addr(guest_main); + reset_cpu_regs(cpufd, entry_rip, cpu_id); + + // Pass the text size via the shared globals page. + if (vm->globals_mem) { + struct syzos_globals* globals = (struct syzos_globals*)vm->globals_mem; + globals->text_sizes[cpu_id] = text_size; + } } #endif @@ -1196,6 +1203,8 @@ static void setup_vm(int vmfd, struct kvm_syz_vm* vm) vm->gpa0_mem = next.addr; if (r->gpa == X86_SYZOS_ADDR_PT_POOL) vm->pt_pool_mem = next.addr; + if (r->gpa == X86_SYZOS_ADDR_GLOBALS) + vm->globals_mem = next.addr; if (r->gpa == X86_SYZOS_ADDR_BOOT_ARGS) { boot_args = (struct syzos_boot_args*)next.addr; @@ -1326,4 +1335,4 @@ static long syz_kvm_assert_syzos_uexit(volatile long a0, volatile long a1, volat } #endif -#endif // EXECUTOR_COMMON_KVM_AMD64_H
\ No newline at end of file +#endif // EXECUTOR_COMMON_KVM_AMD64_H diff --git a/executor/common_kvm_amd64_syzos.h b/executor/common_kvm_amd64_syzos.h index 62c021766..4b8d08e2a 100644 --- a/executor/common_kvm_amd64_syzos.h +++ b/executor/common_kvm_amd64_syzos.h @@ -32,6 +32,7 @@ typedef enum { SYZOS_API_NESTED_LOAD_CODE = 302, SYZOS_API_NESTED_VMLAUNCH = 303, SYZOS_API_NESTED_VMRESUME = 304, + SYZOS_API_NESTED_LOAD_SYZOS = 310, SYZOS_API_NESTED_INTEL_VMWRITE_MASK = 340, SYZOS_API_NESTED_AMD_VMCB_WRITE_MASK = 380, SYZOS_API_NESTED_AMD_INVLPGA = 381, @@ -65,6 +66,13 @@ struct api_call_nested_load_code { uint8 insns[]; }; +struct api_call_nested_load_syzos { + struct api_call_header header; + uint64 vm_id; + uint64 unused_pages; + uint8 program[]; +}; + struct api_call_cpuid { struct api_call_header header; uint32 eax; @@ -121,6 +129,9 @@ struct syzos_boot_args { struct syzos_globals { uint64 alloc_offset; uint64 total_size; + uint64 text_sizes[KVM_MAX_VCPU]; + struct l2_guest_regs l2_ctx[KVM_MAX_VCPU][KVM_MAX_L2_VMS]; + uint64 active_vm_id[KVM_MAX_VCPU]; }; #ifdef __cplusplus @@ -128,6 +139,7 @@ extern "C" { #endif GUEST_CODE static void guest_uexit(uint64 exit_code); GUEST_CODE static void nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs); +GUEST_CODE static void nested_vm_exit_handler_amd(uint64 exit_reason, struct l2_guest_regs* regs); #ifdef __cplusplus } #endif @@ -143,6 +155,7 @@ GUEST_CODE static void guest_handle_set_irq_handler(struct api_call_2* cmd); GUEST_CODE static void guest_handle_enable_nested(struct api_call_1* cmd, uint64 cpu_id); GUEST_CODE static void guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id); GUEST_CODE static void guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id); +GUEST_CODE static void guest_handle_nested_load_syzos(struct api_call_nested_load_syzos* cmd, uint64 cpu_id); GUEST_CODE static void guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id); GUEST_CODE static void guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id); GUEST_CODE static void guest_handle_nested_intel_vmwrite_mask(struct api_call_5* cmd, uint64 cpu_id); @@ -202,8 +215,10 @@ __attribute__((naked)) GUEST_CODE static void uexit_irq_handler() // We add single-line comments to justify having the compound statements below. __attribute__((used)) GUEST_CODE static void -guest_main(uint64 size, uint64 cpu) +guest_main(uint64 cpu) { + volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS; + uint64 size = globals->text_sizes[cpu]; uint64 addr = X86_SYZOS_ADDR_USER_CODE + cpu * KVM_PAGE_SIZE; while (size >= sizeof(struct api_call_header)) { @@ -257,6 +272,9 @@ guest_main(uint64 size, uint64 cpu) } else if (call == SYZOS_API_NESTED_LOAD_CODE) { // Load code into the nested VM. guest_handle_nested_load_code((struct api_call_nested_load_code*)cmd, cpu); + } else if (call == SYZOS_API_NESTED_LOAD_SYZOS) { + // Load SYZOS into the nested VM. + guest_handle_nested_load_syzos((struct api_call_nested_load_syzos*)cmd, cpu); } else if (call == SYZOS_API_NESTED_VMLAUNCH) { // Launch the nested VM. guest_handle_nested_vmlaunch((struct api_call_1*)cmd, cpu); @@ -313,8 +331,11 @@ __attribute__((used)) GUEST_CODE static noinline void guest_uexit(uint64 exit_code) { + // Force exit_code into RAX using inline asm constraints ("a"). + // We write to X86_SYZOS_ADDR_UEXIT (0x40100). + // This allows the L1 hypervisor to reliably read RAX during an EPT violation. volatile uint64* ptr = (volatile uint64*)X86_SYZOS_ADDR_UEXIT; - *ptr = exit_code; + asm volatile("movq %0, (%1)" ::"a"(exit_code), "r"(ptr) : "memory"); } GUEST_CODE static noinline void guest_handle_cpuid(uint32 eax, uint32 ecx) @@ -773,15 +794,10 @@ GUEST_CODE static void l2_map_page(uint64 cpu_id, uint64 vm_id, uint64 gpa, uint pt[pt_idx] = (host_pa & ~0xFFF) | flags; } -GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint64 cpu_id, uint64 vm_id) +GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint64 cpu_id, uint64 vm_id, uint64 unused_pages) { - // The Root PML4 remains at the fixed address assigned to this VM. - uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id); - - // Clear the root table. - guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE); - guest_memset((void*)X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id), 0, KVM_PAGE_SIZE); - + // Note: PML4 and MSR Bitmap must be zeroed by the caller (nested_create_vm) + // so that this function can be called additively by nested_load_syzos. // Intel EPT: set Read, Write, Execute. // AMD NPT: set Present, Write, User. uint64 flags = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER; @@ -799,10 +815,19 @@ GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint6 r.pages = args->regions[i].pages; r.flags = args->regions[i].flags; - // Skip the huge unused heap for now, map fixed small heap if needed or handled by guest_alloc. - if (r.flags & MEM_REGION_FLAG_REMAINING) + // Skip NO_HOST_MEM regions (like the Exit/UEXIT region). + // This ensures that L2 accesses to these pages cause a nested page fault + // (EPT Violation / NPT Fault), allowing L1 to intercept and modify the exit code. + if (r.flags & MEM_REGION_FLAG_NO_HOST_MEM) continue; + // Skip the huge unused heap for now, map fixed small heap if needed or handled by guest_alloc. + // If unused_pages > 0, we map that many pages from the unused region. + if (r.flags & MEM_REGION_FLAG_REMAINING) { + // Map at least a few pages for the allocator overhead if 0 is passed. + r.pages = (unused_pages < 16) ? 16 : unused_pages; + } + for (int p = 0; p < r.pages; p++) { uint64 gpa = r.gpa + (p * KVM_PAGE_SIZE); uint64 backing; @@ -814,9 +839,9 @@ GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint6 // Map stack to the VM's dedicated stack buffer backing = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id); } else if (r.gpa == X86_SYZOS_ADDR_ZERO || - r.gpa == X86_SYZOS_ADDR_VAR_IDT || - r.gpa == X86_SYZOS_ADDR_BOOT_ARGS || - r.gpa == X86_SYZOS_ADDR_PT_POOL) { + r.gpa == X86_SYZOS_ADDR_VAR_IDT || + r.gpa == X86_SYZOS_ADDR_BOOT_ARGS || + r.gpa == X86_SYZOS_ADDR_PT_POOL) { // Critical System Regions: Allocate and COPY from L1. // We must copy the PT POOL because the PD entries in ADDR_ZERO // point to tables allocated here. If we don't copy, L2 sees @@ -824,6 +849,9 @@ GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint6 // GDT/IDT/BootArgs are also copied for valid environment. backing = guest_alloc_page(); guest_memcpy((void*)backing, (void*)gpa, KVM_PAGE_SIZE); + } else if (r.flags & MEM_REGION_FLAG_EXECUTOR_CODE) { + // Identity map the Executor Code. + backing = gpa; } else { // Allocate new backing memory backing = guest_alloc_page(); @@ -906,9 +934,21 @@ typedef enum { SYZOS_NESTED_EXIT_REASON_CPUID = 3, SYZOS_NESTED_EXIT_REASON_RDTSC = 4, SYZOS_NESTED_EXIT_REASON_RDTSCP = 5, + SYZOS_NESTED_EXIT_REASON_EPT_VIOLATION = 6, SYZOS_NESTED_EXIT_REASON_UNKNOWN = 0xFF, } syz_nested_exit_reason; +GUEST_CODE static void handle_nested_uexit(uint64 exit_code) +{ + // Increment the nesting level (top byte). + uint64 level = (exit_code >> 56) + 1; + exit_code = (exit_code & 0x00FFFFFFFFFFFFFFULL) | (level << 56); + + // Perform L1 uexit with the modified code. + guest_uexit(exit_code); + // guest_uexit terminates, so we don't return. +} + GUEST_CODE static void guest_uexit_l2(uint64 exit_reason, syz_nested_exit_reason mapped_reason, cpu_vendor_id vendor) { @@ -924,6 +964,7 @@ GUEST_CODE static void guest_uexit_l2(uint64 exit_reason, syz_nested_exit_reason #define EXIT_REASON_CPUID 0xa #define EXIT_REASON_HLT 0xc #define EXIT_REASON_INVD 0xd +#define EXIT_REASON_EPT_VIOLATION 0x30 #define EXIT_REASON_RDTSC 0x10 #define EXIT_REASON_RDTSCP 0x33 @@ -941,6 +982,8 @@ GUEST_CODE static syz_nested_exit_reason map_intel_exit_reason(uint64 basic_reas return SYZOS_NESTED_EXIT_REASON_RDTSC; if (reason == EXIT_REASON_RDTSCP) return SYZOS_NESTED_EXIT_REASON_RDTSCP; + if (reason == EXIT_REASON_EPT_VIOLATION) + return SYZOS_NESTED_EXIT_REASON_EPT_VIOLATION; return SYZOS_NESTED_EXIT_REASON_UNKNOWN; } @@ -964,7 +1007,36 @@ __attribute__((used)) GUEST_CODE static void nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs) { + volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS; + // Recover cpu_id from the stack. It was pushed before L1 registers. + // Stack: [cpu_id] [launch] [L1 GPRs x6] [L2 GPRs x15] + // Index: 22 21 15..20 0..14 + // regs points to the start of L2 GPRs. + uint64 cpu_id = *(uint64*)((char*)regs + sizeof(struct l2_guest_regs) + 7 * 8); + uint64 vm_id = globals->active_vm_id[cpu_id]; + + // Persist L2 registers. + guest_memcpy((void*)&globals->l2_ctx[cpu_id][vm_id], regs, sizeof(struct l2_guest_regs)); + uint64 basic_reason = exit_reason & 0xFFFF; + + // Handle EPT Violation (Nested UEXIT). + if (basic_reason == EXIT_REASON_EPT_VIOLATION) { + uint64 gpa = vmread(VMCS_GUEST_PHYSICAL_ADDRESS); + // Only handle violations on the specific UEXIT page. + if ((gpa & ~0xFFF) == X86_SYZOS_ADDR_EXIT) { + // This is a uexit from L2. + // We enforced usage of RAX in guest_uexit. + // Read RAX from the saved L2 guest registers. + // Note: On Intel exit, guest registers are NOT saved to VMCS. + // They are saved to 'regs' by our asm wrapper. + handle_nested_uexit(regs->rax); + // Advance L2 RIP by 3 bytes (movq %rax, (%rdx) is 3 bytes). + vmwrite(VMCS_GUEST_RIP, vmread(VMCS_GUEST_RIP) + 3); + return; + } + } + syz_nested_exit_reason mapped_reason = map_intel_exit_reason(basic_reason); guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_INTEL); advance_l2_rip_intel(basic_reason); @@ -975,22 +1047,22 @@ __attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(v { asm volatile(R"( // Save L2's GPRs. This creates the 'struct l2_guest_regs' on the stack. - // The order MUST match the struct. - push %%rax - push %%rbx - push %%rcx - push %%rdx - push %%rsi - push %%rdi - push %%rbp - push %%r8 - push %%r9 - push %%r10 - push %%r11 - push %%r12 - push %%r13 - push %%r14 + // We push in reverse order so that RAX ends up at offset 0 (Top of Stack). push %%r15 + push %%r14 + push %%r13 + push %%r12 + push %%r11 + push %%r10 + push %%r9 + push %%r8 + push %%rbp + push %%rdi + push %%rsi + push %%rdx + push %%rcx + push %%rbx + push %%rax // Prepare arguments for the C handler: // arg1 (RDI) = exit_reason @@ -1004,13 +1076,30 @@ __attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(v // The C handler has processed the exit. Now, return to the L1 command // processing loop. VMX remains enabled. - add %[stack_cleanup_size], %%rsp + + // 1. Discard L2 GPRs. + add %[l2_regs_size], %%rsp + + // 2. Restore L1 callee-saved registers. + // Order must be reverse of push: r15, r14, r13, r12, rbp, rbx. + pop %%r15 + pop %%r14 + pop %%r13 + pop %%r12 + pop %%rbp + pop %%rbx + + // 3. Discard launch flag and cpu_id. + add $16, %%rsp + + // 4. Restore Red Zone. + add $128, %%rsp // Jump to L1 main flow jmp after_vmentry_label )" - : : [stack_cleanup_size] "i"(sizeof(struct l2_guest_regs)), + : : [l2_regs_size] "i"(sizeof(struct l2_guest_regs)), [vm_exit_reason] "i"(VMCS_VM_EXIT_REASON) : "memory", "cc", "rbx", "rdi", "rsi"); } @@ -1018,6 +1107,7 @@ __attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(v #define VMEXIT_CPUID 0x72 #define VMEXIT_INVD 0x76 #define VMEXIT_HLT 0x78 +#define VMEXIT_NPF 0x400 #define VMEXIT_RDTSCP 0x87 GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 basic_reason) @@ -1034,6 +1124,8 @@ GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 basic_reason return SYZOS_NESTED_EXIT_REASON_RDTSC; if (reason == VMEXIT_RDTSCP) return SYZOS_NESTED_EXIT_REASON_RDTSCP; + if (reason == VMEXIT_NPF) + return SYZOS_NESTED_EXIT_REASON_EPT_VIOLATION; return SYZOS_NESTED_EXIT_REASON_UNKNOWN; } @@ -1054,9 +1146,36 @@ GUEST_CODE static void advance_l2_rip_amd(uint64 basic_reason, uint64 cpu_id, ui } __attribute__((used)) GUEST_CODE static void -nested_vm_exit_handler_amd(uint64 exit_reason, uint64 cpu_id, uint64 vm_id) +nested_vm_exit_handler_amd(uint64 exit_reason, struct l2_guest_regs* regs) { + volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS; + // Recover cpu_id from the stack. + // Stack: [cpu_id] [vmcb_addr] [6 L1 GPRs] [exit_code] [15 L2 GPRs] + // Index: 23 22 16..21 15 0..14 + // regs points to Index 0. + uint64 cpu_id = *(uint64*)((char*)regs + sizeof(struct l2_guest_regs) + 8 * 8); + uint64 vm_id = globals->active_vm_id[cpu_id]; + + // Persist L2 registers. + guest_memcpy((void*)&globals->l2_ctx[cpu_id][vm_id], regs, sizeof(struct l2_guest_regs)); + volatile uint64 basic_reason = exit_reason & 0xFFFF; + + // Handle NPT Fault (Nested UEXIT). + if (basic_reason == VMEXIT_NPF) { + uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); + // EXITINFO2 contains the faulting GPA. + uint64 fault_gpa = vmcb_read64((volatile uint8*)vmcb_addr, VMCB_EXITINFO2); + if ((fault_gpa & ~0xFFF) == X86_SYZOS_ADDR_EXIT) { + // RAX is in the saved L2 regs. + handle_nested_uexit(regs->rax); + // Advance L2 RIP by 3 bytes. + uint64 rip = vmcb_read64((volatile uint8*)vmcb_addr, VMCB_GUEST_RIP); + vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, rip + 3); + return; + } + } + syz_nested_exit_reason mapped_reason = map_amd_exit_reason(basic_reason); guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_AMD); advance_l2_rip_amd(basic_reason, cpu_id, vm_id); @@ -1080,10 +1199,7 @@ GUEST_CODE static noinline void init_vmcs_host_state(void) vmwrite(VMCS_HOST_FS_BASE, rdmsr(X86_MSR_FS_BASE)); vmwrite(VMCS_HOST_GS_BASE, rdmsr(X86_MSR_GS_BASE)); - // RIP and RSP. - uint64 tmpreg = 0; // nolint - asm volatile("mov %%rsp, %0" : "=r"(tmpreg)); - vmwrite(VMCS_HOST_RSP, tmpreg); + // Exit handler in RIP. vmwrite(VMCS_HOST_RIP, (uintptr_t)nested_vm_exit_handler_intel_asm); // Control Registers. @@ -1167,6 +1283,8 @@ nested_create_vm_intel(struct api_call_1* cmd, uint64 cpu_id) uint64 vm_id = cmd->arg; uint64 vmcs_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); uint8 error = 0; // nolint + uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id); + uint64 l2_msr_bitmap = X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id); *(uint32*)vmcs_addr = rdmsr(X86_MSR_IA32_VMX_BASIC); asm volatile("vmclear %1; setna %0" @@ -1179,7 +1297,11 @@ nested_create_vm_intel(struct api_call_1* cmd, uint64 cpu_id) } nested_vmptrld(cpu_id, vm_id); - setup_l2_page_tables(CPU_VENDOR_INTEL, cpu_id, vm_id); + // Zero out critical structures. + guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE); + guest_memset((void*)l2_msr_bitmap, 0, KVM_PAGE_SIZE); + + setup_l2_page_tables(CPU_VENDOR_INTEL, cpu_id, vm_id, 0); init_vmcs_control_fields(cpu_id, vm_id); init_vmcs_host_state(); init_vmcs_guest_state(cpu_id, vm_id); @@ -1215,7 +1337,6 @@ GUEST_CODE static noinline void init_vmcb_guest_state(uint64 cpu_id, uint64 vm_i SETUP_L2_SEGMENT_SVM(vmcb_addr, LDTR, 0, 0, 0, SVM_ATTR_LDTR_UNUSABLE); // Setup Guest Control Registers & CPU State. - uint64 efer = rdmsr(X86_MSR_IA32_EFER); vmcb_write64(vmcb_addr, VMCB_GUEST_CR0, read_cr0() | X86_CR0_WP); // L2 will use L1's page tables. vmcb_write64(vmcb_addr, VMCB_GUEST_CR3, read_cr3()); @@ -1224,15 +1345,9 @@ GUEST_CODE static noinline void init_vmcb_guest_state(uint64 cpu_id, uint64 vm_i vmcb_write64(vmcb_addr, VMCB_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8); vmcb_write64(vmcb_addr, VMCB_GUEST_RFLAGS, RFLAGS_1_BIT); - // Setup Guest MSRs. - - // SYSCALL/SYSRET MSRs. - vmcb_write64(vmcb_addr, VMCB_GUEST_DEBUGCTL, 0); - vmcb_write64(vmcb_addr, VMCB_GUEST_DR6, 0x0); - vmcb_write64(vmcb_addr, VMCB_GUEST_DR7, 0x0); - - vmcb_write64(vmcb_addr, VMCB_GUEST_EFER, efer & ~X86_EFER_SCE); - vmcb_write64(vmcb_addr, VMCB_GUEST_PAT, rdmsr(X86_MSR_IA32_CR_PAT)); + // Setup Guest EFER. Must have SVME, LME, and LMA for 64-bit nested. + vmcb_write64(vmcb_addr, VMCB_GUEST_EFER, X86_EFER_LME | X86_EFER_LMA | X86_EFER_SVME); + vmcb_write64(vmcb_addr, VMCB_RAX, 0); // Setup Guest Descriptor Tables. struct { @@ -1270,12 +1385,16 @@ nested_create_vm_amd(struct api_call_1* cmd, uint64 cpu_id) { uint64 vm_id = cmd->arg; uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); + uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id); + uint64 l2_msr_bitmap = X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id); guest_memset((void*)vmcb_addr, 0, KVM_PAGE_SIZE); guest_memset((void*)X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id), 0, KVM_PAGE_SIZE); + guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE); + guest_memset((void*)l2_msr_bitmap, 0, KVM_PAGE_SIZE); // Setup NPT (Nested Page Tables) - setup_l2_page_tables(CPU_VENDOR_AMD, cpu_id, vm_id); + setup_l2_page_tables(CPU_VENDOR_AMD, cpu_id, vm_id, 0); // Initialize VMCB Control and Guest State init_vmcb_guest_state(cpu_id, vm_id); @@ -1291,12 +1410,42 @@ guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id) } } +GUEST_CODE static uint64 l2_gpa_to_pa(uint64 cpu_id, uint64 vm_id, uint64 gpa) +{ + uint64 pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id); + volatile uint64* pml4 = (volatile uint64*)pml4_addr; + uint64 pml4_idx = (gpa >> 39) & 0x1FF; + if (!(pml4[pml4_idx] & X86_PDE64_PRESENT)) + return 0; + + volatile uint64* pdpt = (volatile uint64*)(pml4[pml4_idx] & ~0xFFF); + uint64 pdpt_idx = (gpa >> 30) & 0x1FF; + if (!(pdpt[pdpt_idx] & X86_PDE64_PRESENT)) + return 0; + + volatile uint64* pd = (volatile uint64*)(pdpt[pdpt_idx] & ~0xFFF); + uint64 pd_idx = (gpa >> 21) & 0x1FF; + if (!(pd[pd_idx] & X86_PDE64_PRESENT)) + return 0; + + volatile uint64* pt = (volatile uint64*)(pd[pd_idx] & ~0xFFF); + uint64 pt_idx = (gpa >> 12) & 0x1FF; + if (!(pt[pt_idx] & X86_PDE64_PRESENT)) + return 0; + + return (pt[pt_idx] & ~0xFFF) + (gpa & 0xFFF); +} + GUEST_CODE static noinline void guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id) { uint64 vm_id = cmd->vm_id; // Backing address in L1 for the L2 User Code (mapped at X86_SYZOS_ADDR_USER_CODE) - uint64 l2_code_backing = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id); + uint64 l2_code_backing = l2_gpa_to_pa(cpu_id, vm_id, X86_SYZOS_ADDR_USER_CODE); + if (!l2_code_backing) { + guest_uexit(0xE2BAD4); + return; + } // Code size = command size - header size - vm_id size. uint64 l2_code_size = cmd->header.size - sizeof(struct api_call_header) - sizeof(uint64); @@ -1317,77 +1466,276 @@ guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_ } } -// Clang's LTO may ignore noinline and attempt to inline this function into both callers, -// which results in duplicate declaration of after_vmentry_label. -// Applying __optnone should prevent this behavior. -GUEST_CODE static noinline __optnone void -guest_handle_nested_vmentry_intel(uint64 vm_id, uint64 cpu_id, bool is_launch) +GUEST_CODE static noinline void +guest_handle_nested_load_syzos(struct api_call_nested_load_syzos* cmd, uint64 cpu_id) { - uint64 vmx_error_code = 0; - uint8 fail_flag = 0; // Will be 1 if EITHER CF or ZF is set + uint64 vm_id = cmd->vm_id; + uint64 prog_size = cmd->header.size - __builtin_offsetof(struct api_call_nested_load_syzos, program); + uint64 l2_code_backing = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id); + volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS; - nested_vmptrld(cpu_id, vm_id); + if (prog_size > KVM_PAGE_SIZE) + prog_size = KVM_PAGE_SIZE; + + // Copy Payload to Code buffer. + guest_memcpy((void*)l2_code_backing, (void*)cmd->program, prog_size); + + // Populate Globals. + uint64 globals_pa = l2_gpa_to_pa(cpu_id, vm_id, X86_SYZOS_ADDR_GLOBALS); + if (!globals_pa) { + guest_uexit(0xE2BAD3); + return; + } + volatile struct syzos_globals* l2_globals = (volatile struct syzos_globals*)globals_pa; + // Set initial state for ALL possible L2 VCPUs of this VM. + for (int i = 0; i < KVM_MAX_VCPU; i++) { + l2_globals->text_sizes[i] = prog_size; + globals->l2_ctx[i][vm_id].rdi = i; + globals->l2_ctx[i][vm_id].rax = 0; // Default RAX + // Note: RSP and RIP are set in the VMCB/VMCS, but they could also be in l2_ctx + // since the shims load them if we wanted. But currently they are in VMCB/VMCS. + } - if (is_launch) { - asm volatile(R"( - // Attempt to launch the L2 guest. - vmlaunch - // Set AL to 1 if CF=1 (VMfailValid) - setc %%al - // Set BL to 1 if ZF=1 (VMfailInvalid) - setz %%bl - or %%bl, %%al)" - : "=a"(fail_flag) - : - : "rbx", "cc", "memory"); + // Set RIP to guest_main. + uint64 entry_rip = executor_fn_guest_addr(guest_main); + if (get_cpu_vendor() == CPU_VENDOR_INTEL) { + nested_vmptrld(cpu_id, vm_id); + vmwrite(VMCS_GUEST_RIP, entry_rip); + vmwrite(VMCS_GUEST_RSP, X86_SYZOS_ADDR_STACK_BOTTOM + KVM_PAGE_SIZE - 8); } else { - asm volatile(R"( - // Attempt to resume the L2 guest. - vmresume - // Set AL to 1 if CF=1 (VMfailValid) - setc %%al - // Set BL to 1 if ZF=1 (VMfailInvalid) - setz %%bl - or %%bl, %%al)" - : "=a"(fail_flag) - : - : "rbx", "cc", "memory"); + uint64 vmcb = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); + vmcb_write64(vmcb, VMCB_GUEST_RIP, entry_rip); + vmcb_write64(vmcb, VMCB_GUEST_RSP, X86_SYZOS_ADDR_STACK_BOTTOM + KVM_PAGE_SIZE - 8); } - asm volatile(".globl after_vmentry_label\nafter_vmentry_label:"); +} + +GUEST_CODE static noinline void +guest_handle_nested_vmentry_intel(uint64 vm_id, uint64 cpu_id, bool is_launch) +{ + volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS; + struct l2_guest_regs* l2_regs = (struct l2_guest_regs*)&globals->l2_ctx[cpu_id][vm_id]; + uint64 vmx_error_code = 0; + uint64 fail_flag = 0; // Will be 1 if EITHER CF or ZF is set + nested_vmptrld(cpu_id, vm_id); + + // Mark the VM as active on this CPU. + globals->active_vm_id[cpu_id] = vm_id; + + asm volatile(R"( + // 1. Red Zone protection. + sub $128, %%rsp + + // 2. Stack Passthrough for Exit Handler. + push %[cpu_id] + push %[launch] + + // 3. Save L1 callee-saved registers. + push %%rbx + push %%rbp + push %%r12 + push %%r13 + push %%r14 + push %%r15 + + // 4. Update VMCS_HOST_RSP with the current stack pointer. + // This stack contains [RedZone] [cpu_id] [launch] [L1 regs]. + mov %[host_rsp_field], %%r10 + mov %%rsp, %%r11 + vmwrite %%r11, %%r10 + + // 5. Load L2 GPRs from storage. + // We use RAX as a temporary base pointer. + mov %[l2_regs], %%rax + mov 8(%%rax), %%rbx + mov 16(%%rax), %%rcx + mov 24(%%rax), %%rdx + mov 32(%%rax), %%rsi + mov 40(%%rax), %%rdi + mov 48(%%rax), %%rbp + mov 56(%%rax), %%r8 + mov 64(%%rax), %%r9 + mov 72(%%rax), %%r10 + mov 80(%%rax), %%r11 + mov 88(%%rax), %%r12 + mov 96(%%rax), %%r13 + mov 104(%%rax), %%r14 + mov 112(%%rax), %%r15 + // Finally, load RAX (L2 RAX). + mov 0(%%rax), %%rax + + // 6. Execute Launch or Resume. + // Check the launch flag on the stack. + // Stack offset for 'launch': [r15][r14][r13][r12][rbp][rbx] = 6*8 = 48 bytes. + cmpq $0, 48(%%rsp) + je 1f + vmlaunch + jmp 2f + + 1: vmresume + + 2: // 7. Failure path. + // Restore L1 registers to return to C. + pop %%r15 + pop %%r14 + pop %%r13 + pop %%r12 + pop %%rbp + pop %%rbx + // pop launch and cpu_id + add $16, %%rsp + // restore Red Zone + add $128, %%rsp + mov $1, %[ret] + jmp 3f + + // 8. Success path (L2 Exit). + .globl after_vmentry_label + after_vmentry_label: + xor %[ret], %[ret] + + 3: // Final return to C. + )" + : [ret] "=&r"(fail_flag) + : [launch] "r"((uint64)is_launch), + [host_rsp_field] "i"(VMCS_HOST_RSP), + [cpu_id] "r"(cpu_id), + [l2_regs] "r"(l2_regs) + : "cc", "memory", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11"); + if (fail_flag) { // VMLAUNCH/VMRESUME failed, so VMCS is still valid and can be read. vmx_error_code = vmread(VMCS_VM_INSTRUCTION_ERROR); guest_uexit(0xE2E10000 | (uint32)vmx_error_code); return; } - // If we get here, this means VMLAUNCH/VMRESUME truly succeeded (CF=0 and ZF=0) - // and the L2 guest has run and exited. } GUEST_CODE static noinline void guest_run_amd_vm(uint64 cpu_id, uint64 vm_id) { uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); - volatile uint8* vmcb_ptr = (volatile uint8*)vmcb_addr; + volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS; + globals->active_vm_id[cpu_id] = vm_id; + struct l2_guest_regs* l2_regs = (struct l2_guest_regs*)&globals->l2_ctx[cpu_id][vm_id]; uint8 fail_flag = 0; - asm volatile( - "mov %1, %%rax\n\t" // Load VMCB physical address into RAX - "vmrun\n\t" // Launch or resume L2 guest - "setc %0\n\t" - : "=q"(fail_flag) - : "m"(vmcb_addr) - : "rax", "cc", "memory"); + asm volatile(R"( + // 1. Red Zone protection. + sub $128, %%rsp + + // 2. Stack Passthrough for Exit Handler. + push %[cpu_id] + // Save VMCB address for later use after VMEXIT. + push %[vmcb_addr] + + // 3. Save L1 callee-saved registers. + push %%rbx + push %%rbp + push %%r12 + push %%r13 + push %%r14 + push %%r15 + + // 4. Load L2 GPRs from storage. + mov %[l2_regs], %%rax + // Sync RAX to VMCB (guest RAX). + mov 0(%%rax), %%rbx + mov %[vmcb_addr], %%rcx + mov %%rbx, 0x5f8(%%rcx) + + mov 8(%%rax), %%rbx + mov 16(%%rax), %%rcx + mov 24(%%rax), %%rdx + mov 32(%%rax), %%rsi + mov 40(%%rax), %%rdi + mov 48(%%rax), %%rbp + mov 56(%%rax), %%r8 + mov 64(%%rax), %%r9 + mov 72(%%rax), %%r10 + mov 80(%%rax), %%r11 + mov 88(%%rax), %%r12 + mov 96(%%rax), %%r13 + mov 104(%%rax), %%r14 + mov 112(%%rax), %%r15 + + // 4.5 Note: Host State (RSP and RIP) is saved automatically by VMRUN + // to the HSAVE area pointed to by VM_HSAVE_PA. + // There is no need to manually write it to the VMCB. + + // 5. Execute VMRUN. + clgi + // VMCB address MUST be in RAX. + // It was pushed at Index 6: 6 * 8 = 48. + mov 48(%%rsp), %%rax + vmrun + 1: // Host resumes here. + // Restore RAX as VMRUN clobbers it. + mov 48(%%rsp), %%rax + setc %[fail_flag] + + // 6. Save L2's GPRs. + // exit_code (it will be at Index 15) + pushq 0x70(%%rax) + + // Save L2 GPRs (Index 14 down to 1). + push %%r15 + push %%r14 + push %%r13 + push %%r12 + push %%r11 + push %%r10 + push %%r9 + push %%r8 + push %%rbp + push %%rdi + push %%rsi + push %%rdx + push %%rcx + push %%rbx + + // Save L2 RAX from VMCB (Index 0). + // Since we pushed 16 regs (L2 RAX + 14 GPRs + exit_code), vmcb_addr is at 48 + 16 * 8 = 176(%%rsp). + mov 176(%%rsp), %%rax + pushq 0x5f8(%%rax) + + // 7. Call the C handler. + // arg1 (RDI) = exit reason (at Index 15: 15 * 8 = 120 bytes) + mov 120(%%rsp), %%rdi + // arg2 (RSI) = pointer to the saved registers + mov %%rsp, %%rsi + call nested_vm_exit_handler_amd + + // 8. Restore L1 state. + // Discard L2 GPRs (15 regs) + exit_code = 16 regs in total. + add $128, %%rsp + + // Restore L1 callee-saved registers. + pop %%r15 + pop %%r14 + pop %%r13 + pop %%r12 + pop %%rbp + pop %%rbx + + // 9. Discard vmcb_addr and cpu_id. + add $16, %%rsp + + // 10. Restore Red Zone. + add $128, %%rsp + + stgi + after_vmentry_label_amd: + )" + : [fail_flag] "=m"(fail_flag) + : [cpu_id] "r"(cpu_id), [vmcb_addr] "r"(vmcb_addr), [l2_regs] "r"(l2_regs), + [l2_regs_size] "i"(sizeof(struct l2_guest_regs)) + : "cc", "memory", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11"); if (fail_flag) { // VMRUN failed. guest_uexit(0xE2E10000 | 0xFFFF); return; } - - // VMRUN succeeded and we have a VM-exit. - uint64 exit_reason = vmcb_read64(vmcb_ptr, VMCB_EXIT_CODE); - nested_vm_exit_handler_amd(exit_reason, cpu_id, vm_id); } GUEST_CODE static noinline void diff --git a/executor/kvm.h b/executor/kvm.h index 024f70351..1e0ceae06 100644 --- a/executor/kvm.h +++ b/executor/kvm.h @@ -64,7 +64,7 @@ #define X86_SYZOS_ADDR_STACK0 0x60f80 // Base address for all per-L1-VCPU regions. -#define X86_SYZOS_PER_VCPU_REGIONS_BASE 0x70000 +#define X86_SYZOS_PER_VCPU_REGIONS_BASE 0x400000 // Size of the entire memory block allocated for a single L1 VCPU to manage its L2 VMs. // We need space for 1 VMXON page + 4 L2 VMs. Let's allocate 256KB per L1 VCPU for ample space. #define X86_SYZOS_L1_VCPU_REGION_SIZE 0x40000 @@ -98,7 +98,7 @@ #define X86_SYZOS_L2_VM_OFFSET_MSR_BITMAP 0x7000 // Subsequent addresses are shifted to accommodate all L1 VCPU regions. -#define X86_SYZOS_ADDR_UNUSED 0x200000 +#define X86_SYZOS_ADDR_UNUSED 0x1000000 #define X86_SYZOS_ADDR_IOAPIC 0xfec00000 #define X86_SYZOS_ADDR_VMCS_VMCB(cpu, vm) \ @@ -344,6 +344,7 @@ // VMCS Guest State Fields. #define VMCS_GUEST_INTR_STATUS 0x00000810 #define VMCS_GUEST_PML_INDEX 0x00000812 +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 #define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 #define VMCS_GUEST_IA32_PAT 0x00002804 #define VMCS_GUEST_IA32_EFER 0x00002806 @@ -410,6 +411,7 @@ #define VMCB_CTRL_ASID 0x058 #define VMCB_EXIT_CODE 0x070 +#define VMCB_EXITINFO2 0x080 // NP_ENABLE is actually 1 byte, but the 7 following bytes are reserved, so it's okay #define VMCB_CTRL_NP_ENABLE 0x090 @@ -471,6 +473,7 @@ #define VMCB_GUEST_RSP 0x5d8 #define VMCB_GUEST_PAT 0x668 #define VMCB_GUEST_DEBUGCTL 0x670 +#define VMCB_RAX 0x5f8 // SVM Segment Attribute Defines #define SVM_ATTR_G (1 << 15) @@ -498,6 +501,7 @@ #endif // x86-specific definitions. #define KVM_MAX_VCPU 4 +#define KVM_MAX_L2_VMS 4 #define KVM_PAGE_SIZE (1 << 12) #define KVM_GUEST_PAGES 1024 #define KVM_GUEST_MEM_SIZE (KVM_GUEST_PAGES * KVM_PAGE_SIZE) |
