aboutsummaryrefslogtreecommitdiffstats
path: root/executor
diff options
context:
space:
mode:
Diffstat (limited to 'executor')
-rw-r--r--executor/common_kvm_amd64.h25
-rw-r--r--executor/common_kvm_amd64_syzos.h540
-rw-r--r--executor/kvm.h8
3 files changed, 467 insertions, 106 deletions
diff --git a/executor/common_kvm_amd64.h b/executor/common_kvm_amd64.h
index 474bc875b..c5d7c6983 100644
--- a/executor/common_kvm_amd64.h
+++ b/executor/common_kvm_amd64.h
@@ -206,6 +206,7 @@ static void setup_64bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t g
#endif
#if SYZ_EXECUTOR || __NR_syz_kvm_setup_syzos_vm || __NR_syz_kvm_add_vcpu
+
// SYZOS guest virtual memory layout (must be in sync with executor/kvm.h):
static const struct mem_region syzos_mem_regions[] = {
// AMD64 fixed data structures (5 pages: Zero, GDT, PML4, PDP, PD).
@@ -250,6 +251,7 @@ struct kvm_syz_vm {
void* user_text;
void* gpa0_mem;
void* pt_pool_mem;
+ void* globals_mem;
};
#endif
@@ -1100,19 +1102,16 @@ static volatile long syz_kvm_setup_cpu(volatile long a0, volatile long a1, volat
#define RFLAGS_1_BIT (1ULL << 1)
#define RFLAGS_IF_BIT (1ULL << 9)
-static void reset_cpu_regs(int cpufd, int cpu_id, size_t text_size)
+static void reset_cpu_regs(int cpufd, uint64 rip, uint64 cpu_id)
{
struct kvm_regs regs;
memset(&regs, 0, sizeof(regs));
// RFLAGS.1 must be 1, RFLAGS.IF enables interrupts.
regs.rflags |= RFLAGS_1_BIT | RFLAGS_IF_BIT;
- // PC points to the relative offset of guest_main() within the guest code.
- regs.rip = executor_fn_guest_addr(guest_main);
+ regs.rip = rip;
regs.rsp = X86_SYZOS_ADDR_STACK0;
- // Pass parameters to guest_main().
- regs.rdi = text_size;
- regs.rsi = cpu_id;
+ regs.rdi = cpu_id;
ioctl(cpufd, KVM_SET_REGS, &regs);
}
@@ -1126,7 +1125,15 @@ static void install_user_code(struct kvm_syz_vm* vm, int cpufd, int cpu_id, cons
memcpy(target, text, text_size);
setup_gdt_ldt_pg(vm, cpufd, cpu_id);
setup_cpuid(cpufd);
- reset_cpu_regs(cpufd, cpu_id, text_size);
+
+ uint64 entry_rip = executor_fn_guest_addr(guest_main);
+ reset_cpu_regs(cpufd, entry_rip, cpu_id);
+
+ // Pass the text size via the shared globals page.
+ if (vm->globals_mem) {
+ struct syzos_globals* globals = (struct syzos_globals*)vm->globals_mem;
+ globals->text_sizes[cpu_id] = text_size;
+ }
}
#endif
@@ -1196,6 +1203,8 @@ static void setup_vm(int vmfd, struct kvm_syz_vm* vm)
vm->gpa0_mem = next.addr;
if (r->gpa == X86_SYZOS_ADDR_PT_POOL)
vm->pt_pool_mem = next.addr;
+ if (r->gpa == X86_SYZOS_ADDR_GLOBALS)
+ vm->globals_mem = next.addr;
if (r->gpa == X86_SYZOS_ADDR_BOOT_ARGS) {
boot_args = (struct syzos_boot_args*)next.addr;
@@ -1326,4 +1335,4 @@ static long syz_kvm_assert_syzos_uexit(volatile long a0, volatile long a1, volat
}
#endif
-#endif // EXECUTOR_COMMON_KVM_AMD64_H \ No newline at end of file
+#endif // EXECUTOR_COMMON_KVM_AMD64_H
diff --git a/executor/common_kvm_amd64_syzos.h b/executor/common_kvm_amd64_syzos.h
index 62c021766..4b8d08e2a 100644
--- a/executor/common_kvm_amd64_syzos.h
+++ b/executor/common_kvm_amd64_syzos.h
@@ -32,6 +32,7 @@ typedef enum {
SYZOS_API_NESTED_LOAD_CODE = 302,
SYZOS_API_NESTED_VMLAUNCH = 303,
SYZOS_API_NESTED_VMRESUME = 304,
+ SYZOS_API_NESTED_LOAD_SYZOS = 310,
SYZOS_API_NESTED_INTEL_VMWRITE_MASK = 340,
SYZOS_API_NESTED_AMD_VMCB_WRITE_MASK = 380,
SYZOS_API_NESTED_AMD_INVLPGA = 381,
@@ -65,6 +66,13 @@ struct api_call_nested_load_code {
uint8 insns[];
};
+struct api_call_nested_load_syzos {
+ struct api_call_header header;
+ uint64 vm_id;
+ uint64 unused_pages;
+ uint8 program[];
+};
+
struct api_call_cpuid {
struct api_call_header header;
uint32 eax;
@@ -121,6 +129,9 @@ struct syzos_boot_args {
struct syzos_globals {
uint64 alloc_offset;
uint64 total_size;
+ uint64 text_sizes[KVM_MAX_VCPU];
+ struct l2_guest_regs l2_ctx[KVM_MAX_VCPU][KVM_MAX_L2_VMS];
+ uint64 active_vm_id[KVM_MAX_VCPU];
};
#ifdef __cplusplus
@@ -128,6 +139,7 @@ extern "C" {
#endif
GUEST_CODE static void guest_uexit(uint64 exit_code);
GUEST_CODE static void nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs);
+GUEST_CODE static void nested_vm_exit_handler_amd(uint64 exit_reason, struct l2_guest_regs* regs);
#ifdef __cplusplus
}
#endif
@@ -143,6 +155,7 @@ GUEST_CODE static void guest_handle_set_irq_handler(struct api_call_2* cmd);
GUEST_CODE static void guest_handle_enable_nested(struct api_call_1* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id);
+GUEST_CODE static void guest_handle_nested_load_syzos(struct api_call_nested_load_syzos* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_vmresume(struct api_call_1* cmd, uint64 cpu_id);
GUEST_CODE static void guest_handle_nested_intel_vmwrite_mask(struct api_call_5* cmd, uint64 cpu_id);
@@ -202,8 +215,10 @@ __attribute__((naked)) GUEST_CODE static void uexit_irq_handler()
// We add single-line comments to justify having the compound statements below.
__attribute__((used))
GUEST_CODE static void
-guest_main(uint64 size, uint64 cpu)
+guest_main(uint64 cpu)
{
+ volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
+ uint64 size = globals->text_sizes[cpu];
uint64 addr = X86_SYZOS_ADDR_USER_CODE + cpu * KVM_PAGE_SIZE;
while (size >= sizeof(struct api_call_header)) {
@@ -257,6 +272,9 @@ guest_main(uint64 size, uint64 cpu)
} else if (call == SYZOS_API_NESTED_LOAD_CODE) {
// Load code into the nested VM.
guest_handle_nested_load_code((struct api_call_nested_load_code*)cmd, cpu);
+ } else if (call == SYZOS_API_NESTED_LOAD_SYZOS) {
+ // Load SYZOS into the nested VM.
+ guest_handle_nested_load_syzos((struct api_call_nested_load_syzos*)cmd, cpu);
} else if (call == SYZOS_API_NESTED_VMLAUNCH) {
// Launch the nested VM.
guest_handle_nested_vmlaunch((struct api_call_1*)cmd, cpu);
@@ -313,8 +331,11 @@ __attribute__((used))
GUEST_CODE static noinline void
guest_uexit(uint64 exit_code)
{
+ // Force exit_code into RAX using inline asm constraints ("a").
+ // We write to X86_SYZOS_ADDR_UEXIT (0x40100).
+ // This allows the L1 hypervisor to reliably read RAX during an EPT violation.
volatile uint64* ptr = (volatile uint64*)X86_SYZOS_ADDR_UEXIT;
- *ptr = exit_code;
+ asm volatile("movq %0, (%1)" ::"a"(exit_code), "r"(ptr) : "memory");
}
GUEST_CODE static noinline void guest_handle_cpuid(uint32 eax, uint32 ecx)
@@ -773,15 +794,10 @@ GUEST_CODE static void l2_map_page(uint64 cpu_id, uint64 vm_id, uint64 gpa, uint
pt[pt_idx] = (host_pa & ~0xFFF) | flags;
}
-GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint64 cpu_id, uint64 vm_id)
+GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint64 cpu_id, uint64 vm_id, uint64 unused_pages)
{
- // The Root PML4 remains at the fixed address assigned to this VM.
- uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id);
-
- // Clear the root table.
- guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE);
- guest_memset((void*)X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id), 0, KVM_PAGE_SIZE);
-
+ // Note: PML4 and MSR Bitmap must be zeroed by the caller (nested_create_vm)
+ // so that this function can be called additively by nested_load_syzos.
// Intel EPT: set Read, Write, Execute.
// AMD NPT: set Present, Write, User.
uint64 flags = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER;
@@ -799,10 +815,19 @@ GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint6
r.pages = args->regions[i].pages;
r.flags = args->regions[i].flags;
- // Skip the huge unused heap for now, map fixed small heap if needed or handled by guest_alloc.
- if (r.flags & MEM_REGION_FLAG_REMAINING)
+ // Skip NO_HOST_MEM regions (like the Exit/UEXIT region).
+ // This ensures that L2 accesses to these pages cause a nested page fault
+ // (EPT Violation / NPT Fault), allowing L1 to intercept and modify the exit code.
+ if (r.flags & MEM_REGION_FLAG_NO_HOST_MEM)
continue;
+ // Skip the huge unused heap for now, map fixed small heap if needed or handled by guest_alloc.
+ // If unused_pages > 0, we map that many pages from the unused region.
+ if (r.flags & MEM_REGION_FLAG_REMAINING) {
+ // Map at least a few pages for the allocator overhead if 0 is passed.
+ r.pages = (unused_pages < 16) ? 16 : unused_pages;
+ }
+
for (int p = 0; p < r.pages; p++) {
uint64 gpa = r.gpa + (p * KVM_PAGE_SIZE);
uint64 backing;
@@ -814,9 +839,9 @@ GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint6
// Map stack to the VM's dedicated stack buffer
backing = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id);
} else if (r.gpa == X86_SYZOS_ADDR_ZERO ||
- r.gpa == X86_SYZOS_ADDR_VAR_IDT ||
- r.gpa == X86_SYZOS_ADDR_BOOT_ARGS ||
- r.gpa == X86_SYZOS_ADDR_PT_POOL) {
+ r.gpa == X86_SYZOS_ADDR_VAR_IDT ||
+ r.gpa == X86_SYZOS_ADDR_BOOT_ARGS ||
+ r.gpa == X86_SYZOS_ADDR_PT_POOL) {
// Critical System Regions: Allocate and COPY from L1.
// We must copy the PT POOL because the PD entries in ADDR_ZERO
// point to tables allocated here. If we don't copy, L2 sees
@@ -824,6 +849,9 @@ GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint6
// GDT/IDT/BootArgs are also copied for valid environment.
backing = guest_alloc_page();
guest_memcpy((void*)backing, (void*)gpa, KVM_PAGE_SIZE);
+ } else if (r.flags & MEM_REGION_FLAG_EXECUTOR_CODE) {
+ // Identity map the Executor Code.
+ backing = gpa;
} else {
// Allocate new backing memory
backing = guest_alloc_page();
@@ -906,9 +934,21 @@ typedef enum {
SYZOS_NESTED_EXIT_REASON_CPUID = 3,
SYZOS_NESTED_EXIT_REASON_RDTSC = 4,
SYZOS_NESTED_EXIT_REASON_RDTSCP = 5,
+ SYZOS_NESTED_EXIT_REASON_EPT_VIOLATION = 6,
SYZOS_NESTED_EXIT_REASON_UNKNOWN = 0xFF,
} syz_nested_exit_reason;
+GUEST_CODE static void handle_nested_uexit(uint64 exit_code)
+{
+ // Increment the nesting level (top byte).
+ uint64 level = (exit_code >> 56) + 1;
+ exit_code = (exit_code & 0x00FFFFFFFFFFFFFFULL) | (level << 56);
+
+ // Perform L1 uexit with the modified code.
+ guest_uexit(exit_code);
+ // guest_uexit terminates, so we don't return.
+}
+
GUEST_CODE static void guest_uexit_l2(uint64 exit_reason, syz_nested_exit_reason mapped_reason,
cpu_vendor_id vendor)
{
@@ -924,6 +964,7 @@ GUEST_CODE static void guest_uexit_l2(uint64 exit_reason, syz_nested_exit_reason
#define EXIT_REASON_CPUID 0xa
#define EXIT_REASON_HLT 0xc
#define EXIT_REASON_INVD 0xd
+#define EXIT_REASON_EPT_VIOLATION 0x30
#define EXIT_REASON_RDTSC 0x10
#define EXIT_REASON_RDTSCP 0x33
@@ -941,6 +982,8 @@ GUEST_CODE static syz_nested_exit_reason map_intel_exit_reason(uint64 basic_reas
return SYZOS_NESTED_EXIT_REASON_RDTSC;
if (reason == EXIT_REASON_RDTSCP)
return SYZOS_NESTED_EXIT_REASON_RDTSCP;
+ if (reason == EXIT_REASON_EPT_VIOLATION)
+ return SYZOS_NESTED_EXIT_REASON_EPT_VIOLATION;
return SYZOS_NESTED_EXIT_REASON_UNKNOWN;
}
@@ -964,7 +1007,36 @@ __attribute__((used))
GUEST_CODE static void
nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs)
{
+ volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
+ // Recover cpu_id from the stack. It was pushed before L1 registers.
+ // Stack: [cpu_id] [launch] [L1 GPRs x6] [L2 GPRs x15]
+ // Index: 22 21 15..20 0..14
+ // regs points to the start of L2 GPRs.
+ uint64 cpu_id = *(uint64*)((char*)regs + sizeof(struct l2_guest_regs) + 7 * 8);
+ uint64 vm_id = globals->active_vm_id[cpu_id];
+
+ // Persist L2 registers.
+ guest_memcpy((void*)&globals->l2_ctx[cpu_id][vm_id], regs, sizeof(struct l2_guest_regs));
+
uint64 basic_reason = exit_reason & 0xFFFF;
+
+ // Handle EPT Violation (Nested UEXIT).
+ if (basic_reason == EXIT_REASON_EPT_VIOLATION) {
+ uint64 gpa = vmread(VMCS_GUEST_PHYSICAL_ADDRESS);
+ // Only handle violations on the specific UEXIT page.
+ if ((gpa & ~0xFFF) == X86_SYZOS_ADDR_EXIT) {
+ // This is a uexit from L2.
+ // We enforced usage of RAX in guest_uexit.
+ // Read RAX from the saved L2 guest registers.
+ // Note: On Intel exit, guest registers are NOT saved to VMCS.
+ // They are saved to 'regs' by our asm wrapper.
+ handle_nested_uexit(regs->rax);
+ // Advance L2 RIP by 3 bytes (movq %rax, (%rdx) is 3 bytes).
+ vmwrite(VMCS_GUEST_RIP, vmread(VMCS_GUEST_RIP) + 3);
+ return;
+ }
+ }
+
syz_nested_exit_reason mapped_reason = map_intel_exit_reason(basic_reason);
guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_INTEL);
advance_l2_rip_intel(basic_reason);
@@ -975,22 +1047,22 @@ __attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(v
{
asm volatile(R"(
// Save L2's GPRs. This creates the 'struct l2_guest_regs' on the stack.
- // The order MUST match the struct.
- push %%rax
- push %%rbx
- push %%rcx
- push %%rdx
- push %%rsi
- push %%rdi
- push %%rbp
- push %%r8
- push %%r9
- push %%r10
- push %%r11
- push %%r12
- push %%r13
- push %%r14
+ // We push in reverse order so that RAX ends up at offset 0 (Top of Stack).
push %%r15
+ push %%r14
+ push %%r13
+ push %%r12
+ push %%r11
+ push %%r10
+ push %%r9
+ push %%r8
+ push %%rbp
+ push %%rdi
+ push %%rsi
+ push %%rdx
+ push %%rcx
+ push %%rbx
+ push %%rax
// Prepare arguments for the C handler:
// arg1 (RDI) = exit_reason
@@ -1004,13 +1076,30 @@ __attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(v
// The C handler has processed the exit. Now, return to the L1 command
// processing loop. VMX remains enabled.
- add %[stack_cleanup_size], %%rsp
+
+ // 1. Discard L2 GPRs.
+ add %[l2_regs_size], %%rsp
+
+ // 2. Restore L1 callee-saved registers.
+ // Order must be reverse of push: r15, r14, r13, r12, rbp, rbx.
+ pop %%r15
+ pop %%r14
+ pop %%r13
+ pop %%r12
+ pop %%rbp
+ pop %%rbx
+
+ // 3. Discard launch flag and cpu_id.
+ add $16, %%rsp
+
+ // 4. Restore Red Zone.
+ add $128, %%rsp
// Jump to L1 main flow
jmp after_vmentry_label
)"
- : : [stack_cleanup_size] "i"(sizeof(struct l2_guest_regs)),
+ : : [l2_regs_size] "i"(sizeof(struct l2_guest_regs)),
[vm_exit_reason] "i"(VMCS_VM_EXIT_REASON) : "memory", "cc", "rbx", "rdi", "rsi");
}
@@ -1018,6 +1107,7 @@ __attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(v
#define VMEXIT_CPUID 0x72
#define VMEXIT_INVD 0x76
#define VMEXIT_HLT 0x78
+#define VMEXIT_NPF 0x400
#define VMEXIT_RDTSCP 0x87
GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 basic_reason)
@@ -1034,6 +1124,8 @@ GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 basic_reason
return SYZOS_NESTED_EXIT_REASON_RDTSC;
if (reason == VMEXIT_RDTSCP)
return SYZOS_NESTED_EXIT_REASON_RDTSCP;
+ if (reason == VMEXIT_NPF)
+ return SYZOS_NESTED_EXIT_REASON_EPT_VIOLATION;
return SYZOS_NESTED_EXIT_REASON_UNKNOWN;
}
@@ -1054,9 +1146,36 @@ GUEST_CODE static void advance_l2_rip_amd(uint64 basic_reason, uint64 cpu_id, ui
}
__attribute__((used)) GUEST_CODE static void
-nested_vm_exit_handler_amd(uint64 exit_reason, uint64 cpu_id, uint64 vm_id)
+nested_vm_exit_handler_amd(uint64 exit_reason, struct l2_guest_regs* regs)
{
+ volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
+ // Recover cpu_id from the stack.
+ // Stack: [cpu_id] [vmcb_addr] [6 L1 GPRs] [exit_code] [15 L2 GPRs]
+ // Index: 23 22 16..21 15 0..14
+ // regs points to Index 0.
+ uint64 cpu_id = *(uint64*)((char*)regs + sizeof(struct l2_guest_regs) + 8 * 8);
+ uint64 vm_id = globals->active_vm_id[cpu_id];
+
+ // Persist L2 registers.
+ guest_memcpy((void*)&globals->l2_ctx[cpu_id][vm_id], regs, sizeof(struct l2_guest_regs));
+
volatile uint64 basic_reason = exit_reason & 0xFFFF;
+
+ // Handle NPT Fault (Nested UEXIT).
+ if (basic_reason == VMEXIT_NPF) {
+ uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
+ // EXITINFO2 contains the faulting GPA.
+ uint64 fault_gpa = vmcb_read64((volatile uint8*)vmcb_addr, VMCB_EXITINFO2);
+ if ((fault_gpa & ~0xFFF) == X86_SYZOS_ADDR_EXIT) {
+ // RAX is in the saved L2 regs.
+ handle_nested_uexit(regs->rax);
+ // Advance L2 RIP by 3 bytes.
+ uint64 rip = vmcb_read64((volatile uint8*)vmcb_addr, VMCB_GUEST_RIP);
+ vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, rip + 3);
+ return;
+ }
+ }
+
syz_nested_exit_reason mapped_reason = map_amd_exit_reason(basic_reason);
guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_AMD);
advance_l2_rip_amd(basic_reason, cpu_id, vm_id);
@@ -1080,10 +1199,7 @@ GUEST_CODE static noinline void init_vmcs_host_state(void)
vmwrite(VMCS_HOST_FS_BASE, rdmsr(X86_MSR_FS_BASE));
vmwrite(VMCS_HOST_GS_BASE, rdmsr(X86_MSR_GS_BASE));
- // RIP and RSP.
- uint64 tmpreg = 0; // nolint
- asm volatile("mov %%rsp, %0" : "=r"(tmpreg));
- vmwrite(VMCS_HOST_RSP, tmpreg);
+ // Exit handler in RIP.
vmwrite(VMCS_HOST_RIP, (uintptr_t)nested_vm_exit_handler_intel_asm);
// Control Registers.
@@ -1167,6 +1283,8 @@ nested_create_vm_intel(struct api_call_1* cmd, uint64 cpu_id)
uint64 vm_id = cmd->arg;
uint64 vmcs_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
uint8 error = 0; // nolint
+ uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id);
+ uint64 l2_msr_bitmap = X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id);
*(uint32*)vmcs_addr = rdmsr(X86_MSR_IA32_VMX_BASIC);
asm volatile("vmclear %1; setna %0"
@@ -1179,7 +1297,11 @@ nested_create_vm_intel(struct api_call_1* cmd, uint64 cpu_id)
}
nested_vmptrld(cpu_id, vm_id);
- setup_l2_page_tables(CPU_VENDOR_INTEL, cpu_id, vm_id);
+ // Zero out critical structures.
+ guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE);
+ guest_memset((void*)l2_msr_bitmap, 0, KVM_PAGE_SIZE);
+
+ setup_l2_page_tables(CPU_VENDOR_INTEL, cpu_id, vm_id, 0);
init_vmcs_control_fields(cpu_id, vm_id);
init_vmcs_host_state();
init_vmcs_guest_state(cpu_id, vm_id);
@@ -1215,7 +1337,6 @@ GUEST_CODE static noinline void init_vmcb_guest_state(uint64 cpu_id, uint64 vm_i
SETUP_L2_SEGMENT_SVM(vmcb_addr, LDTR, 0, 0, 0, SVM_ATTR_LDTR_UNUSABLE);
// Setup Guest Control Registers & CPU State.
- uint64 efer = rdmsr(X86_MSR_IA32_EFER);
vmcb_write64(vmcb_addr, VMCB_GUEST_CR0, read_cr0() | X86_CR0_WP);
// L2 will use L1's page tables.
vmcb_write64(vmcb_addr, VMCB_GUEST_CR3, read_cr3());
@@ -1224,15 +1345,9 @@ GUEST_CODE static noinline void init_vmcb_guest_state(uint64 cpu_id, uint64 vm_i
vmcb_write64(vmcb_addr, VMCB_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8);
vmcb_write64(vmcb_addr, VMCB_GUEST_RFLAGS, RFLAGS_1_BIT);
- // Setup Guest MSRs.
-
- // SYSCALL/SYSRET MSRs.
- vmcb_write64(vmcb_addr, VMCB_GUEST_DEBUGCTL, 0);
- vmcb_write64(vmcb_addr, VMCB_GUEST_DR6, 0x0);
- vmcb_write64(vmcb_addr, VMCB_GUEST_DR7, 0x0);
-
- vmcb_write64(vmcb_addr, VMCB_GUEST_EFER, efer & ~X86_EFER_SCE);
- vmcb_write64(vmcb_addr, VMCB_GUEST_PAT, rdmsr(X86_MSR_IA32_CR_PAT));
+ // Setup Guest EFER. Must have SVME, LME, and LMA for 64-bit nested.
+ vmcb_write64(vmcb_addr, VMCB_GUEST_EFER, X86_EFER_LME | X86_EFER_LMA | X86_EFER_SVME);
+ vmcb_write64(vmcb_addr, VMCB_RAX, 0);
// Setup Guest Descriptor Tables.
struct {
@@ -1270,12 +1385,16 @@ nested_create_vm_amd(struct api_call_1* cmd, uint64 cpu_id)
{
uint64 vm_id = cmd->arg;
uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
+ uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id);
+ uint64 l2_msr_bitmap = X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id);
guest_memset((void*)vmcb_addr, 0, KVM_PAGE_SIZE);
guest_memset((void*)X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id), 0, KVM_PAGE_SIZE);
+ guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE);
+ guest_memset((void*)l2_msr_bitmap, 0, KVM_PAGE_SIZE);
// Setup NPT (Nested Page Tables)
- setup_l2_page_tables(CPU_VENDOR_AMD, cpu_id, vm_id);
+ setup_l2_page_tables(CPU_VENDOR_AMD, cpu_id, vm_id, 0);
// Initialize VMCB Control and Guest State
init_vmcb_guest_state(cpu_id, vm_id);
@@ -1291,12 +1410,42 @@ guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id)
}
}
+GUEST_CODE static uint64 l2_gpa_to_pa(uint64 cpu_id, uint64 vm_id, uint64 gpa)
+{
+ uint64 pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id);
+ volatile uint64* pml4 = (volatile uint64*)pml4_addr;
+ uint64 pml4_idx = (gpa >> 39) & 0x1FF;
+ if (!(pml4[pml4_idx] & X86_PDE64_PRESENT))
+ return 0;
+
+ volatile uint64* pdpt = (volatile uint64*)(pml4[pml4_idx] & ~0xFFF);
+ uint64 pdpt_idx = (gpa >> 30) & 0x1FF;
+ if (!(pdpt[pdpt_idx] & X86_PDE64_PRESENT))
+ return 0;
+
+ volatile uint64* pd = (volatile uint64*)(pdpt[pdpt_idx] & ~0xFFF);
+ uint64 pd_idx = (gpa >> 21) & 0x1FF;
+ if (!(pd[pd_idx] & X86_PDE64_PRESENT))
+ return 0;
+
+ volatile uint64* pt = (volatile uint64*)(pd[pd_idx] & ~0xFFF);
+ uint64 pt_idx = (gpa >> 12) & 0x1FF;
+ if (!(pt[pt_idx] & X86_PDE64_PRESENT))
+ return 0;
+
+ return (pt[pt_idx] & ~0xFFF) + (gpa & 0xFFF);
+}
+
GUEST_CODE static noinline void
guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id)
{
uint64 vm_id = cmd->vm_id;
// Backing address in L1 for the L2 User Code (mapped at X86_SYZOS_ADDR_USER_CODE)
- uint64 l2_code_backing = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id);
+ uint64 l2_code_backing = l2_gpa_to_pa(cpu_id, vm_id, X86_SYZOS_ADDR_USER_CODE);
+ if (!l2_code_backing) {
+ guest_uexit(0xE2BAD4);
+ return;
+ }
// Code size = command size - header size - vm_id size.
uint64 l2_code_size = cmd->header.size - sizeof(struct api_call_header) - sizeof(uint64);
@@ -1317,77 +1466,276 @@ guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_
}
}
-// Clang's LTO may ignore noinline and attempt to inline this function into both callers,
-// which results in duplicate declaration of after_vmentry_label.
-// Applying __optnone should prevent this behavior.
-GUEST_CODE static noinline __optnone void
-guest_handle_nested_vmentry_intel(uint64 vm_id, uint64 cpu_id, bool is_launch)
+GUEST_CODE static noinline void
+guest_handle_nested_load_syzos(struct api_call_nested_load_syzos* cmd, uint64 cpu_id)
{
- uint64 vmx_error_code = 0;
- uint8 fail_flag = 0; // Will be 1 if EITHER CF or ZF is set
+ uint64 vm_id = cmd->vm_id;
+ uint64 prog_size = cmd->header.size - __builtin_offsetof(struct api_call_nested_load_syzos, program);
+ uint64 l2_code_backing = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id);
+ volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
- nested_vmptrld(cpu_id, vm_id);
+ if (prog_size > KVM_PAGE_SIZE)
+ prog_size = KVM_PAGE_SIZE;
+
+ // Copy Payload to Code buffer.
+ guest_memcpy((void*)l2_code_backing, (void*)cmd->program, prog_size);
+
+ // Populate Globals.
+ uint64 globals_pa = l2_gpa_to_pa(cpu_id, vm_id, X86_SYZOS_ADDR_GLOBALS);
+ if (!globals_pa) {
+ guest_uexit(0xE2BAD3);
+ return;
+ }
+ volatile struct syzos_globals* l2_globals = (volatile struct syzos_globals*)globals_pa;
+ // Set initial state for ALL possible L2 VCPUs of this VM.
+ for (int i = 0; i < KVM_MAX_VCPU; i++) {
+ l2_globals->text_sizes[i] = prog_size;
+ globals->l2_ctx[i][vm_id].rdi = i;
+ globals->l2_ctx[i][vm_id].rax = 0; // Default RAX
+ // Note: RSP and RIP are set in the VMCB/VMCS, but they could also be in l2_ctx
+ // since the shims load them if we wanted. But currently they are in VMCB/VMCS.
+ }
- if (is_launch) {
- asm volatile(R"(
- // Attempt to launch the L2 guest.
- vmlaunch
- // Set AL to 1 if CF=1 (VMfailValid)
- setc %%al
- // Set BL to 1 if ZF=1 (VMfailInvalid)
- setz %%bl
- or %%bl, %%al)"
- : "=a"(fail_flag)
- :
- : "rbx", "cc", "memory");
+ // Set RIP to guest_main.
+ uint64 entry_rip = executor_fn_guest_addr(guest_main);
+ if (get_cpu_vendor() == CPU_VENDOR_INTEL) {
+ nested_vmptrld(cpu_id, vm_id);
+ vmwrite(VMCS_GUEST_RIP, entry_rip);
+ vmwrite(VMCS_GUEST_RSP, X86_SYZOS_ADDR_STACK_BOTTOM + KVM_PAGE_SIZE - 8);
} else {
- asm volatile(R"(
- // Attempt to resume the L2 guest.
- vmresume
- // Set AL to 1 if CF=1 (VMfailValid)
- setc %%al
- // Set BL to 1 if ZF=1 (VMfailInvalid)
- setz %%bl
- or %%bl, %%al)"
- : "=a"(fail_flag)
- :
- : "rbx", "cc", "memory");
+ uint64 vmcb = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
+ vmcb_write64(vmcb, VMCB_GUEST_RIP, entry_rip);
+ vmcb_write64(vmcb, VMCB_GUEST_RSP, X86_SYZOS_ADDR_STACK_BOTTOM + KVM_PAGE_SIZE - 8);
}
- asm volatile(".globl after_vmentry_label\nafter_vmentry_label:");
+}
+
+GUEST_CODE static noinline void
+guest_handle_nested_vmentry_intel(uint64 vm_id, uint64 cpu_id, bool is_launch)
+{
+ volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
+ struct l2_guest_regs* l2_regs = (struct l2_guest_regs*)&globals->l2_ctx[cpu_id][vm_id];
+ uint64 vmx_error_code = 0;
+ uint64 fail_flag = 0; // Will be 1 if EITHER CF or ZF is set
+ nested_vmptrld(cpu_id, vm_id);
+
+ // Mark the VM as active on this CPU.
+ globals->active_vm_id[cpu_id] = vm_id;
+
+ asm volatile(R"(
+ // 1. Red Zone protection.
+ sub $128, %%rsp
+
+ // 2. Stack Passthrough for Exit Handler.
+ push %[cpu_id]
+ push %[launch]
+
+ // 3. Save L1 callee-saved registers.
+ push %%rbx
+ push %%rbp
+ push %%r12
+ push %%r13
+ push %%r14
+ push %%r15
+
+ // 4. Update VMCS_HOST_RSP with the current stack pointer.
+ // This stack contains [RedZone] [cpu_id] [launch] [L1 regs].
+ mov %[host_rsp_field], %%r10
+ mov %%rsp, %%r11
+ vmwrite %%r11, %%r10
+
+ // 5. Load L2 GPRs from storage.
+ // We use RAX as a temporary base pointer.
+ mov %[l2_regs], %%rax
+ mov 8(%%rax), %%rbx
+ mov 16(%%rax), %%rcx
+ mov 24(%%rax), %%rdx
+ mov 32(%%rax), %%rsi
+ mov 40(%%rax), %%rdi
+ mov 48(%%rax), %%rbp
+ mov 56(%%rax), %%r8
+ mov 64(%%rax), %%r9
+ mov 72(%%rax), %%r10
+ mov 80(%%rax), %%r11
+ mov 88(%%rax), %%r12
+ mov 96(%%rax), %%r13
+ mov 104(%%rax), %%r14
+ mov 112(%%rax), %%r15
+ // Finally, load RAX (L2 RAX).
+ mov 0(%%rax), %%rax
+
+ // 6. Execute Launch or Resume.
+ // Check the launch flag on the stack.
+ // Stack offset for 'launch': [r15][r14][r13][r12][rbp][rbx] = 6*8 = 48 bytes.
+ cmpq $0, 48(%%rsp)
+ je 1f
+ vmlaunch
+ jmp 2f
+
+ 1: vmresume
+
+ 2: // 7. Failure path.
+ // Restore L1 registers to return to C.
+ pop %%r15
+ pop %%r14
+ pop %%r13
+ pop %%r12
+ pop %%rbp
+ pop %%rbx
+ // pop launch and cpu_id
+ add $16, %%rsp
+ // restore Red Zone
+ add $128, %%rsp
+ mov $1, %[ret]
+ jmp 3f
+
+ // 8. Success path (L2 Exit).
+ .globl after_vmentry_label
+ after_vmentry_label:
+ xor %[ret], %[ret]
+
+ 3: // Final return to C.
+ )"
+ : [ret] "=&r"(fail_flag)
+ : [launch] "r"((uint64)is_launch),
+ [host_rsp_field] "i"(VMCS_HOST_RSP),
+ [cpu_id] "r"(cpu_id),
+ [l2_regs] "r"(l2_regs)
+ : "cc", "memory", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11");
+
if (fail_flag) {
// VMLAUNCH/VMRESUME failed, so VMCS is still valid and can be read.
vmx_error_code = vmread(VMCS_VM_INSTRUCTION_ERROR);
guest_uexit(0xE2E10000 | (uint32)vmx_error_code);
return;
}
- // If we get here, this means VMLAUNCH/VMRESUME truly succeeded (CF=0 and ZF=0)
- // and the L2 guest has run and exited.
}
GUEST_CODE static noinline void
guest_run_amd_vm(uint64 cpu_id, uint64 vm_id)
{
uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id);
- volatile uint8* vmcb_ptr = (volatile uint8*)vmcb_addr;
+ volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS;
+ globals->active_vm_id[cpu_id] = vm_id;
+ struct l2_guest_regs* l2_regs = (struct l2_guest_regs*)&globals->l2_ctx[cpu_id][vm_id];
uint8 fail_flag = 0;
- asm volatile(
- "mov %1, %%rax\n\t" // Load VMCB physical address into RAX
- "vmrun\n\t" // Launch or resume L2 guest
- "setc %0\n\t"
- : "=q"(fail_flag)
- : "m"(vmcb_addr)
- : "rax", "cc", "memory");
+ asm volatile(R"(
+ // 1. Red Zone protection.
+ sub $128, %%rsp
+
+ // 2. Stack Passthrough for Exit Handler.
+ push %[cpu_id]
+ // Save VMCB address for later use after VMEXIT.
+ push %[vmcb_addr]
+
+ // 3. Save L1 callee-saved registers.
+ push %%rbx
+ push %%rbp
+ push %%r12
+ push %%r13
+ push %%r14
+ push %%r15
+
+ // 4. Load L2 GPRs from storage.
+ mov %[l2_regs], %%rax
+ // Sync RAX to VMCB (guest RAX).
+ mov 0(%%rax), %%rbx
+ mov %[vmcb_addr], %%rcx
+ mov %%rbx, 0x5f8(%%rcx)
+
+ mov 8(%%rax), %%rbx
+ mov 16(%%rax), %%rcx
+ mov 24(%%rax), %%rdx
+ mov 32(%%rax), %%rsi
+ mov 40(%%rax), %%rdi
+ mov 48(%%rax), %%rbp
+ mov 56(%%rax), %%r8
+ mov 64(%%rax), %%r9
+ mov 72(%%rax), %%r10
+ mov 80(%%rax), %%r11
+ mov 88(%%rax), %%r12
+ mov 96(%%rax), %%r13
+ mov 104(%%rax), %%r14
+ mov 112(%%rax), %%r15
+
+ // 4.5 Note: Host State (RSP and RIP) is saved automatically by VMRUN
+ // to the HSAVE area pointed to by VM_HSAVE_PA.
+ // There is no need to manually write it to the VMCB.
+
+ // 5. Execute VMRUN.
+ clgi
+ // VMCB address MUST be in RAX.
+ // It was pushed at Index 6: 6 * 8 = 48.
+ mov 48(%%rsp), %%rax
+ vmrun
+ 1: // Host resumes here.
+ // Restore RAX as VMRUN clobbers it.
+ mov 48(%%rsp), %%rax
+ setc %[fail_flag]
+
+ // 6. Save L2's GPRs.
+ // exit_code (it will be at Index 15)
+ pushq 0x70(%%rax)
+
+ // Save L2 GPRs (Index 14 down to 1).
+ push %%r15
+ push %%r14
+ push %%r13
+ push %%r12
+ push %%r11
+ push %%r10
+ push %%r9
+ push %%r8
+ push %%rbp
+ push %%rdi
+ push %%rsi
+ push %%rdx
+ push %%rcx
+ push %%rbx
+
+ // Save L2 RAX from VMCB (Index 0).
+ // Since we pushed 16 regs (L2 RAX + 14 GPRs + exit_code), vmcb_addr is at 48 + 16 * 8 = 176(%%rsp).
+ mov 176(%%rsp), %%rax
+ pushq 0x5f8(%%rax)
+
+ // 7. Call the C handler.
+ // arg1 (RDI) = exit reason (at Index 15: 15 * 8 = 120 bytes)
+ mov 120(%%rsp), %%rdi
+ // arg2 (RSI) = pointer to the saved registers
+ mov %%rsp, %%rsi
+ call nested_vm_exit_handler_amd
+
+ // 8. Restore L1 state.
+ // Discard L2 GPRs (15 regs) + exit_code = 16 regs in total.
+ add $128, %%rsp
+
+ // Restore L1 callee-saved registers.
+ pop %%r15
+ pop %%r14
+ pop %%r13
+ pop %%r12
+ pop %%rbp
+ pop %%rbx
+
+ // 9. Discard vmcb_addr and cpu_id.
+ add $16, %%rsp
+
+ // 10. Restore Red Zone.
+ add $128, %%rsp
+
+ stgi
+ after_vmentry_label_amd:
+ )"
+ : [fail_flag] "=m"(fail_flag)
+ : [cpu_id] "r"(cpu_id), [vmcb_addr] "r"(vmcb_addr), [l2_regs] "r"(l2_regs),
+ [l2_regs_size] "i"(sizeof(struct l2_guest_regs))
+ : "cc", "memory", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11");
if (fail_flag) {
// VMRUN failed.
guest_uexit(0xE2E10000 | 0xFFFF);
return;
}
-
- // VMRUN succeeded and we have a VM-exit.
- uint64 exit_reason = vmcb_read64(vmcb_ptr, VMCB_EXIT_CODE);
- nested_vm_exit_handler_amd(exit_reason, cpu_id, vm_id);
}
GUEST_CODE static noinline void
diff --git a/executor/kvm.h b/executor/kvm.h
index 024f70351..1e0ceae06 100644
--- a/executor/kvm.h
+++ b/executor/kvm.h
@@ -64,7 +64,7 @@
#define X86_SYZOS_ADDR_STACK0 0x60f80
// Base address for all per-L1-VCPU regions.
-#define X86_SYZOS_PER_VCPU_REGIONS_BASE 0x70000
+#define X86_SYZOS_PER_VCPU_REGIONS_BASE 0x400000
// Size of the entire memory block allocated for a single L1 VCPU to manage its L2 VMs.
// We need space for 1 VMXON page + 4 L2 VMs. Let's allocate 256KB per L1 VCPU for ample space.
#define X86_SYZOS_L1_VCPU_REGION_SIZE 0x40000
@@ -98,7 +98,7 @@
#define X86_SYZOS_L2_VM_OFFSET_MSR_BITMAP 0x7000
// Subsequent addresses are shifted to accommodate all L1 VCPU regions.
-#define X86_SYZOS_ADDR_UNUSED 0x200000
+#define X86_SYZOS_ADDR_UNUSED 0x1000000
#define X86_SYZOS_ADDR_IOAPIC 0xfec00000
#define X86_SYZOS_ADDR_VMCS_VMCB(cpu, vm) \
@@ -344,6 +344,7 @@
// VMCS Guest State Fields.
#define VMCS_GUEST_INTR_STATUS 0x00000810
#define VMCS_GUEST_PML_INDEX 0x00000812
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802
#define VMCS_GUEST_IA32_PAT 0x00002804
#define VMCS_GUEST_IA32_EFER 0x00002806
@@ -410,6 +411,7 @@
#define VMCB_CTRL_ASID 0x058
#define VMCB_EXIT_CODE 0x070
+#define VMCB_EXITINFO2 0x080
// NP_ENABLE is actually 1 byte, but the 7 following bytes are reserved, so it's okay
#define VMCB_CTRL_NP_ENABLE 0x090
@@ -471,6 +473,7 @@
#define VMCB_GUEST_RSP 0x5d8
#define VMCB_GUEST_PAT 0x668
#define VMCB_GUEST_DEBUGCTL 0x670
+#define VMCB_RAX 0x5f8
// SVM Segment Attribute Defines
#define SVM_ATTR_G (1 << 15)
@@ -498,6 +501,7 @@
#endif // x86-specific definitions.
#define KVM_MAX_VCPU 4
+#define KVM_MAX_L2_VMS 4
#define KVM_PAGE_SIZE (1 << 12)
#define KVM_GUEST_PAGES 1024
#define KVM_GUEST_MEM_SIZE (KVM_GUEST_PAGES * KVM_PAGE_SIZE)