From a4c52dd6fa17ba8bb0a3d6a08de46c61035f693c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 23 Jan 2026 18:28:30 +0100 Subject: executor: implement dynamic L2 page table allocation in SYZOS Enable the SYZOS guest (L1) to dynamically allocate memory for nested L2 page tables, replacing the previous rigid static layout. Move the mem_region and syzos_boot_args struct definitions to the guest header (common_kvm_amd64_syzos.h) to allow the guest to parse the memory map injected by the host. Introduce a bump allocator, guest_alloc_page(), which targets the X86_SYZOS_ADDR_UNUSED heap. This allocator relies on a new struct syzos_globals located at X86_SYZOS_ADDR_GLOBALS to track the allocation offset. Refactor setup_l2_page_tables() to allocate intermediate paging levels (PDPT, PD, PT) via guest_alloc_page() instead of using fixed contiguous offsets relative to the PML4. This allows for disjoint memory usage and supports future recursion requirements. --- executor/common_kvm_amd64.h | 12 ------ executor/common_kvm_amd64_syzos.h | 77 +++++++++++++++++++++++++++++++++------ 2 files changed, 66 insertions(+), 23 deletions(-) (limited to 'executor') diff --git a/executor/common_kvm_amd64.h b/executor/common_kvm_amd64.h index 12bc8958d..a181e302e 100644 --- a/executor/common_kvm_amd64.h +++ b/executor/common_kvm_amd64.h @@ -215,18 +215,6 @@ static void setup_64bit_idt(struct kvm_sregs* sregs, char* host_mem, uintptr_t g #define MEM_REGION_FLAG_NO_HOST_MEM (1 << 6) #define MEM_REGION_FLAG_REMAINING (1 << 7) -struct mem_region { - uint64 gpa; - int pages; - uint32 flags; -}; - -struct syzos_boot_args { - uint32 region_count; - uint32 reserved; - struct mem_region regions[]; -}; - // SYZOS guest virtual memory layout (must be in sync with executor/kvm.h): static const struct mem_region syzos_mem_regions[] = { // AMD64 fixed data structures (5 pages: Zero, GDT, PML4, PDP, PD). diff --git a/executor/common_kvm_amd64_syzos.h b/executor/common_kvm_amd64_syzos.h index 5584a62b9..897f3ef8c 100644 --- a/executor/common_kvm_amd64_syzos.h +++ b/executor/common_kvm_amd64_syzos.h @@ -96,6 +96,23 @@ struct l2_guest_regs { uint64 r8, r9, r10, r11, r12, r13, r14, r15; }; +struct mem_region { + uint64 gpa; + int pages; + uint32 flags; +}; + +struct syzos_boot_args { + uint32 region_count; + uint32 reserved; + struct mem_region regions[]; +}; + +struct syzos_globals { + uint64 alloc_offset; + uint64 total_size; +}; + #ifdef __cplusplus extern "C" { #endif @@ -672,35 +689,73 @@ guest_handle_enable_nested(struct api_call_1* cmd, uint64 cpu_id) } } +// Calculate the size of the unused memory region from the boot arguments. +GUEST_CODE static uint64 get_unused_memory_size() +{ + volatile struct syzos_boot_args* args = (volatile struct syzos_boot_args*)X86_SYZOS_ADDR_BOOT_ARGS; + for (uint32 i = 0; i < args->region_count; i++) { + if (args->regions[i].gpa == X86_SYZOS_ADDR_UNUSED) + return args->regions[i].pages * KVM_PAGE_SIZE; + } + return 0; +} + +// Allocate a page from the X86_SYZOS_ADDR_UNUSED region using a non-reclaiming bump allocator. +GUEST_CODE static uint64 guest_alloc_page() +{ + volatile struct syzos_globals* globals = (volatile struct syzos_globals*)X86_SYZOS_ADDR_GLOBALS; + + // Lazy initialization of total_size using CAS to prevent races. + if (globals->total_size == 0) { + uint64 size = get_unused_memory_size(); + // Attempt to swap 0 with the calculated size. + // If another CPU beat us to it, this does nothing (which is fine). + __sync_val_compare_and_swap(&globals->total_size, 0, size); + } + + // Atomic fetch-and-add to reserve space. + uint64 offset = __sync_fetch_and_add(&globals->alloc_offset, KVM_PAGE_SIZE); + + if (offset >= globals->total_size) + guest_uexit(UEXIT_ASSERT); + + uint64 ptr = X86_SYZOS_ADDR_UNUSED + offset; + guest_memset((void*)ptr, 0, KVM_PAGE_SIZE); + return ptr; +} + GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint64 cpu_id, uint64 vm_id) { + // The Root PML4 remains at the fixed address assigned to this VM. uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id); - uint64 l2_pdpt_addr = l2_pml4_addr + KVM_PAGE_SIZE; - uint64 l2_pd_addr = l2_pml4_addr + 2 * KVM_PAGE_SIZE; - uint64 l2_pt_addr = l2_pml4_addr + 3 * KVM_PAGE_SIZE; + + // Allocate subsequent levels dynamically. + uint64 l2_pdpt_addr = guest_alloc_page(); + uint64 l2_pd_addr = guest_alloc_page(); + uint64 l2_pt_addr = guest_alloc_page(); volatile uint64* pml4 = (volatile uint64*)l2_pml4_addr; volatile uint64* pdpt = (volatile uint64*)l2_pdpt_addr; volatile uint64* pd = (volatile uint64*)l2_pd_addr; volatile uint64* pt = (volatile uint64*)l2_pt_addr; + // Clear the root table (the others are cleared by guest_alloc_page). guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE); - guest_memset((void*)l2_pdpt_addr, 0, KVM_PAGE_SIZE); - guest_memset((void*)l2_pd_addr, 0, KVM_PAGE_SIZE); - guest_memset((void*)l2_pt_addr, 0, KVM_PAGE_SIZE); guest_memset((void*)X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id), 0, KVM_PAGE_SIZE); // Intel EPT: set Read, Write, Execute. // AMD NPT: set Present, Write, User. uint64 flags = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER; - // Create the 4-level page table entries using 4KB pages: - // PML4[0] -> points to PDPT + + // Setup Hierarchy: + // PML4[0] -> PDPT pml4[0] = l2_pdpt_addr | flags; - // PDPT[0] -> points to Page Directory (PD) + // PDPT[0] -> PD pdpt[0] = l2_pd_addr | flags; - // PD[0] -> points to Page Table (PT) (NO X86_PDE64_PS) + // PD[0] -> PT pd[0] = l2_pt_addr | flags; - // PT[0..511] -> maps 512 4KB pages (2MB total) identity + + // PT[0..511] -> Maps 2MB identity uint64 pt_flags = flags; if (vendor == CPU_VENDOR_INTEL) { pt_flags |= EPT_MEMTYPE_WB | EPT_ACCESSED | EPT_DIRTY; -- cgit mrf-deployment