// Copyright 2020 syzkaller project authors. All rights reserved. // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. #ifndef EXECUTOR_COMMON_KVM_PPC64_H #define EXECUTOR_COMMON_KVM_PPC64_H // This file is shared between executor and csource package. // Implementation of syz_kvm_setup_cpu pseudo-syscall. #include "kvm_ppc64le.S.h" #define BOOK3S_INTERRUPT_SYSTEM_RESET 0x100 #define BOOK3S_INTERRUPT_MACHINE_CHECK 0x200 #define BOOK3S_INTERRUPT_DATA_STORAGE 0x300 #define BOOK3S_INTERRUPT_DATA_SEGMENT 0x380 #define BOOK3S_INTERRUPT_INST_STORAGE 0x400 #define BOOK3S_INTERRUPT_INST_SEGMENT 0x480 #define BOOK3S_INTERRUPT_EXTERNAL 0x500 #define BOOK3S_INTERRUPT_EXTERNAL_HV 0x502 #define BOOK3S_INTERRUPT_ALIGNMENT 0x600 #define BOOK3S_INTERRUPT_PROGRAM 0x700 #define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800 #define BOOK3S_INTERRUPT_DECREMENTER 0x900 #define BOOK3S_INTERRUPT_HV_DECREMENTER 0x980 #define BOOK3S_INTERRUPT_DOORBELL 0xa00 #define BOOK3S_INTERRUPT_SYSCALL 0xc00 #define BOOK3S_INTERRUPT_TRACE 0xd00 #define BOOK3S_INTERRUPT_H_DATA_STORAGE 0xe00 #define BOOK3S_INTERRUPT_H_INST_STORAGE 0xe20 #define BOOK3S_INTERRUPT_H_EMUL_ASSIST 0xe40 #define BOOK3S_INTERRUPT_HMI 0xe60 #define BOOK3S_INTERRUPT_H_DOORBELL 0xe80 #define BOOK3S_INTERRUPT_H_VIRT 0xea0 #define BOOK3S_INTERRUPT_PERFMON 0xf00 #define BOOK3S_INTERRUPT_ALTIVEC 0xf20 #define BOOK3S_INTERRUPT_VSX 0xf40 #define BOOK3S_INTERRUPT_FAC_UNAVAIL 0xf60 #define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80 #define BITS_PER_LONG 64 #define PPC_BITLSHIFT(be) (BITS_PER_LONG - 1 - (be)) #define PPC_BIT(bit) (1ULL << PPC_BITLSHIFT(bit)) #define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs)) #define RADIX_PTE_INDEX_SIZE 5 // size: 8B << 5 = 256B, maps 2^5 x 64K = 2MB #define RADIX_PMD_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps 2^9 x 2MB = 1GB #define RADIX_PUD_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps 2^9 x 1GB = 512GB #define RADIX_PGD_INDEX_SIZE 13 // size: 8B << 13 = 64KB, maps 2^13 x 512GB = 4PB #define cpu_to_be32(x) __builtin_bswap32(x) #define cpu_to_be64(x) __builtin_bswap64(x) #define be64_to_cpu(x) __builtin_bswap64(x) #define LPCR_ILE PPC_BIT(38) #define LPCR_UPRT PPC_BIT(41) // Use Process Table #define LPCR_EVIRT PPC_BIT(42) // Enhanced Virtualisation #define LPCR_HR PPC_BIT(43) // Host Radix #ifndef KVM_REG_PPC_LPCR_64 #define KVM_REG_PPC_LPCR_64 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb5) #endif #define PRTB_SIZE_SHIFT 12 // log2((64 << 10) / 16) #define PATB_GR (1UL << 63) // guest uses radix; must match HR #define PATB_HR (1UL << 63) #define PRTB_MASK 0x0ffffffffffff000UL #define ALIGNUP(p, q) ((void*)(((unsigned long)(p) + (q) - 1) & ~((q) - 1))) #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #ifndef KVM_REG_PPC_DEC_EXPIRY #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) #endif #ifndef KVM_PPC_CONFIGURE_V3_MMU // Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) // Flag values for KVM_PPC_CONFIGURE_V3_MMU #define KVM_PPC_MMUV3_RADIX 1 // 1 = radix mode, 0 = HPT #define KVM_PPC_MMUV3_GTSE 2 // global translation shootdown enb #endif #ifndef KVM_CAP_PPC_NESTED_HV #define KVM_CAP_PPC_NESTED_HV 160 #endif struct kvm_text { uintptr_t typ; const void* text; uintptr_t size; }; static int kvmppc_define_rtas_kernel_token(int vmfd, unsigned token, const char* func) { struct kvm_rtas_token_args args; args.token = token; strncpy(args.name, func, sizeof(args.name) - 1); return ioctl(vmfd, KVM_PPC_RTAS_DEFINE_TOKEN, &args); } static int kvmppc_get_one_reg(int cpufd, uint64 id, void* target) { struct kvm_one_reg reg = {.id = id, .addr = (uintptr_t)target}; return ioctl(cpufd, KVM_GET_ONE_REG, ®); } static int kvmppc_set_one_reg(int cpufd, uint64 id, void* target) { struct kvm_one_reg reg = {.id = id, .addr = (uintptr_t)target}; return ioctl(cpufd, KVM_SET_ONE_REG, ®); } static int kvm_vcpu_enable_cap(int cpufd, uint32 capability) { struct kvm_enable_cap cap = { .cap = capability, }; return ioctl(cpufd, KVM_ENABLE_CAP, &cap); } static int kvm_vm_enable_cap(int vmfd, uint32 capability, uint64 p1, uint64 p2) { struct kvm_enable_cap cap = { .cap = capability, .flags = 0, .args = {p1, p2}, }; return ioctl(vmfd, KVM_ENABLE_CAP, &cap); } static void dump_text(const char* mem, unsigned start, unsigned cw, uint32 debug_inst_opcode) { #ifdef DEBUG printf("Text @%x: ", start); for (unsigned i = 0; i < cw; ++i) { uint32 w = ((uint32*)(mem + start))[i]; printf(" %08x", w); if (debug_inst_opcode && debug_inst_opcode == w) break; } printf("\n"); #endif } // Flags #define KVM_SETUP_PPC64_LE (1 << 0) // Little endian #define KVM_SETUP_PPC64_IR (1 << 1) // Paging for instructions #define KVM_SETUP_PPC64_DR (1 << 2) // Paging for data #define KVM_SETUP_PPC64_PR (1 << 3) // Run with MSR_PR (==usermode) #define KVM_SETUP_PPC64_PID1 (1 << 4) // Set PID=1 i.e. not kernel's PID // syz_kvm_setup_cpu(fd fd_kvmvm, cpufd fd_kvmcpu, usermem vma[24], text ptr[in, array[kvm_text, 1]], ntext len[text], flags flags[kvm_setup_flags_ppc64], opts ptr[in, array[kvm_setup_opt, 0:2]], nopt len[opts]) static volatile long syz_kvm_setup_cpu(volatile long a0, volatile long a1, volatile long a2, volatile long a3, volatile long a4, volatile long a5, volatile long a6, volatile long a7) { const int vmfd = a0; const int cpufd = a1; char* const host_mem = (char*)a2; const struct kvm_text* const text_array_ptr = (struct kvm_text*)a3; const uintptr_t text_count = a4; uintptr_t flags = a5; const uintptr_t page_size = 0x10000; // SYZ_PAGE_SIZE const uintptr_t guest_mem_size = 24 * page_size; // vma[24] from dev_kvm.txt unsigned long gpa_off = 0; uint32 debug_inst_opcode = 0; (void)text_count; // fuzzer can spoof count and we need just 1 text, so ignore text_count const void* text = 0; uintptr_t text_size = 0; uint64 pid = 0; uint64 lpcr = 0; NONFAILING(text = text_array_ptr[0].text); NONFAILING(text_size = text_array_ptr[0].size); if (kvm_vcpu_enable_cap(cpufd, KVM_CAP_PPC_PAPR)) return -1; if (kvm_vm_enable_cap(vmfd, KVM_CAP_PPC_NESTED_HV, 1, 0)) return -1; for (uintptr_t i = 0; i < guest_mem_size / page_size; i++) { struct kvm_userspace_memory_region memreg; memreg.slot = i; memreg.flags = 0; // can be KVM_MEM_LOG_DIRTY_PAGES but not KVM_MEM_READONLY memreg.guest_phys_addr = i * page_size; memreg.memory_size = page_size; memreg.userspace_addr = (uintptr_t)host_mem + i * page_size; if (ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &memreg)) return -1; } struct kvm_regs regs; struct kvm_sregs sregs; if (ioctl(cpufd, KVM_GET_SREGS, &sregs)) return -1; if (ioctl(cpufd, KVM_GET_REGS, ®s)) return -1; regs.msr = PPC_BIT(0); // MSR_SF == Sixty Four == 64bit if (flags & KVM_SETUP_PPC64_LE) regs.msr |= PPC_BIT(63); // Little endian // PR == "problem state" == non priveledged == userspace if (flags & KVM_SETUP_PPC64_PR) { regs.msr |= PPC_BIT(49); // When PR=1, the hardware enforces IR and DR as well. flags |= KVM_SETUP_PPC64_IR | KVM_SETUP_PPC64_DR | KVM_SETUP_PPC64_PID1; } if (flags & KVM_SETUP_PPC64_IR) regs.msr |= PPC_BIT(58); // IR - MMU=on for instructions if (flags & KVM_SETUP_PPC64_DR) regs.msr |= PPC_BIT(59); // DR - MMU=on for data if (flags & KVM_SETUP_PPC64_PID1) pid = 1; // KVM HV on POWER is hard to force to exit, it will bounce between // the fault handlers in KVM and the VM. Forcing all exception // vectors to do software debug breakpoint ensures the exit from KVM. if (kvmppc_get_one_reg(cpufd, KVM_REG_PPC_DEBUG_INST, &debug_inst_opcode)) return -1; #define VEC(x) (*((uint32*)(host_mem + (x)))) VEC(BOOK3S_INTERRUPT_SYSTEM_RESET) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_MACHINE_CHECK) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_DATA_STORAGE) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_DATA_SEGMENT) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_INST_STORAGE) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_INST_SEGMENT) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_EXTERNAL) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_EXTERNAL_HV) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_ALIGNMENT) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_PROGRAM) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_FP_UNAVAIL) = debug_inst_opcode; memcpy(host_mem + BOOK3S_INTERRUPT_DECREMENTER, kvm_ppc64_recharge_dec, sizeof(kvm_ppc64_recharge_dec) - 1); VEC(BOOK3S_INTERRUPT_DECREMENTER + sizeof(kvm_ppc64_recharge_dec) - 1) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_HV_DECREMENTER) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_DOORBELL) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_SYSCALL) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_TRACE) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_H_DATA_STORAGE) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_H_INST_STORAGE) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_H_EMUL_ASSIST) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_HMI) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_H_DOORBELL) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_H_VIRT) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_PERFMON) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_ALTIVEC) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_VSX) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_FAC_UNAVAIL) = debug_inst_opcode; VEC(BOOK3S_INTERRUPT_H_FAC_UNAVAIL) = debug_inst_opcode; struct kvm_guest_debug dbg = {0}; dbg.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; if (ioctl(cpufd, KVM_SET_GUEST_DEBUG, &dbg)) return -1; // Exception vector occupy 128K, including "System Call Vectored" gpa_off = 128 << 10; // Set up a radix page table, the hash mode is not supported if (flags & (KVM_SETUP_PPC64_IR | KVM_SETUP_PPC64_DR)) { uintptr_t process_tb_off = gpa_off; unsigned long process_tb_size = 1UL << (PRTB_SIZE_SHIFT + 4); struct prtb_entry { __be64 prtb0; __be64 prtb1; }* process_tb = (struct prtb_entry*)(host_mem + gpa_off); memset(process_tb, 0xcc, process_tb_size); // PRTB_SIZE_SHIFT is defined to use 64K for the process table gpa_off += process_tb_size; unsigned long *pgd, *pud, *pmd, *pte, i; // Create 4 level page table, just like Linux does for PAGE_SIZE==64K, // put each level to a separate page including the last level which won't // need more than as we only allocate 24 pages for the entire VM. uintptr_t pgd_off = gpa_off; pgd = (unsigned long*)(host_mem + pgd_off); gpa_off += page_size; uintptr_t pud_off = gpa_off; pud = (unsigned long*)(host_mem + pud_off); gpa_off += page_size; uintptr_t pmd_off = gpa_off; pmd = (unsigned long*)(host_mem + pmd_off); gpa_off += page_size; uintptr_t pte_off = gpa_off; pte = (unsigned long*)(host_mem + pte_off); gpa_off += page_size; memset(pgd, 0, page_size); memset(pud, 0, page_size); memset(pmd, 0, page_size); memset(pte, 0, page_size); pgd[0] = cpu_to_be64(PPC_BIT(0) | // Valid (pud_off & PPC_BITMASK(4, 55)) | RADIX_PUD_INDEX_SIZE); pud[0] = cpu_to_be64(PPC_BIT(0) | // Valid (pmd_off & PPC_BITMASK(4, 55)) | RADIX_PMD_INDEX_SIZE); pmd[0] = cpu_to_be64(PPC_BIT(0) | // Valid (pte_off & PPC_BITMASK(4, 55)) | RADIX_PTE_INDEX_SIZE); // Map all 24 pages and allow write+execute for better coverage. for (i = 0; i < 24; ++i) pte[i] = cpu_to_be64(PPC_BIT(0) | // Valid PPC_BIT(1) | // Leaf ((i * page_size) & PPC_BITMASK(7, 51)) | PPC_BIT(55) | // Reference PPC_BIT(56) | // Change PPC_BIT(61) | // Read permitted PPC_BIT(62) | // Write permitted PPC_BIT(63)); // Execute permitted const long max_shift = 52; const unsigned long rts = (max_shift - 31) & 0x1f; const unsigned long rts1 = (rts >> 3) << PPC_BITLSHIFT(2); const unsigned long rts2 = (rts & 7) << PPC_BITLSHIFT(58); process_tb[0].prtb0 = cpu_to_be64(PATB_HR | rts1 | pgd_off | rts2 | RADIX_PGD_INDEX_SIZE); if (pid) process_tb[pid].prtb0 = cpu_to_be64(PATB_HR | rts1 | pgd_off | rts2 | RADIX_PGD_INDEX_SIZE); // PATB_GR is not in the spec but KVM HV wants it for some reason struct kvm_ppc_mmuv3_cfg cfg = { .flags = KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE, .process_table = (process_tb_off & PRTB_MASK) | (PRTB_SIZE_SHIFT - 12) | PATB_GR, }; if (ioctl(vmfd, KVM_PPC_CONFIGURE_V3_MMU, &cfg)) return -1; lpcr |= LPCR_UPRT | LPCR_HR; #ifdef DEBUG printf("MMUv3: flags=%lx %016lx\n", cfg.flags, cfg.process_table); printf("PTRB0=%016lx PGD0=%016lx PUD0=%016lx PMD0=%016lx\n", be64_to_cpu((unsigned long)process_tb[0].prtb0), be64_to_cpu((unsigned long)pgd[0]), be64_to_cpu((unsigned long)pud[0]), be64_to_cpu((unsigned long)pmd[0])); printf("PTEs @%lx:\n %016lx %016lx %016lx %016lx\n %016lx %016lx %016lx %016lx\n", pte_off, be64_to_cpu((unsigned long)pte[0]), be64_to_cpu((unsigned long)pte[1]), be64_to_cpu((unsigned long)pte[2]), be64_to_cpu((unsigned long)pte[3]), be64_to_cpu((unsigned long)pte[4]), be64_to_cpu((unsigned long)pte[5]), be64_to_cpu((unsigned long)pte[6]), be64_to_cpu((unsigned long)pte[7])); #endif } memcpy(host_mem + gpa_off, text, text_size); regs.pc = gpa_off; uintptr_t end_of_text = gpa_off + ((text_size + 3) & ~3); memcpy(host_mem + end_of_text, &debug_inst_opcode, sizeof(debug_inst_opcode)); // The code generator produces little endian instructions so swap bytes here if (!(flags & KVM_SETUP_PPC64_LE)) { uint32* p = (uint32*)(host_mem + gpa_off); for (unsigned long i = 0; i < text_size / sizeof(*p); ++i) p[i] = cpu_to_be32(p[i]); p = (uint32*)(host_mem + BOOK3S_INTERRUPT_DECREMENTER); for (unsigned long i = 0; i < sizeof(kvm_ppc64_recharge_dec) / sizeof(*p); ++i) p[i] = cpu_to_be32(p[i]); } else { // PPC by default calls exception handlers in big endian unless ILE lpcr |= LPCR_ILE; } if (ioctl(cpufd, KVM_SET_SREGS, &sregs)) return -1; if (ioctl(cpufd, KVM_SET_REGS, ®s)) return -1; if (kvmppc_set_one_reg(cpufd, KVM_REG_PPC_LPCR_64, &lpcr)) return -1; if (kvmppc_set_one_reg(cpufd, KVM_REG_PPC_PID, &pid)) return -1; // Hypercalls need to be enable so we enable them all here to // allow fuzzing #define MAX_HCALL 0x450 for (unsigned hcall = 4; hcall < MAX_HCALL; hcall += 4) kvm_vm_enable_cap(vmfd, KVM_CAP_PPC_ENABLE_HCALL, hcall, 1); for (unsigned hcall = 0xf000; hcall < 0xf810; hcall += 4) kvm_vm_enable_cap(vmfd, KVM_CAP_PPC_ENABLE_HCALL, hcall, 1); for (unsigned hcall = 0xef00; hcall < 0xef20; hcall += 4) kvm_vm_enable_cap(vmfd, KVM_CAP_PPC_ENABLE_HCALL, hcall, 1); // Only a few of many RTAS calls are actually in the KVM and the rest // are handled in QEMU, enable the KVM handling for those 4 here. kvmppc_define_rtas_kernel_token(vmfd, 1, "ibm,set-xive"); kvmppc_define_rtas_kernel_token(vmfd, 2, "ibm,get-xive"); kvmppc_define_rtas_kernel_token(vmfd, 3, "ibm,int-on"); kvmppc_define_rtas_kernel_token(vmfd, 4, "ibm,int-off"); dump_text(host_mem, regs.pc, 8, debug_inst_opcode); dump_text(host_mem, BOOK3S_INTERRUPT_DECREMENTER, 16, debug_inst_opcode); uint64 decr = 0x7fffffff; if (kvmppc_set_one_reg(cpufd, KVM_REG_PPC_DEC_EXPIRY, &decr)) return -1; return 0; } #endif // EXECUTOR_COMMON_KVM_PPC64_H