From 32a0e5edfeef60e894c453dc9e9ae45528626ef1 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 7 Aug 2025 15:26:06 +0200
Subject: executor: arm64: syzos: add flush_cache_range()

ARMv8-A architecture mandates how caches should be flushed when
writing self-modifying code.
Although it would be nice to catch some bugs caused by omitting this
synchronization, we want it to happen in most cases, so that our code
actually works.
---
 executor/common_kvm_arm64_syzos.h | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

(limited to 'executor')

diff --git a/executor/common_kvm_arm64_syzos.h b/executor/common_kvm_arm64_syzos.h
index 0253b6b96..41a7f184b 100644
--- a/executor/common_kvm_arm64_syzos.h
+++ b/executor/common_kvm_arm64_syzos.h
@@ -193,8 +193,38 @@ guest_main(uint64 size, uint64 cpu)
 	guest_uexit((uint64)-1);
 }
 
+// Some ARM chips use 128-byte cache lines. Pick 256 to be on the safe side.
+#define MAX_CACHE_LINE_SIZE 256
+
+GUEST_CODE static noinline void
+flush_cache_range(void* addr, uint64 size)
+{
+	uint64 start = (uint64)addr;
+	uint64 end = start + size;
+
+	// For self-modifying code, we must clean the D-cache and invalidate the
+	// I-cache for the memory range that was modified. This is the sequence
+	// mandated by the ARMv8-A architecture.
+
+	// 1. Clean D-cache over the whole range to the Point of Unification.
+	for (uint64 i = start; i < end; i += MAX_CACHE_LINE_SIZE)
+		asm volatile("dc cvau, %[addr]" : : [addr] "r"(i) : "memory");
+	// 2. Wait for the D-cache clean to complete.
+	asm volatile("dsb sy" : : : "memory");
+
+	// 3. Invalidate I-cache over the whole range.
+	for (uint64 i = start; i < end; i += MAX_CACHE_LINE_SIZE)
+		asm volatile("ic ivau, %[addr]" : : [addr] "r"(i) : "memory");
+	// 4. Wait for the I-cache invalidate to complete.
+	asm volatile("dsb sy" : : : "memory");
+
+	// 5. Flush pipeline to force re-fetch of new instruction.
+	asm volatile("isb" : : : "memory");
+}
+
 GUEST_CODE static noinline void guest_execute_code(uint32* insns, uint64 size)
 {
+	flush_cache_range(insns, size);
 	volatile void (*fn)() = (volatile void (*)())insns;
 	fn();
 }
@@ -236,9 +266,6 @@ GUEST_CODE static uint32 get_cpu_id()
 	return (uint32)val;
 }
 
-// Some ARM chips use 128-byte cache lines. Pick 256 to be on the safe side.
-#define MAX_CACHE_LINE_SIZE 256
-
 // Read the value from a system register using an MRS instruction.
 GUEST_CODE static noinline void
 guest_handle_mrs(uint64 reg)
@@ -249,6 +276,7 @@ guest_handle_mrs(uint64 reg)
 	uint32* insn = (uint32*)((uint64)ARM64_ADDR_SCRATCH_CODE + cpu_id * MAX_CACHE_LINE_SIZE);
 	insn[0] = mrs;
 	insn[1] = 0xd65f03c0; // RET
+	flush_cache_range(insn, 8);
 	// Make a call to the generated MSR instruction and clobber x0.
 	asm("blr %[pc]\n"
 	    :
@@ -273,6 +301,7 @@ guest_handle_msr(uint64 reg, uint64 val)
 	uint32* insn = (uint32*)((uint64)ARM64_ADDR_SCRATCH_CODE + cpu_id * MAX_CACHE_LINE_SIZE);
 	insn[0] = msr;
 	insn[1] = 0xd65f03c0; // RET
+	flush_cache_range(insn, 8);
 	// Put `val` into x0 and make a call to the generated MSR instruction.
 	asm("mov x0, %[val]\nblr %[pc]\n"
 	    :
-- 
cgit mrf-deployment