aboutsummaryrefslogtreecommitdiffstats
path: root/executor/snapshot.h
diff options
context:
space:
mode:
authorDmitry Vyukov <dvyukov@google.com>2024-07-05 17:20:13 +0200
committerDmitry Vyukov <dvyukov@google.com>2024-07-25 13:12:57 +0000
commit4d77b9fe7da3d014943a16cb4b9a4ca3a531521a (patch)
treec37fbf8b50205eb8b830595a621ad4b355e32e9a /executor/snapshot.h
parent206f31df2861c47b13a8c05a105afa94bcc7106c (diff)
all: add qemu snapshotting mode
Diffstat (limited to 'executor/snapshot.h')
-rw-r--r--executor/snapshot.h252
1 files changed, 252 insertions, 0 deletions
diff --git a/executor/snapshot.h b/executor/snapshot.h
new file mode 100644
index 000000000..5479a162f
--- /dev/null
+++ b/executor/snapshot.h
@@ -0,0 +1,252 @@
+// Copyright 2024 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+#include <dirent.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <string>
+#include <utility>
+
+#ifndef MADV_POPULATE_WRITE
+#define MADV_POPULATE_WRITE 23
+#endif
+
+// Size of qemu snapshots and time required to restore a snapshot depend on the amount of memory
+// the VM touches after boot. For example, a 132 MB snapshot takes around 150ms to restore,
+// while a 260 MB snapshot takes around 275 ms to restore.
+//
+// To reduce size of the snapshot it's recommended to use smaller kernel and setup fewer devices.
+// For example the following cmdline arguments:
+// "loop.max_loop=1 dummy_hcd.num=1 vivid.n_devs=2 vivid.multiplanar=1,2 netrom.nr_ndevs=1 rose.rose_ndevs=1"
+// and CONFIG_USBIP_VHCI_NR_HCS=1 help to reduce snapshot by about 20 MB. Note: we have only 1 proc
+// in snapshot mode, so we don't need lots of devices. However, our descriptions rely on vivid.n_devs=16
+// since they hardcode names like /dev/video36 which follow after these 16 pre-created devices.
+//
+// Additionally we could try to use executor as init process, this should remove dhcpd/sshd/udevd/klogd/etc.
+// We don't need even networking in snapshot mode since we communicate via shared memory.
+
+static struct {
+ // Ivshmem interrupt doorbell register.
+ volatile uint32* doorbell;
+ volatile rpc::SnapshotHeaderT* hdr;
+ void* input;
+} ivs;
+
+// Finds qemu ivshmem device, see:
+// https://www.qemu.org/docs/master/specs/ivshmem-spec.html
+static void FindIvshmemDevices()
+{
+ std::string result;
+ DIR* devices = opendir("/sys/bus/pci/devices");
+ if (!devices)
+ fail("opendir(/sys/bus/pci/devices) failed");
+ void* regs = nullptr;
+ void* input = nullptr;
+ void* output = nullptr;
+ while (auto* dev = readdir(devices)) {
+ if (dev->d_name[0] == '.')
+ continue;
+ const std::string& vendor = ReadTextFile("/sys/bus/pci/devices/%s/vendor", dev->d_name);
+ const std::string& device = ReadTextFile("/sys/bus/pci/devices/%s/device", dev->d_name);
+ debug("PCI device %s: vendor=%s device=%s\n", dev->d_name, vendor.c_str(), device.c_str());
+ if (vendor != "0x1af4" || device != "0x1110")
+ continue;
+ char filename[1024];
+ snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource2", dev->d_name);
+ int res2 = open(filename, O_RDWR);
+ if (res2 == -1)
+ fail("failed to open ivshmem resource2");
+ struct stat statbuf;
+ if (fstat(res2, &statbuf))
+ fail("failed to fstat ivshmem resource2");
+ debug("ivshmem resource2 size %zu\n", static_cast<size_t>(statbuf.st_size));
+ // The only way to distinguish the 2 ivshmem regions is by size.
+ if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotDoorbellSize)) {
+ snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource0", dev->d_name);
+ int res0 = open(filename, O_RDWR);
+ if (res0 == -1)
+ fail("failed to open ivshmem resource0");
+ regs = mmap(nullptr, getpagesize(), PROT_READ | PROT_WRITE, MAP_SHARED, res0, 0);
+ close(res0);
+ if (regs == MAP_FAILED)
+ fail("failed to mmap ivshmem resource0");
+ debug("mapped doorbell registers at %p\n", regs);
+ } else if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotShmemSize)) {
+ input = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxInputSize),
+ PROT_READ, MAP_SHARED, res2, 0);
+ output = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxOutputSize),
+ PROT_READ | PROT_WRITE, MAP_SHARED, res2,
+ static_cast<uint64>(rpc::Const::MaxInputSize));
+ if (input == MAP_FAILED || output == MAP_FAILED)
+ fail("failed to mmap ivshmem resource2");
+ debug("mapped shmem input at at %p/%llu\n",
+ input, static_cast<uint64>(rpc::Const::MaxInputSize));
+ debug("mapped shmem output at at %p/%llu\n",
+ output, static_cast<uint64>(rpc::Const::MaxOutputSize));
+ }
+ close(res2);
+ }
+ closedir(devices);
+ if (regs == nullptr || input == nullptr)
+ fail("cannot find ivshmem PCI devices");
+ ivs.doorbell = static_cast<uint32*>(regs) + 3;
+ ivs.hdr = static_cast<rpc::SnapshotHeaderT*>(output);
+ ivs.input = input;
+ output_data = reinterpret_cast<OutputData*>(static_cast<char*>(output) + sizeof(rpc::SnapshotHeaderT));
+ output_size = static_cast<uint64>(rpc::Const::MaxOutputSize) - sizeof(rpc::SnapshotHeaderT);
+}
+
+static void SnapshotSetup(char** argv, int argc)
+{
+ flag_snapshot = true;
+ // This allows to see debug output during early setup.
+ // If debug is not actually enabled, it will be turned off in parse_handshake.
+ flag_debug = true;
+#if GOOS_linux
+ // In snapshot mode executor output is redirected to /dev/kmsg.
+ // This is required to turn off rate limiting of writes.
+ write_file("/proc/sys/kernel/printk_devkmsg", "on\n");
+#endif
+ FindIvshmemDevices();
+ // Wait for the host to write handshake_req into input memory.
+ while (ivs.hdr->state != rpc::SnapshotState::Handshake)
+ sleep_ms(10);
+ auto msg = flatbuffers::GetRoot<rpc::SnapshotHandshake>(ivs.input);
+ handshake_req req = {
+ .magic = kInMagic,
+ .use_cover_edges = msg->cover_edges(),
+ .is_kernel_64_bit = msg->kernel_64_bit(),
+ .flags = msg->env_flags(),
+ .pid = 0,
+ .sandbox_arg = static_cast<uint64>(msg->sandbox_arg()),
+ .syscall_timeout_ms = static_cast<uint64>(msg->syscall_timeout_ms()),
+ .program_timeout_ms = static_cast<uint64>(msg->program_timeout_ms()),
+ .slowdown_scale = static_cast<uint64>(msg->slowdown()),
+ };
+ parse_handshake(req);
+ for (const auto& feat : features) {
+ if (!(msg->features() & feat.id))
+ continue;
+ debug("setting up feature %s\n", rpc::EnumNameFeature(feat.id));
+ const char* reason = feat.setup();
+ if (reason)
+ failmsg("feature setup failed", "reason: %s", reason);
+ }
+}
+
+constexpr size_t kOutputPopulate = 256 << 10;
+constexpr size_t kInputPopulate = 64 << 10;
+constexpr size_t kGlobalsPopulate = 4 << 10;
+constexpr size_t kDataPopulate = 8 << 10;
+constexpr size_t kCoveragePopulate = 32 << 10;
+constexpr size_t kThreadsPopulate = 2;
+
+static void SnapshotSetState(rpc::SnapshotState state)
+{
+ debug("changing stapshot state %s -> %s\n",
+ rpc::EnumNameSnapshotState(ivs.hdr->state), rpc::EnumNameSnapshotState(state));
+ std::atomic_signal_fence(std::memory_order_seq_cst);
+ ivs.hdr->state = state;
+ // The register contains VM index shifted by 16 (the host part is VM index 1)
+ // + interrup vector index (0 in our case).
+ *ivs.doorbell = 1 << 16;
+}
+
+// PopulateMemory prefaults anon memory (we want to avoid minor page faults as well).
+static void PopulateMemory(void* ptr, size_t size)
+{
+ ptr = (void*)(uintptr_t(ptr) & ~(getpagesize() - 1));
+ if (madvise(ptr, size, MADV_POPULATE_WRITE))
+ failmsg("populate madvise failed", "ptr=%p size=%zu", ptr, size);
+}
+
+// TouchMemory prefaults non-anon shared memory.
+static void TouchMemory(void* ptr, size_t size)
+{
+ size_t const kPageSize = getpagesize();
+ for (size_t i = 0; i < size; i += kPageSize)
+ (void)((volatile char*)ptr)[i];
+}
+
+#if SYZ_EXECUTOR_USES_FORK_SERVER
+static void SnapshotPrepareParent()
+{
+ TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate);
+ // Notify SnapshotStart that we finished prefaulting memory in the parent.
+ output_data->completed = 1;
+ // Wait for the request to come, so that we give it full time slice to execute.
+ // This process will start waiting for the child as soon as we return.
+ while (ivs.hdr->state != rpc::SnapshotState::Execute)
+ ;
+}
+#endif
+
+static void SnapshotStart()
+{
+ debug("SnapshotStart\n");
+ // Prefault as much memory as we can before the snapshot is taken.
+ // Also pre-create some threads and let them block.
+ // This is intended to make execution after each snapshot restore faster,
+ // as we won't need to do that duplicate work again and again.
+ flag_threaded = true;
+ for (size_t i = 0; i < kThreadsPopulate; i++) {
+ thread_t* th = &threads[i];
+ thread_create(th, i, flag_coverage);
+ if (flag_coverage)
+ PopulateMemory(th->cov.data, kCoveragePopulate);
+ }
+ TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate);
+ TouchMemory(ivs.input, kInputPopulate);
+ PopulateMemory(&flag_coverage, kGlobalsPopulate);
+ PopulateMemory((void*)SYZ_DATA_OFFSET, kDataPopulate);
+ sleep_ms(100); // let threads start and block
+ // Wait for the parent process to prefault as well.
+ while (!output_data->completed)
+ sleep_ms(1);
+ // Notify host that we are ready to be snapshotted.
+ SnapshotSetState(rpc::SnapshotState::Ready);
+ // Snapshot is restored here.
+ // First time we may loop here while the snapshot is taken,
+ // but afterwards we should be restored when the state is already Execute.
+ // Note: we don't use sleep in the loop because we may be snapshotted while in the sleep syscall.
+ // As the result each execution after snapshot restore will be slower as it will need to finish
+ // the sleep and return from the syscall.
+ while (ivs.hdr->state == rpc::SnapshotState::Ready)
+ ;
+ if (ivs.hdr->state == rpc::SnapshotState::Snapshotted) {
+ // First time around, just acknowledge and wait for snapshot restart.
+ SnapshotSetState(rpc::SnapshotState::Executed);
+ for (;;)
+ sleep(1000);
+ }
+ // Resumed for program execution.
+ output_data->Reset();
+ auto msg = flatbuffers::GetRoot<rpc::SnapshotRequest>(ivs.input);
+ execute_req req = {
+ .magic = kInMagic,
+ .id = 0,
+ .exec_flags = static_cast<uint64>(msg->exec_flags()),
+ .all_call_signal = msg->all_call_signal(),
+ .all_extra_signal = msg->all_extra_signal(),
+ };
+ parse_execute(req);
+ output_data->num_calls.store(msg->num_calls(), std::memory_order_relaxed);
+ input_data = const_cast<uint8*>(msg->prog_data()->Data());
+}
+
+NORETURN static void SnapshotDone(bool failed)
+{
+ debug("SnapshotDone\n");
+ uint32 num_calls = output_data->num_calls.load(std::memory_order_relaxed);
+ auto data = finish_output(output_data, 0, 0, num_calls, 0, 0, failed ? kFailStatus : 0, nullptr);
+ ivs.hdr->output_offset = data.data() - reinterpret_cast<volatile uint8_t*>(ivs.hdr);
+ ivs.hdr->output_size = data.size();
+ SnapshotSetState(failed ? rpc::SnapshotState::Failed : rpc::SnapshotState::Executed);
+ // Wait to be restarted from the snapshot.
+ for (;;)
+ sleep(1000);
+}