aboutsummaryrefslogtreecommitdiffstats
path: root/executor
diff options
context:
space:
mode:
authorDmitry Vyukov <dvyukov@google.com>2024-07-05 17:20:13 +0200
committerDmitry Vyukov <dvyukov@google.com>2024-07-25 13:12:57 +0000
commit4d77b9fe7da3d014943a16cb4b9a4ca3a531521a (patch)
treec37fbf8b50205eb8b830595a621ad4b355e32e9a /executor
parent206f31df2861c47b13a8c05a105afa94bcc7106c (diff)
all: add qemu snapshotting mode
Diffstat (limited to 'executor')
-rw-r--r--executor/common.h11
-rw-r--r--executor/executor.cc96
-rw-r--r--executor/executor_runner.h3
-rw-r--r--executor/files.h16
-rw-r--r--executor/snapshot.h252
5 files changed, 341 insertions, 37 deletions
diff --git a/executor/common.h b/executor/common.h
index a38768536..123723e5a 100644
--- a/executor/common.h
+++ b/executor/common.h
@@ -613,7 +613,8 @@ static void loop(void)
#endif
#if SYZ_EXECUTOR
// Tell parent that we are ready to serve.
- reply_execute(0);
+ if (!flag_snapshot)
+ reply_execute(0);
#endif
int iter = 0;
#if SYZ_REPEAT_TIMES
@@ -632,7 +633,8 @@ static void loop(void)
reset_loop();
#endif
#if SYZ_EXECUTOR
- receive_execute();
+ if (!flag_snapshot)
+ receive_execute();
#endif
int pid = fork();
if (pid < 0)
@@ -663,6 +665,11 @@ static void loop(void)
}
debug("spawned worker pid %d\n", pid);
+#if SYZ_EXECUTOR
+ if (flag_snapshot)
+ SnapshotPrepareParent();
+#endif
+
// We used to use sigtimedwait(SIGCHLD) to wait for the subprocess.
// But SIGCHLD is also delivered when a process stops/continues,
// so it would require a loop with status analysis and timeout recalculation.
diff --git a/executor/executor.cc b/executor/executor.cc
index 17de4e87d..055957e9f 100644
--- a/executor/executor.cc
+++ b/executor/executor.cc
@@ -113,6 +113,8 @@ static void reply_execute(uint32 status);
static void receive_handshake();
#if SYZ_EXECUTOR_USES_FORK_SERVER
+static void SnapshotPrepareParent();
+
// Allocating (and forking) virtual memory for each executed process is expensive, so we only mmap
// the amount we might possibly need for the specific received prog.
const int kMaxOutputComparisons = 14 << 20; // executions with comparsions enabled are usually < 1% of all executions
@@ -143,6 +145,7 @@ struct alignas(8) OutputData {
std::atomic<uint32> size;
std::atomic<uint32> consumed;
std::atomic<uint32> completed;
+ std::atomic<uint32> num_calls;
struct {
// Call index in the test program (they may be out-of-order is some syscalls block).
int index;
@@ -155,6 +158,7 @@ struct alignas(8) OutputData {
size.store(0, std::memory_order_relaxed);
consumed.store(0, std::memory_order_relaxed);
completed.store(0, std::memory_order_relaxed);
+ num_calls.store(0, std::memory_order_relaxed);
}
};
@@ -248,6 +252,7 @@ static bool dedup(uint8 index, uint64 sig);
static uint64 start_time_ms = 0;
static bool flag_debug;
+static bool flag_snapshot;
static bool flag_coverage;
static bool flag_sandbox_none;
static bool flag_sandbox_setuid;
@@ -463,8 +468,10 @@ static bool copyout(char* addr, uint64 size, uint64* res);
static void setup_control_pipes();
static bool coverage_filter(uint64 pc);
static rpc::ComparisonRaw convert(const kcov_comparison_t& cmp);
-static flatbuffers::span<uint8_t> finish_output(OutputData* output, int proc_id, uint64 req_id,
+static flatbuffers::span<uint8_t> finish_output(OutputData* output, int proc_id, uint64 req_id, uint32 num_calls,
uint64 elapsed, uint64 freshness, uint32 status, const std::vector<uint8_t>* process_output);
+static void parse_execute(const execute_req& req);
+static void parse_handshake(const handshake_req& req);
#include "syscalls.h"
@@ -495,6 +502,8 @@ static feature_t features[] = {};
#include "files.h"
#include "subprocess.h"
+#include "snapshot.h"
+
#include "executor_runner.h"
#include "test.h"
@@ -535,44 +544,50 @@ int main(int argc, char** argv)
start_time_ms = current_time_ms();
os_init(argc, argv, (char*)SYZ_DATA_OFFSET, SYZ_NUM_PAGES * SYZ_PAGE_SIZE);
+ use_temporary_dir();
+ install_segv_handler();
current_thread = &threads[0];
- void* mmap_out = mmap(NULL, kMaxInput, PROT_READ, MAP_SHARED, kInFd, 0);
- if (mmap_out == MAP_FAILED)
- fail("mmap of input file failed");
- input_data = static_cast<uint8*>(mmap_out);
+ if (argc > 2 && strcmp(argv[2], "snapshot") == 0) {
+ SnapshotSetup(argv, argc);
+ } else {
+ void* mmap_out = mmap(NULL, kMaxInput, PROT_READ, MAP_SHARED, kInFd, 0);
+ if (mmap_out == MAP_FAILED)
+ fail("mmap of input file failed");
+ input_data = static_cast<uint8*>(mmap_out);
+
+ mmap_output(kInitialOutput);
- mmap_output(kInitialOutput);
- // Prevent test programs to mess with these fds.
- // Due to races in collider mode, a program can e.g. ftruncate one of these fds,
- // which will cause fuzzer to crash.
- close(kInFd);
+ // Prevent test programs to mess with these fds.
+ // Due to races in collider mode, a program can e.g. ftruncate one of these fds,
+ // which will cause fuzzer to crash.
+ close(kInFd);
#if !SYZ_EXECUTOR_USES_FORK_SERVER
- close(kOutFd);
+ // For SYZ_EXECUTOR_USES_FORK_SERVER, close(kOutFd) is invoked in the forked child,
+ // after the program has been received.
+ close(kOutFd);
#endif
- // For SYZ_EXECUTOR_USES_FORK_SERVER, close(kOutFd) is invoked in the forked child,
- // after the program has been received.
- if (fcntl(kMaxSignalFd, F_GETFD) != -1) {
- // Use random addresses for coverage filters to not collide with output_data.
- max_signal.emplace(kMaxSignalFd, reinterpret_cast<void*>(0x110c230000ull));
- close(kMaxSignalFd);
- }
- if (fcntl(kCoverFilterFd, F_GETFD) != -1) {
- cover_filter.emplace(kCoverFilterFd, reinterpret_cast<void*>(0x110f230000ull));
- close(kCoverFilterFd);
- }
+ if (fcntl(kMaxSignalFd, F_GETFD) != -1) {
+ // Use random addresses for coverage filters to not collide with output_data.
+ max_signal.emplace(kMaxSignalFd, reinterpret_cast<void*>(0x110c230000ull));
+ close(kMaxSignalFd);
+ }
+ if (fcntl(kCoverFilterFd, F_GETFD) != -1) {
+ cover_filter.emplace(kCoverFilterFd, reinterpret_cast<void*>(0x110f230000ull));
+ close(kCoverFilterFd);
+ }
- use_temporary_dir();
- install_segv_handler();
- setup_control_pipes();
- receive_handshake();
+ setup_control_pipes();
+ receive_handshake();
#if !SYZ_EXECUTOR_USES_FORK_SERVER
- // We receive/reply handshake when fork server is disabled just to simplify runner logic.
- // It's a bit suboptimal, but no fork server is much slower anyway.
- reply_execute(0);
- receive_execute();
+ // We receive/reply handshake when fork server is disabled just to simplify runner logic.
+ // It's a bit suboptimal, but no fork server is much slower anyway.
+ reply_execute(0);
+ receive_execute();
#endif
+ }
+
if (flag_coverage) {
int create_count = kCoverDefaultCount, mmap_count = create_count;
if (flag_delay_kcov_mmap) {
@@ -694,6 +709,11 @@ void receive_handshake()
ssize_t n = read(kInPipeFd, &req, sizeof(req));
if (n != sizeof(req))
failmsg("handshake read failed", "read=%zu", n);
+ parse_handshake(req);
+}
+
+void parse_handshake(const handshake_req& req)
+{
if (req.magic != kInMagic)
failmsg("bad handshake magic", "magic=0x%llx", req.magic);
#if SYZ_HAVE_SANDBOX_ANDROID
@@ -732,6 +752,11 @@ void receive_execute()
;
if (n != (ssize_t)sizeof(req))
failmsg("control pipe read failed", "read=%zd want=%zd", n, sizeof(req));
+ parse_execute(req);
+}
+
+void parse_execute(const execute_req& req)
+{
request_id = req.id;
flag_collect_signal = req.exec_flags & (1 << 0);
flag_collect_cover = req.exec_flags & (1 << 1);
@@ -759,6 +784,8 @@ bool cover_collection_required()
void reply_execute(uint32 status)
{
+ if (flag_snapshot)
+ SnapshotDone(status == kFailStatus);
if (write(kOutPipeFd, &status, sizeof(status)) != sizeof(status))
fail("control pipe write failed");
}
@@ -781,7 +808,10 @@ void realloc_output_data()
void execute_one()
{
in_execute_one = true;
- realloc_output_data();
+ if (flag_snapshot)
+ SnapshotStart();
+ else
+ realloc_output_data();
output_builder.emplace(output_data, output_size);
uint64 start = current_time_ms();
uint8* input_pos = input_data;
@@ -1272,11 +1302,9 @@ void write_extra_output()
cover_reset(&extra_cov);
}
-flatbuffers::span<uint8_t> finish_output(OutputData* output, int proc_id, uint64 req_id, uint64 elapsed,
+flatbuffers::span<uint8_t> finish_output(OutputData* output, int proc_id, uint64 req_id, uint32 num_calls, uint64 elapsed,
uint64 freshness, uint32 status, const std::vector<uint8_t>* process_output)
{
- uint8* prog_data = input_data;
- uint32 num_calls = read_input(&prog_data);
int output_size = output->size.load(std::memory_order_relaxed) ?: kMaxOutput;
uint32 completed = output->completed.load(std::memory_order_relaxed);
completed = std::min(completed, kMaxCalls);
diff --git a/executor/executor_runner.h b/executor/executor_runner.h
index 260d4a5de..96fcc9b44 100644
--- a/executor/executor_runner.h
+++ b/executor/executor_runner.h
@@ -352,7 +352,8 @@ private:
output_.insert(output_.end(), tmp, tmp + strlen(tmp));
}
}
- auto data = finish_output(resp_mem_, id_, msg_->id, elapsed, freshness_++, status, output);
+ uint32 num_calls = read_input(&prog_data);
+ auto data = finish_output(resp_mem_, id_, msg_->id, num_calls, elapsed, freshness_++, status, output);
conn_.Send(data.data(), data.size());
resp_mem_->Reset();
diff --git a/executor/files.h b/executor/files.h
index f952a07dc..470157e84 100644
--- a/executor/files.h
+++ b/executor/files.h
@@ -9,6 +9,7 @@
#include <errno.h>
#include <fcntl.h>
#include <glob.h>
+#include <stdarg.h>
#include <string.h>
#include <unistd.h>
@@ -58,6 +59,21 @@ static std::unique_ptr<rpc::FileInfoRawT> ReadFile(const std::string& file)
return info;
}
+static std::string ReadTextFile(const char* file_fmt, ...)
+{
+ char file[1024];
+ va_list args;
+ va_start(args, file_fmt);
+ vsnprintf(file, sizeof(file), file_fmt, args);
+ va_end(args);
+ file[sizeof(file) - 1] = 0;
+ auto data = ReadFile(file)->data;
+ std::string str(data.begin(), data.end());
+ while (!str.empty() && (str.back() == '\n' || str.back() == 0))
+ str.resize(str.size() - 1);
+ return str;
+}
+
static std::vector<std::unique_ptr<rpc::FileInfoRawT>> ReadFiles(const std::vector<std::string>& files)
{
std::vector<std::unique_ptr<rpc::FileInfoRawT>> results;
diff --git a/executor/snapshot.h b/executor/snapshot.h
new file mode 100644
index 000000000..5479a162f
--- /dev/null
+++ b/executor/snapshot.h
@@ -0,0 +1,252 @@
+// Copyright 2024 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+#include <dirent.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <string>
+#include <utility>
+
+#ifndef MADV_POPULATE_WRITE
+#define MADV_POPULATE_WRITE 23
+#endif
+
+// Size of qemu snapshots and time required to restore a snapshot depend on the amount of memory
+// the VM touches after boot. For example, a 132 MB snapshot takes around 150ms to restore,
+// while a 260 MB snapshot takes around 275 ms to restore.
+//
+// To reduce size of the snapshot it's recommended to use smaller kernel and setup fewer devices.
+// For example the following cmdline arguments:
+// "loop.max_loop=1 dummy_hcd.num=1 vivid.n_devs=2 vivid.multiplanar=1,2 netrom.nr_ndevs=1 rose.rose_ndevs=1"
+// and CONFIG_USBIP_VHCI_NR_HCS=1 help to reduce snapshot by about 20 MB. Note: we have only 1 proc
+// in snapshot mode, so we don't need lots of devices. However, our descriptions rely on vivid.n_devs=16
+// since they hardcode names like /dev/video36 which follow after these 16 pre-created devices.
+//
+// Additionally we could try to use executor as init process, this should remove dhcpd/sshd/udevd/klogd/etc.
+// We don't need even networking in snapshot mode since we communicate via shared memory.
+
+static struct {
+ // Ivshmem interrupt doorbell register.
+ volatile uint32* doorbell;
+ volatile rpc::SnapshotHeaderT* hdr;
+ void* input;
+} ivs;
+
+// Finds qemu ivshmem device, see:
+// https://www.qemu.org/docs/master/specs/ivshmem-spec.html
+static void FindIvshmemDevices()
+{
+ std::string result;
+ DIR* devices = opendir("/sys/bus/pci/devices");
+ if (!devices)
+ fail("opendir(/sys/bus/pci/devices) failed");
+ void* regs = nullptr;
+ void* input = nullptr;
+ void* output = nullptr;
+ while (auto* dev = readdir(devices)) {
+ if (dev->d_name[0] == '.')
+ continue;
+ const std::string& vendor = ReadTextFile("/sys/bus/pci/devices/%s/vendor", dev->d_name);
+ const std::string& device = ReadTextFile("/sys/bus/pci/devices/%s/device", dev->d_name);
+ debug("PCI device %s: vendor=%s device=%s\n", dev->d_name, vendor.c_str(), device.c_str());
+ if (vendor != "0x1af4" || device != "0x1110")
+ continue;
+ char filename[1024];
+ snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource2", dev->d_name);
+ int res2 = open(filename, O_RDWR);
+ if (res2 == -1)
+ fail("failed to open ivshmem resource2");
+ struct stat statbuf;
+ if (fstat(res2, &statbuf))
+ fail("failed to fstat ivshmem resource2");
+ debug("ivshmem resource2 size %zu\n", static_cast<size_t>(statbuf.st_size));
+ // The only way to distinguish the 2 ivshmem regions is by size.
+ if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotDoorbellSize)) {
+ snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource0", dev->d_name);
+ int res0 = open(filename, O_RDWR);
+ if (res0 == -1)
+ fail("failed to open ivshmem resource0");
+ regs = mmap(nullptr, getpagesize(), PROT_READ | PROT_WRITE, MAP_SHARED, res0, 0);
+ close(res0);
+ if (regs == MAP_FAILED)
+ fail("failed to mmap ivshmem resource0");
+ debug("mapped doorbell registers at %p\n", regs);
+ } else if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotShmemSize)) {
+ input = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxInputSize),
+ PROT_READ, MAP_SHARED, res2, 0);
+ output = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxOutputSize),
+ PROT_READ | PROT_WRITE, MAP_SHARED, res2,
+ static_cast<uint64>(rpc::Const::MaxInputSize));
+ if (input == MAP_FAILED || output == MAP_FAILED)
+ fail("failed to mmap ivshmem resource2");
+ debug("mapped shmem input at at %p/%llu\n",
+ input, static_cast<uint64>(rpc::Const::MaxInputSize));
+ debug("mapped shmem output at at %p/%llu\n",
+ output, static_cast<uint64>(rpc::Const::MaxOutputSize));
+ }
+ close(res2);
+ }
+ closedir(devices);
+ if (regs == nullptr || input == nullptr)
+ fail("cannot find ivshmem PCI devices");
+ ivs.doorbell = static_cast<uint32*>(regs) + 3;
+ ivs.hdr = static_cast<rpc::SnapshotHeaderT*>(output);
+ ivs.input = input;
+ output_data = reinterpret_cast<OutputData*>(static_cast<char*>(output) + sizeof(rpc::SnapshotHeaderT));
+ output_size = static_cast<uint64>(rpc::Const::MaxOutputSize) - sizeof(rpc::SnapshotHeaderT);
+}
+
+static void SnapshotSetup(char** argv, int argc)
+{
+ flag_snapshot = true;
+ // This allows to see debug output during early setup.
+ // If debug is not actually enabled, it will be turned off in parse_handshake.
+ flag_debug = true;
+#if GOOS_linux
+ // In snapshot mode executor output is redirected to /dev/kmsg.
+ // This is required to turn off rate limiting of writes.
+ write_file("/proc/sys/kernel/printk_devkmsg", "on\n");
+#endif
+ FindIvshmemDevices();
+ // Wait for the host to write handshake_req into input memory.
+ while (ivs.hdr->state != rpc::SnapshotState::Handshake)
+ sleep_ms(10);
+ auto msg = flatbuffers::GetRoot<rpc::SnapshotHandshake>(ivs.input);
+ handshake_req req = {
+ .magic = kInMagic,
+ .use_cover_edges = msg->cover_edges(),
+ .is_kernel_64_bit = msg->kernel_64_bit(),
+ .flags = msg->env_flags(),
+ .pid = 0,
+ .sandbox_arg = static_cast<uint64>(msg->sandbox_arg()),
+ .syscall_timeout_ms = static_cast<uint64>(msg->syscall_timeout_ms()),
+ .program_timeout_ms = static_cast<uint64>(msg->program_timeout_ms()),
+ .slowdown_scale = static_cast<uint64>(msg->slowdown()),
+ };
+ parse_handshake(req);
+ for (const auto& feat : features) {
+ if (!(msg->features() & feat.id))
+ continue;
+ debug("setting up feature %s\n", rpc::EnumNameFeature(feat.id));
+ const char* reason = feat.setup();
+ if (reason)
+ failmsg("feature setup failed", "reason: %s", reason);
+ }
+}
+
+constexpr size_t kOutputPopulate = 256 << 10;
+constexpr size_t kInputPopulate = 64 << 10;
+constexpr size_t kGlobalsPopulate = 4 << 10;
+constexpr size_t kDataPopulate = 8 << 10;
+constexpr size_t kCoveragePopulate = 32 << 10;
+constexpr size_t kThreadsPopulate = 2;
+
+static void SnapshotSetState(rpc::SnapshotState state)
+{
+ debug("changing stapshot state %s -> %s\n",
+ rpc::EnumNameSnapshotState(ivs.hdr->state), rpc::EnumNameSnapshotState(state));
+ std::atomic_signal_fence(std::memory_order_seq_cst);
+ ivs.hdr->state = state;
+ // The register contains VM index shifted by 16 (the host part is VM index 1)
+ // + interrup vector index (0 in our case).
+ *ivs.doorbell = 1 << 16;
+}
+
+// PopulateMemory prefaults anon memory (we want to avoid minor page faults as well).
+static void PopulateMemory(void* ptr, size_t size)
+{
+ ptr = (void*)(uintptr_t(ptr) & ~(getpagesize() - 1));
+ if (madvise(ptr, size, MADV_POPULATE_WRITE))
+ failmsg("populate madvise failed", "ptr=%p size=%zu", ptr, size);
+}
+
+// TouchMemory prefaults non-anon shared memory.
+static void TouchMemory(void* ptr, size_t size)
+{
+ size_t const kPageSize = getpagesize();
+ for (size_t i = 0; i < size; i += kPageSize)
+ (void)((volatile char*)ptr)[i];
+}
+
+#if SYZ_EXECUTOR_USES_FORK_SERVER
+static void SnapshotPrepareParent()
+{
+ TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate);
+ // Notify SnapshotStart that we finished prefaulting memory in the parent.
+ output_data->completed = 1;
+ // Wait for the request to come, so that we give it full time slice to execute.
+ // This process will start waiting for the child as soon as we return.
+ while (ivs.hdr->state != rpc::SnapshotState::Execute)
+ ;
+}
+#endif
+
+static void SnapshotStart()
+{
+ debug("SnapshotStart\n");
+ // Prefault as much memory as we can before the snapshot is taken.
+ // Also pre-create some threads and let them block.
+ // This is intended to make execution after each snapshot restore faster,
+ // as we won't need to do that duplicate work again and again.
+ flag_threaded = true;
+ for (size_t i = 0; i < kThreadsPopulate; i++) {
+ thread_t* th = &threads[i];
+ thread_create(th, i, flag_coverage);
+ if (flag_coverage)
+ PopulateMemory(th->cov.data, kCoveragePopulate);
+ }
+ TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate);
+ TouchMemory(ivs.input, kInputPopulate);
+ PopulateMemory(&flag_coverage, kGlobalsPopulate);
+ PopulateMemory((void*)SYZ_DATA_OFFSET, kDataPopulate);
+ sleep_ms(100); // let threads start and block
+ // Wait for the parent process to prefault as well.
+ while (!output_data->completed)
+ sleep_ms(1);
+ // Notify host that we are ready to be snapshotted.
+ SnapshotSetState(rpc::SnapshotState::Ready);
+ // Snapshot is restored here.
+ // First time we may loop here while the snapshot is taken,
+ // but afterwards we should be restored when the state is already Execute.
+ // Note: we don't use sleep in the loop because we may be snapshotted while in the sleep syscall.
+ // As the result each execution after snapshot restore will be slower as it will need to finish
+ // the sleep and return from the syscall.
+ while (ivs.hdr->state == rpc::SnapshotState::Ready)
+ ;
+ if (ivs.hdr->state == rpc::SnapshotState::Snapshotted) {
+ // First time around, just acknowledge and wait for snapshot restart.
+ SnapshotSetState(rpc::SnapshotState::Executed);
+ for (;;)
+ sleep(1000);
+ }
+ // Resumed for program execution.
+ output_data->Reset();
+ auto msg = flatbuffers::GetRoot<rpc::SnapshotRequest>(ivs.input);
+ execute_req req = {
+ .magic = kInMagic,
+ .id = 0,
+ .exec_flags = static_cast<uint64>(msg->exec_flags()),
+ .all_call_signal = msg->all_call_signal(),
+ .all_extra_signal = msg->all_extra_signal(),
+ };
+ parse_execute(req);
+ output_data->num_calls.store(msg->num_calls(), std::memory_order_relaxed);
+ input_data = const_cast<uint8*>(msg->prog_data()->Data());
+}
+
+NORETURN static void SnapshotDone(bool failed)
+{
+ debug("SnapshotDone\n");
+ uint32 num_calls = output_data->num_calls.load(std::memory_order_relaxed);
+ auto data = finish_output(output_data, 0, 0, num_calls, 0, 0, failed ? kFailStatus : 0, nullptr);
+ ivs.hdr->output_offset = data.data() - reinterpret_cast<volatile uint8_t*>(ivs.hdr);
+ ivs.hdr->output_size = data.size();
+ SnapshotSetState(failed ? rpc::SnapshotState::Failed : rpc::SnapshotState::Executed);
+ // Wait to be restarted from the snapshot.
+ for (;;)
+ sleep(1000);
+}