diff options
| author | Dmitry Vyukov <dvyukov@google.com> | 2024-07-05 17:20:13 +0200 |
|---|---|---|
| committer | Dmitry Vyukov <dvyukov@google.com> | 2024-07-25 13:12:57 +0000 |
| commit | 4d77b9fe7da3d014943a16cb4b9a4ca3a531521a (patch) | |
| tree | c37fbf8b50205eb8b830595a621ad4b355e32e9a /executor | |
| parent | 206f31df2861c47b13a8c05a105afa94bcc7106c (diff) | |
all: add qemu snapshotting mode
Diffstat (limited to 'executor')
| -rw-r--r-- | executor/common.h | 11 | ||||
| -rw-r--r-- | executor/executor.cc | 96 | ||||
| -rw-r--r-- | executor/executor_runner.h | 3 | ||||
| -rw-r--r-- | executor/files.h | 16 | ||||
| -rw-r--r-- | executor/snapshot.h | 252 |
5 files changed, 341 insertions, 37 deletions
diff --git a/executor/common.h b/executor/common.h index a38768536..123723e5a 100644 --- a/executor/common.h +++ b/executor/common.h @@ -613,7 +613,8 @@ static void loop(void) #endif #if SYZ_EXECUTOR // Tell parent that we are ready to serve. - reply_execute(0); + if (!flag_snapshot) + reply_execute(0); #endif int iter = 0; #if SYZ_REPEAT_TIMES @@ -632,7 +633,8 @@ static void loop(void) reset_loop(); #endif #if SYZ_EXECUTOR - receive_execute(); + if (!flag_snapshot) + receive_execute(); #endif int pid = fork(); if (pid < 0) @@ -663,6 +665,11 @@ static void loop(void) } debug("spawned worker pid %d\n", pid); +#if SYZ_EXECUTOR + if (flag_snapshot) + SnapshotPrepareParent(); +#endif + // We used to use sigtimedwait(SIGCHLD) to wait for the subprocess. // But SIGCHLD is also delivered when a process stops/continues, // so it would require a loop with status analysis and timeout recalculation. diff --git a/executor/executor.cc b/executor/executor.cc index 17de4e87d..055957e9f 100644 --- a/executor/executor.cc +++ b/executor/executor.cc @@ -113,6 +113,8 @@ static void reply_execute(uint32 status); static void receive_handshake(); #if SYZ_EXECUTOR_USES_FORK_SERVER +static void SnapshotPrepareParent(); + // Allocating (and forking) virtual memory for each executed process is expensive, so we only mmap // the amount we might possibly need for the specific received prog. const int kMaxOutputComparisons = 14 << 20; // executions with comparsions enabled are usually < 1% of all executions @@ -143,6 +145,7 @@ struct alignas(8) OutputData { std::atomic<uint32> size; std::atomic<uint32> consumed; std::atomic<uint32> completed; + std::atomic<uint32> num_calls; struct { // Call index in the test program (they may be out-of-order is some syscalls block). int index; @@ -155,6 +158,7 @@ struct alignas(8) OutputData { size.store(0, std::memory_order_relaxed); consumed.store(0, std::memory_order_relaxed); completed.store(0, std::memory_order_relaxed); + num_calls.store(0, std::memory_order_relaxed); } }; @@ -248,6 +252,7 @@ static bool dedup(uint8 index, uint64 sig); static uint64 start_time_ms = 0; static bool flag_debug; +static bool flag_snapshot; static bool flag_coverage; static bool flag_sandbox_none; static bool flag_sandbox_setuid; @@ -463,8 +468,10 @@ static bool copyout(char* addr, uint64 size, uint64* res); static void setup_control_pipes(); static bool coverage_filter(uint64 pc); static rpc::ComparisonRaw convert(const kcov_comparison_t& cmp); -static flatbuffers::span<uint8_t> finish_output(OutputData* output, int proc_id, uint64 req_id, +static flatbuffers::span<uint8_t> finish_output(OutputData* output, int proc_id, uint64 req_id, uint32 num_calls, uint64 elapsed, uint64 freshness, uint32 status, const std::vector<uint8_t>* process_output); +static void parse_execute(const execute_req& req); +static void parse_handshake(const handshake_req& req); #include "syscalls.h" @@ -495,6 +502,8 @@ static feature_t features[] = {}; #include "files.h" #include "subprocess.h" +#include "snapshot.h" + #include "executor_runner.h" #include "test.h" @@ -535,44 +544,50 @@ int main(int argc, char** argv) start_time_ms = current_time_ms(); os_init(argc, argv, (char*)SYZ_DATA_OFFSET, SYZ_NUM_PAGES * SYZ_PAGE_SIZE); + use_temporary_dir(); + install_segv_handler(); current_thread = &threads[0]; - void* mmap_out = mmap(NULL, kMaxInput, PROT_READ, MAP_SHARED, kInFd, 0); - if (mmap_out == MAP_FAILED) - fail("mmap of input file failed"); - input_data = static_cast<uint8*>(mmap_out); + if (argc > 2 && strcmp(argv[2], "snapshot") == 0) { + SnapshotSetup(argv, argc); + } else { + void* mmap_out = mmap(NULL, kMaxInput, PROT_READ, MAP_SHARED, kInFd, 0); + if (mmap_out == MAP_FAILED) + fail("mmap of input file failed"); + input_data = static_cast<uint8*>(mmap_out); + + mmap_output(kInitialOutput); - mmap_output(kInitialOutput); - // Prevent test programs to mess with these fds. - // Due to races in collider mode, a program can e.g. ftruncate one of these fds, - // which will cause fuzzer to crash. - close(kInFd); + // Prevent test programs to mess with these fds. + // Due to races in collider mode, a program can e.g. ftruncate one of these fds, + // which will cause fuzzer to crash. + close(kInFd); #if !SYZ_EXECUTOR_USES_FORK_SERVER - close(kOutFd); + // For SYZ_EXECUTOR_USES_FORK_SERVER, close(kOutFd) is invoked in the forked child, + // after the program has been received. + close(kOutFd); #endif - // For SYZ_EXECUTOR_USES_FORK_SERVER, close(kOutFd) is invoked in the forked child, - // after the program has been received. - if (fcntl(kMaxSignalFd, F_GETFD) != -1) { - // Use random addresses for coverage filters to not collide with output_data. - max_signal.emplace(kMaxSignalFd, reinterpret_cast<void*>(0x110c230000ull)); - close(kMaxSignalFd); - } - if (fcntl(kCoverFilterFd, F_GETFD) != -1) { - cover_filter.emplace(kCoverFilterFd, reinterpret_cast<void*>(0x110f230000ull)); - close(kCoverFilterFd); - } + if (fcntl(kMaxSignalFd, F_GETFD) != -1) { + // Use random addresses for coverage filters to not collide with output_data. + max_signal.emplace(kMaxSignalFd, reinterpret_cast<void*>(0x110c230000ull)); + close(kMaxSignalFd); + } + if (fcntl(kCoverFilterFd, F_GETFD) != -1) { + cover_filter.emplace(kCoverFilterFd, reinterpret_cast<void*>(0x110f230000ull)); + close(kCoverFilterFd); + } - use_temporary_dir(); - install_segv_handler(); - setup_control_pipes(); - receive_handshake(); + setup_control_pipes(); + receive_handshake(); #if !SYZ_EXECUTOR_USES_FORK_SERVER - // We receive/reply handshake when fork server is disabled just to simplify runner logic. - // It's a bit suboptimal, but no fork server is much slower anyway. - reply_execute(0); - receive_execute(); + // We receive/reply handshake when fork server is disabled just to simplify runner logic. + // It's a bit suboptimal, but no fork server is much slower anyway. + reply_execute(0); + receive_execute(); #endif + } + if (flag_coverage) { int create_count = kCoverDefaultCount, mmap_count = create_count; if (flag_delay_kcov_mmap) { @@ -694,6 +709,11 @@ void receive_handshake() ssize_t n = read(kInPipeFd, &req, sizeof(req)); if (n != sizeof(req)) failmsg("handshake read failed", "read=%zu", n); + parse_handshake(req); +} + +void parse_handshake(const handshake_req& req) +{ if (req.magic != kInMagic) failmsg("bad handshake magic", "magic=0x%llx", req.magic); #if SYZ_HAVE_SANDBOX_ANDROID @@ -732,6 +752,11 @@ void receive_execute() ; if (n != (ssize_t)sizeof(req)) failmsg("control pipe read failed", "read=%zd want=%zd", n, sizeof(req)); + parse_execute(req); +} + +void parse_execute(const execute_req& req) +{ request_id = req.id; flag_collect_signal = req.exec_flags & (1 << 0); flag_collect_cover = req.exec_flags & (1 << 1); @@ -759,6 +784,8 @@ bool cover_collection_required() void reply_execute(uint32 status) { + if (flag_snapshot) + SnapshotDone(status == kFailStatus); if (write(kOutPipeFd, &status, sizeof(status)) != sizeof(status)) fail("control pipe write failed"); } @@ -781,7 +808,10 @@ void realloc_output_data() void execute_one() { in_execute_one = true; - realloc_output_data(); + if (flag_snapshot) + SnapshotStart(); + else + realloc_output_data(); output_builder.emplace(output_data, output_size); uint64 start = current_time_ms(); uint8* input_pos = input_data; @@ -1272,11 +1302,9 @@ void write_extra_output() cover_reset(&extra_cov); } -flatbuffers::span<uint8_t> finish_output(OutputData* output, int proc_id, uint64 req_id, uint64 elapsed, +flatbuffers::span<uint8_t> finish_output(OutputData* output, int proc_id, uint64 req_id, uint32 num_calls, uint64 elapsed, uint64 freshness, uint32 status, const std::vector<uint8_t>* process_output) { - uint8* prog_data = input_data; - uint32 num_calls = read_input(&prog_data); int output_size = output->size.load(std::memory_order_relaxed) ?: kMaxOutput; uint32 completed = output->completed.load(std::memory_order_relaxed); completed = std::min(completed, kMaxCalls); diff --git a/executor/executor_runner.h b/executor/executor_runner.h index 260d4a5de..96fcc9b44 100644 --- a/executor/executor_runner.h +++ b/executor/executor_runner.h @@ -352,7 +352,8 @@ private: output_.insert(output_.end(), tmp, tmp + strlen(tmp)); } } - auto data = finish_output(resp_mem_, id_, msg_->id, elapsed, freshness_++, status, output); + uint32 num_calls = read_input(&prog_data); + auto data = finish_output(resp_mem_, id_, msg_->id, num_calls, elapsed, freshness_++, status, output); conn_.Send(data.data(), data.size()); resp_mem_->Reset(); diff --git a/executor/files.h b/executor/files.h index f952a07dc..470157e84 100644 --- a/executor/files.h +++ b/executor/files.h @@ -9,6 +9,7 @@ #include <errno.h> #include <fcntl.h> #include <glob.h> +#include <stdarg.h> #include <string.h> #include <unistd.h> @@ -58,6 +59,21 @@ static std::unique_ptr<rpc::FileInfoRawT> ReadFile(const std::string& file) return info; } +static std::string ReadTextFile(const char* file_fmt, ...) +{ + char file[1024]; + va_list args; + va_start(args, file_fmt); + vsnprintf(file, sizeof(file), file_fmt, args); + va_end(args); + file[sizeof(file) - 1] = 0; + auto data = ReadFile(file)->data; + std::string str(data.begin(), data.end()); + while (!str.empty() && (str.back() == '\n' || str.back() == 0)) + str.resize(str.size() - 1); + return str; +} + static std::vector<std::unique_ptr<rpc::FileInfoRawT>> ReadFiles(const std::vector<std::string>& files) { std::vector<std::unique_ptr<rpc::FileInfoRawT>> results; diff --git a/executor/snapshot.h b/executor/snapshot.h new file mode 100644 index 000000000..5479a162f --- /dev/null +++ b/executor/snapshot.h @@ -0,0 +1,252 @@ +// Copyright 2024 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +#include <dirent.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> + +#include <atomic> +#include <string> +#include <utility> + +#ifndef MADV_POPULATE_WRITE +#define MADV_POPULATE_WRITE 23 +#endif + +// Size of qemu snapshots and time required to restore a snapshot depend on the amount of memory +// the VM touches after boot. For example, a 132 MB snapshot takes around 150ms to restore, +// while a 260 MB snapshot takes around 275 ms to restore. +// +// To reduce size of the snapshot it's recommended to use smaller kernel and setup fewer devices. +// For example the following cmdline arguments: +// "loop.max_loop=1 dummy_hcd.num=1 vivid.n_devs=2 vivid.multiplanar=1,2 netrom.nr_ndevs=1 rose.rose_ndevs=1" +// and CONFIG_USBIP_VHCI_NR_HCS=1 help to reduce snapshot by about 20 MB. Note: we have only 1 proc +// in snapshot mode, so we don't need lots of devices. However, our descriptions rely on vivid.n_devs=16 +// since they hardcode names like /dev/video36 which follow after these 16 pre-created devices. +// +// Additionally we could try to use executor as init process, this should remove dhcpd/sshd/udevd/klogd/etc. +// We don't need even networking in snapshot mode since we communicate via shared memory. + +static struct { + // Ivshmem interrupt doorbell register. + volatile uint32* doorbell; + volatile rpc::SnapshotHeaderT* hdr; + void* input; +} ivs; + +// Finds qemu ivshmem device, see: +// https://www.qemu.org/docs/master/specs/ivshmem-spec.html +static void FindIvshmemDevices() +{ + std::string result; + DIR* devices = opendir("/sys/bus/pci/devices"); + if (!devices) + fail("opendir(/sys/bus/pci/devices) failed"); + void* regs = nullptr; + void* input = nullptr; + void* output = nullptr; + while (auto* dev = readdir(devices)) { + if (dev->d_name[0] == '.') + continue; + const std::string& vendor = ReadTextFile("/sys/bus/pci/devices/%s/vendor", dev->d_name); + const std::string& device = ReadTextFile("/sys/bus/pci/devices/%s/device", dev->d_name); + debug("PCI device %s: vendor=%s device=%s\n", dev->d_name, vendor.c_str(), device.c_str()); + if (vendor != "0x1af4" || device != "0x1110") + continue; + char filename[1024]; + snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource2", dev->d_name); + int res2 = open(filename, O_RDWR); + if (res2 == -1) + fail("failed to open ivshmem resource2"); + struct stat statbuf; + if (fstat(res2, &statbuf)) + fail("failed to fstat ivshmem resource2"); + debug("ivshmem resource2 size %zu\n", static_cast<size_t>(statbuf.st_size)); + // The only way to distinguish the 2 ivshmem regions is by size. + if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotDoorbellSize)) { + snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%s/resource0", dev->d_name); + int res0 = open(filename, O_RDWR); + if (res0 == -1) + fail("failed to open ivshmem resource0"); + regs = mmap(nullptr, getpagesize(), PROT_READ | PROT_WRITE, MAP_SHARED, res0, 0); + close(res0); + if (regs == MAP_FAILED) + fail("failed to mmap ivshmem resource0"); + debug("mapped doorbell registers at %p\n", regs); + } else if (statbuf.st_size == static_cast<uint64>(rpc::Const::SnapshotShmemSize)) { + input = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxInputSize), + PROT_READ, MAP_SHARED, res2, 0); + output = mmap(nullptr, static_cast<uint64>(rpc::Const::MaxOutputSize), + PROT_READ | PROT_WRITE, MAP_SHARED, res2, + static_cast<uint64>(rpc::Const::MaxInputSize)); + if (input == MAP_FAILED || output == MAP_FAILED) + fail("failed to mmap ivshmem resource2"); + debug("mapped shmem input at at %p/%llu\n", + input, static_cast<uint64>(rpc::Const::MaxInputSize)); + debug("mapped shmem output at at %p/%llu\n", + output, static_cast<uint64>(rpc::Const::MaxOutputSize)); + } + close(res2); + } + closedir(devices); + if (regs == nullptr || input == nullptr) + fail("cannot find ivshmem PCI devices"); + ivs.doorbell = static_cast<uint32*>(regs) + 3; + ivs.hdr = static_cast<rpc::SnapshotHeaderT*>(output); + ivs.input = input; + output_data = reinterpret_cast<OutputData*>(static_cast<char*>(output) + sizeof(rpc::SnapshotHeaderT)); + output_size = static_cast<uint64>(rpc::Const::MaxOutputSize) - sizeof(rpc::SnapshotHeaderT); +} + +static void SnapshotSetup(char** argv, int argc) +{ + flag_snapshot = true; + // This allows to see debug output during early setup. + // If debug is not actually enabled, it will be turned off in parse_handshake. + flag_debug = true; +#if GOOS_linux + // In snapshot mode executor output is redirected to /dev/kmsg. + // This is required to turn off rate limiting of writes. + write_file("/proc/sys/kernel/printk_devkmsg", "on\n"); +#endif + FindIvshmemDevices(); + // Wait for the host to write handshake_req into input memory. + while (ivs.hdr->state != rpc::SnapshotState::Handshake) + sleep_ms(10); + auto msg = flatbuffers::GetRoot<rpc::SnapshotHandshake>(ivs.input); + handshake_req req = { + .magic = kInMagic, + .use_cover_edges = msg->cover_edges(), + .is_kernel_64_bit = msg->kernel_64_bit(), + .flags = msg->env_flags(), + .pid = 0, + .sandbox_arg = static_cast<uint64>(msg->sandbox_arg()), + .syscall_timeout_ms = static_cast<uint64>(msg->syscall_timeout_ms()), + .program_timeout_ms = static_cast<uint64>(msg->program_timeout_ms()), + .slowdown_scale = static_cast<uint64>(msg->slowdown()), + }; + parse_handshake(req); + for (const auto& feat : features) { + if (!(msg->features() & feat.id)) + continue; + debug("setting up feature %s\n", rpc::EnumNameFeature(feat.id)); + const char* reason = feat.setup(); + if (reason) + failmsg("feature setup failed", "reason: %s", reason); + } +} + +constexpr size_t kOutputPopulate = 256 << 10; +constexpr size_t kInputPopulate = 64 << 10; +constexpr size_t kGlobalsPopulate = 4 << 10; +constexpr size_t kDataPopulate = 8 << 10; +constexpr size_t kCoveragePopulate = 32 << 10; +constexpr size_t kThreadsPopulate = 2; + +static void SnapshotSetState(rpc::SnapshotState state) +{ + debug("changing stapshot state %s -> %s\n", + rpc::EnumNameSnapshotState(ivs.hdr->state), rpc::EnumNameSnapshotState(state)); + std::atomic_signal_fence(std::memory_order_seq_cst); + ivs.hdr->state = state; + // The register contains VM index shifted by 16 (the host part is VM index 1) + // + interrup vector index (0 in our case). + *ivs.doorbell = 1 << 16; +} + +// PopulateMemory prefaults anon memory (we want to avoid minor page faults as well). +static void PopulateMemory(void* ptr, size_t size) +{ + ptr = (void*)(uintptr_t(ptr) & ~(getpagesize() - 1)); + if (madvise(ptr, size, MADV_POPULATE_WRITE)) + failmsg("populate madvise failed", "ptr=%p size=%zu", ptr, size); +} + +// TouchMemory prefaults non-anon shared memory. +static void TouchMemory(void* ptr, size_t size) +{ + size_t const kPageSize = getpagesize(); + for (size_t i = 0; i < size; i += kPageSize) + (void)((volatile char*)ptr)[i]; +} + +#if SYZ_EXECUTOR_USES_FORK_SERVER +static void SnapshotPrepareParent() +{ + TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate); + // Notify SnapshotStart that we finished prefaulting memory in the parent. + output_data->completed = 1; + // Wait for the request to come, so that we give it full time slice to execute. + // This process will start waiting for the child as soon as we return. + while (ivs.hdr->state != rpc::SnapshotState::Execute) + ; +} +#endif + +static void SnapshotStart() +{ + debug("SnapshotStart\n"); + // Prefault as much memory as we can before the snapshot is taken. + // Also pre-create some threads and let them block. + // This is intended to make execution after each snapshot restore faster, + // as we won't need to do that duplicate work again and again. + flag_threaded = true; + for (size_t i = 0; i < kThreadsPopulate; i++) { + thread_t* th = &threads[i]; + thread_create(th, i, flag_coverage); + if (flag_coverage) + PopulateMemory(th->cov.data, kCoveragePopulate); + } + TouchMemory((char*)output_data + output_size - kOutputPopulate, kOutputPopulate); + TouchMemory(ivs.input, kInputPopulate); + PopulateMemory(&flag_coverage, kGlobalsPopulate); + PopulateMemory((void*)SYZ_DATA_OFFSET, kDataPopulate); + sleep_ms(100); // let threads start and block + // Wait for the parent process to prefault as well. + while (!output_data->completed) + sleep_ms(1); + // Notify host that we are ready to be snapshotted. + SnapshotSetState(rpc::SnapshotState::Ready); + // Snapshot is restored here. + // First time we may loop here while the snapshot is taken, + // but afterwards we should be restored when the state is already Execute. + // Note: we don't use sleep in the loop because we may be snapshotted while in the sleep syscall. + // As the result each execution after snapshot restore will be slower as it will need to finish + // the sleep and return from the syscall. + while (ivs.hdr->state == rpc::SnapshotState::Ready) + ; + if (ivs.hdr->state == rpc::SnapshotState::Snapshotted) { + // First time around, just acknowledge and wait for snapshot restart. + SnapshotSetState(rpc::SnapshotState::Executed); + for (;;) + sleep(1000); + } + // Resumed for program execution. + output_data->Reset(); + auto msg = flatbuffers::GetRoot<rpc::SnapshotRequest>(ivs.input); + execute_req req = { + .magic = kInMagic, + .id = 0, + .exec_flags = static_cast<uint64>(msg->exec_flags()), + .all_call_signal = msg->all_call_signal(), + .all_extra_signal = msg->all_extra_signal(), + }; + parse_execute(req); + output_data->num_calls.store(msg->num_calls(), std::memory_order_relaxed); + input_data = const_cast<uint8*>(msg->prog_data()->Data()); +} + +NORETURN static void SnapshotDone(bool failed) +{ + debug("SnapshotDone\n"); + uint32 num_calls = output_data->num_calls.load(std::memory_order_relaxed); + auto data = finish_output(output_data, 0, 0, num_calls, 0, 0, failed ? kFailStatus : 0, nullptr); + ivs.hdr->output_offset = data.data() - reinterpret_cast<volatile uint8_t*>(ivs.hdr); + ivs.hdr->output_size = data.size(); + SnapshotSetState(failed ? rpc::SnapshotState::Failed : rpc::SnapshotState::Executed); + // Wait to be restarted from the snapshot. + for (;;) + sleep(1000); +} |
