3 files changed, 75 insertions, 16 deletions
diff --git a/executor/executor.cc b/executor/executor.cc
index 305675bce..5b5e06422 100644
--- a/executor/executor.cc
+++ b/executor/executor.cc
@@ -470,7 +470,8 @@ static void setup_control_pipes();
 static bool coverage_filter(uint64 pc);
 static rpc::ComparisonRaw convert(const kcov_comparison_t& cmp);
 static flatbuffers::span<uint8_t> finish_output(OutputData* output, int proc_id, uint64 req_id, uint32 num_calls,
-						uint64 elapsed, uint64 freshness, uint32 status, const std::vector<uint8_t>* process_output);
+						uint64 elapsed, uint64 freshness, uint32 status, bool hanged,
+						const std::vector<uint8_t>* process_output);
 static void parse_execute(const execute_req& req);
 static void parse_handshake(const handshake_req& req);
 
@@ -1343,7 +1344,7 @@ void write_extra_output()
 }
 
 flatbuffers::span<uint8_t> finish_output(OutputData* output, int proc_id, uint64 req_id, uint32 num_calls, uint64 elapsed,
-					 uint64 freshness, uint32 status, const std::vector<uint8_t>* process_output)
+					 uint64 freshness, uint32 status, bool hanged, const std::vector<uint8_t>* process_output)
 {
 	// In snapshot mode the output size is fixed and output_size is always initialized, so use it.
 	int out_size = flag_snapshot ? output_size : output->size.load(std::memory_order_relaxed) ?
@@ -1376,7 +1377,7 @@ flatbuffers::span<uint8_t> finish_output(OutputData* output, int proc_id, uint64
 	flatbuffers::Offset<flatbuffers::Vector<uint8_t>> output_off = 0;
 	if (process_output)
 		output_off = fbb.CreateVector(*process_output);
-	auto exec_off = rpc::CreateExecResultRaw(fbb, req_id, proc_id, output_off, error_off, prog_info_off);
+	auto exec_off = rpc::CreateExecResultRaw(fbb, req_id, proc_id, output_off, hanged, error_off, prog_info_off);
 	auto msg_off = rpc::CreateExecutorMessageRaw(fbb, rpc::ExecutorMessagesRaw::ExecResult,
 						     flatbuffers::Offset<void>(exec_off.o));
 	fbb.FinishSizePrefixed(msg_off);
diff --git a/executor/executor_runner.h b/executor/executor_runner.h
index 30a5f3a6d..5680f8fc2 100644
--- a/executor/executor_runner.h
+++ b/executor/executor_runner.h
@@ -27,16 +27,62 @@ inline std::ostream& operator<<(std::ostream& ss, const rpc::ExecRequestRawT& re
 		  << "\n";
 }
 
+// ProcIDPool allows to reuse a set of unique proc IDs across a set of subprocesses.
+//
+// When a subprocess hangs, it's a bit unclear what to do (we don't have means to kill
+// the whole tree of its children, and waiting for all them will presumably hang as well).
+// Later there may appear a "task hung" report from the kernel, so we don't want to terminate
+// the VM immidiatly. But the "task hung" report may also not appear at all, so we can't
+// just wait for a hanged subprocesses forever.
+//
+// So in that case we kill/wait just the top subprocesses, and give it a new proc ID
+// (since some resources associated with the old proc ID may still be used by the old
+// unterminated test processes). However, we don't have infinite number of proc IDs,
+// so we recycle them in FIFO order. This is not ideal, but it looks like the best
+// practical solution.
+class ProcIDPool
+{
+public:
+	ProcIDPool(int num_procs)
+	{
+		// Theoretically we have 32 procs (prog.MaxPids), but there are some limitations in descriptions
+		// that make them work well only for up to 10 procs. For example, we form /dev/loopN
+		// device name using proc['0', 1, int8]. When these limitations are fixed,
+		// we can use all 32 here (prog.MaxPids)
+		constexpr int kNumGoodProcs = 10;
+		for (int i = 0; i < std::max(num_procs, kNumGoodProcs); i++)
+			ids_.push_back(i);
+	}
+
+	int Alloc(int old = -1)
+	{
+		if (old >= 0)
+			ids_.push_back(old);
+		if (ids_.empty())
+			fail("out of proc ids");
+		int id = ids_.front();
+		ids_.pop_front();
+		return id;
+	}
+
+private:
+	std::deque<int> ids_;
+
+	ProcIDPool(const ProcIDPool&) = delete;
+	ProcIDPool& operator=(const ProcIDPool&) = delete;
+};
+
 // Proc represents one subprocess that runs tests (re-execed syz-executor with 'exec' argument).
 // The object is persistent and re-starts subprocess when it crashes.
 class Proc
 {
 public:
-	Proc(Connection& conn, const char* bin, int id, int& restarting, const bool& corpus_triaged, int max_signal_fd, int cover_filter_fd,
+	Proc(Connection& conn, const char* bin, int id, ProcIDPool& proc_id_pool, int& restarting, const bool& corpus_triaged, int max_signal_fd, int cover_filter_fd,
 	     bool use_cover_edges, bool is_kernel_64_bit, uint32 slowdown, uint32 syscall_timeout_ms, uint32 program_timeout_ms)
 	    : conn_(conn),
 	      bin_(bin),
-	      id_(id),
+	      proc_id_pool_(proc_id_pool),
+	      id_(proc_id_pool.Alloc()),
 	      restarting_(restarting),
 	      corpus_triaged_(corpus_triaged),
 	      max_signal_fd_(max_signal_fd),
@@ -92,7 +138,7 @@ public:
 			// fork server is enabled, so we use quite large timeout. Child process can be slow
 			// due to global locks in namespaces and other things, so let's better wait than
 			// report false misleading crashes.
-			uint64 timeout = 2 * program_timeout_ms_;
+			uint64 timeout = 3 * program_timeout_ms_;
 #else
 			uint64 timeout = program_timeout_ms_;
 #endif
@@ -134,7 +180,8 @@ private:
 
 	Connection& conn_;
 	const char* const bin_;
-	const int id_;
+	ProcIDPool& proc_id_pool_;
+	int id_;
 	int& restarting_;
 	const bool& corpus_triaged_;
 	const int max_signal_fd_;
@@ -215,7 +262,15 @@ private:
 				while (ReadOutput())
 					;
 			}
-			HandleCompletion(status);
+			bool hanged = SYZ_EXECUTOR_USES_FORK_SERVER && state_ == State::Executing;
+			HandleCompletion(status, hanged);
+			if (hanged) {
+				// If the process has hanged, it may still be using per-proc resources,
+				// so allocate a fresh proc id.
+				int new_id = proc_id_pool_.Alloc(id_);
+				debug("proc %d: changing proc id to %d\n", id_, new_id);
+				id_ = new_id;
+			}
 		} else if (attempts_ > 3)
 			sleep_ms(100 * attempts_);
 		Start();
@@ -350,7 +405,7 @@ private:
 		}
 	}
 
-	void HandleCompletion(uint32 status)
+	void HandleCompletion(uint32 status, bool hanged = false)
 	{
 		if (!msg_)
 			fail("don't have executed msg");
@@ -370,7 +425,7 @@ private:
 			}
 		}
 		uint32 num_calls = read_input(&prog_data);
-		auto data = finish_output(resp_mem_, id_, msg_->id, num_calls, elapsed, freshness_++, status, output);
+		auto data = finish_output(resp_mem_, id_, msg_->id, num_calls, elapsed, freshness_++, status, hanged, output);
 		conn_.Send(data.data(), data.size());
 
 		resp_mem_->Reset();
@@ -458,12 +513,14 @@ public:
 	    : conn_(conn),
 	      vm_index_(vm_index)
 	{
-		size_t num_procs = Handshake();
+		int num_procs = Handshake();
+		proc_id_pool_.emplace(num_procs);
 		int max_signal_fd = max_signal_ ? max_signal_->FD() : -1;
 		int cover_filter_fd = cover_filter_ ? cover_filter_->FD() : -1;
-		for (size_t i = 0; i < num_procs; i++)
-			procs_.emplace_back(new Proc(conn, bin, i, restarting_, corpus_triaged_, max_signal_fd, cover_filter_fd,
-						     use_cover_edges_, is_kernel_64_bit_, slowdown_, syscall_timeout_ms_, program_timeout_ms_));
+		for (int i = 0; i < num_procs; i++)
+			procs_.emplace_back(new Proc(conn, bin, i, *proc_id_pool_, restarting_, corpus_triaged_,
+						     max_signal_fd, cover_filter_fd, use_cover_edges_, is_kernel_64_bit_, slowdown_,
+						     syscall_timeout_ms_, program_timeout_ms_));
 
 		for (;;)
 			Loop();
@@ -474,6 +531,7 @@ private:
 	const int vm_index_;
 	std::optional<CoverFilter> max_signal_;
 	std::optional<CoverFilter> cover_filter_;
+	std::optional<ProcIDPool> proc_id_pool_;
 	std::vector<std::unique_ptr<Proc>> procs_;
 	std::deque<rpc::ExecRequestRawT> requests_;
 	std::vector<std::string> leak_frames_;
@@ -545,7 +603,7 @@ private:
 			failmsg("bad restarting", "restarting=%d", restarting_);
 	}
 
-	size_t Handshake()
+	int Handshake()
 	{
 		rpc::ConnectRequestRawT conn_req;
 		conn_req.id = vm_index_;
diff --git a/executor/snapshot.h b/executor/snapshot.h
index ce3e355b0..462fb2c56 100644
--- a/executor/snapshot.h
+++ b/executor/snapshot.h
@@ -251,7 +251,7 @@ NORETURN static void SnapshotDone(bool failed)
 	debug("SnapshotDone\n");
 	CoverAccessScope scope(nullptr);
 	uint32 num_calls = output_data->num_calls.load(std::memory_order_relaxed);
-	auto data = finish_output(output_data, 0, 0, num_calls, 0, 0, failed ? kFailStatus : 0, nullptr);
+	auto data = finish_output(output_data, 0, 0, num_calls, 0, 0, failed ? kFailStatus : 0, false, nullptr);
 	ivs.hdr->output_offset = data.data() - reinterpret_cast<volatile uint8_t*>(ivs.hdr);
 	ivs.hdr->output_size = data.size();
 	SnapshotSetState(failed ? rpc::SnapshotState::Failed : rpc::SnapshotState::Executed);