executor: better handling for hanged test processes

Currently we kill hanged processes and consider the corresponding test finished. We don't kill/wait for the actual test subprocess (we don't know its pid to kill, and waiting will presumably hang). This has 2 problems: 1. If the hanged process causes "task hung" report, we can't reproduce it, since the test finished too long ago (manager thinks its finished and discards the request). 2. The test process still consumed per-pid resources. Explicitly detect and handle such cases: Manager keeps these hanged tests forever, and we assign a new proc id for future processes (don't reuse the hanged one).
author: Dmitry Vyukov <dvyukov@google.com> 2024-10-21 11:53:44 +0200
committer: Dmitry Vyukov <dvyukov@google.com> 2024-10-24 09:34:38 +0000
commit: 9fc8fe026baab9959459256f2d47f4bbf21d405a (patch)
tree: 6d97a7ac2b8e69f5fa7a92a4b3824b1ad9e571c7 /pkg/fuzzer/fuzzer.go
parent: a85e9d5032fdf305457a6400bd3af4a8df6c45c4 (diff)
1 files changed, 5 insertions, 4 deletions
diff --git a/pkg/fuzzer/fuzzer.go b/pkg/fuzzer/fuzzer.go
index c5f5526ea..1f2fd90a8 100644
--- a/pkg/fuzzer/fuzzer.go
+++ b/pkg/fuzzer/fuzzer.go
@@ -122,13 +122,14 @@ func (fuzzer *Fuzzer) enqueue(executor queue.Executor, req *queue.Request, flags
 }
 
 func (fuzzer *Fuzzer) processResult(req *queue.Request, res *queue.Result, flags ProgFlags, attempt int) bool {
-	inTriage := flags&progInTriage > 0
+	// If we are already triaging this exact prog, this is flaky coverage.
+	// Hanged programs are harmful as they consume executor procs.
+	dontTriage := flags&progInTriage > 0 || res.Status == queue.Hanged
 	// Triage the program.
 	// We do it before unblocking the waiting threads because
 	// it may result it concurrent modification of req.Prog.
-	// If we are already triaging this exact prog, this is flaky coverage.
 	var triage map[int]*triageCall
-	if req.ExecOpts.ExecFlags&flatrpc.ExecFlagCollectSignal > 0 && res.Info != nil && !inTriage {
+	if req.ExecOpts.ExecFlags&flatrpc.ExecFlagCollectSignal > 0 && res.Info != nil && !dontTriage {
 		for call, info := range res.Info.Calls {
 			fuzzer.triageProgCall(req.Prog, info, call, &triage)
 		}
@@ -168,7 +169,7 @@ func (fuzzer *Fuzzer) processResult(req *queue.Request, res *queue.Result, flags
 		// In non-snapshot mode usually we are not sure which exactly input caused the crash,
 		// so give it one more chance. In snapshot mode we know for sure, so don't retry.
 		maxCandidateAttempts = 2
-		if fuzzer.Config.Snapshot {
+		if fuzzer.Config.Snapshot || res.Status == queue.Hanged {
 			maxCandidateAttempts = 0
 		}
 	}
author	Dmitry Vyukov <dvyukov@google.com>	2024-10-21 11:53:44 +0200
committer	Dmitry Vyukov <dvyukov@google.com>	2024-10-24 09:34:38 +0000
commit	9fc8fe026baab9959459256f2d47f4bbf21d405a (patch)
tree	6d97a7ac2b8e69f5fa7a92a4b3824b1ad9e571c7 /pkg/fuzzer/fuzzer.go
parent	a85e9d5032fdf305457a6400bd3af4a8df6c45c4 (diff)