diff options
| author | Dmitry Vyukov <dvyukov@google.com> | 2024-10-21 11:53:44 +0200 |
|---|---|---|
| committer | Dmitry Vyukov <dvyukov@google.com> | 2024-10-24 09:34:38 +0000 |
| commit | 9fc8fe026baab9959459256f2d47f4bbf21d405a (patch) | |
| tree | 6d97a7ac2b8e69f5fa7a92a4b3824b1ad9e571c7 /pkg/fuzzer | |
| parent | a85e9d5032fdf305457a6400bd3af4a8df6c45c4 (diff) | |
executor: better handling for hanged test processes
Currently we kill hanged processes and consider the corresponding test finished.
We don't kill/wait for the actual test subprocess (we don't know its pid to kill,
and waiting will presumably hang). This has 2 problems:
1. If the hanged process causes "task hung" report, we can't reproduce it,
since the test finished too long ago (manager thinks its finished and
discards the request).
2. The test process still consumed per-pid resources.
Explicitly detect and handle such cases:
Manager keeps these hanged tests forever,
and we assign a new proc id for future processes
(don't reuse the hanged one).
Diffstat (limited to 'pkg/fuzzer')
| -rw-r--r-- | pkg/fuzzer/fuzzer.go | 9 | ||||
| -rw-r--r-- | pkg/fuzzer/queue/queue.go | 10 | ||||
| -rw-r--r-- | pkg/fuzzer/queue/retry.go | 28 |
3 files changed, 33 insertions, 14 deletions
diff --git a/pkg/fuzzer/fuzzer.go b/pkg/fuzzer/fuzzer.go index c5f5526ea..1f2fd90a8 100644 --- a/pkg/fuzzer/fuzzer.go +++ b/pkg/fuzzer/fuzzer.go @@ -122,13 +122,14 @@ func (fuzzer *Fuzzer) enqueue(executor queue.Executor, req *queue.Request, flags } func (fuzzer *Fuzzer) processResult(req *queue.Request, res *queue.Result, flags ProgFlags, attempt int) bool { - inTriage := flags&progInTriage > 0 + // If we are already triaging this exact prog, this is flaky coverage. + // Hanged programs are harmful as they consume executor procs. + dontTriage := flags&progInTriage > 0 || res.Status == queue.Hanged // Triage the program. // We do it before unblocking the waiting threads because // it may result it concurrent modification of req.Prog. - // If we are already triaging this exact prog, this is flaky coverage. var triage map[int]*triageCall - if req.ExecOpts.ExecFlags&flatrpc.ExecFlagCollectSignal > 0 && res.Info != nil && !inTriage { + if req.ExecOpts.ExecFlags&flatrpc.ExecFlagCollectSignal > 0 && res.Info != nil && !dontTriage { for call, info := range res.Info.Calls { fuzzer.triageProgCall(req.Prog, info, call, &triage) } @@ -168,7 +169,7 @@ func (fuzzer *Fuzzer) processResult(req *queue.Request, res *queue.Result, flags // In non-snapshot mode usually we are not sure which exactly input caused the crash, // so give it one more chance. In snapshot mode we know for sure, so don't retry. maxCandidateAttempts = 2 - if fuzzer.Config.Snapshot { + if fuzzer.Config.Snapshot || res.Status == queue.Hanged { maxCandidateAttempts = 0 } } diff --git a/pkg/fuzzer/queue/queue.go b/pkg/fuzzer/queue/queue.go index cbdb2ba19..0a56c76d3 100644 --- a/pkg/fuzzer/queue/queue.go +++ b/pkg/fuzzer/queue/queue.go @@ -161,7 +161,14 @@ func (r *Result) clone() *Result { } func (r *Result) Stop() bool { - return r.Status == ExecFailure || r.Status == Crashed + switch r.Status { + case Success, Restarted: + return false + case ExecFailure, Crashed, Hanged: + return true + default: + panic(fmt.Sprintf("unhandled status %v", r.Status)) + } } type Status int @@ -171,6 +178,7 @@ const ( ExecFailure // For e.g. serialization errors. Crashed // The VM crashed holding the request. Restarted // The VM was restarted holding the request. + Hanged // The program has hanged (can't be killed/waited). ) // Executor describes the interface wanted by the producers of requests. diff --git a/pkg/fuzzer/queue/retry.go b/pkg/fuzzer/queue/retry.go index c59a2c048..186850b5b 100644 --- a/pkg/fuzzer/queue/retry.go +++ b/pkg/fuzzer/queue/retry.go @@ -3,6 +3,10 @@ package queue +import ( + "fmt" +) + type retryer struct { pq *PlainQueue base Source @@ -28,16 +32,22 @@ func (r *retryer) Next() *Request { } func (r *retryer) done(req *Request, res *Result) bool { - // The input was on a restarted VM. - if res.Status == Restarted { - r.pq.Submit(req) - return false - } - // Retry important requests from crashed VMs once. - if res.Status == Crashed && req.Important && !req.onceCrashed { - req.onceCrashed = true + switch res.Status { + case Success, ExecFailure, Hanged: + return true + case Restarted: + // The input was on a restarted VM. r.pq.Submit(req) return false + case Crashed: + // Retry important requests from crashed VMs once. + if req.Important && !req.onceCrashed { + req.onceCrashed = true + r.pq.Submit(req) + return false + } + return true + default: + panic(fmt.Sprintf("unhandled status %v", res.Status)) } - return true } |
