diff options
| author | Aleksandr Nogikh <nogikh@google.com> | 2025-04-11 14:08:10 +0200 |
|---|---|---|
| committer | Aleksandr Nogikh <nogikh@google.com> | 2025-04-15 08:18:26 +0000 |
| commit | 81b407fe4e14e1e40fa060509157143e51ec1739 (patch) | |
| tree | ccaf1229e9ce952bbb832d81c9c5d0501ebdb0de /pkg | |
| parent | 0bd6db418098e2d98a2edf948b41410d3d9f9e70 (diff) | |
pkg/manager: wait until corpus is triaged for diff fuzzing
Track the right moment to start bug reproductions more exactly:
1) Either once 90% of the corpus is triaged (*).
2) Or once we are past 50% of the time dedicated for fuzzing.
Whatever happens earlier.
(*) The last percents are usually quite slow and they bring much less
covered PCs that all the previous ones.
Diffstat (limited to 'pkg')
| -rw-r--r-- | pkg/fuzzer/fuzzer.go | 6 | ||||
| -rw-r--r-- | pkg/manager/diff.go | 61 |
2 files changed, 61 insertions, 6 deletions
diff --git a/pkg/fuzzer/fuzzer.go b/pkg/fuzzer/fuzzer.go index 3e5f94fc3..17c5af47b 100644 --- a/pkg/fuzzer/fuzzer.go +++ b/pkg/fuzzer/fuzzer.go @@ -106,8 +106,12 @@ func newExecQueues(fuzzer *Fuzzer) execQueues { return ret } +func (fuzzer *Fuzzer) CandidatesToTriage() int { + return fuzzer.statCandidates.Val() + fuzzer.statJobsTriageCandidate.Val() +} + func (fuzzer *Fuzzer) CandidateTriageFinished() bool { - return fuzzer.statCandidates.Val()+fuzzer.statJobsTriageCandidate.Val() == 0 + return fuzzer.CandidatesToTriage() == 0 } func (fuzzer *Fuzzer) execute(executor queue.Executor, req *queue.Request) *queue.Result { diff --git a/pkg/manager/diff.go b/pkg/manager/diff.go index 93922859b..0fc9b2bad 100644 --- a/pkg/manager/diff.go +++ b/pkg/manager/diff.go @@ -40,6 +40,11 @@ type DiffFuzzerConfig struct { Debug bool PatchedOnly chan *UniqueBug ArtifactsDir string // Where to store the artifacts that supplement the logs. + // The fuzzer waits no more than MaxTriageTime time until it starts taking VMs away + // for bug reproduction. + // The option may help find a balance between spending too much time triaging + // the corpus and not reaching a proper kernel coverage. + MaxTriageTime time.Duration } type UniqueBug struct { @@ -76,6 +81,7 @@ func RunDiffFuzzer(ctx context.Context, baseCfg, newCfg *mgrconfig.Config, cfg D store := &DiffFuzzerStore{BasePath: cfg.ArtifactsDir} diffCtx := &diffContext{ + cfg: cfg, doneRepro: make(chan *ReproResult), base: base, new: new, @@ -102,6 +108,7 @@ func RunDiffFuzzer(ctx context.Context, baseCfg, newCfg *mgrconfig.Config, cfg D } type diffContext struct { + cfg DiffFuzzerConfig store *DiffFuzzerStore http *HTTPServer @@ -126,10 +133,8 @@ func (dc *diffContext) Loop(baseCtx context.Context) error { g.Go(func() error { // Let both base and patched instances somewhat progress in fuzzing before we take // VMs away for bug reproduction. - // TODO: determine the exact moment of corpus triage. - select { - case <-time.After(15 * time.Minute): - case <-ctx.Done(): + dc.waitCorpusTriage(ctx) + if ctx.Err() != nil { return nil } log.Logf(0, "starting bug reproductions") @@ -202,6 +207,35 @@ loop: return g.Wait() } +func (dc *diffContext) waitCorpusTriage(ctx context.Context) { + // Wait either until we have triaged 90% of the candidates or + // once MaxTriageTime has passed. + // We don't want to wait for 100% of the candidates because there's usually a long + // tail of slow triage jobs + the value of the candidates diminishes over time. + const triagedThreshold = 0.9 + const backOffTime = 30 * time.Second + + startedAt := time.Now() + for { + select { + case <-time.After(backOffTime): + case <-ctx.Done(): + return + } + triaged := dc.new.triageProgress() + if triaged >= triagedThreshold { + log.Logf(0, "triaged %.1f%% of the corpus", triaged*100.0) + return + } + if dc.cfg.MaxTriageTime != 0 && + time.Since(startedAt) >= dc.cfg.MaxTriageTime { + log.Logf(0, "timed out waiting for %.1f%% triage (have %.1f%%)", + triagedThreshold*100.0, triaged*100.0) + return + } + } +} + // TODO: instead of this limit, consider expotentially growing delays between reproduction attempts. const maxReproAttempts = 6 @@ -268,6 +302,8 @@ type kernelContext struct { pool *vm.Dispatcher features flatrpc.Feature candidates chan []fuzzer.Candidate + // Once candidates is assigned, candidatesCount holds their original count. + candidatesCount atomic.Int64 coverFilters CoverageFilters reportGenerator *ReportGeneratorWrapper @@ -381,7 +417,6 @@ func (kc *kernelContext) setupFuzzer(features flatrpc.Feature, syscalls map[*pro log.Logf(level, msg, args...) }, }, rnd, kc.cfg.Target) - kc.fuzzer.Store(fuzzerObj) if kc.http != nil { kc.http.Fuzzer.Store(fuzzerObj) @@ -396,6 +431,9 @@ func (kc *kernelContext) setupFuzzer(features flatrpc.Feature, syscalls map[*pro // The loop will be aborted later. break } + // We assign kc.fuzzer after kc.candidatesCount to simplify the triageProgress implementation. + kc.candidatesCount.Store(int64(len(candidates))) + kc.fuzzer.Store(fuzzerObj) filtered := FilterCandidates(candidates, syscalls, false).Candidates log.Logf(0, "%s: adding %d seeds", kc.name, len(filtered)) @@ -485,6 +523,19 @@ func (kc *kernelContext) runInstance(ctx context.Context, inst *vm.Instance, return rep, err } +func (kc *kernelContext) triageProgress() float64 { + fuzzer := kc.fuzzer.Load() + if fuzzer == nil { + return 0 + } + total := kc.candidatesCount.Load() + if total == 0.0 { + // There were no candidates in the first place. + return 1 + } + return 1.0 - float64(fuzzer.CandidatesToTriage())/float64(total) +} + // reproRunner is used to run reproducers on the base kernel to determine whether it is affected. type reproRunner struct { done chan reproRunnerResult |
