diff options
| author | Aleksandr Nogikh <nogikh@google.com> | 2026-01-23 17:18:49 +0100 |
|---|---|---|
| committer | Aleksandr Nogikh <nogikh@google.com> | 2026-01-23 20:35:29 +0000 |
| commit | 4f25b9b48bdfbc7adeaf320f8d92067afe509b49 (patch) | |
| tree | e66a51fc3786e8bb8abba5ca89cfc152a59e971f /pkg/manager/diff/manager.go | |
| parent | b4afeb6fb8cde041ba03048f5e123ed3f304a5e6 (diff) | |
pkg/manager: split off diff fuzzer functionality
Move the code to a separate pkg/manager/diff package. Split the
code into several files.
Diffstat (limited to 'pkg/manager/diff/manager.go')
| -rw-r--r-- | pkg/manager/diff/manager.go | 416 |
1 files changed, 416 insertions, 0 deletions
diff --git a/pkg/manager/diff/manager.go b/pkg/manager/diff/manager.go new file mode 100644 index 000000000..cc4d8e0d1 --- /dev/null +++ b/pkg/manager/diff/manager.go @@ -0,0 +1,416 @@ +// Copyright 2026 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +package diff + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "math/rand" + "strings" + "sync" + "time" + + "github.com/google/syzkaller/pkg/fuzzer/queue" + "github.com/google/syzkaller/pkg/log" + "github.com/google/syzkaller/pkg/manager" + "github.com/google/syzkaller/pkg/mgrconfig" + "github.com/google/syzkaller/pkg/report" + "github.com/google/syzkaller/pkg/repro" + "github.com/google/syzkaller/pkg/stat" + "github.com/google/syzkaller/vm" + "golang.org/x/sync/errgroup" +) + +type Config struct { + Debug bool + PatchedOnly chan *Bug + BaseCrashes chan string + Store *manager.DiffFuzzerStore + ArtifactsDir string // Where to store the artifacts that supplement the logs. + // The fuzzer waits no more than MaxTriageTime time until it starts taking VMs away + // for bug reproduction. + // The option may help find a balance between spending too much time triaging + // the corpus and not reaching a proper kernel coverage. + MaxTriageTime time.Duration + // If non-empty, the fuzzer will spend no more than this amount of time + // trying to reach the modified code. The time is counted since the moment + // 99% of the corpus is triaged. + FuzzToReachPatched time.Duration + // The callback may be used to consult external systems on whether + // the crash should be ignored. E.g. because it doesn't match the filter or + // the particular base kernel has already been seen to crash with the given title. + // It helps reduce the number of unnecessary reproductions. + IgnoreCrash func(context.Context, string) (bool, error) +} + +func (cfg *Config) TriageDeadline() <-chan time.Time { + if cfg.MaxTriageTime == 0 { + return nil + } + return time.After(cfg.MaxTriageTime) +} + +type Bug struct { + // The report from the patched kernel. + Report *report.Report + Repro *repro.Result +} + +func Run(ctx context.Context, baseCfg, newCfg *mgrconfig.Config, cfg Config) error { + if cfg.PatchedOnly == nil { + return fmt.Errorf("you must set up a patched only channel") + } + base, err := setup("base", baseCfg, cfg.Debug) + if err != nil { + return err + } + new, err := setup("new", newCfg, cfg.Debug) + if err != nil { + return err + } + eg, ctx := errgroup.WithContext(ctx) + eg.Go(func() error { + info, err := manager.LoadSeeds(newCfg, true) + if err != nil { + return err + } + select { + case new.candidates <- info.Candidates: + case <-ctx.Done(): + } + return nil + }) + + stream := queue.NewRandomQueue(4096, rand.New(rand.NewSource(time.Now().UnixNano()))) + base.source = stream + new.duplicateInto = stream + + diffCtx := &diffContext{ + cfg: cfg, + doneRepro: make(chan *manager.ReproResult), + base: base, + new: new, + store: cfg.Store, + reproAttempts: map[string]int{}, + patchedOnly: cfg.PatchedOnly, + } + if newCfg.HTTP != "" { + diffCtx.http = &manager.HTTPServer{ + Cfg: newCfg, + StartTime: time.Now(), + DiffStore: cfg.Store, + Pools: map[string]*vm.Dispatcher{ + new.name: new.pool, + base.name: base.pool, + }, + } + new.http = diffCtx.http + } + eg.Go(func() error { + return diffCtx.Loop(ctx) + }) + return eg.Wait() +} + +type diffContext struct { + cfg Config + store *manager.DiffFuzzerStore + http *manager.HTTPServer + + doneRepro chan *manager.ReproResult + base *kernelContext + new *kernelContext + patchedOnly chan *Bug + + mu sync.Mutex + reproAttempts map[string]int +} + +const ( + // Don't start reproductions until 90% of the corpus has been triaged. + corpusTriageToRepro = 0.9 + // Start to monitor whether we reached the modified files only after triaging 99%. + corpusTriageToMonitor = 0.99 +) + +func (dc *diffContext) Loop(ctx context.Context) error { + g, groupCtx := errgroup.WithContext(ctx) + reproLoop := manager.NewReproLoop(dc, dc.new.pool.Total()-dc.new.cfg.FuzzingVMs, false) + if dc.http != nil { + dc.http.ReproLoop = reproLoop + g.Go(func() error { + return dc.http.Serve(groupCtx) + }) + } + + g.Go(func() error { + select { + case <-groupCtx.Done(): + return nil + case <-dc.waitCorpusTriage(groupCtx, corpusTriageToRepro): + case <-dc.cfg.TriageDeadline(): + log.Logf(0, "timed out waiting for coprus triage") + } + log.Logf(0, "starting bug reproductions") + reproLoop.Loop(groupCtx) + return nil + }) + + g.Go(func() error { return dc.monitorPatchedCoverage(groupCtx) }) + g.Go(func() error { return dc.base.Loop(groupCtx) }) + g.Go(func() error { return dc.new.Loop(groupCtx) }) + + runner := &reproRunner{done: make(chan reproRunnerResult, 2), kernel: dc.base} + statTimer := time.NewTicker(5 * time.Minute) +loop: + for { + select { + case <-groupCtx.Done(): + break loop + case <-statTimer.C: + vals := make(map[string]int) + for _, stat := range stat.Collect(stat.All) { + vals[stat.Name] = stat.V + } + data, _ := json.MarshalIndent(vals, "", " ") + log.Logf(0, "STAT %s", data) + case rep := <-dc.base.crashes: + log.Logf(1, "base crash: %v", rep.Title) + dc.reportBaseCrash(groupCtx, rep) + case ret := <-runner.done: + dc.handleReproResult(groupCtx, ret, reproLoop) + case ret := <-dc.doneRepro: + // We have finished reproducing a crash from the patched instance. + if ret.Repro != nil && ret.Repro.Report != nil { + origTitle := ret.Crash.Report.Title + if ret.Repro.Report.Title == origTitle { + origTitle = "-SAME-" + } + log.Logf(1, "found repro for %q (orig title: %q, reliability: %2.f), took %.2f minutes", + ret.Repro.Report.Title, origTitle, ret.Repro.Reliability, ret.Stats.TotalTime.Minutes()) + g.Go(func() error { + runner.Run(groupCtx, ret.Repro, ret.Crash.FullRepro) + return nil + }) + } else { + origTitle := ret.Crash.Report.Title + log.Logf(1, "failed repro for %q, err=%s", origTitle, ret.Err) + } + dc.store.SaveRepro(ret) + case rep := <-dc.new.crashes: + // A new crash is found on the patched instance. + crash := &manager.Crash{Report: rep} + need := dc.NeedRepro(crash) + log.Logf(0, "patched crashed: %v [need repro = %v]", + rep.Title, need) + dc.store.PatchedCrashed(rep.Title, rep.Report, rep.Output) + if need { + reproLoop.Enqueue(crash) + } + } + } + return g.Wait() +} + +func (dc *diffContext) handleReproResult(ctx context.Context, ret reproRunnerResult, reproLoop *manager.ReproLoop) { + // We have run the reproducer on the base instance. + + // A sanity check: the base kernel might have crashed with the same title + // since the moment we have stared the reproduction / running on the repro base. + ignored := dc.ignoreCrash(ctx, ret.reproReport.Title) + if ret.crashReport == nil && ignored { + // Report it as error so that we could at least find it in the logs. + log.Errorf("resulting crash of an approved repro result is to be ignored: %s", + ret.reproReport.Title) + } else if ret.crashReport == nil { + dc.store.BaseNotCrashed(ret.reproReport.Title) + select { + case <-ctx.Done(): + case dc.patchedOnly <- &Bug{ + Report: ret.reproReport, + Repro: ret.repro, + }: + } + log.Logf(0, "patched-only: %s", ret.reproReport.Title) + // Now that we know this bug only affects the patch kernel, we can spend more time + // generating a minimalistic repro and a C repro. + if !ret.fullRepro { + reproLoop.Enqueue(&manager.Crash{ + Report: &report.Report{ + Title: ret.reproReport.Title, + Output: ret.repro.Prog.Serialize(), + }, + FullRepro: true, + }) + } + } else { + dc.reportBaseCrash(ctx, ret.crashReport) + log.Logf(0, "crashes both: %s / %s", ret.reproReport.Title, ret.crashReport.Title) + } +} + +func (dc *diffContext) ignoreCrash(ctx context.Context, title string) bool { + if dc.store.EverCrashedBase(title) { + return true + } + // Let's try to ask the external systems about it as well. + if dc.cfg.IgnoreCrash != nil { + ignore, err := dc.cfg.IgnoreCrash(ctx, title) + if err != nil { + log.Logf(0, "a call to IgnoreCrash failed: %v", err) + } else { + if ignore { + log.Logf(0, "base crash %q is to be ignored", title) + } + return ignore + } + } + return false +} + +func (dc *diffContext) reportBaseCrash(ctx context.Context, rep *report.Report) { + dc.store.BaseCrashed(rep.Title, rep.Report) + if dc.cfg.BaseCrashes == nil { + return + } + select { + case dc.cfg.BaseCrashes <- rep.Title: + case <-ctx.Done(): + } +} + +func (dc *diffContext) waitCorpusTriage(ctx context.Context, threshold float64) chan struct{} { + const backOffTime = 30 * time.Second + ret := make(chan struct{}) + go func() { + for { + select { + case <-time.After(backOffTime): + case <-ctx.Done(): + return + } + triaged := dc.new.triageProgress() + if triaged >= threshold { + log.Logf(0, "triaged %.1f%% of the corpus", triaged*100.0) + close(ret) + return + } + } + }() + return ret +} + +var ErrPatchedAreaNotReached = errors.New("fuzzer has not reached the patched area") + +func (dc *diffContext) monitorPatchedCoverage(ctx context.Context) error { + if dc.cfg.FuzzToReachPatched == 0 { + // The feature is disabled. + return nil + } + + // First wait until we have almost triaged all of the corpus. + select { + case <-ctx.Done(): + return nil + case <-dc.waitCorpusTriage(ctx, corpusTriageToMonitor): + } + + // By this moment, we must have coverage filters already filled out. + focusPCs := 0 + // The last one is "everything else", so it's not of interest. + coverFilters := dc.new.coverFilters + for i := 0; i < len(coverFilters.Areas)-1; i++ { + focusPCs += len(coverFilters.Areas[i].CoverPCs) + } + if focusPCs == 0 { + // No areas were configured. + log.Logf(1, "no PCs in the areas of focused fuzzing, skipping the zero patched coverage check") + return nil + } + + // Then give the fuzzer some change to get through. + select { + case <-time.After(dc.cfg.FuzzToReachPatched): + case <-ctx.Done(): + return nil + } + focusAreaStats := dc.new.progsPerArea() + if focusAreaStats[symbolsArea]+focusAreaStats[filesArea]+focusAreaStats[includesArea] > 0 { + log.Logf(0, "fuzzer has reached the modified code (%d + %d + %d), continuing fuzzing", + focusAreaStats[symbolsArea], focusAreaStats[filesArea], focusAreaStats[includesArea]) + return nil + } + log.Logf(0, "fuzzer has not reached the modified code in %s, aborting", + dc.cfg.FuzzToReachPatched) + return ErrPatchedAreaNotReached +} + +// TODO: instead of this limit, consider expotentially growing delays between reproduction attempts. +const maxReproAttempts = 6 + +func needReproForTitle(title string) bool { + if strings.Contains(title, "no output") || + strings.Contains(title, "lost connection") || + strings.Contains(title, "detected stall") || + strings.Contains(title, "SYZ") { + // Don't waste time reproducing these. + return false + } + return true +} + +func (dc *diffContext) NeedRepro(crash *manager.Crash) bool { + if crash.FullRepro { + return true + } + if !needReproForTitle(crash.Title) { + return false + } + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + if dc.ignoreCrash(ctx, crash.Title) { + return false + } + dc.mu.Lock() + defer dc.mu.Unlock() + return dc.reproAttempts[crash.Title] <= maxReproAttempts +} + +func (dc *diffContext) RunRepro(ctx context.Context, crash *manager.Crash) *manager.ReproResult { + dc.mu.Lock() + dc.reproAttempts[crash.Title]++ + dc.mu.Unlock() + + res, stats, err := repro.Run(ctx, crash.Output, repro.Environment{ + Config: dc.new.cfg, + Features: dc.new.features, + Reporter: dc.new.reporter, + Pool: dc.new.pool, + Fast: !crash.FullRepro, + }) + if res != nil && res.Report != nil { + dc.mu.Lock() + dc.reproAttempts[res.Report.Title] = maxReproAttempts + dc.mu.Unlock() + } + ret := &manager.ReproResult{ + Crash: crash, + Repro: res, + Stats: stats, + Err: err, + } + + select { + case dc.doneRepro <- ret: + case <-ctx.Done(): + // If the context is cancelled, no one may be listening on doneRepro. + } + return ret +} + +func (dc *diffContext) ResizeReproPool(size int) { + dc.new.pool.ReserveForRun(size) +} |
