// Copyright 2026 syzkaller project authors. All rights reserved. // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. package diff import ( "context" "encoding/json" "errors" "fmt" "math/rand" "strings" "sync" "time" "github.com/google/syzkaller/pkg/fuzzer/queue" "github.com/google/syzkaller/pkg/log" "github.com/google/syzkaller/pkg/manager" "github.com/google/syzkaller/pkg/mgrconfig" "github.com/google/syzkaller/pkg/report" "github.com/google/syzkaller/pkg/repro" "github.com/google/syzkaller/pkg/stat" "github.com/google/syzkaller/vm" "golang.org/x/sync/errgroup" ) type Config struct { Debug bool PatchedOnly chan *Bug BaseCrashes chan string Store *manager.DiffFuzzerStore ArtifactsDir string // Where to store the artifacts that supplement the logs. // The fuzzer waits no more than MaxTriageTime time until it starts taking VMs away // for bug reproduction. // The option may help find a balance between spending too much time triaging // the corpus and not reaching a proper kernel coverage. MaxTriageTime time.Duration // If non-empty, the fuzzer will spend no more than this amount of time // trying to reach the modified code. The time is counted since the moment // 99% of the corpus is triaged. FuzzToReachPatched time.Duration // The callback may be used to consult external systems on whether // the crash should be ignored. E.g. because it doesn't match the filter or // the particular base kernel has already been seen to crash with the given title. // It helps reduce the number of unnecessary reproductions. IgnoreCrash func(context.Context, string) (bool, error) } func (cfg *Config) TriageDeadline() <-chan time.Time { if cfg.MaxTriageTime == 0 { return nil } return time.After(cfg.MaxTriageTime) } type Bug struct { // The report from the patched kernel. Report *report.Report Repro *repro.Result } func Run(ctx context.Context, baseCfg, newCfg *mgrconfig.Config, cfg Config) error { if cfg.PatchedOnly == nil { return fmt.Errorf("you must set up a patched only channel") } base, err := setup("base", baseCfg, cfg.Debug) if err != nil { return err } new, err := setup("new", newCfg, cfg.Debug) if err != nil { return err } eg, ctx := errgroup.WithContext(ctx) eg.Go(func() error { info, err := manager.LoadSeeds(newCfg, true) if err != nil { return err } select { case new.candidates <- info.Candidates: case <-ctx.Done(): } return nil }) stream := queue.NewRandomQueue(4096, rand.New(rand.NewSource(time.Now().UnixNano()))) base.source = stream new.duplicateInto = stream diffCtx := &diffContext{ cfg: cfg, doneRepro: make(chan *manager.ReproResult), base: base, new: new, store: cfg.Store, reproAttempts: map[string]int{}, patchedOnly: cfg.PatchedOnly, } if newCfg.HTTP != "" { diffCtx.http = &manager.HTTPServer{ Cfg: newCfg, StartTime: time.Now(), DiffStore: cfg.Store, Pools: map[string]*vm.Dispatcher{ new.name: new.pool, base.name: base.pool, }, } new.http = diffCtx.http } eg.Go(func() error { return diffCtx.Loop(ctx) }) return eg.Wait() } type diffContext struct { cfg Config store *manager.DiffFuzzerStore http *manager.HTTPServer doneRepro chan *manager.ReproResult base *kernelContext new *kernelContext patchedOnly chan *Bug mu sync.Mutex reproAttempts map[string]int } const ( // Don't start reproductions until 90% of the corpus has been triaged. corpusTriageToRepro = 0.9 // Start to monitor whether we reached the modified files only after triaging 99%. corpusTriageToMonitor = 0.99 ) func (dc *diffContext) Loop(ctx context.Context) error { g, groupCtx := errgroup.WithContext(ctx) reproLoop := manager.NewReproLoop(dc, dc.new.pool.Total()-dc.new.cfg.FuzzingVMs, false) if dc.http != nil { dc.http.ReproLoop = reproLoop g.Go(func() error { return dc.http.Serve(groupCtx) }) } g.Go(func() error { select { case <-groupCtx.Done(): return nil case <-dc.waitCorpusTriage(groupCtx, corpusTriageToRepro): case <-dc.cfg.TriageDeadline(): log.Logf(0, "timed out waiting for coprus triage") } log.Logf(0, "starting bug reproductions") reproLoop.Loop(groupCtx) return nil }) g.Go(func() error { return dc.monitorPatchedCoverage(groupCtx) }) g.Go(func() error { return dc.base.Loop(groupCtx) }) g.Go(func() error { return dc.new.Loop(groupCtx) }) runner := &reproRunner{done: make(chan reproRunnerResult, 2), kernel: dc.base} statTimer := time.NewTicker(5 * time.Minute) loop: for { select { case <-groupCtx.Done(): break loop case <-statTimer.C: vals := make(map[string]int) for _, stat := range stat.Collect(stat.All) { vals[stat.Name] = stat.V } data, _ := json.MarshalIndent(vals, "", " ") log.Logf(0, "STAT %s", data) case rep := <-dc.base.crashes: log.Logf(1, "base crash: %v", rep.Title) dc.reportBaseCrash(groupCtx, rep) case ret := <-runner.done: dc.handleReproResult(groupCtx, ret, reproLoop) case ret := <-dc.doneRepro: // We have finished reproducing a crash from the patched instance. if ret.Repro != nil && ret.Repro.Report != nil { origTitle := ret.Crash.Report.Title if ret.Repro.Report.Title == origTitle { origTitle = "-SAME-" } log.Logf(1, "found repro for %q (orig title: %q, reliability: %2.f), took %.2f minutes", ret.Repro.Report.Title, origTitle, ret.Repro.Reliability, ret.Stats.TotalTime.Minutes()) g.Go(func() error { runner.Run(groupCtx, ret.Repro, ret.Crash.FullRepro) return nil }) } else { origTitle := ret.Crash.Report.Title log.Logf(1, "failed repro for %q, err=%s", origTitle, ret.Err) } dc.store.SaveRepro(ret) case rep := <-dc.new.crashes: // A new crash is found on the patched instance. crash := &manager.Crash{Report: rep} need := dc.NeedRepro(crash) log.Logf(0, "patched crashed: %v [need repro = %v]", rep.Title, need) dc.store.PatchedCrashed(rep.Title, rep.Report, rep.Output) if need { reproLoop.Enqueue(crash) } } } return g.Wait() } func (dc *diffContext) handleReproResult(ctx context.Context, ret reproRunnerResult, reproLoop *manager.ReproLoop) { // We have run the reproducer on the base instance. // A sanity check: the base kernel might have crashed with the same title // since the moment we have stared the reproduction / running on the repro base. ignored := dc.ignoreCrash(ctx, ret.reproReport.Title) if ret.crashReport == nil && ignored { // Report it as error so that we could at least find it in the logs. log.Errorf("resulting crash of an approved repro result is to be ignored: %s", ret.reproReport.Title) } else if ret.crashReport == nil { dc.store.BaseNotCrashed(ret.reproReport.Title) select { case <-ctx.Done(): case dc.patchedOnly <- &Bug{ Report: ret.reproReport, Repro: ret.repro, }: } log.Logf(0, "patched-only: %s", ret.reproReport.Title) // Now that we know this bug only affects the patch kernel, we can spend more time // generating a minimalistic repro and a C repro. if !ret.fullRepro { reproLoop.Enqueue(&manager.Crash{ Report: &report.Report{ Title: ret.reproReport.Title, Output: ret.repro.Prog.Serialize(), }, FullRepro: true, }) } } else { dc.reportBaseCrash(ctx, ret.crashReport) log.Logf(0, "crashes both: %s / %s", ret.reproReport.Title, ret.crashReport.Title) } } func (dc *diffContext) ignoreCrash(ctx context.Context, title string) bool { if dc.store.EverCrashedBase(title) { return true } // Let's try to ask the external systems about it as well. if dc.cfg.IgnoreCrash != nil { ignore, err := dc.cfg.IgnoreCrash(ctx, title) if err != nil { log.Logf(0, "a call to IgnoreCrash failed: %v", err) } else { if ignore { log.Logf(0, "base crash %q is to be ignored", title) } return ignore } } return false } func (dc *diffContext) reportBaseCrash(ctx context.Context, rep *report.Report) { dc.store.BaseCrashed(rep.Title, rep.Report) if dc.cfg.BaseCrashes == nil { return } select { case dc.cfg.BaseCrashes <- rep.Title: case <-ctx.Done(): } } func (dc *diffContext) waitCorpusTriage(ctx context.Context, threshold float64) chan struct{} { const backOffTime = 30 * time.Second ret := make(chan struct{}) go func() { for { select { case <-time.After(backOffTime): case <-ctx.Done(): return } triaged := dc.new.triageProgress() if triaged >= threshold { log.Logf(0, "triaged %.1f%% of the corpus", triaged*100.0) close(ret) return } } }() return ret } var ErrPatchedAreaNotReached = errors.New("fuzzer has not reached the patched area") func (dc *diffContext) monitorPatchedCoverage(ctx context.Context) error { if dc.cfg.FuzzToReachPatched == 0 { // The feature is disabled. return nil } // First wait until we have almost triaged all of the corpus. select { case <-ctx.Done(): return nil case <-dc.waitCorpusTriage(ctx, corpusTriageToMonitor): } // By this moment, we must have coverage filters already filled out. focusPCs := 0 // The last one is "everything else", so it's not of interest. coverFilters := dc.new.coverFilters for i := 0; i < len(coverFilters.Areas)-1; i++ { focusPCs += len(coverFilters.Areas[i].CoverPCs) } if focusPCs == 0 { // No areas were configured. log.Logf(1, "no PCs in the areas of focused fuzzing, skipping the zero patched coverage check") return nil } // Then give the fuzzer some change to get through. select { case <-time.After(dc.cfg.FuzzToReachPatched): case <-ctx.Done(): return nil } focusAreaStats := dc.new.progsPerArea() if focusAreaStats[symbolsArea]+focusAreaStats[filesArea]+focusAreaStats[includesArea] > 0 { log.Logf(0, "fuzzer has reached the modified code (%d + %d + %d), continuing fuzzing", focusAreaStats[symbolsArea], focusAreaStats[filesArea], focusAreaStats[includesArea]) return nil } log.Logf(0, "fuzzer has not reached the modified code in %s, aborting", dc.cfg.FuzzToReachPatched) return ErrPatchedAreaNotReached } // TODO: instead of this limit, consider expotentially growing delays between reproduction attempts. const maxReproAttempts = 6 func needReproForTitle(title string) bool { if strings.Contains(title, "no output") || strings.Contains(title, "lost connection") || strings.Contains(title, "detected stall") || strings.Contains(title, "SYZ") { // Don't waste time reproducing these. return false } return true } func (dc *diffContext) NeedRepro(crash *manager.Crash) bool { if crash.FullRepro { return true } if !needReproForTitle(crash.Title) { return false } ctx, cancel := context.WithTimeout(context.Background(), time.Minute) defer cancel() if dc.ignoreCrash(ctx, crash.Title) { return false } dc.mu.Lock() defer dc.mu.Unlock() return dc.reproAttempts[crash.Title] <= maxReproAttempts } func (dc *diffContext) RunRepro(ctx context.Context, crash *manager.Crash) *manager.ReproResult { dc.mu.Lock() dc.reproAttempts[crash.Title]++ dc.mu.Unlock() res, stats, err := repro.Run(ctx, crash.Output, repro.Environment{ Config: dc.new.cfg, Features: dc.new.features, Reporter: dc.new.reporter, Pool: dc.new.pool, Fast: !crash.FullRepro, }) if res != nil && res.Report != nil { dc.mu.Lock() dc.reproAttempts[res.Report.Title] = maxReproAttempts dc.mu.Unlock() } ret := &manager.ReproResult{ Crash: crash, Repro: res, Stats: stats, Err: err, } select { case dc.doneRepro <- ret: case <-ctx.Done(): // If the context is cancelled, no one may be listening on doneRepro. } return ret } func (dc *diffContext) ResizeReproPool(size int) { dc.new.pool.ReserveForRun(size) }