aboutsummaryrefslogtreecommitdiffstats
path: root/pkg/manager/diff/manager.go
diff options
context:
space:
mode:
authorAleksandr Nogikh <nogikh@google.com>2026-01-23 17:18:49 +0100
committerAleksandr Nogikh <nogikh@google.com>2026-01-23 20:35:29 +0000
commit4f25b9b48bdfbc7adeaf320f8d92067afe509b49 (patch)
treee66a51fc3786e8bb8abba5ca89cfc152a59e971f /pkg/manager/diff/manager.go
parentb4afeb6fb8cde041ba03048f5e123ed3f304a5e6 (diff)
pkg/manager: split off diff fuzzer functionality
Move the code to a separate pkg/manager/diff package. Split the code into several files.
Diffstat (limited to 'pkg/manager/diff/manager.go')
-rw-r--r--pkg/manager/diff/manager.go416
1 files changed, 416 insertions, 0 deletions
diff --git a/pkg/manager/diff/manager.go b/pkg/manager/diff/manager.go
new file mode 100644
index 000000000..cc4d8e0d1
--- /dev/null
+++ b/pkg/manager/diff/manager.go
@@ -0,0 +1,416 @@
+// Copyright 2026 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+package diff
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "math/rand"
+ "strings"
+ "sync"
+ "time"
+
+ "github.com/google/syzkaller/pkg/fuzzer/queue"
+ "github.com/google/syzkaller/pkg/log"
+ "github.com/google/syzkaller/pkg/manager"
+ "github.com/google/syzkaller/pkg/mgrconfig"
+ "github.com/google/syzkaller/pkg/report"
+ "github.com/google/syzkaller/pkg/repro"
+ "github.com/google/syzkaller/pkg/stat"
+ "github.com/google/syzkaller/vm"
+ "golang.org/x/sync/errgroup"
+)
+
+type Config struct {
+ Debug bool
+ PatchedOnly chan *Bug
+ BaseCrashes chan string
+ Store *manager.DiffFuzzerStore
+ ArtifactsDir string // Where to store the artifacts that supplement the logs.
+ // The fuzzer waits no more than MaxTriageTime time until it starts taking VMs away
+ // for bug reproduction.
+ // The option may help find a balance between spending too much time triaging
+ // the corpus and not reaching a proper kernel coverage.
+ MaxTriageTime time.Duration
+ // If non-empty, the fuzzer will spend no more than this amount of time
+ // trying to reach the modified code. The time is counted since the moment
+ // 99% of the corpus is triaged.
+ FuzzToReachPatched time.Duration
+ // The callback may be used to consult external systems on whether
+ // the crash should be ignored. E.g. because it doesn't match the filter or
+ // the particular base kernel has already been seen to crash with the given title.
+ // It helps reduce the number of unnecessary reproductions.
+ IgnoreCrash func(context.Context, string) (bool, error)
+}
+
+func (cfg *Config) TriageDeadline() <-chan time.Time {
+ if cfg.MaxTriageTime == 0 {
+ return nil
+ }
+ return time.After(cfg.MaxTriageTime)
+}
+
+type Bug struct {
+ // The report from the patched kernel.
+ Report *report.Report
+ Repro *repro.Result
+}
+
+func Run(ctx context.Context, baseCfg, newCfg *mgrconfig.Config, cfg Config) error {
+ if cfg.PatchedOnly == nil {
+ return fmt.Errorf("you must set up a patched only channel")
+ }
+ base, err := setup("base", baseCfg, cfg.Debug)
+ if err != nil {
+ return err
+ }
+ new, err := setup("new", newCfg, cfg.Debug)
+ if err != nil {
+ return err
+ }
+ eg, ctx := errgroup.WithContext(ctx)
+ eg.Go(func() error {
+ info, err := manager.LoadSeeds(newCfg, true)
+ if err != nil {
+ return err
+ }
+ select {
+ case new.candidates <- info.Candidates:
+ case <-ctx.Done():
+ }
+ return nil
+ })
+
+ stream := queue.NewRandomQueue(4096, rand.New(rand.NewSource(time.Now().UnixNano())))
+ base.source = stream
+ new.duplicateInto = stream
+
+ diffCtx := &diffContext{
+ cfg: cfg,
+ doneRepro: make(chan *manager.ReproResult),
+ base: base,
+ new: new,
+ store: cfg.Store,
+ reproAttempts: map[string]int{},
+ patchedOnly: cfg.PatchedOnly,
+ }
+ if newCfg.HTTP != "" {
+ diffCtx.http = &manager.HTTPServer{
+ Cfg: newCfg,
+ StartTime: time.Now(),
+ DiffStore: cfg.Store,
+ Pools: map[string]*vm.Dispatcher{
+ new.name: new.pool,
+ base.name: base.pool,
+ },
+ }
+ new.http = diffCtx.http
+ }
+ eg.Go(func() error {
+ return diffCtx.Loop(ctx)
+ })
+ return eg.Wait()
+}
+
+type diffContext struct {
+ cfg Config
+ store *manager.DiffFuzzerStore
+ http *manager.HTTPServer
+
+ doneRepro chan *manager.ReproResult
+ base *kernelContext
+ new *kernelContext
+ patchedOnly chan *Bug
+
+ mu sync.Mutex
+ reproAttempts map[string]int
+}
+
+const (
+ // Don't start reproductions until 90% of the corpus has been triaged.
+ corpusTriageToRepro = 0.9
+ // Start to monitor whether we reached the modified files only after triaging 99%.
+ corpusTriageToMonitor = 0.99
+)
+
+func (dc *diffContext) Loop(ctx context.Context) error {
+ g, groupCtx := errgroup.WithContext(ctx)
+ reproLoop := manager.NewReproLoop(dc, dc.new.pool.Total()-dc.new.cfg.FuzzingVMs, false)
+ if dc.http != nil {
+ dc.http.ReproLoop = reproLoop
+ g.Go(func() error {
+ return dc.http.Serve(groupCtx)
+ })
+ }
+
+ g.Go(func() error {
+ select {
+ case <-groupCtx.Done():
+ return nil
+ case <-dc.waitCorpusTriage(groupCtx, corpusTriageToRepro):
+ case <-dc.cfg.TriageDeadline():
+ log.Logf(0, "timed out waiting for coprus triage")
+ }
+ log.Logf(0, "starting bug reproductions")
+ reproLoop.Loop(groupCtx)
+ return nil
+ })
+
+ g.Go(func() error { return dc.monitorPatchedCoverage(groupCtx) })
+ g.Go(func() error { return dc.base.Loop(groupCtx) })
+ g.Go(func() error { return dc.new.Loop(groupCtx) })
+
+ runner := &reproRunner{done: make(chan reproRunnerResult, 2), kernel: dc.base}
+ statTimer := time.NewTicker(5 * time.Minute)
+loop:
+ for {
+ select {
+ case <-groupCtx.Done():
+ break loop
+ case <-statTimer.C:
+ vals := make(map[string]int)
+ for _, stat := range stat.Collect(stat.All) {
+ vals[stat.Name] = stat.V
+ }
+ data, _ := json.MarshalIndent(vals, "", " ")
+ log.Logf(0, "STAT %s", data)
+ case rep := <-dc.base.crashes:
+ log.Logf(1, "base crash: %v", rep.Title)
+ dc.reportBaseCrash(groupCtx, rep)
+ case ret := <-runner.done:
+ dc.handleReproResult(groupCtx, ret, reproLoop)
+ case ret := <-dc.doneRepro:
+ // We have finished reproducing a crash from the patched instance.
+ if ret.Repro != nil && ret.Repro.Report != nil {
+ origTitle := ret.Crash.Report.Title
+ if ret.Repro.Report.Title == origTitle {
+ origTitle = "-SAME-"
+ }
+ log.Logf(1, "found repro for %q (orig title: %q, reliability: %2.f), took %.2f minutes",
+ ret.Repro.Report.Title, origTitle, ret.Repro.Reliability, ret.Stats.TotalTime.Minutes())
+ g.Go(func() error {
+ runner.Run(groupCtx, ret.Repro, ret.Crash.FullRepro)
+ return nil
+ })
+ } else {
+ origTitle := ret.Crash.Report.Title
+ log.Logf(1, "failed repro for %q, err=%s", origTitle, ret.Err)
+ }
+ dc.store.SaveRepro(ret)
+ case rep := <-dc.new.crashes:
+ // A new crash is found on the patched instance.
+ crash := &manager.Crash{Report: rep}
+ need := dc.NeedRepro(crash)
+ log.Logf(0, "patched crashed: %v [need repro = %v]",
+ rep.Title, need)
+ dc.store.PatchedCrashed(rep.Title, rep.Report, rep.Output)
+ if need {
+ reproLoop.Enqueue(crash)
+ }
+ }
+ }
+ return g.Wait()
+}
+
+func (dc *diffContext) handleReproResult(ctx context.Context, ret reproRunnerResult, reproLoop *manager.ReproLoop) {
+ // We have run the reproducer on the base instance.
+
+ // A sanity check: the base kernel might have crashed with the same title
+ // since the moment we have stared the reproduction / running on the repro base.
+ ignored := dc.ignoreCrash(ctx, ret.reproReport.Title)
+ if ret.crashReport == nil && ignored {
+ // Report it as error so that we could at least find it in the logs.
+ log.Errorf("resulting crash of an approved repro result is to be ignored: %s",
+ ret.reproReport.Title)
+ } else if ret.crashReport == nil {
+ dc.store.BaseNotCrashed(ret.reproReport.Title)
+ select {
+ case <-ctx.Done():
+ case dc.patchedOnly <- &Bug{
+ Report: ret.reproReport,
+ Repro: ret.repro,
+ }:
+ }
+ log.Logf(0, "patched-only: %s", ret.reproReport.Title)
+ // Now that we know this bug only affects the patch kernel, we can spend more time
+ // generating a minimalistic repro and a C repro.
+ if !ret.fullRepro {
+ reproLoop.Enqueue(&manager.Crash{
+ Report: &report.Report{
+ Title: ret.reproReport.Title,
+ Output: ret.repro.Prog.Serialize(),
+ },
+ FullRepro: true,
+ })
+ }
+ } else {
+ dc.reportBaseCrash(ctx, ret.crashReport)
+ log.Logf(0, "crashes both: %s / %s", ret.reproReport.Title, ret.crashReport.Title)
+ }
+}
+
+func (dc *diffContext) ignoreCrash(ctx context.Context, title string) bool {
+ if dc.store.EverCrashedBase(title) {
+ return true
+ }
+ // Let's try to ask the external systems about it as well.
+ if dc.cfg.IgnoreCrash != nil {
+ ignore, err := dc.cfg.IgnoreCrash(ctx, title)
+ if err != nil {
+ log.Logf(0, "a call to IgnoreCrash failed: %v", err)
+ } else {
+ if ignore {
+ log.Logf(0, "base crash %q is to be ignored", title)
+ }
+ return ignore
+ }
+ }
+ return false
+}
+
+func (dc *diffContext) reportBaseCrash(ctx context.Context, rep *report.Report) {
+ dc.store.BaseCrashed(rep.Title, rep.Report)
+ if dc.cfg.BaseCrashes == nil {
+ return
+ }
+ select {
+ case dc.cfg.BaseCrashes <- rep.Title:
+ case <-ctx.Done():
+ }
+}
+
+func (dc *diffContext) waitCorpusTriage(ctx context.Context, threshold float64) chan struct{} {
+ const backOffTime = 30 * time.Second
+ ret := make(chan struct{})
+ go func() {
+ for {
+ select {
+ case <-time.After(backOffTime):
+ case <-ctx.Done():
+ return
+ }
+ triaged := dc.new.triageProgress()
+ if triaged >= threshold {
+ log.Logf(0, "triaged %.1f%% of the corpus", triaged*100.0)
+ close(ret)
+ return
+ }
+ }
+ }()
+ return ret
+}
+
+var ErrPatchedAreaNotReached = errors.New("fuzzer has not reached the patched area")
+
+func (dc *diffContext) monitorPatchedCoverage(ctx context.Context) error {
+ if dc.cfg.FuzzToReachPatched == 0 {
+ // The feature is disabled.
+ return nil
+ }
+
+ // First wait until we have almost triaged all of the corpus.
+ select {
+ case <-ctx.Done():
+ return nil
+ case <-dc.waitCorpusTriage(ctx, corpusTriageToMonitor):
+ }
+
+ // By this moment, we must have coverage filters already filled out.
+ focusPCs := 0
+ // The last one is "everything else", so it's not of interest.
+ coverFilters := dc.new.coverFilters
+ for i := 0; i < len(coverFilters.Areas)-1; i++ {
+ focusPCs += len(coverFilters.Areas[i].CoverPCs)
+ }
+ if focusPCs == 0 {
+ // No areas were configured.
+ log.Logf(1, "no PCs in the areas of focused fuzzing, skipping the zero patched coverage check")
+ return nil
+ }
+
+ // Then give the fuzzer some change to get through.
+ select {
+ case <-time.After(dc.cfg.FuzzToReachPatched):
+ case <-ctx.Done():
+ return nil
+ }
+ focusAreaStats := dc.new.progsPerArea()
+ if focusAreaStats[symbolsArea]+focusAreaStats[filesArea]+focusAreaStats[includesArea] > 0 {
+ log.Logf(0, "fuzzer has reached the modified code (%d + %d + %d), continuing fuzzing",
+ focusAreaStats[symbolsArea], focusAreaStats[filesArea], focusAreaStats[includesArea])
+ return nil
+ }
+ log.Logf(0, "fuzzer has not reached the modified code in %s, aborting",
+ dc.cfg.FuzzToReachPatched)
+ return ErrPatchedAreaNotReached
+}
+
+// TODO: instead of this limit, consider expotentially growing delays between reproduction attempts.
+const maxReproAttempts = 6
+
+func needReproForTitle(title string) bool {
+ if strings.Contains(title, "no output") ||
+ strings.Contains(title, "lost connection") ||
+ strings.Contains(title, "detected stall") ||
+ strings.Contains(title, "SYZ") {
+ // Don't waste time reproducing these.
+ return false
+ }
+ return true
+}
+
+func (dc *diffContext) NeedRepro(crash *manager.Crash) bool {
+ if crash.FullRepro {
+ return true
+ }
+ if !needReproForTitle(crash.Title) {
+ return false
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+ defer cancel()
+ if dc.ignoreCrash(ctx, crash.Title) {
+ return false
+ }
+ dc.mu.Lock()
+ defer dc.mu.Unlock()
+ return dc.reproAttempts[crash.Title] <= maxReproAttempts
+}
+
+func (dc *diffContext) RunRepro(ctx context.Context, crash *manager.Crash) *manager.ReproResult {
+ dc.mu.Lock()
+ dc.reproAttempts[crash.Title]++
+ dc.mu.Unlock()
+
+ res, stats, err := repro.Run(ctx, crash.Output, repro.Environment{
+ Config: dc.new.cfg,
+ Features: dc.new.features,
+ Reporter: dc.new.reporter,
+ Pool: dc.new.pool,
+ Fast: !crash.FullRepro,
+ })
+ if res != nil && res.Report != nil {
+ dc.mu.Lock()
+ dc.reproAttempts[res.Report.Title] = maxReproAttempts
+ dc.mu.Unlock()
+ }
+ ret := &manager.ReproResult{
+ Crash: crash,
+ Repro: res,
+ Stats: stats,
+ Err: err,
+ }
+
+ select {
+ case dc.doneRepro <- ret:
+ case <-ctx.Done():
+ // If the context is cancelled, no one may be listening on doneRepro.
+ }
+ return ret
+}
+
+func (dc *diffContext) ResizeReproPool(size int) {
+ dc.new.pool.ReserveForRun(size)
+}