pkg/manager: split off diff fuzzer functionality

Move the code to a separate pkg/manager/diff package. Split the code into several files.
author: Aleksandr Nogikh <nogikh@google.com> 2026-01-23 17:18:49 +0100
committer: Aleksandr Nogikh <nogikh@google.com> 2026-01-23 20:35:29 +0000
commit: 4f25b9b48bdfbc7adeaf320f8d92067afe509b49 (patch)
tree: e66a51fc3786e8bb8abba5ca89cfc152a59e971f /pkg/manager/diff/manager.go
parent: b4afeb6fb8cde041ba03048f5e123ed3f304a5e6 (diff)
1 files changed, 416 insertions, 0 deletions
diff --git a/pkg/manager/diff/manager.go b/pkg/manager/diff/manager.go
new file mode 100644
index 000000000..cc4d8e0d1
--- /dev/null
+++ b/pkg/manager/diff/manager.go
@@ -0,0 +1,416 @@
+// Copyright 2026 syzkaller project authors. All rights reserved.
+// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
+
+package diff
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"math/rand"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/google/syzkaller/pkg/fuzzer/queue"
+	"github.com/google/syzkaller/pkg/log"
+	"github.com/google/syzkaller/pkg/manager"
+	"github.com/google/syzkaller/pkg/mgrconfig"
+	"github.com/google/syzkaller/pkg/report"
+	"github.com/google/syzkaller/pkg/repro"
+	"github.com/google/syzkaller/pkg/stat"
+	"github.com/google/syzkaller/vm"
+	"golang.org/x/sync/errgroup"
+)
+
+type Config struct {
+	Debug        bool
+	PatchedOnly  chan *Bug
+	BaseCrashes  chan string
+	Store        *manager.DiffFuzzerStore
+	ArtifactsDir string // Where to store the artifacts that supplement the logs.
+	// The fuzzer waits no more than MaxTriageTime time until it starts taking VMs away
+	// for bug reproduction.
+	// The option may help find a balance between spending too much time triaging
+	// the corpus and not reaching a proper kernel coverage.
+	MaxTriageTime time.Duration
+	// If non-empty, the fuzzer will spend no more than this amount of time
+	// trying to reach the modified code. The time is counted since the moment
+	// 99% of the corpus is triaged.
+	FuzzToReachPatched time.Duration
+	// The callback may be used to consult external systems on whether
+	// the crash should be ignored. E.g. because it doesn't match the filter or
+	// the particular base kernel has already been seen to crash with the given title.
+	// It helps reduce the number of unnecessary reproductions.
+	IgnoreCrash func(context.Context, string) (bool, error)
+}
+
+func (cfg *Config) TriageDeadline() <-chan time.Time {
+	if cfg.MaxTriageTime == 0 {
+		return nil
+	}
+	return time.After(cfg.MaxTriageTime)
+}
+
+type Bug struct {
+	// The report from the patched kernel.
+	Report *report.Report
+	Repro  *repro.Result
+}
+
+func Run(ctx context.Context, baseCfg, newCfg *mgrconfig.Config, cfg Config) error {
+	if cfg.PatchedOnly == nil {
+		return fmt.Errorf("you must set up a patched only channel")
+	}
+	base, err := setup("base", baseCfg, cfg.Debug)
+	if err != nil {
+		return err
+	}
+	new, err := setup("new", newCfg, cfg.Debug)
+	if err != nil {
+		return err
+	}
+	eg, ctx := errgroup.WithContext(ctx)
+	eg.Go(func() error {
+		info, err := manager.LoadSeeds(newCfg, true)
+		if err != nil {
+			return err
+		}
+		select {
+		case new.candidates <- info.Candidates:
+		case <-ctx.Done():
+		}
+		return nil
+	})
+
+	stream := queue.NewRandomQueue(4096, rand.New(rand.NewSource(time.Now().UnixNano())))
+	base.source = stream
+	new.duplicateInto = stream
+
+	diffCtx := &diffContext{
+		cfg:           cfg,
+		doneRepro:     make(chan *manager.ReproResult),
+		base:          base,
+		new:           new,
+		store:         cfg.Store,
+		reproAttempts: map[string]int{},
+		patchedOnly:   cfg.PatchedOnly,
+	}
+	if newCfg.HTTP != "" {
+		diffCtx.http = &manager.HTTPServer{
+			Cfg:       newCfg,
+			StartTime: time.Now(),
+			DiffStore: cfg.Store,
+			Pools: map[string]*vm.Dispatcher{
+				new.name:  new.pool,
+				base.name: base.pool,
+			},
+		}
+		new.http = diffCtx.http
+	}
+	eg.Go(func() error {
+		return diffCtx.Loop(ctx)
+	})
+	return eg.Wait()
+}
+
+type diffContext struct {
+	cfg   Config
+	store *manager.DiffFuzzerStore
+	http  *manager.HTTPServer
+
+	doneRepro   chan *manager.ReproResult
+	base        *kernelContext
+	new         *kernelContext
+	patchedOnly chan *Bug
+
+	mu            sync.Mutex
+	reproAttempts map[string]int
+}
+
+const (
+	// Don't start reproductions until 90% of the corpus has been triaged.
+	corpusTriageToRepro = 0.9
+	// Start to monitor whether we reached the modified files only after triaging 99%.
+	corpusTriageToMonitor = 0.99
+)
+
+func (dc *diffContext) Loop(ctx context.Context) error {
+	g, groupCtx := errgroup.WithContext(ctx)
+	reproLoop := manager.NewReproLoop(dc, dc.new.pool.Total()-dc.new.cfg.FuzzingVMs, false)
+	if dc.http != nil {
+		dc.http.ReproLoop = reproLoop
+		g.Go(func() error {
+			return dc.http.Serve(groupCtx)
+		})
+	}
+
+	g.Go(func() error {
+		select {
+		case <-groupCtx.Done():
+			return nil
+		case <-dc.waitCorpusTriage(groupCtx, corpusTriageToRepro):
+		case <-dc.cfg.TriageDeadline():
+			log.Logf(0, "timed out waiting for coprus triage")
+		}
+		log.Logf(0, "starting bug reproductions")
+		reproLoop.Loop(groupCtx)
+		return nil
+	})
+
+	g.Go(func() error { return dc.monitorPatchedCoverage(groupCtx) })
+	g.Go(func() error { return dc.base.Loop(groupCtx) })
+	g.Go(func() error { return dc.new.Loop(groupCtx) })
+
+	runner := &reproRunner{done: make(chan reproRunnerResult, 2), kernel: dc.base}
+	statTimer := time.NewTicker(5 * time.Minute)
+loop:
+	for {
+		select {
+		case <-groupCtx.Done():
+			break loop
+		case <-statTimer.C:
+			vals := make(map[string]int)
+			for _, stat := range stat.Collect(stat.All) {
+				vals[stat.Name] = stat.V
+			}
+			data, _ := json.MarshalIndent(vals, "", "  ")
+			log.Logf(0, "STAT %s", data)
+		case rep := <-dc.base.crashes:
+			log.Logf(1, "base crash: %v", rep.Title)
+			dc.reportBaseCrash(groupCtx, rep)
+		case ret := <-runner.done:
+			dc.handleReproResult(groupCtx, ret, reproLoop)
+		case ret := <-dc.doneRepro:
+			// We have finished reproducing a crash from the patched instance.
+			if ret.Repro != nil && ret.Repro.Report != nil {
+				origTitle := ret.Crash.Report.Title
+				if ret.Repro.Report.Title == origTitle {
+					origTitle = "-SAME-"
+				}
+				log.Logf(1, "found repro for %q (orig title: %q, reliability: %2.f), took %.2f minutes",
+					ret.Repro.Report.Title, origTitle, ret.Repro.Reliability, ret.Stats.TotalTime.Minutes())
+				g.Go(func() error {
+					runner.Run(groupCtx, ret.Repro, ret.Crash.FullRepro)
+					return nil
+				})
+			} else {
+				origTitle := ret.Crash.Report.Title
+				log.Logf(1, "failed repro for %q, err=%s", origTitle, ret.Err)
+			}
+			dc.store.SaveRepro(ret)
+		case rep := <-dc.new.crashes:
+			// A new crash is found on the patched instance.
+			crash := &manager.Crash{Report: rep}
+			need := dc.NeedRepro(crash)
+			log.Logf(0, "patched crashed: %v [need repro = %v]",
+				rep.Title, need)
+			dc.store.PatchedCrashed(rep.Title, rep.Report, rep.Output)
+			if need {
+				reproLoop.Enqueue(crash)
+			}
+		}
+	}
+	return g.Wait()
+}
+
+func (dc *diffContext) handleReproResult(ctx context.Context, ret reproRunnerResult, reproLoop *manager.ReproLoop) {
+	// We have run the reproducer on the base instance.
+
+	// A sanity check: the base kernel might have crashed with the same title
+	// since the moment we have stared the reproduction / running on the repro base.
+	ignored := dc.ignoreCrash(ctx, ret.reproReport.Title)
+	if ret.crashReport == nil && ignored {
+		// Report it as error so that we could at least find it in the logs.
+		log.Errorf("resulting crash of an approved repro result is to be ignored: %s",
+			ret.reproReport.Title)
+	} else if ret.crashReport == nil {
+		dc.store.BaseNotCrashed(ret.reproReport.Title)
+		select {
+		case <-ctx.Done():
+		case dc.patchedOnly <- &Bug{
+			Report: ret.reproReport,
+			Repro:  ret.repro,
+		}:
+		}
+		log.Logf(0, "patched-only: %s", ret.reproReport.Title)
+		// Now that we know this bug only affects the patch kernel, we can spend more time
+		// generating a minimalistic repro and a C repro.
+		if !ret.fullRepro {
+			reproLoop.Enqueue(&manager.Crash{
+				Report: &report.Report{
+					Title:  ret.reproReport.Title,
+					Output: ret.repro.Prog.Serialize(),
+				},
+				FullRepro: true,
+			})
+		}
+	} else {
+		dc.reportBaseCrash(ctx, ret.crashReport)
+		log.Logf(0, "crashes both: %s / %s", ret.reproReport.Title, ret.crashReport.Title)
+	}
+}
+
+func (dc *diffContext) ignoreCrash(ctx context.Context, title string) bool {
+	if dc.store.EverCrashedBase(title) {
+		return true
+	}
+	// Let's try to ask the external systems about it as well.
+	if dc.cfg.IgnoreCrash != nil {
+		ignore, err := dc.cfg.IgnoreCrash(ctx, title)
+		if err != nil {
+			log.Logf(0, "a call to IgnoreCrash failed: %v", err)
+		} else {
+			if ignore {
+				log.Logf(0, "base crash %q is to be ignored", title)
+			}
+			return ignore
+		}
+	}
+	return false
+}
+
+func (dc *diffContext) reportBaseCrash(ctx context.Context, rep *report.Report) {
+	dc.store.BaseCrashed(rep.Title, rep.Report)
+	if dc.cfg.BaseCrashes == nil {
+		return
+	}
+	select {
+	case dc.cfg.BaseCrashes <- rep.Title:
+	case <-ctx.Done():
+	}
+}
+
+func (dc *diffContext) waitCorpusTriage(ctx context.Context, threshold float64) chan struct{} {
+	const backOffTime = 30 * time.Second
+	ret := make(chan struct{})
+	go func() {
+		for {
+			select {
+			case <-time.After(backOffTime):
+			case <-ctx.Done():
+				return
+			}
+			triaged := dc.new.triageProgress()
+			if triaged >= threshold {
+				log.Logf(0, "triaged %.1f%% of the corpus", triaged*100.0)
+				close(ret)
+				return
+			}
+		}
+	}()
+	return ret
+}
+
+var ErrPatchedAreaNotReached = errors.New("fuzzer has not reached the patched area")
+
+func (dc *diffContext) monitorPatchedCoverage(ctx context.Context) error {
+	if dc.cfg.FuzzToReachPatched == 0 {
+		// The feature is disabled.
+		return nil
+	}
+
+	// First wait until we have almost triaged all of the corpus.
+	select {
+	case <-ctx.Done():
+		return nil
+	case <-dc.waitCorpusTriage(ctx, corpusTriageToMonitor):
+	}
+
+	// By this moment, we must have coverage filters already filled out.
+	focusPCs := 0
+	// The last one is "everything else", so it's not of interest.
+	coverFilters := dc.new.coverFilters
+	for i := 0; i < len(coverFilters.Areas)-1; i++ {
+		focusPCs += len(coverFilters.Areas[i].CoverPCs)
+	}
+	if focusPCs == 0 {
+		// No areas were configured.
+		log.Logf(1, "no PCs in the areas of focused fuzzing, skipping the zero patched coverage check")
+		return nil
+	}
+
+	// Then give the fuzzer some change to get through.
+	select {
+	case <-time.After(dc.cfg.FuzzToReachPatched):
+	case <-ctx.Done():
+		return nil
+	}
+	focusAreaStats := dc.new.progsPerArea()
+	if focusAreaStats[symbolsArea]+focusAreaStats[filesArea]+focusAreaStats[includesArea] > 0 {
+		log.Logf(0, "fuzzer has reached the modified code (%d + %d + %d), continuing fuzzing",
+			focusAreaStats[symbolsArea], focusAreaStats[filesArea], focusAreaStats[includesArea])
+		return nil
+	}
+	log.Logf(0, "fuzzer has not reached the modified code in %s, aborting",
+		dc.cfg.FuzzToReachPatched)
+	return ErrPatchedAreaNotReached
+}
+
+// TODO: instead of this limit, consider expotentially growing delays between reproduction attempts.
+const maxReproAttempts = 6
+
+func needReproForTitle(title string) bool {
+	if strings.Contains(title, "no output") ||
+		strings.Contains(title, "lost connection") ||
+		strings.Contains(title, "detected stall") ||
+		strings.Contains(title, "SYZ") {
+		// Don't waste time reproducing these.
+		return false
+	}
+	return true
+}
+
+func (dc *diffContext) NeedRepro(crash *manager.Crash) bool {
+	if crash.FullRepro {
+		return true
+	}
+	if !needReproForTitle(crash.Title) {
+		return false
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
+	defer cancel()
+	if dc.ignoreCrash(ctx, crash.Title) {
+		return false
+	}
+	dc.mu.Lock()
+	defer dc.mu.Unlock()
+	return dc.reproAttempts[crash.Title] <= maxReproAttempts
+}
+
+func (dc *diffContext) RunRepro(ctx context.Context, crash *manager.Crash) *manager.ReproResult {
+	dc.mu.Lock()
+	dc.reproAttempts[crash.Title]++
+	dc.mu.Unlock()
+
+	res, stats, err := repro.Run(ctx, crash.Output, repro.Environment{
+		Config:   dc.new.cfg,
+		Features: dc.new.features,
+		Reporter: dc.new.reporter,
+		Pool:     dc.new.pool,
+		Fast:     !crash.FullRepro,
+	})
+	if res != nil && res.Report != nil {
+		dc.mu.Lock()
+		dc.reproAttempts[res.Report.Title] = maxReproAttempts
+		dc.mu.Unlock()
+	}
+	ret := &manager.ReproResult{
+		Crash: crash,
+		Repro: res,
+		Stats: stats,
+		Err:   err,
+	}
+
+	select {
+	case dc.doneRepro <- ret:
+	case <-ctx.Done():
+		// If the context is cancelled, no one may be listening on doneRepro.
+	}
+	return ret
+}
+
+func (dc *diffContext) ResizeReproPool(size int) {
+	dc.new.pool.ReserveForRun(size)
+}
author	Aleksandr Nogikh <nogikh@google.com>	2026-01-23 17:18:49 +0100
committer	Aleksandr Nogikh <nogikh@google.com>	2026-01-23 20:35:29 +0000
commit	4f25b9b48bdfbc7adeaf320f8d92067afe509b49 (patch)
tree	e66a51fc3786e8bb8abba5ca89cfc152a59e971f /pkg/manager/diff/manager.go
parent	b4afeb6fb8cde041ba03048f5e123ed3f304a5e6 (diff)