pkg/fuzzer: smarter retry for corpus candidates

Currently we always retry corpus candidates 2 times. Retry up to 3 times if we did not get new signal. This should both reduce total number of runs (if we get new signal on the first run) and reduce signal loss after restart (since we try up to 3 times instead of 2).
author: Dmitry Vyukov <dvyukov@google.com> 2024-05-28 16:43:16 +0200
committer: Dmitry Vyukov <dvyukov@google.com> 2024-06-03 15:04:36 +0000
commit: 05abdec478c0e0186e19d301ba5bd67e553a0737 (patch)
tree: 12a9d8fc9a5a68d1faf0471c27830a07d7edd2ad
parent: 701ad974ccc0ba0a61d2a2bd58db569f794bd037 (diff)
2 files changed, 38 insertions, 31 deletions
diff --git a/pkg/fuzzer/fuzzer.go b/pkg/fuzzer/fuzzer.go
index b0ceaf9c0..44aad6010 100644
--- a/pkg/fuzzer/fuzzer.go
+++ b/pkg/fuzzer/fuzzer.go
@@ -101,42 +101,56 @@ func (fuzzer *Fuzzer) execute(executor queue.Executor, req *queue.Request) *queu
 }
 
 func (fuzzer *Fuzzer) executeWithFlags(executor queue.Executor, req *queue.Request, flags ProgFlags) *queue.Result {
-	executor.Submit(req)
-	res := req.Wait(fuzzer.ctx)
-	fuzzer.processResult(req, res, flags)
-	return res
+	fuzzer.enqueue(executor, req, flags, 0)
+	return req.Wait(fuzzer.ctx)
 }
 
-func (fuzzer *Fuzzer) prepare(req *queue.Request, flags ProgFlags) {
+func (fuzzer *Fuzzer) prepare(req *queue.Request, flags ProgFlags, attempt int) {
 	req.OnDone(func(req *queue.Request, res *queue.Result) bool {
-		fuzzer.processResult(req, res, flags)
-		return true
+		return fuzzer.processResult(req, res, flags, attempt)
 	})
 }
 
-func (fuzzer *Fuzzer) enqueue(executor queue.Executor, req *queue.Request, flags ProgFlags) {
-	fuzzer.prepare(req, flags)
+func (fuzzer *Fuzzer) enqueue(executor queue.Executor, req *queue.Request, flags ProgFlags, attempt int) {
+	fuzzer.prepare(req, flags, attempt)
 	executor.Submit(req)
 }
 
-func (fuzzer *Fuzzer) processResult(req *queue.Request, res *queue.Result, flags ProgFlags) {
+func (fuzzer *Fuzzer) processResult(req *queue.Request, res *queue.Result, flags ProgFlags, attempt int) bool {
 	inTriage := flags&progInTriage > 0
 	// Triage individual calls.
 	// We do it before unblocking the waiting threads because
 	// it may result it concurrent modification of req.Prog.
 	// If we are already triaging this exact prog, this is flaky coverage.
+	triaging := false
 	if req.ExecOpts.ExecFlags&flatrpc.ExecFlagCollectSignal > 0 && res.Info != nil && !inTriage {
 		for call, info := range res.Info.Calls {
-			fuzzer.triageProgCall(req.Prog, info, call, flags)
+			if fuzzer.triageProgCall(req.Prog, info, call, flags) {
+				triaging = true
+			}
+		}
+		if fuzzer.triageProgCall(req.Prog, res.Info.Extra, -1, flags) {
+			triaging = true
 		}
-		fuzzer.triageProgCall(req.Prog, res.Info.Extra, -1, flags)
 	}
+
 	if res.Info != nil {
 		fuzzer.statExecTime.Add(int(res.Info.Elapsed / 1e6))
 	}
+
+	// Corpus candidates may have flaky coverage, so we give them a second chance.
+	maxCandidateAttempts := 3
+	if req.Risky() {
+		maxCandidateAttempts = 2
+	}
+	if !triaging && flags&ProgFromCorpus != 0 && attempt < maxCandidateAttempts {
+		fuzzer.enqueue(fuzzer.candidateQueue, req, flags, attempt+1)
+		return false
+	}
 	if flags&progCandidate != 0 {
 		fuzzer.statCandidates.Add(-1)
 	}
+	return true
 }
 
 type Config struct {
@@ -154,17 +168,17 @@ type Config struct {
 	NewInputFilter func(call string) bool
 }
 
-func (fuzzer *Fuzzer) triageProgCall(p *prog.Prog, info *flatrpc.CallInfo, call int, flags ProgFlags) {
+func (fuzzer *Fuzzer) triageProgCall(p *prog.Prog, info *flatrpc.CallInfo, call int, flags ProgFlags) bool {
 	if info == nil {
-		return
+		return false
 	}
 	prio := signalPrio(p, info, call)
 	newMaxSignal := fuzzer.Cover.addRawMaxSignal(info.Signal, prio)
 	if newMaxSignal.Empty() {
-		return
+		return false
 	}
 	if !fuzzer.Config.NewInputFilter(p.CallName(call)) {
-		return
+		return false
 	}
 	fuzzer.Logf(2, "found new signal in call %d in %s", call, p)
 
@@ -180,6 +194,7 @@ func (fuzzer *Fuzzer) triageProgCall(p *prog.Prog, info *flatrpc.CallInfo, call
 		flags:     flags,
 		queue:     queue.Append(),
 	})
+	return true
 }
 
 func signalPrio(p *prog.Prog, info *flatrpc.CallInfo, call int) (prio uint8) {
@@ -211,7 +226,7 @@ func (fuzzer *Fuzzer) genFuzz() *queue.Request {
 	if req == nil {
 		req = genProgRequest(fuzzer, rnd)
 	}
-	fuzzer.prepare(req, 0)
+	fuzzer.prepare(req, 0, 0)
 	return req
 }
 
@@ -246,7 +261,9 @@ func (fuzzer *Fuzzer) Logf(level int, msg string, args ...interface{}) {
 type ProgFlags int
 
 const (
-	ProgMinimized ProgFlags = 1 << iota
+	// The candidate was loaded from our local corpus rather than come from hub.
+	ProgFromCorpus ProgFlags = 1 << iota
+	ProgMinimized
 	ProgSmashed
 
 	progCandidate
@@ -267,7 +284,7 @@ func (fuzzer *Fuzzer) AddCandidates(candidates []Candidate) {
 			Stat:      fuzzer.statExecCandidate,
 			Important: true,
 		}
-		fuzzer.enqueue(fuzzer.candidateQueue, req, candidate.Flags|progCandidate)
+		fuzzer.enqueue(fuzzer.candidateQueue, req, candidate.Flags|progCandidate, 0)
 	}
 }
 
diff --git a/syz-manager/manager.go b/syz-manager/manager.go
index 0837e0251..bbc915556 100644
--- a/syz-manager/manager.go
+++ b/syz-manager/manager.go
@@ -646,7 +646,7 @@ func (mgr *Manager) loadCorpus() {
 	// By default we don't re-minimize/re-smash programs from corpus,
 	// it takes lots of time on start and is unnecessary.
 	// However, on version bumps we can selectively re-minimize/re-smash.
-	flags := fuzzer.ProgMinimized | fuzzer.ProgSmashed
+	flags := fuzzer.ProgFromCorpus | fuzzer.ProgMinimized | fuzzer.ProgSmashed
 	switch mgr.corpusDB.Version {
 	case 0:
 		// Version 0 had broken minimization, so we need to re-minimize.
@@ -681,7 +681,7 @@ func (mgr *Manager) loadCorpus() {
 	mgr.fresh = len(mgr.corpusDB.Records) == 0
 	seeds := 0
 	for _, seed := range mgr.seeds {
-		_, item := mgr.loadProg(seed, fuzzer.ProgMinimized)
+		_, item := mgr.loadProg(seed, fuzzer.ProgFromCorpus|fuzzer.ProgMinimized)
 		if item != nil {
 			candidates = append(candidates, *item)
 			seeds++
@@ -690,16 +690,6 @@ func (mgr *Manager) loadCorpus() {
 	log.Logf(0, "%-24v: %v (%v broken, %v seeds)", "corpus", len(candidates), broken, seeds)
 	mgr.seeds = nil
 
-	// We duplicate all inputs in the corpus and shuffle the second part.
-	// This solves the following problem. A fuzzer can crash while triaging candidates,
-	// in such case it will also lost all cached candidates. Or, the input can be somewhat flaky
-	// and doesn't give the coverage on first try. So we give each input the second chance.
-	// Shuffling should alleviate deterministically losing the same inputs on fuzzer crashing.
-	candidates = append(candidates, candidates...)
-	shuffle := candidates[len(candidates)/2:]
-	rand.Shuffle(len(shuffle), func(i, j int) {
-		shuffle[i], shuffle[j] = shuffle[j], shuffle[i]
-	})
 	if mgr.phase != phaseInit {
 		panic(fmt.Sprintf("loadCorpus: bad phase %v", mgr.phase))
 	}
author	Dmitry Vyukov <dvyukov@google.com>	2024-05-28 16:43:16 +0200
committer	Dmitry Vyukov <dvyukov@google.com>	2024-06-03 15:04:36 +0000
commit	05abdec478c0e0186e19d301ba5bd67e553a0737 (patch)
tree	12a9d8fc9a5a68d1faf0471c27830a07d7edd2ad
parent	701ad974ccc0ba0a61d2a2bd58db569f794bd037 (diff)