syz-manager: send new inputs to the hub only once

We used to send corpus updates (added/removed elements) to the hub in each sync. But that produced too much churn since hub algorithm is O(N^2) (distributing everything to everybody), and lots of new inputs are later removed (either we can't reproduce coverage after restart, or inputs removed during corpus minimization). So now we don't send new inputs in each sync, instead we aim at sending corpus once after initial triage. This solves the problem with non-reproducible/removed inputs. Typical instance life-time on syzbot is <24h, for such instances we send the corpus once. If an instance somehow lives for longer, then we re-connect and re-send once in a while (e.g. a local long-running instance).
author: Dmitry Vyukov <dvyukov@google.com> 2024-09-12 15:53:24 +0200
committer: Dmitry Vyukov <dvyukov@google.com> 2024-09-12 16:29:23 +0000
commit: 2a91a78df9ed766fac414f94e9d3cc5fa71add55 (patch)
tree: 91e80536ed38be1e7b3a7b7a19bb2c61ad1a614a /syz-manager
parent: 41b5d1787276981454609f1e9ca17f335e8223b5 (diff)
2 files changed, 35 insertions, 33 deletions
diff --git a/syz-manager/hub.go b/syz-manager/hub.go
index 6acedf37e..9db988484 100644
--- a/syz-manager/hub.go
+++ b/syz-manager/hub.go
@@ -74,7 +74,6 @@ type HubConnector struct {
 	enabledCalls   map[*prog.Syscall]bool
 	leak           bool
 	fresh          bool
-	hubCorpus      map[string]bool
 	newRepros      [][]byte
 	hubReproQueue  chan *manager.Crash
 	needMoreRepros func() bool
@@ -91,7 +90,8 @@ type HubConnector struct {
 
 // HubManagerView restricts interface between HubConnector and Manager.
 type HubManagerView interface {
-	getMinimizedCorpus() (corpus []*corpus.Item, repros [][]byte)
+	getMinimizedCorpus() []*corpus.Item
+	getNewRepros() [][]byte
 	addNewCandidates(candidates []fuzzer.Candidate)
 	needMoreCandidates() bool
 	hubIsUnreachable()
@@ -100,25 +100,28 @@ type HubManagerView interface {
 func (hc *HubConnector) loop() {
 	var hub *rpctype.RPCClient
 	var doneOnce bool
+	var connectTime time.Time
 	for query := 0; ; time.Sleep(10 * time.Minute) {
-		corpus, repros := hc.mgr.getMinimizedCorpus()
-		if !hc.cfg.Cover {
+		if hub == nil {
+			var corpus []*corpus.Item
 			// If we are using fake coverage, don't send our corpus to the hub.
 			// It should be lower quality than coverage-guided corpus.
 			// However still send repros and accept new inputs.
-			corpus = nil
-		}
-		hc.newRepros = append(hc.newRepros, repros...)
-		if hub == nil {
+			if hc.cfg.Cover {
+				corpus = hc.mgr.getMinimizedCorpus()
+			}
 			var err error
 			if hub, err = hc.connect(corpus); err != nil {
 				log.Logf(0, "failed to connect to hub at %v: %v", hc.cfg.HubAddr, err)
 			} else {
 				log.Logf(0, "connected to hub at %v, corpus %v", hc.cfg.HubAddr, len(corpus))
+				connectTime = time.Now()
 			}
 		}
 		if hub != nil && hc.mgr.needMoreCandidates() {
-			if err := hc.sync(hub, corpus); err != nil {
+			repros := hc.mgr.getNewRepros()
+			hc.newRepros = append(hc.newRepros, repros...)
+			if err := hc.sync(hub); err != nil {
 				log.Logf(0, "hub sync failed: %v", err)
 				hub.Close()
 				hub = nil
@@ -131,6 +134,19 @@ func (hc *HubConnector) loop() {
 		if hub == nil && query >= maxAttempts && !doneOnce {
 			hc.mgr.hubIsUnreachable()
 		}
+		// We used to send corpus updates (added/removed elements) to the hub in each sync.
+		// But that produced too much churn since hub algorithm is O(N^2) (distributing everything
+		// to everybody), and lots of new inputs are later removed (either we can't reproduce coverage
+		// after restart, or inputs removed during corpus minimization). So now we don't send new inputs
+		// in each sync, instead we aim at sending corpus once after initial triage. This solves
+		// the problem with non-reproducible/removed inputs. Typical instance life-time on syzbot is <24h,
+		// for such instances we send the corpus once. If an instance somehow lives for longer, then we
+		// re-connect and re-send once in a while (e.g. a local long-running instance).
+		if hub != nil && time.Since(connectTime) > 30*time.Hour {
+			log.Logf(0, "re-syncing with hub")
+			hub.Close()
+			hub = nil
+		}
 	}
 }
 
@@ -153,9 +169,7 @@ func (hc *HubConnector) connect(corpus []*corpus.Item) (*rpctype.RPCClient, erro
 	for call := range hc.enabledCalls {
 		a.Calls = append(a.Calls, call.Name)
 	}
-	hubCorpus := make(map[string]bool)
 	for _, inp := range corpus {
-		hubCorpus[inp.Sig] = true
 		a.Corpus = append(a.Corpus, inp.Prog.Serialize())
 	}
 	// Never send more than this, this is never healthy but happens episodically
@@ -176,12 +190,11 @@ func (hc *HubConnector) connect(corpus []*corpus.Item) (*rpctype.RPCClient, erro
 	if err != nil {
 		return nil, err
 	}
-	hc.hubCorpus = hubCorpus
 	hc.fresh = false
 	return hub, nil
 }
 
-func (hc *HubConnector) sync(hub *rpctype.RPCClient, corpus []*corpus.Item) error {
+func (hc *HubConnector) sync(hub *rpctype.RPCClient) error {
 	key, err := hc.keyGet()
 	if err != nil {
 		return err
@@ -191,22 +204,6 @@ func (hc *HubConnector) sync(hub *rpctype.RPCClient, corpus []*corpus.Item) erro
 		Key:     key,
 		Manager: hc.cfg.Name,
 	}
-	sigs := make(map[string]bool)
-	for _, inp := range corpus {
-		sigs[inp.Sig] = true
-		if hc.hubCorpus[inp.Sig] {
-			continue
-		}
-		hc.hubCorpus[inp.Sig] = true
-		a.Add = append(a.Add, inp.Prog.Serialize())
-	}
-	for sig := range hc.hubCorpus {
-		if sigs[sig] {
-			continue
-		}
-		delete(hc.hubCorpus, sig)
-		a.Del = append(a.Del, sig)
-	}
 	if hc.needMoreRepros != nil {
 		a.NeedRepros = hc.needMoreRepros()
 	}
diff --git a/syz-manager/manager.go b/syz-manager/manager.go
index c090d1a0d..71168fb82 100644
--- a/syz-manager/manager.go
+++ b/syz-manager/manager.go
@@ -974,14 +974,19 @@ func (mgr *Manager) corpusInputHandler(updates <-chan corpus.NewItemEvent) {
 	}
 }
 
-func (mgr *Manager) getMinimizedCorpus() (corpus []*corpus.Item, repros [][]byte) {
+func (mgr *Manager) getMinimizedCorpus() []*corpus.Item {
 	mgr.mu.Lock()
 	defer mgr.mu.Unlock()
 	mgr.minimizeCorpusLocked()
-	corpus = mgr.corpus.Items()
-	repros = mgr.newRepros
+	return mgr.corpus.Items()
+}
+
+func (mgr *Manager) getNewRepros() [][]byte {
+	mgr.mu.Lock()
+	defer mgr.mu.Unlock()
+	repros := mgr.newRepros
 	mgr.newRepros = nil
-	return
+	return repros
 }
 
 func (mgr *Manager) addNewCandidates(candidates []fuzzer.Candidate) {
author	Dmitry Vyukov <dvyukov@google.com>	2024-09-12 15:53:24 +0200
committer	Dmitry Vyukov <dvyukov@google.com>	2024-09-12 16:29:23 +0000
commit	2a91a78df9ed766fac414f94e9d3cc5fa71add55 (patch)
tree	91e80536ed38be1e7b3a7b7a19bb2c61ad1a614a /syz-manager
parent	41b5d1787276981454609f1e9ca17f335e8223b5 (diff)