From b840654d3ada1d4c3ade855affaa8b53cdd92346 Mon Sep 17 00:00:00 2001
From: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Date: Tue, 20 Aug 2024 12:42:25 +0500
Subject: syz-manager: sync saveRepro() and needLocalRepro()

There is a race condition of saving and accessing "repro.prog" file in
processFuzzingResults() goroutine.

needRepro() which is called from reproMgr.Loop() or upon <-externalReproQueue,
or at the end of saveCrash() can be called before saveRepro().

Removed reproMgr.Done channel and moved to post-processing of
*ReproResult to runRepro(). This should guarantee the order of RW
operations on "repro.prog" file.

$ stress ./syz-manager.test -test.run=TestReproRWRace
5s: 16 runs so far, 0 failures, 8 active
10s: 40 runs so far, 0 failures, 8 active
15s: 64 runs so far, 0 failures, 8 active
20s: 80 runs so far, 0 failures, 8 active
25s: 96 runs so far, 0 failures, 8 active
30s: 112 runs so far, 0 failures, 8 active
35s: 136 runs so far, 0 failures, 8 active
40s: 152 runs so far, 0 failures, 8 active
45s: 168 runs so far, 0 failures, 8 active
50s: 184 runs so far, 0 failures, 8 active
55s: 200 runs so far, 0 failures, 8 active
1m0s: 216 runs so far, 0 failures, 8 active

Fixes: https://github.com/google/syzkaller/issues/5157
---
 syz-manager/manager.go    | 35 +++++++++++++-----------
 syz-manager/repro.go      |  5 ----
 syz-manager/repro_test.go | 68 ++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 76 insertions(+), 32 deletions(-)

diff --git a/syz-manager/manager.go b/syz-manager/manager.go
index 704129b1b..6657ed2f6 100644
--- a/syz-manager/manager.go
+++ b/syz-manager/manager.go
@@ -401,21 +401,6 @@ func (mgr *Manager) processFuzzingResults(ctx context.Context) {
 			if crash != nil {
 				mgr.saveCrash(crash)
 			}
-		case res := <-mgr.reproMgr.Done:
-			if res.err != nil {
-				reportReproError(res.err)
-			}
-			if res.repro == nil {
-				if res.crash.Title == "" {
-					log.Logf(1, "repro '%v' not from dashboard, so not reporting the failure",
-						res.crash.FullTitle())
-				} else {
-					log.Logf(1, "report repro failure of '%v'", res.crash.Title)
-					mgr.saveFailedRepro(res.crash.Report, res.stats)
-				}
-			} else {
-				mgr.saveRepro(res)
-			}
 		case crash := <-mgr.externalReproQueue:
 			if mgr.needRepro(crash) {
 				mgr.reproMgr.Enqueue(crash)
@@ -491,9 +476,29 @@ func (mgr *Manager) runRepro(crash *Crash) *ReproResult {
 			}
 		}
 	}
+
+	mgr.processRepro(ret)
+
 	return ret
 }
 
+func (mgr *Manager) processRepro(res *ReproResult) {
+	if res.err != nil {
+		reportReproError(res.err)
+	}
+	if res.repro == nil {
+		if res.crash.Title == "" {
+			log.Logf(1, "repro '%v' not from dashboard, so not reporting the failure",
+				res.crash.FullTitle())
+		} else {
+			log.Logf(1, "report repro failure of '%v'", res.crash.Title)
+			mgr.saveFailedRepro(res.crash.Report, res.stats)
+		}
+	} else {
+		mgr.saveRepro(res)
+	}
+}
+
 func (mgr *Manager) preloadCorpus() {
 	corpusDB, err := db.Open(filepath.Join(mgr.cfg.Workdir, "corpus.db"), true)
 	if err != nil {
diff --git a/syz-manager/repro.go b/syz-manager/repro.go
index 11812b83b..3232bd9f3 100644
--- a/syz-manager/repro.go
+++ b/syz-manager/repro.go
@@ -20,8 +20,6 @@ type reproManagerView interface {
 }
 
 type reproManager struct {
-	Done chan *ReproResult
-
 	statNumReproducing *stat.Val
 	statPending        *stat.Val
 
@@ -39,8 +37,6 @@ type reproManager struct {
 
 func newReproManager(mgr reproManagerView, reproVMs int, onlyOnce bool) *reproManager {
 	ret := &reproManager{
-		Done: make(chan *ReproResult, 10),
-
 		mgr:         mgr,
 		onlyOnce:    onlyOnce,
 		parallel:    make(chan struct{}, reproVMs),
@@ -217,7 +213,6 @@ func (m *reproManager) handle(crash *Crash) {
 	log.Logf(0, "repro finished '%v', repro=%v crepro=%v desc='%v' hub=%v from_dashboard=%v",
 		crash.FullTitle(), res.repro != nil, crepro, title, crash.fromHub, crash.fromDashboard,
 	)
-	m.Done <- res
 }
 
 func (m *reproManager) adjustPoolSizeLocked() {
diff --git a/syz-manager/repro_test.go b/syz-manager/repro_test.go
index 8042c7042..84ab94a6b 100644
--- a/syz-manager/repro_test.go
+++ b/syz-manager/repro_test.go
@@ -56,15 +56,7 @@ func TestReproManager(t *testing.T) {
 	called.ret <- &ReproResult{crash: &Crash{fromHub: true}}
 	called2.ret <- &ReproResult{crash: &Crash{fromHub: true}}
 
-	// Wait until the number of reserved VMs goes to 0.
-	for i := 0; i < 100; i++ {
-		if mock.reserved.Load() == 0 {
-			assert.True(t, obj.CanReproMore())
-			return
-		}
-		time.Sleep(10 * time.Millisecond)
-	}
-	t.Fatal("reserved VMs must have dropped to 0")
+	mock.onVMShutdown(t, obj)
 }
 
 func TestReproOrder(t *testing.T) {
@@ -105,9 +97,48 @@ func TestReproOrder(t *testing.T) {
 	assert.Equal(t, crashes[2], obj.popCrash())
 }
 
+func TestReproRWRace(t *testing.T) {
+	mock := &reproMgrMock{
+		run: make(chan runCallback),
+	}
+	obj := newReproManager(mock, 3, false)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	go func() {
+		obj.Loop(ctx) // calls runRepro()
+	}()
+
+	assert.False(t, obj.CanReproMore())
+	obj.StartReproduction()
+	assert.True(t, obj.CanReproMore())
+
+	obj.Enqueue(&Crash{Report: &report.Report{Title: "A"}})
+	obj.Enqueue(&Crash{Report: &report.Report{Title: "A"}})
+
+	assert.True(t, mock.needRepro(nil))
+	called := <-mock.run
+	called.ret <- &ReproResult{}
+	// Pretend that processRepro() is finished and
+	// we've written "repro.prog" to the disk.
+	mock.reproProgExist.Store(true)
+	assert.False(t, mock.needRepro(nil))
+	assert.True(t, obj.CanReproMore())
+
+	called2 := <-mock.run
+	called2.ret <- &ReproResult{}
+	assert.False(t, mock.needRepro(nil))
+	assert.True(t, obj.CanReproMore())
+
+	// Reproducers may be still running.
+	mock.onVMShutdown(t, obj)
+}
+
 type reproMgrMock struct {
-	reserved atomic.Int64
-	run      chan runCallback
+	reserved       atomic.Int64
+	run            chan runCallback
+	reproProgExist atomic.Bool
 }
 
 type runCallback struct {
@@ -115,6 +146,19 @@ type runCallback struct {
 	ret   chan *ReproResult
 }
 
+// Wait until the number of reserved VMs goes to 0.
+func (m *reproMgrMock) onVMShutdown(t *testing.T, reproMgr *reproManager) {
+	for i := 0; i < 100; i++ {
+		if m.reserved.Load() == 0 {
+			assert.True(t, reproMgr.CanReproMore())
+			assert.True(t, reproMgr.Empty())
+			return
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+	t.Fatal("reserved VMs must have dropped to 0")
+}
+
 func (m *reproMgrMock) runRepro(crash *Crash) *ReproResult {
 	retCh := make(chan *ReproResult)
 	m.run <- runCallback{crash: crash, ret: retCh}
@@ -124,7 +168,7 @@ func (m *reproMgrMock) runRepro(crash *Crash) *ReproResult {
 }
 
 func (m *reproMgrMock) needRepro(crash *Crash) bool {
-	return true
+	return !m.reproProgExist.Load()
 }
 
 func (m *reproMgrMock) resizeReproPool(VMs int) {
-- 
cgit mrf-deployment