From b840654d3ada1d4c3ade855affaa8b53cdd92346 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Tue, 20 Aug 2024 12:42:25 +0500 Subject: syz-manager: sync saveRepro() and needLocalRepro() There is a race condition of saving and accessing "repro.prog" file in processFuzzingResults() goroutine. needRepro() which is called from reproMgr.Loop() or upon <-externalReproQueue, or at the end of saveCrash() can be called before saveRepro(). Removed reproMgr.Done channel and moved to post-processing of *ReproResult to runRepro(). This should guarantee the order of RW operations on "repro.prog" file. $ stress ./syz-manager.test -test.run=TestReproRWRace 5s: 16 runs so far, 0 failures, 8 active 10s: 40 runs so far, 0 failures, 8 active 15s: 64 runs so far, 0 failures, 8 active 20s: 80 runs so far, 0 failures, 8 active 25s: 96 runs so far, 0 failures, 8 active 30s: 112 runs so far, 0 failures, 8 active 35s: 136 runs so far, 0 failures, 8 active 40s: 152 runs so far, 0 failures, 8 active 45s: 168 runs so far, 0 failures, 8 active 50s: 184 runs so far, 0 failures, 8 active 55s: 200 runs so far, 0 failures, 8 active 1m0s: 216 runs so far, 0 failures, 8 active Fixes: https://github.com/google/syzkaller/issues/5157 --- syz-manager/manager.go | 35 +++++++++++++----------- syz-manager/repro.go | 5 ---- syz-manager/repro_test.go | 68 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 76 insertions(+), 32 deletions(-) diff --git a/syz-manager/manager.go b/syz-manager/manager.go index 704129b1b..6657ed2f6 100644 --- a/syz-manager/manager.go +++ b/syz-manager/manager.go @@ -401,21 +401,6 @@ func (mgr *Manager) processFuzzingResults(ctx context.Context) { if crash != nil { mgr.saveCrash(crash) } - case res := <-mgr.reproMgr.Done: - if res.err != nil { - reportReproError(res.err) - } - if res.repro == nil { - if res.crash.Title == "" { - log.Logf(1, "repro '%v' not from dashboard, so not reporting the failure", - res.crash.FullTitle()) - } else { - log.Logf(1, "report repro failure of '%v'", res.crash.Title) - mgr.saveFailedRepro(res.crash.Report, res.stats) - } - } else { - mgr.saveRepro(res) - } case crash := <-mgr.externalReproQueue: if mgr.needRepro(crash) { mgr.reproMgr.Enqueue(crash) @@ -491,9 +476,29 @@ func (mgr *Manager) runRepro(crash *Crash) *ReproResult { } } } + + mgr.processRepro(ret) + return ret } +func (mgr *Manager) processRepro(res *ReproResult) { + if res.err != nil { + reportReproError(res.err) + } + if res.repro == nil { + if res.crash.Title == "" { + log.Logf(1, "repro '%v' not from dashboard, so not reporting the failure", + res.crash.FullTitle()) + } else { + log.Logf(1, "report repro failure of '%v'", res.crash.Title) + mgr.saveFailedRepro(res.crash.Report, res.stats) + } + } else { + mgr.saveRepro(res) + } +} + func (mgr *Manager) preloadCorpus() { corpusDB, err := db.Open(filepath.Join(mgr.cfg.Workdir, "corpus.db"), true) if err != nil { diff --git a/syz-manager/repro.go b/syz-manager/repro.go index 11812b83b..3232bd9f3 100644 --- a/syz-manager/repro.go +++ b/syz-manager/repro.go @@ -20,8 +20,6 @@ type reproManagerView interface { } type reproManager struct { - Done chan *ReproResult - statNumReproducing *stat.Val statPending *stat.Val @@ -39,8 +37,6 @@ type reproManager struct { func newReproManager(mgr reproManagerView, reproVMs int, onlyOnce bool) *reproManager { ret := &reproManager{ - Done: make(chan *ReproResult, 10), - mgr: mgr, onlyOnce: onlyOnce, parallel: make(chan struct{}, reproVMs), @@ -217,7 +213,6 @@ func (m *reproManager) handle(crash *Crash) { log.Logf(0, "repro finished '%v', repro=%v crepro=%v desc='%v' hub=%v from_dashboard=%v", crash.FullTitle(), res.repro != nil, crepro, title, crash.fromHub, crash.fromDashboard, ) - m.Done <- res } func (m *reproManager) adjustPoolSizeLocked() { diff --git a/syz-manager/repro_test.go b/syz-manager/repro_test.go index 8042c7042..84ab94a6b 100644 --- a/syz-manager/repro_test.go +++ b/syz-manager/repro_test.go @@ -56,15 +56,7 @@ func TestReproManager(t *testing.T) { called.ret <- &ReproResult{crash: &Crash{fromHub: true}} called2.ret <- &ReproResult{crash: &Crash{fromHub: true}} - // Wait until the number of reserved VMs goes to 0. - for i := 0; i < 100; i++ { - if mock.reserved.Load() == 0 { - assert.True(t, obj.CanReproMore()) - return - } - time.Sleep(10 * time.Millisecond) - } - t.Fatal("reserved VMs must have dropped to 0") + mock.onVMShutdown(t, obj) } func TestReproOrder(t *testing.T) { @@ -105,9 +97,48 @@ func TestReproOrder(t *testing.T) { assert.Equal(t, crashes[2], obj.popCrash()) } +func TestReproRWRace(t *testing.T) { + mock := &reproMgrMock{ + run: make(chan runCallback), + } + obj := newReproManager(mock, 3, false) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go func() { + obj.Loop(ctx) // calls runRepro() + }() + + assert.False(t, obj.CanReproMore()) + obj.StartReproduction() + assert.True(t, obj.CanReproMore()) + + obj.Enqueue(&Crash{Report: &report.Report{Title: "A"}}) + obj.Enqueue(&Crash{Report: &report.Report{Title: "A"}}) + + assert.True(t, mock.needRepro(nil)) + called := <-mock.run + called.ret <- &ReproResult{} + // Pretend that processRepro() is finished and + // we've written "repro.prog" to the disk. + mock.reproProgExist.Store(true) + assert.False(t, mock.needRepro(nil)) + assert.True(t, obj.CanReproMore()) + + called2 := <-mock.run + called2.ret <- &ReproResult{} + assert.False(t, mock.needRepro(nil)) + assert.True(t, obj.CanReproMore()) + + // Reproducers may be still running. + mock.onVMShutdown(t, obj) +} + type reproMgrMock struct { - reserved atomic.Int64 - run chan runCallback + reserved atomic.Int64 + run chan runCallback + reproProgExist atomic.Bool } type runCallback struct { @@ -115,6 +146,19 @@ type runCallback struct { ret chan *ReproResult } +// Wait until the number of reserved VMs goes to 0. +func (m *reproMgrMock) onVMShutdown(t *testing.T, reproMgr *reproManager) { + for i := 0; i < 100; i++ { + if m.reserved.Load() == 0 { + assert.True(t, reproMgr.CanReproMore()) + assert.True(t, reproMgr.Empty()) + return + } + time.Sleep(10 * time.Millisecond) + } + t.Fatal("reserved VMs must have dropped to 0") +} + func (m *reproMgrMock) runRepro(crash *Crash) *ReproResult { retCh := make(chan *ReproResult) m.run <- runCallback{crash: crash, ret: retCh} @@ -124,7 +168,7 @@ func (m *reproMgrMock) runRepro(crash *Crash) *ReproResult { } func (m *reproMgrMock) needRepro(crash *Crash) bool { - return true + return !m.reproProgExist.Load() } func (m *reproMgrMock) resizeReproPool(VMs int) { -- cgit mrf-deployment