pkg/manager: better handle the reproduction queue

As pingQueue holds maximum 1 element, it's not a very good indicator of whether there are more elements in the reproduction queue. If the first few crashes happen to be no longer needed (which is quite likely when reproduction loop is started long after we have begun to collect the crashes), the loop becomes reactive - new reproductions will be started only once new bugs are submitted to the queue. Fix it by polling the queue until it returns nil. Add a test that exposes the previously existing bug.
author: Aleksandr Nogikh <nogikh@google.com> 2025-04-22 17:24:38 +0200
committer: Aleksandr Nogikh <nogikh@google.com> 2025-04-23 09:01:11 +0000
commit: d00e39af8a96f7fe18fa5664936a7d658f80eeef (patch)
tree: 01ed97e71880c4ad070ab95b5df6c0f36470c87f
parent: 6eb12242337d83872574660e27cfe2e8db463e48 (diff)
2 files changed, 45 insertions, 6 deletions
diff --git a/pkg/manager/repro.go b/pkg/manager/repro.go
index f9fb2d0e2..28b676752 100644
--- a/pkg/manager/repro.go
+++ b/pkg/manager/repro.go
@@ -188,11 +188,15 @@ func (r *ReproLoop) Loop(ctx context.Context) {
 		for {
 			if crash != nil && !r.mgr.NeedRepro(crash) {
 				log.Logf(1, "reproduction of %q aborted: it's no longer needed", crash.FullTitle())
-				crash = nil
 				// Now we might not need that many VMs.
 				r.mu.Lock()
 				r.adjustPoolSizeLocked()
 				r.mu.Unlock()
+
+				// Immediately check if there was any other crash in the queue, so that we fall back
+				// to waiting on pingQueue only if there were really no other crashes in the queue.
+				crash = r.popCrash()
+				continue
 			}
 			if crash != nil {
 				break
diff --git a/pkg/manager/repro_test.go b/pkg/manager/repro_test.go
index d66bee8d0..909d6ffeb 100644
--- a/pkg/manager/repro_test.go
+++ b/pkg/manager/repro_test.go
@@ -97,8 +97,12 @@ func TestReproOrder(t *testing.T) {
 }
 
 func TestReproRWRace(t *testing.T) {
+	var reproProgExist atomic.Bool
 	mock := &reproMgrMock{
 		run: make(chan runCallback),
+		needReproCb: func(_ *Crash) bool {
+			return !reproProgExist.Load()
+		},
 	}
 	obj := NewReproLoop(mock, 3, false)
 
@@ -114,7 +118,7 @@ func TestReproRWRace(t *testing.T) {
 	called := <-mock.run
 	// Pretend that processRepro() is finished and
 	// we've written "repro.prog" to the disk.
-	mock.reproProgExist.Store(true)
+	reproProgExist.Store(true)
 	assert.False(t, mock.NeedRepro(nil))
 	called.ret <- &ReproResult{}
 	assert.True(t, obj.CanReproMore())
@@ -145,10 +149,38 @@ func TestCancelRunningRepro(t *testing.T) {
 	done()
 }
 
+func TestEnqueueTriggersRepro(t *testing.T) {
+	mock := &reproMgrMock{
+		run: make(chan runCallback),
+		needReproCb: func(crash *Crash) bool {
+			return crash.FullTitle() == "C"
+		},
+	}
+	obj := NewReproLoop(mock, 1, false)
+	obj.Enqueue(&Crash{Report: &report.Report{Title: "A"}, Manual: true})
+	obj.Enqueue(&Crash{Report: &report.Report{Title: "B"}, Manual: true})
+	obj.Enqueue(&Crash{Report: &report.Report{Title: "C"}})
+
+	ctx, done := context.WithCancel(context.Background())
+	complete := make(chan struct{})
+	go func() {
+		obj.Loop(ctx)
+		close(complete)
+	}()
+
+	defer func() {
+		<-complete
+	}()
+	// The test will hang if the loop never picks up the title C.
+	crash := <-mock.run
+	assert.Equal(t, "C", crash.crash.FullTitle())
+	done()
+}
+
 type reproMgrMock struct {
-	reserved       atomic.Int64
-	run            chan runCallback
-	reproProgExist atomic.Bool
+	reserved    atomic.Int64
+	run         chan runCallback
+	needReproCb func(*Crash) bool
 }
 
 type runCallback struct {
@@ -187,7 +219,10 @@ func (m *reproMgrMock) RunRepro(ctx context.Context, crash *Crash) *ReproResult
 }
 
 func (m *reproMgrMock) NeedRepro(crash *Crash) bool {
-	return !m.reproProgExist.Load()
+	if m.needReproCb != nil {
+		return m.needReproCb(crash)
+	}
+	return true
 }
 
 func (m *reproMgrMock) ResizeReproPool(VMs int) {
author	Aleksandr Nogikh <nogikh@google.com>	2025-04-22 17:24:38 +0200
committer	Aleksandr Nogikh <nogikh@google.com>	2025-04-23 09:01:11 +0000
commit	d00e39af8a96f7fe18fa5664936a7d658f80eeef (patch)
tree	01ed97e71880c4ad070ab95b5df6c0f36470c87f
parent	6eb12242337d83872574660e27cfe2e8db463e48 (diff)