2 files changed, 77 insertions, 15 deletions
diff --git a/pkg/repro/repro.go b/pkg/repro/repro.go
index daebeed18..dc680cbb5 100644
--- a/pkg/repro/repro.go
+++ b/pkg/repro/repro.go
@@ -516,12 +516,14 @@ func (ctx *context) testProg(p *prog.Prog, duration time.Duration, opts csource.
 
 func (ctx *context) testWithInstance(callback func(execInterface) (rep *instance.RunResult,
 	err error)) (bool, error) {
-	inst := <-ctx.instances
-	if inst == nil {
-		return false, fmt.Errorf("all VMs failed to boot")
+	result, err := ctx.runOnInstance(callback)
+	if err != nil {
+		// It's hard to classify all kinds of errors into the one worth repeating
+		// and not. So let's just retry run for all errors.
+		// If the problem is transient, it will likely go away.
+		// If the problem is permanent, it will just be the same.
+		result, err = ctx.runOnInstance(callback)
 	}
-	defer ctx.returnInstance(inst)
-	result, err := callback(inst.execProg)
 	if err != nil {
 		return false, err
 	}
@@ -541,6 +543,17 @@ func (ctx *context) testWithInstance(callback func(execInterface) (rep *instance
 	return true, nil
 }
 
+// A helper method for testWithInstance.
+func (ctx *context) runOnInstance(callback func(execInterface) (rep *instance.RunResult,
+	err error)) (*instance.RunResult, error) {
+	inst := <-ctx.instances
+	if inst == nil {
+		return nil, fmt.Errorf("all VMs failed to boot")
+	}
+	defer ctx.returnInstance(inst)
+	return callback(inst.execProg)
+}
+
 func encodeEntries(entries []*prog.LogEntry) []byte {
 	buf := new(bytes.Buffer)
 	for _, ent := range entries {
diff --git a/pkg/repro/repro_test.go b/pkg/repro/repro_test.go
index ebe3809e0..c6c4cbe8d 100644
--- a/pkg/repro/repro_test.go
+++ b/pkg/repro/repro_test.go
@@ -4,6 +4,7 @@
 package repro
 
 import (
+	"fmt"
 	"math/rand"
 	"regexp"
 	"sync"
@@ -178,24 +179,53 @@ alarm(0xa)
 getpid()
 `
 
+// Only crash if `pause()` is followed by `alarm(0xa)`.
+var testCrashCondition = regexp.MustCompile(`(?s)pause\(\).*alarm\(0xa\)`)
+
+func testExecRunner(log []byte) (*instance.RunResult, error) {
+	crash := testCrashCondition.Match(log)
+	if crash {
+		ret := &instance.RunResult{}
+		ret.Report = &report.Report{
+			Title: `some crash`,
+		}
+		return ret, nil
+	}
+	return &instance.RunResult{}, nil
+}
+
 // Just a pkg/repro smoke test: check that we can extract a two-call reproducer.
 // No focus on error handling and minor corner cases.
 func TestPlainRepro(t *testing.T) {
 	ctx := prepareTestCtx(t, testReproLog)
-	// Only crash if `pause()` is followed by `alarm(0xa)`.
-	var match = regexp.MustCompile(`(?s)pause\(\).*alarm\(0xa\)`)
+	go generateTestInstances(ctx, 3, &testExecInterface{
+		t:   t,
+		run: testExecRunner,
+	})
+	result, _, err := ctx.run()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if diff := cmp.Diff(`pause()
+alarm(0xa)
+`, string(result.Prog.Serialize())); diff != "" {
+		t.Fatal(diff)
+	}
+}
+
+// There happen to be transient errors like ssh/scp connection failures.
+// Ensure that the code just retries.
+func TestVMErrorResilience(t *testing.T) {
+	ctx := prepareTestCtx(t, testReproLog)
+	fail := false
 	go generateTestInstances(ctx, 3, &testExecInterface{
 		t: t,
 		run: func(log []byte) (*instance.RunResult, error) {
-			crash := match.Match(log)
-			if crash {
-				ret := &instance.RunResult{}
-				ret.Report = &report.Report{
-					Title: `some crash`,
-				}
-				return ret, nil
+			fail = !fail
+			if fail {
+				return nil, fmt.Errorf("some random error")
 			}
-			return &instance.RunResult{}, nil
+			return testExecRunner(log)
 		},
 	})
 	result, _, err := ctx.run()
@@ -208,3 +238,22 @@ alarm(0xa)
 		t.Fatal(diff)
 	}
 }
+
+func TestTooManyErrors(t *testing.T) {
+	ctx := prepareTestCtx(t, testReproLog)
+	counter := 0
+	go generateTestInstances(ctx, 3, &testExecInterface{
+		t: t,
+		run: func(log []byte) (*instance.RunResult, error) {
+			counter++
+			if counter%3 != 0 {
+				return nil, fmt.Errorf("some random error")
+			}
+			return testExecRunner(log)
+		},
+	})
+	_, _, err := ctx.run()
+	if err == nil {
+		t.Fatalf("expected an error")
+	}
+}