From 7b630fdbfd1d53f913e0aff0dfa8ebfbaf86652b Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Wed, 19 Jul 2023 16:40:52 +0200 Subject: pkg/repro: tolerate two consequential run errors Retrying once has greatly reduced the number of "failed to copy prog to VM" errors, but they still periodically pop up. The underlying problem is still not 100% known. Supposedly, if a booted VM with an instrumented kernel has to wait too long, it can just hang or crash by itself. At least on some problematic revisions. Investigation wouldbe quite time-consuming -- we need to do a complicated refactoring in order to also capture serial output for Copy() failures. So far it does not seem to be totally worth it. Let's do 3 runOnInstance() attempts. If the problem still persists, there's no point in doing more runs -- we'd have to determine the exact root cause. --- pkg/repro/repro.go | 12 +++++++++--- pkg/repro/repro_test.go | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'pkg/repro') diff --git a/pkg/repro/repro.go b/pkg/repro/repro.go index b774705e6..1da74830f 100644 --- a/pkg/repro/repro.go +++ b/pkg/repro/repro.go @@ -527,13 +527,19 @@ func (ctx *context) testProg(p *prog.Prog, duration time.Duration, opts csource. func (ctx *context) testWithInstance(callback func(execInterface) (rep *instance.RunResult, err error)) (bool, error) { - result, err := ctx.runOnInstance(callback) - if err != nil { + var result *instance.RunResult + var err error + + const attempts = 3 + for i := 0; i < attempts; i++ { // It's hard to classify all kinds of errors into the one worth repeating - // and not. So let's just retry run for all errors. + // and not. So let's just retry runs for all errors. // If the problem is transient, it will likely go away. // If the problem is permanent, it will just be the same. result, err = ctx.runOnInstance(callback) + if err == nil { + break + } } if err != nil { return false, err diff --git a/pkg/repro/repro_test.go b/pkg/repro/repro_test.go index 0e58685b7..fc8d9cad0 100644 --- a/pkg/repro/repro_test.go +++ b/pkg/repro/repro_test.go @@ -247,7 +247,7 @@ func TestTooManyErrors(t *testing.T) { t: t, run: func(log []byte) (*instance.RunResult, error) { counter++ - if counter%3 != 0 { + if counter%4 != 0 { return nil, fmt.Errorf("some random error") } return testExecRunner(log) -- cgit mrf-deployment