pkg/bisect: estimate confidence in the result

Estimate reproducer's flakiness more carefully, as it can help us get better results. During config minimization, do not let reproducibility drop too low. For each test() run, estimate our confidence in the result. For now, only consider the false negative case: if we got no crashes, it's likely that the bug is there, but the reproducer was not lucky enough. Given an estimate of reproduction likelihood, we can easily calculate the chance of an invalid result: (1-probability)^runs. Combine all individual test() confidences into Result.Confidence. If the resulting Confidence is too low, ignore the bisection result.
author: Aleksandr Nogikh <nogikh@google.com> 2023-07-12 20:32:59 +0200
committer: Aleksandr Nogikh <nogikh@google.com> 2023-07-14 12:36:52 +0000
commit: 35d9ecc508aef508b67ee7986a7abb0864e74f8e (patch)
tree: 5b044a39374c6e3b7c91d14974b7abe1268d62b0 /pkg/bisect/bisect_test.go
parent: d624500f3877323fae8eb084872c5ef9a8ce3ef9 (diff)
1 files changed, 16 insertions, 2 deletions
diff --git a/pkg/bisect/bisect_test.go b/pkg/bisect/bisect_test.go
index 0c4775b0c..05b67008d 100644
--- a/pkg/bisect/bisect_test.go
+++ b/pkg/bisect/bisect_test.go
@@ -214,6 +214,9 @@ func testBisection(t *testing.T, baseDir string, test BisectionTest) {
 		} else {
 			checkBisectionResult(t, test, res)
 		}
+		if test.extraTest != nil {
+			test.extraTest(t, res)
+		}
 	}
 
 	res, err := runImpl(cfg, r, inst)
@@ -289,6 +292,8 @@ type BisectionTest struct {
 	baselineConfig  string
 	resultingConfig string
 	crossTree       bool
+
+	extraTest func(t *testing.T, res *Result)
 }
 
 var bisectionTests = []BisectionTest{
@@ -299,14 +304,23 @@ var bisectionTests = []BisectionTest{
 		commitLen:   1,
 		expectRep:   true,
 		culprit:     602,
+		extraTest: func(t *testing.T, res *Result) {
+			assert.Greater(t, res.Confidence, 0.99)
+		},
 	},
 	{
-		name:        "cause-finds-cause",
+		name:        "cause-finds-cause-flaky",
 		startCommit: 905,
 		commitLen:   1,
 		expectRep:   true,
 		flaky:       true,
-		culprit:     602,
+		culprit:     605,
+		extraTest: func(t *testing.T, res *Result) {
+			// False negative probability of each run is ~35%.
+			// We get two "good" results, so our accumulated confidence is ~42%.
+			assert.Less(t, res.Confidence, 0.5)
+			assert.Greater(t, res.Confidence, 0.4)
+		},
 	},
 	// Test bisection returns correct cause with different baseline/config combinations.
 	{
author	Aleksandr Nogikh <nogikh@google.com>	2023-07-12 20:32:59 +0200
committer	Aleksandr Nogikh <nogikh@google.com>	2023-07-14 12:36:52 +0000
commit	35d9ecc508aef508b67ee7986a7abb0864e74f8e (patch)
tree	5b044a39374c6e3b7c91d14974b7abe1268d62b0 /pkg/bisect/bisect_test.go
parent	d624500f3877323fae8eb084872c5ef9a8ce3ef9 (diff)