From e8709b21d7c474a0fb6b8ff13039702865fd83bb Mon Sep 17 00:00:00 2001
From: Aleksandr Nogikh <nogikh@google.com>
Date: Mon, 15 Jul 2024 17:31:34 +0200
Subject: pkg/bisect: set a lower bound for BisectBad verdict

The "1 crashed, 9 OK" cases are a frequent reason of invalid bisection
results on syzbot.

Let's define a cutoff for a BisectBad verdict and use it to prevent such
obvious outliers. We cannot safely declare such results as BisectGood
either, so let's return BisectSkip in this case.
---
 pkg/bisect/bisect.go      |  3 ++-
 pkg/bisect/bisect_test.go | 28 +++++++++++++++++++---------
 2 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'pkg/bisect')

diff --git a/pkg/bisect/bisect.go b/pkg/bisect/bisect.go
index b75b2d2e0..2654b5dec 100644
--- a/pkg/bisect/bisect.go
+++ b/pkg/bisect/bisect.go
@@ -830,6 +830,7 @@ func (env *env) bisectionDecision(total, bad, good, infra int) (vcs.BisectResult
 	// Boot errors, image test errors, skipped crashes.
 	skip := total - bad - good - infra
 
+	wantBadRuns := max(2, (total-infra)/6) // For 10 runs, require 2 crashes. For 20, require 3.
 	wantGoodRuns := total / 2
 	wantTotalRuns := total / 2
 	if env.flaky {
@@ -840,7 +841,7 @@ func (env *env) bisectionDecision(total, bad, good, infra int) (vcs.BisectResult
 		// We need a big enough number of good results, otherwise the chance of a false
 		// positive is too high.
 		return vcs.BisectGood, nil
-	} else if bad > 0 && (good+bad) >= wantTotalRuns {
+	} else if bad >= wantBadRuns && (good+bad) >= wantTotalRuns {
 		// We need enough (good+bad) results to conclude that the kernel revision itself
 		// is not too broken.
 		return vcs.BisectBad, nil
diff --git a/pkg/bisect/bisect_test.go b/pkg/bisect/bisect_test.go
index d44532fe0..eead42335 100644
--- a/pkg/bisect/bisect_test.go
+++ b/pkg/bisect/bisect_test.go
@@ -105,7 +105,8 @@ func (env *testEnv) Test(numVMs int, reproSyz, reproOpts, reproC []byte) ([]inst
 	if (env.config == "baseline-repro" || env.config == "new-minimized-config" || env.config == "original config") &&
 		introduced && !fixed {
 		if env.test.flaky {
-			ret = crashErrors(1, numVMs-1, "crash occurs", env.test.reportType)
+			crashed := max(2, numVMs/6)
+			ret = crashErrors(crashed, numVMs-crashed, "crash occurs", env.test.reportType)
 		} else {
 			ret = crashErrors(numVMs, 0, "crash occurs", env.test.reportType)
 		}
@@ -352,10 +353,10 @@ var bisectionTests = []BisectionTest{
 		flaky:       true,
 		introduced:  "605",
 		extraTest: func(t *testing.T, res *Result) {
-			// False negative probability of each run is ~35%.
-			// We get three "good" results, so our accumulated confidence is ~27%.
-			assert.Less(t, res.Confidence, 0.3)
-			assert.Greater(t, res.Confidence, 0.2)
+			// False negative probability of each run is ~4%.
+			// We get three "good" results, so our accumulated confidence is ~85%.
+			assert.Less(t, res.Confidence, 0.9)
+			assert.Greater(t, res.Confidence, 0.8)
 		},
 	},
 	// Test bisection returns correct cause with different baseline/config combinations.
@@ -797,8 +798,8 @@ func TestBisectVerdict(t *testing.T) {
 		{
 			name:    "many-total-and-infra",
 			total:   10,
-			good:    5,
-			bad:     1,
+			good:    4,
+			bad:     2,
 			infra:   2,
 			skip:    2,
 			verdict: vcs.BisectBad,
@@ -846,12 +847,21 @@ func TestBisectVerdict(t *testing.T) {
 			name:    "flaky-many-skips",
 			flaky:   true,
 			total:   20,
-			good:    9,
-			bad:     1,
+			good:    7,
+			bad:     3,
 			infra:   0,
 			skip:    10,
 			verdict: vcs.BisectBad,
 		},
+		{
+			name:    "outlier-bad",
+			total:   10,
+			good:    9,
+			bad:     1,
+			infra:   0,
+			skip:    0,
+			verdict: vcs.BisectSkip,
+		},
 	}
 
 	for _, test := range tests {
-- 
cgit mrf-deployment