From f9e341e30b4f3faa468a0b885775a4fbf7825016 Mon Sep 17 00:00:00 2001 From: Mara Mihali Date: Thu, 5 Aug 2021 16:56:46 +0000 Subject: pkg/rpctype, syz-runner, syz-verifier: add reruns to syz-verifier architecture When a mismatch is found in the results returned for a program, the program will be rerun on all the kernels to ensure the mismatch is not flaky (i.e. it didn't occur because of some background activity or external state and will always be returned when running the program). If the same mismatch occurs in all reruns, syz-verifier creates a report for the program, otherwise it discards the program as being flaky --- syz-verifier/main.go | 142 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 106 insertions(+), 36 deletions(-) (limited to 'syz-verifier/main.go') diff --git a/syz-verifier/main.go b/syz-verifier/main.go index 62de2cdf1..eea9d92ed 100755 --- a/syz-verifier/main.go +++ b/syz-verifier/main.go @@ -60,18 +60,20 @@ type Verifier struct { stats *Stats statsWrite io.Writer newEnv bool + reruns int } // RPCServer is a wrapper around the rpc.Server. It communicates with Runners, // generates programs and sends complete Results for verification. type RPCServer struct { - vrf *Verifier - port int - mu sync.Mutex - cond *sync.Cond - pools map[int]*poolInfo - progs map[int]*progInfo - notChecked int + vrf *Verifier + port int + mu sync.Mutex + cond *sync.Cond + pools map[int]*poolInfo + progs map[int]*progInfo + notChecked int + rerunsAvailable *sync.Cond } // poolInfo contains kernel-specific information for spawning virtual machines @@ -82,12 +84,14 @@ type poolInfo struct { cfg *mgrconfig.Config pool *vm.Pool Reporter report.Reporter - // vmRunners keeps track of what programs have been sent to each Runner. + // runners keeps track of what programs have been sent to each Runner. // There is one Runner executing per VM instance. - vmRunners map[int]runnerProgs + runners map[int]runnerProgs // progs stores the programs that haven't been sent to this kernel yet but // have been sent to at least one other kernel. progs []*progInfo + // toRerun stores the programs that still need to be rerun by this kernel. + toRerun []*progInfo // checked is set to true when the set of system calls not supported on the // kernel is known. checked bool @@ -97,9 +101,12 @@ type progInfo struct { prog *prog.Prog idx int serialized []byte - res []*Result + res [][]*Result // received stores the number of results received for this program. received int + + runIdx int + report *ResultReport } type runnerProgs map[int]*progInfo @@ -111,6 +118,7 @@ func main() { flagStats := flag.String("stats", "", "where stats will be written when"+ "execution of syz-verifier finishes, defaults to stdout") flagEnv := flag.Bool("new-env", true, "create a new environment for each program") + flagReruns := flag.Int("rerun", 3, "number of time program is rerun when a mismatch is found") flag.Parse() pools := make(map[int]*poolInfo) @@ -190,7 +198,7 @@ func main() { if err != nil { log.Fatalf("failed to create reporter for instance-%d: %v", idx, err) } - pi.vmRunners = make(map[int]runnerProgs) + pi.runners = make(map[int]runnerProgs) } calls := make(map[*prog.Syscall]bool) @@ -215,6 +223,7 @@ func main() { reportReasons: len(cfg.EnabledSyscalls) != 0 || len(cfg.DisabledSyscalls) != 0, statsWrite: sw, newEnv: *flagEnv, + reruns: *flagReruns, } vrf.srv, err = startRPCServer(vrf) @@ -270,6 +279,7 @@ func startRPCServer(vrf *Verifier) (*RPCServer, error) { notChecked: len(vrf.pools), } srv.cond = sync.NewCond(&srv.mu) + srv.rerunsAvailable = sync.NewCond(&srv.mu) s, err := rpctype.NewRPCServer(vrf.addr, "Verifier", srv) if err != nil { @@ -288,7 +298,7 @@ func (srv *RPCServer) Connect(a *rpctype.RunnerConnectArgs, r *rpctype.RunnerCon srv.mu.Lock() defer srv.mu.Unlock() pool, vm := a.Pool, a.VM - srv.pools[pool].vmRunners[vm] = make(runnerProgs) + srv.pools[pool].runners[vm] = make(runnerProgs) r.CheckUnsupportedCalls = !srv.pools[pool].checked return nil } @@ -362,16 +372,20 @@ func (vrf *Verifier) finalizeCallSet(w io.Writer) { func (srv *RPCServer) NextExchange(a *rpctype.NextExchangeArgs, r *rpctype.NextExchangeRes) error { srv.mu.Lock() defer srv.mu.Unlock() + + var res *Result + var prog *progInfo if a.Info.Calls != nil { - res := &Result{ + res = &Result{ Pool: a.Pool, Hanged: a.Hanged, Info: a.Info, + RunIdx: a.RunIdx, } - prog := srv.progs[a.ProgIdx] + prog = srv.progs[a.ProgIdx] if prog == nil { - // This case can happen if both of the below conditions are true: + // This case can happen if both conditions are true: // 1. a Runner calls Verifier.NextExchange, then crashes, // its corresponding Pool being the only one that hasn't // sent results for the program yet @@ -385,10 +399,11 @@ func (srv *RPCServer) NextExchange(a *rpctype.NextExchangeArgs, r *rpctype.NextE return nil } + delete(srv.pools[a.Pool].runners[a.VM], prog.idx) if srv.newResult(res, prog) { - srv.vrf.processResults(prog.res, prog.prog) - delete(srv.progs, a.ProgIdx) - delete(srv.pools[a.Pool].vmRunners[a.VM], a.ProgIdx) + if srv.vrf.processResults(prog) { + delete(srv.progs, prog.idx) + } } } @@ -397,8 +412,8 @@ func (srv *RPCServer) NextExchange(a *rpctype.NextExchangeArgs, r *rpctype.NextE srv.cond.Wait() } - prog, pi := srv.newProgram(a.Pool, a.VM) - r.RPCProg = rpctype.RPCProg{Prog: prog, ProgIdx: pi} + newProg, pi, ri := srv.newProgram(a.Pool, a.VM) + r.RPCProg = rpctype.RPCProg{Prog: newProg, ProgIdx: pi, RunIdx: ri} return nil } @@ -406,20 +421,56 @@ func (srv *RPCServer) NextExchange(a *rpctype.NextExchangeArgs, r *rpctype.NextE // Results from the corresponding programs have been received and they can be // sent for verification. Otherwise, it returns false. func (srv *RPCServer) newResult(res *Result, prog *progInfo) bool { - prog.res[res.Pool] = res + ri := prog.runIdx + if prog.res[ri][res.Pool] != nil { + return false + } + prog.res[ri][res.Pool] = res prog.received++ return prog.received == len(srv.pools) } // processResults will send a set of complete results for verification and, in -// case differences are found, it will store a result report highlighting those -// in th workdir/results directory. If writing the results fails, it returns an -// error. -func (vrf *Verifier) processResults(res []*Result, prog *prog.Prog) { - vrf.stats.Progs++ - rr := Verify(res, prog, vrf.stats) - if rr == nil { - return +// case differences are found, it will start the rerun process for the program +// (if reruns are enabled). If every rerun produces the same results, the result +// report will be printed to persistent storage. Otherwise, the program is +// discarded as flaky. +func (vrf *Verifier) processResults(prog *progInfo) bool { + // TODO: Simplify this if clause. + if prog.runIdx == 0 { + vrf.stats.TotalProgs++ + prog.report = Verify(prog.res[0], prog.prog, vrf.stats) + if prog.report == nil { + return true + } + } else { + if !VerifyRerun(prog.res[prog.runIdx], prog.report) { + vrf.stats.FlakyProgs++ + log.Printf("flaky results dected: %d", vrf.stats.FlakyProgs) + return true + } + } + + if prog.runIdx < vrf.reruns-1 { + vrf.srv.newRun(prog) + return false + } + + rr := prog.report + vrf.stats.MismatchingProgs++ + + for _, cr := range rr.Reports { + if !cr.Mismatch { + break + } + vrf.stats.Calls[cr.Call].Mismatches++ + vrf.stats.TotalMismatches++ + for _, state := range cr.States { + if state0 := cr.States[0]; state0 != state { + vrf.stats.Calls[cr.Call].States[state] = true + vrf.stats.Calls[cr.Call].States[state0] = true + } + } } oldest := 0 @@ -448,6 +499,16 @@ func (vrf *Verifier) processResults(res []*Result, prog *prog.Prog) { } log.Printf("result-%d written successfully", oldest) + return true +} + +func (srv *RPCServer) newRun(p *progInfo) { + p.runIdx++ + p.received = 0 + p.res[p.runIdx] = make([]*Result, len(srv.pools)) + for _, pool := range srv.pools { + pool.toRerun = append(pool.toRerun, p) + } } func createReport(rr *ResultReport, pools int) []byte { @@ -476,25 +537,34 @@ func createReport(rr *ResultReport, pools int) []byte { // newProgram returns a new program for the Runner identified by poolIdx and // vmIdx and the program's index. -func (srv *RPCServer) newProgram(poolIdx, vmIdx int) ([]byte, int) { +func (srv *RPCServer) newProgram(poolIdx, vmIdx int) ([]byte, int, int) { pool := srv.pools[poolIdx] + + if len(pool.toRerun) != 0 { + p := pool.toRerun[0] + pool.runners[vmIdx][p.idx] = p + pool.toRerun = pool.toRerun[1:] + return p.serialized, p.idx, p.runIdx + } + if len(pool.progs) == 0 { prog, progIdx := srv.vrf.generate() pi := &progInfo{ prog: prog, idx: progIdx, serialized: prog.Serialize(), - res: make([]*Result, len(srv.pools)), + res: make([][]*Result, srv.vrf.reruns), } + pi.res[0] = make([]*Result, len(srv.pools)) for _, pool := range srv.pools { pool.progs = append(pool.progs, pi) } srv.progs[progIdx] = pi } p := pool.progs[0] - pool.vmRunners[vmIdx][p.idx] = p + pool.runners[vmIdx][p.idx] = p pool.progs = pool.progs[1:] - return p.serialized, p.idx + return p.serialized, p.idx, p.runIdx } // generate will return a newly generated program and its index. @@ -507,13 +577,13 @@ func (vrf *Verifier) generate() (*prog.Prog, int) { func (srv *RPCServer) cleanup(poolIdx, vmIdx int) { srv.mu.Lock() defer srv.mu.Unlock() - progs := srv.pools[poolIdx].vmRunners[vmIdx] + progs := srv.pools[poolIdx].runners[vmIdx] for _, prog := range progs { if srv.newResult(&Result{Pool: poolIdx, Crashed: true}, prog) { - srv.vrf.processResults(prog.res, prog.prog) + srv.vrf.processResults(prog) delete(srv.progs, prog.idx) - delete(srv.pools[poolIdx].vmRunners[vmIdx], prog.idx) + delete(srv.pools[poolIdx].runners[vmIdx], prog.idx) continue } } -- cgit mrf-deployment