// Copyright 2024 syzkaller project authors. All rights reserved. // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. package manager import ( "bufio" "bytes" "errors" "fmt" "math/rand" "os" "path/filepath" "runtime" "strings" "sync" "time" "github.com/google/syzkaller/pkg/db" "github.com/google/syzkaller/pkg/fuzzer" "github.com/google/syzkaller/pkg/hash" "github.com/google/syzkaller/pkg/log" "github.com/google/syzkaller/pkg/mgrconfig" "github.com/google/syzkaller/pkg/osutil" "github.com/google/syzkaller/prog" ) type Seeds struct { CorpusDB *db.DB Fresh bool Candidates []fuzzer.Candidate } func LoadSeeds(cfg *mgrconfig.Config, immutable bool) (Seeds, error) { var info Seeds var err error info.CorpusDB, err = db.Open(filepath.Join(cfg.Workdir, "corpus.db"), !immutable) if err != nil { if info.CorpusDB == nil { return Seeds{}, fmt.Errorf("failed to open corpus database: %w", err) } log.Errorf("read %v inputs from corpus and got error: %v", len(info.CorpusDB.Records), err) } info.Fresh = len(info.CorpusDB.Records) == 0 corpusFlags := versionToFlags(info.CorpusDB.Version) outputs := make(chan *input, 32) chErr := make(chan error, 1) go func() { chErr <- readInputs(cfg, info.CorpusDB, outputs) close(outputs) }() brokenSeeds := 0 skippedSeeds := 0 var brokenCorpus []string var candidates []fuzzer.Candidate for inp := range outputs { if inp.Prog == nil { if inp.IsSeed { if errors.Is(inp.Err, ErrSkippedTest) { skippedSeeds++ log.Logf(2, "seed %s is skipped: %s", inp.Path, inp.Err) } else { brokenSeeds++ log.Logf(0, "seed %s is broken: %s", inp.Path, inp.Err) } } else { brokenCorpus = append(brokenCorpus, inp.Key) } continue } flags := corpusFlags if inp.IsSeed { if _, ok := info.CorpusDB.Records[hash.String(inp.Prog.Serialize())]; ok { continue } // Seeds are not considered "from corpus" (won't be rerun multiple times) // b/c they are tried on every start anyway. flags = fuzzer.ProgMinimized } candidates = append(candidates, fuzzer.Candidate{ Prog: inp.Prog, Flags: flags, }) } if err := <-chErr; err != nil { return Seeds{}, err } if len(brokenCorpus)+brokenSeeds != 0 { log.Logf(0, "broken programs in the corpus: %v, broken seeds: %v", len(brokenCorpus), brokenSeeds) } if skippedSeeds != 0 { log.Logf(0, "skipped %v seeds", skippedSeeds) } if !immutable { // This needs to be done outside of the loop above to not race with corpusDB reads. for _, sig := range brokenCorpus { info.CorpusDB.Delete(sig) } if err := info.CorpusDB.Flush(); err != nil { return Seeds{}, fmt.Errorf("failed to save corpus database: %w", err) } } // Switch database to the mode when it does not keep records in memory. // We don't need them anymore and they consume lots of memory. info.CorpusDB.DiscardData() info.Candidates = candidates return info, nil } type input struct { IsSeed bool Key string Path string Data []byte Prog *prog.Prog Err error } func readInputs(cfg *mgrconfig.Config, db *db.DB, output chan *input) error { procs := runtime.GOMAXPROCS(0) inputs := make(chan *input, procs) var wg sync.WaitGroup wg.Add(procs) defer wg.Wait() defer close(inputs) for p := 0; p < procs; p++ { go func() { defer wg.Done() for inp := range inputs { inp.Prog, inp.Err = ParseSeed(cfg.Target, inp.Data) output <- inp } }() } for key, rec := range db.Records { inputs <- &input{ Key: key, Data: rec.Val, } } seedPath := filepath.Join("sys", cfg.TargetOS, "test") seedDir := filepath.Join(cfg.Syzkaller, seedPath) if osutil.IsExist(seedDir) { seeds, err := os.ReadDir(seedDir) if err != nil { return fmt.Errorf("failed to read seeds dir: %w", err) } for _, seed := range seeds { data, err := os.ReadFile(filepath.Join(seedDir, seed.Name())) if err != nil { return fmt.Errorf("failed to read seed %v: %w", seed.Name(), err) } inputs <- &input{ IsSeed: true, Path: filepath.Join(seedPath, seed.Name()), Data: data, } } } return nil } const CurrentDBVersion = 5 func versionToFlags(version uint64) fuzzer.ProgFlags { // By default we don't re-minimize/re-smash programs from corpus, // it takes lots of time on start and is unnecessary. // However, on version bumps we can selectively re-minimize/re-smash. corpusFlags := fuzzer.ProgFromCorpus | fuzzer.ProgMinimized | fuzzer.ProgSmashed switch version { case 0: // Version 0 had broken minimization, so we need to re-minimize. corpusFlags &= ^fuzzer.ProgMinimized fallthrough case 1: // Version 1->2: memory is preallocated so lots of mmaps become unnecessary. corpusFlags &= ^fuzzer.ProgMinimized fallthrough case 2: // Version 2->3: big-endian hints. corpusFlags &= ^fuzzer.ProgSmashed fallthrough case 3: // Version 3->4: to shake things up. corpusFlags &= ^fuzzer.ProgMinimized fallthrough case 4: // Version 4->5: fix for comparison argument sign extension. // Introduced in 1ba0279d74a35e96e81de87073212d2b20256e8f. // Update (July 2024): // We used to reset the fuzzer.ProgSmashed flag here, but it has led to // perpetual corpus retriage on slow syzkaller instances. By now, all faster // instances must have already bumped their corpus versions, so let's just // increase the version to let all others go past the corpus triage stage. fallthrough case CurrentDBVersion: } return corpusFlags } func ParseSeed(target *prog.Target, data []byte) (*prog.Prog, error) { p, _, err := parseProg(target, data, prog.NonStrict, nil) return p, err } func ParseSeedWithRequirements(target *prog.Target, data []byte, reqs map[string]bool) ( *prog.Prog, map[string]bool, error) { return parseProg(target, data, prog.Strict, reqs) } func parseRequires(data []byte) map[string]bool { requires := make(map[string]bool) for s := bufio.NewScanner(bytes.NewReader(data)); s.Scan(); { const prefix = "# requires:" line := s.Text() if !strings.HasPrefix(line, prefix) { continue } for _, req := range strings.Fields(line[len(prefix):]) { positive := true if req[0] == '-' { positive = false req = req[1:] } requires[req] = positive } } return requires } func checkArch(requires map[string]bool, arch string) bool { for req, positive := range requires { const prefix = "arch=" if strings.HasPrefix(req, prefix) && arch != req[len(prefix):] == positive { return false } } return true } func MatchRequirements(props, requires map[string]bool) bool { for req, positive := range requires { if positive { if !props[req] { return false } continue } matched := true for _, req1 := range strings.Split(req, ",") { if !props[req1] { matched = false } } if matched { return false } } return true } var ErrSkippedTest = errors.New("skipped test based on constraints") func parseProg(target *prog.Target, data []byte, mode prog.DeserializeMode, reqs map[string]bool) ( *prog.Prog, map[string]bool, error) { properties := parseRequires(data) // Need to check requirements early, as some programs may fail to deserialize // on some arches due to missing syscalls. We also do not want to parse tests // that are marked as 'manual'. if !checkArch(properties, target.Arch) || !MatchRequirements(properties, reqs) { var pairs []string for k, v := range properties { pairs = append(pairs, fmt.Sprintf("%s=%t", k, v)) } return nil, properties, fmt.Errorf("%w: %s", ErrSkippedTest, strings.Join(pairs, ", ")) } p, err := target.Deserialize(data, mode) if err != nil { return nil, nil, err } if len(p.Calls) > prog.MaxCalls { return nil, nil, fmt.Errorf("longer than %d calls (%d)", prog.MaxCalls, len(p.Calls)) } // For some yet unknown reasons, programs with fail_nth > 0 may sneak in. Ignore them. for _, call := range p.Calls { if call.Props.FailNth > 0 { return nil, nil, fmt.Errorf("input has fail_nth > 0") } } return p, properties, nil } type FilteredCandidates struct { Candidates []fuzzer.Candidate ModifiedHashes []string SeedCount int } func FilterCandidates(candidates []fuzzer.Candidate, syscalls map[*prog.Syscall]bool, dropMinimize bool) FilteredCandidates { var ret FilteredCandidates for _, item := range candidates { if !item.Prog.OnlyContains(syscalls) { ret.ModifiedHashes = append(ret.ModifiedHashes, hash.String(item.Prog.Serialize())) // We cut out the disabled syscalls and retriage/minimize what remains from the prog. // The original prog will be deleted from the corpus. if dropMinimize { item.Flags &= ^fuzzer.ProgMinimized } item.Prog.FilterInplace(syscalls) if len(item.Prog.Calls) == 0 { continue } } if item.Flags&fuzzer.ProgFromCorpus == 0 { ret.SeedCount++ } ret.Candidates = append(ret.Candidates, item) } return ret } // Programs that do more than 15 system calls are to be treated with suspicion and re-minimized. const ReminimizeThreshold = 15 // ReminimizeSubset clears the fuzzer.ProgMinimized flag of a small subset of seeds. // The ultimate objective is to gradually clean up the poorly minimized corpus programs. // ReminimizeSubset assumes that candidates are sorted in the order of ascending len(Prog.Calls). func (fc *FilteredCandidates) ReminimizeSubset() int { if len(fc.Candidates) == 0 { return 0 } // Focus on the top 10% of the largest programs in the corpus. threshold := max(ReminimizeThreshold, len(fc.Candidates[len(fc.Candidates)*9/10].Prog.Calls)) var resetIndices []int for i, info := range fc.Candidates { if info.Flags&fuzzer.ProgMinimized == 0 { continue } if len(info.Prog.Calls) >= threshold { resetIndices = append(resetIndices, i) } } // Reset ProgMinimized for up to 1% of the seed programs. reset := min(50, len(resetIndices), max(1, len(fc.Candidates)/100)) rnd := rand.New(rand.NewSource(time.Now().UnixNano())) for _, i := range rnd.Perm(len(resetIndices))[:reset] { fc.Candidates[resetIndices[i]].Flags &= ^fuzzer.ProgMinimized } return reset } // resmashSubset clears fuzzer.ProgSmashes for a subset of seeds. // We smash the program only once after we add it to the corpus, but it can be that // either it did not finish before the instance was restarted, or the fuzzing algorithms // have become smarter over time, or just that kernel code changed over time. // It would be best to track it in pkg/db, but until it's capable of that, let's just // re-smash some corpus subset on each syz-manager restart. func (fc *FilteredCandidates) ResmashSubset() int { var indices []int for i, info := range fc.Candidates { if info.Flags&fuzzer.ProgSmashed == 0 { continue } indices = append(indices, i) } // Reset ProgSmashed for up to 0.5% of the seed programs. reset := min(25, len(indices), max(1, len(fc.Candidates)/200)) rnd := rand.New(rand.NewSource(time.Now().UnixNano())) for _, i := range rnd.Perm(len(indices))[:reset] { fc.Candidates[indices[i]].Flags &= ^fuzzer.ProgSmashed } return reset }