tools/syz-testbed: produce results indefinitely

Creation of instances only at the start of the tool limits the scale of the experiments. Instead of setting up instances at the beginning, create, start and stop them dynamically. This allows to conduct much larger experiments while keeping the maximal resource consuption under control. When an instance has been executing for the specified time, it is stopped and replaced by another one. Introduce two new parameters: - max_instances - the maximal number of simultaneously running instances. - run_hours - the lifetime (in hours) of a single instance. The longer the tool runs, the more test results will be accumulated for each checkout specified in the configuration file.
author: Aleksandr Nogikh <nogikh@google.com> 2021-10-28 17:49:25 +0000
committer: Aleksandr Nogikh <wp32pw@gmail.com> 2021-11-12 16:33:43 +0100
commit: bba04c950dfc1bc6b709232bb32d552223997678 (patch)
tree: e646235d8fa12248ea951a491a59259753e266b2 /tools/syz-testbed/testbed.go
parent: e543635de79b44506c51fa953a8ad8eb870397f6 (diff)
1 files changed, 225 insertions, 146 deletions
diff --git a/tools/syz-testbed/testbed.go b/tools/syz-testbed/testbed.go
index bad77c294..819a61b34 100644
--- a/tools/syz-testbed/testbed.go
+++ b/tools/syz-testbed/testbed.go
@@ -13,15 +13,12 @@ import (
 	"flag"
 	"fmt"
 	"log"
-	"net"
-	"os"
 	"path/filepath"
 	"regexp"
-	"strconv"
+	"sync"
 	"time"
 
 	"github.com/google/syzkaller/pkg/config"
-	syz_instance "github.com/google/syzkaller/pkg/instance"
 	"github.com/google/syzkaller/pkg/mgrconfig"
 	"github.com/google/syzkaller/pkg/osutil"
 	"github.com/google/syzkaller/pkg/tool"
@@ -29,44 +26,45 @@ import (
 )
 
 var (
-	flagConfig  = flag.String("config", "", "config file")
-	flagCleanup = flag.Bool("cleanup", false, "remove existing work directories")
+	flagConfig = flag.String("config", "", "config file")
 )
 
 type TestbedConfig struct {
-	Corpus        string            `json:"corpus"`         // path to the corpus file
-	Workdir       string            `json:"workdir"`        // instances will be checked out there
-	ManagerConfig json.RawMessage   `json:"manager_config"` // base manager config
-	Checkouts     []TestbedCheckout `json:"checkouts"`
+	Name          string           `json:"name"`           // name of the testbed
+	MaxInstances  int              `json:"max_instances"`  // max # of simultaneously running instances
+	RunTime       DurationConfig   `json:"run_time"`       // lifetime of an instance (default "24h")
+	Corpus        string           `json:"corpus"`         // path to the corpus file
+	Workdir       string           `json:"workdir"`        // instances will be checked out there
+	ManagerConfig json.RawMessage  `json:"manager_config"` // base manager config
+	Checkouts     []CheckoutConfig `json:"checkouts"`
 }
 
-type TestbedCheckout struct {
+type DurationConfig struct {
+	time.Duration
+}
+
+type CheckoutConfig struct {
 	Name   string `json:"name"`
 	Repo   string `json:"repo"`
 	Branch string `json:"branch"`
-	Count  int    `json:"count"`
-}
-
-type CheckoutInfo struct {
-	Path      string
-	Name      string
-	Instances []InstanceInfo
 }
 
-// The essential information about an already prepared instance.
-type InstanceInfo struct {
-	Name            string
-	Workdir         string
-	BenchFile       string
-	LogFile         string
-	HTTP            string
-	ExecCommand     string
-	ExecCommandArgs []string
+type TestbedContext struct {
+	Config         *TestbedConfig
+	ManagerConfig  *mgrconfig.Config
+	Checkouts      []*Checkout
+	NextRestart    time.Time
+	NextCheckoutID int
+	NextInstanceID int
+	statMutex      sync.Mutex
 }
 
 func main() {
 	flag.Parse()
-	cfg := &TestbedConfig{}
+	cfg := &TestbedConfig{
+		Name:    "testbed",
+		RunTime: DurationConfig{24 * time.Hour},
+	}
 	err := config.LoadFile(*flagConfig, &cfg)
 	if err != nil {
 		tool.Failf("failed to read config: %s", err)
@@ -82,150 +80,238 @@ func main() {
 		tool.Failf("failed to parse manager config: %s", err)
 	}
 	if managerCfg.HTTP == "" {
-		managerCfg.HTTP = ":50000"
+		managerCfg.HTTP = "0.0.0.0:0"
 	}
 
-	checkouts := []*CheckoutInfo{}
-	for _, co := range cfg.Checkouts {
-		checkouts = append(checkouts, newCheckout(co, cfg, managerCfg))
+	ctx := TestbedContext{
+		Config:        cfg,
+		ManagerConfig: managerCfg,
 	}
-
-	log.Printf("------------------")
-	for _, co := range checkouts {
-		for _, instance := range co.Instances {
-			go runInstance(instance)
+	for _, checkoutCfg := range cfg.Checkouts {
+		co, err := ctx.NewCheckout(&checkoutCfg)
+		if err != nil {
+			tool.Failf("checkout failed: %s", err)
 		}
+		ctx.Checkouts = append(ctx.Checkouts, co)
 	}
-	go collectStats(cfg, checkouts)
-	// Block the execution indefinitely.
-	// Either the process will be killed or it will exit itself if one of the instances fails.
-	select {}
-}
 
-func collectStats(cfg *TestbedConfig, checkouts []*CheckoutInfo) {
-	const period = 90 * time.Second
-	benchFolder := filepath.Join(cfg.Workdir, "benches")
-	err := osutil.MkdirAll(benchFolder)
-	if err != nil {
-		tool.Failf("failed to create bench folder: %s", err)
-	}
-	tableStats := map[string]func(checkouts []*CheckoutInfo) ([][]string, error){
-		"bugs.csv":           generateBugTable,
-		"checkout_stats.csv": checkoutStatsTable,
-		"instance_stats.csv": instanceStatsTable,
-	}
-	for {
-		time.Sleep(period)
-		for fileName, genFunc := range tableStats {
-			table, err := genFunc(checkouts)
-			if err == nil {
-				saveTableAsCsv(table, filepath.Join(cfg.Workdir, fileName))
+	shutdown := make(chan struct{})
+	osutil.HandleInterrupts(shutdown)
+
+	go func() {
+		const period = 90 * time.Second
+		for {
+			time.Sleep(period)
+			err := ctx.SaveStats()
+			if err != nil {
+				log.Printf("stats saving error: %s", err)
 			}
 		}
-		for _, checkout := range checkouts {
-			fileName := fmt.Sprintf("avg_%v.txt", checkout.Name)
-			saveAvgBenchFile(checkout, filepath.Join(benchFolder, fileName))
+	}()
+
+	ctx.Loop(shutdown)
+}
+
+func (ctx *TestbedContext) GetStatViews() ([]StatView, error) {
+	groupsCompleted := []RunResultGroup{}
+	groupsAll := []RunResultGroup{}
+	for _, checkout := range ctx.Checkouts {
+		running := []*RunResult{}
+		for _, instance := range checkout.Running {
+			result, err := instance.FetchResult()
+			if err != nil {
+				return nil, err
+			}
+			running = append(running, result)
 		}
+		groupsCompleted = append(groupsCompleted, RunResultGroup{
+			Name:    checkout.Name,
+			Results: checkout.Completed,
+		})
+		groupsAll = append(groupsAll, RunResultGroup{
+			Name:    checkout.Name,
+			Results: append(checkout.Completed, running...),
+		})
 	}
+	return []StatView{
+		{
+			Name:   "all",
+			Groups: groupsAll,
+		},
+		{
+			Name:   "completed",
+			Groups: groupsCompleted,
+		},
+	}, nil
 }
 
-func runInstance(info InstanceInfo) {
-	logfile, err := os.Create(info.LogFile)
+func (ctx *TestbedContext) saveStatView(view StatView) error {
+	dir := filepath.Join(ctx.Config.Workdir, "stats_"+view.Name)
+	benchDir := filepath.Join(dir, "benches")
+	err := osutil.MkdirAll(benchDir)
 	if err != nil {
-		tool.Failf("[%s] failed to create logfile: %s", info.Name, err)
+		return fmt.Errorf("failed to create %s: %s", benchDir, err)
 	}
-	cmd := osutil.GraciousCommand(info.ExecCommand, info.ExecCommandArgs...)
-	cmd.Stdout = logfile
-	cmd.Stderr = logfile
-	err = cmd.Start()
-	if err != nil {
-		tool.Failf("[%s] failed to start instance: %s", info.Name, err)
+
+	tableStats := map[string]func(view StatView) ([][]string, error){
+		"bugs.csv":           (StatView).GenerateBugTable,
+		"checkout_stats.csv": (StatView).StatsTable,
+		"instance_stats.csv": (StatView).InstanceStatsTable,
 	}
-	log.Printf("[%s] Instance started. Listening on %s", info.Name, info.HTTP)
-	logfile.Close()
-	err = cmd.Wait()
-	tool.Failf("[%s] Instance exited: %s", info.Name, err)
+
+	for fileName, genFunc := range tableStats {
+		table, err := genFunc(view)
+		if err == nil {
+			SaveTableAsCsv(table, filepath.Join(dir, fileName))
+		} else {
+			log.Printf("some error: %s", err)
+		}
+	}
+	for _, group := range view.Groups {
+		fileName := fmt.Sprintf("avg_%v.txt", group.Name)
+		group.SaveAvgBenchFile(filepath.Join(benchDir, fileName))
+	}
+	return nil
 }
 
-func newCheckout(co TestbedCheckout, cfg *TestbedConfig, managerCfg *mgrconfig.Config) *CheckoutInfo {
-	log.Printf("[%s] Checking out", co.Name)
-	path := filepath.Join(cfg.Workdir, "checkouts", co.Name)
-	if osutil.IsExist(path) {
-		if !*flagCleanup {
-			tool.Failf("path %s already exists", path)
+func (ctx *TestbedContext) saveTestbedStats(file string) error {
+	table := [][]string{
+		{"Checkout", "Running", "Completed", "Until reset"},
+	}
+	for _, checkout := range ctx.Checkouts {
+		until := "-"
+		if ctx.NextRestart.After(time.Now()) {
+			until = time.Until(ctx.NextRestart).Round(time.Second).String()
 		}
-		osutil.RemoveAll(path)
+		table = append(table, []string{
+			checkout.Name,
+			fmt.Sprintf("%d", len(checkout.Running)),
+			fmt.Sprintf("%d", len(checkout.Completed)),
+			until,
+		})
 	}
-	repo := vcs.NewSyzkallerRepo(path)
-	commit, err := repo.Poll(co.Repo, co.Branch)
+	return SaveTableAsCsv(table, file)
+}
+
+func (ctx *TestbedContext) SaveStats() error {
+	// Preventing concurrent saving of the stats.
+	ctx.statMutex.Lock()
+	defer ctx.statMutex.Unlock()
+	views, err := ctx.GetStatViews()
 	if err != nil {
-		tool.Failf("failed to checkout %s (%s): %s", co.Repo, co.Branch, err)
+		return err
 	}
-	log.Printf("[%s] Done. Latest commit: %s", co.Name, commit)
-	log.Printf("[%s] Building", co.Name)
-	if _, err := osutil.RunCmd(time.Hour, path, syz_instance.MakeBin); err != nil {
-		tool.Failf("[%s] Make failed: %s", co.Name, err)
+	for _, view := range views {
+		err := ctx.saveStatView(view)
+		if err != nil {
+			return err
+		}
 	}
-	checkoutInfo := CheckoutInfo{
-		Name: co.Name,
-		Path: path,
+	return ctx.saveTestbedStats(filepath.Join(ctx.Config.Workdir, "testbed.csv"))
+}
+
+func (ctx *TestbedContext) generateInstances(count int) ([]*Instance, error) {
+	// It seems that even gracefully finished syz-managers can leak GCE instances.
+	// To allow for that strange behavior, let's reuse syz-manager names, so that
+	// they will in turn reuse the names of the leaked GCE instances.
+	instances := []*Instance{}
+	for idx := 1; idx <= count; idx++ {
+		checkout := ctx.Checkouts[ctx.NextCheckoutID]
+		ctx.NextCheckoutID = (ctx.NextCheckoutID + 1) % len(ctx.Checkouts)
+		instance, err := ctx.NewInstance(checkout, fmt.Sprintf("%s-%d", ctx.Config.Name, idx))
+		if err != nil {
+			return nil, err
+		}
+		checkout.Running = append(checkout.Running, instance)
+		instances = append(instances, instance)
 	}
-	for i := 1; i <= co.Count; i++ {
-		name := fmt.Sprintf("%v-%d", co.Name, i)
-		log.Printf("[%s] Generating workdir", name)
-		workdir := filepath.Join(path, fmt.Sprintf("workdir_%d", i))
-		err = osutil.MkdirAll(workdir)
+	return instances, nil
+}
+
+// Create instances, run them, stop them, archive them, and so on...
+func (ctx *TestbedContext) Loop(stop chan struct{}) {
+	duration := ctx.Config.RunTime.Duration
+	mustStop := false
+	for !mustStop {
+		log.Printf("setting up instances")
+		instances, err := ctx.generateInstances(ctx.Config.MaxInstances)
 		if err != nil {
-			tool.Failf("failed to create dir %s", workdir)
+			tool.Failf("failed to set up intances: %s", err)
 		}
-		if cfg.Corpus != "" {
-			corpusPath := filepath.Join(workdir, "corpus.db")
-			err = osutil.CopyFile(cfg.Corpus, corpusPath)
-			if err != nil {
-				tool.Failf("failed to copy corpus from %s: %s", cfg.Corpus, err)
-			}
+		log.Printf("starting instances")
+		instanceStatuses := make(chan error, len(instances))
+		var wg sync.WaitGroup
+		for _, inst := range instances {
+			wg.Add(1)
+			go func(instance *Instance) {
+				instanceStatuses <- instance.Run()
+				wg.Done()
+			}(inst)
 		}
-		log.Printf("[%s] Generating syz-manager config", name)
-		managerCfg.Name = name
-		managerCfg.Workdir = workdir
-		managerCfg.Syzkaller = path
-		managerCfgPath := filepath.Join(path, fmt.Sprintf("syz_%d.cnf", i))
-		err = config.SaveFile(managerCfgPath, managerCfg)
-		if err != nil {
-			tool.Failf("failed to save manager config to %s: %s", managerCfgPath, err)
+
+		ctx.NextRestart = time.Now().Add(duration)
+		select {
+		case err := <-instanceStatuses:
+			// Syz-managers are not supposed to stop under normal circumstances.
+			// If one of them did stop, there must have been a very good reason to.
+			// For now, we just shut down the whole experiment in such a case.
+			log.Printf("an instance has failed (%s), stopping everything", err)
+			mustStop = true
+		case <-stop:
+			log.Printf("stopping the experiment")
+			mustStop = true
+		case <-time.After(duration):
+			log.Printf("run period has finished")
 		}
-		bench := filepath.Join(path, fmt.Sprintf("bench_%d.txt", i))
-		log := filepath.Join(path, fmt.Sprintf("log_%d.txt", i))
-		checkoutInfo.Instances = append(checkoutInfo.Instances, InstanceInfo{
-			Name:            managerCfg.Name,
-			Workdir:         workdir,
-			BenchFile:       bench,
-			LogFile:         log,
-			HTTP:            managerCfg.HTTP,
-			ExecCommand:     filepath.Join(path, "bin", "syz-manager"),
-			ExecCommandArgs: []string{"-config", managerCfgPath, "-bench", bench},
-		})
-		managerCfg.HTTP, err = increasePort(managerCfg.HTTP)
+
+		// Wait for all instances to finish.
+		for _, instance := range instances {
+			instance.Stop()
+		}
+		wg.Wait()
+
+		// Only mark instances completed if they've indeed been running the whole iteration.
+		if !mustStop {
+			for _, checkout := range ctx.Checkouts {
+				err = checkout.ArchiveRunning()
+				if err != nil {
+					tool.Failf("ArchiveRunning error: %s", err)
+				}
+			}
+		}
+
+		log.Printf("collecting statistics")
+		err = ctx.SaveStats()
 		if err != nil {
-			tool.Failf("failed to inrease port number: %s", err)
+			log.Printf("stats saving error: %s", err)
 		}
 	}
-	return &checkoutInfo
 }
 
-func increasePort(http string) (string, error) {
-	host, portStr, err := net.SplitHostPort(http)
-	if err != nil {
-		return "", fmt.Errorf("invalid http value: %s", http)
+func (d *DurationConfig) UnmarshalJSON(data []byte) error {
+	var v interface{}
+	if err := json.Unmarshal(data, &v); err != nil {
+		return err
 	}
-	port, err := strconv.Atoi(portStr)
-	if err != nil {
-		return "", err
+	str, ok := v.(string)
+	if !ok {
+		return fmt.Errorf("%s was expected to be a string", data)
+	}
+	parsed, err := time.ParseDuration(str)
+	if err == nil {
+		d.Duration = parsed
 	}
-	return net.JoinHostPort(host, fmt.Sprintf("%d", port+1)), nil
+	return err
+}
+func (d *DurationConfig) MarshalJSON() ([]byte, error) {
+	return json.Marshal(d.String())
 }
 
 func checkConfig(cfg *TestbedConfig) error {
+	testbedNameRe := regexp.MustCompile(`^[0-9a-z\-]{1,20}$`)
+	if !testbedNameRe.MatchString(cfg.Name) {
+		return fmt.Errorf("invalid testbed name: %v", cfg.Name)
+	}
 	if cfg.Workdir == "" {
 		return fmt.Errorf("workdir is empty")
 	}
@@ -237,8 +323,10 @@ func checkConfig(cfg *TestbedConfig) error {
 	if cfg.Corpus != "" && !osutil.IsExist(cfg.Corpus) {
 		return fmt.Errorf("corpus %v does not exist", cfg.Corpus)
 	}
+	if cfg.MaxInstances < 1 {
+		return fmt.Errorf("max_instances cannot be less than 1")
+	}
 	cfg.Corpus = osutil.Abs(cfg.Corpus)
-	instanceNameRe := regexp.MustCompile(`^[0-9a-z\-]{1,20}$`)
 	names := make(map[string]bool)
 	for idx := range cfg.Checkouts {
 		co := &cfg.Checkouts[idx]
@@ -250,17 +338,8 @@ func checkConfig(cfg *TestbedConfig) error {
 		} else if !vcs.CheckBranch(co.Branch) {
 			return fmt.Errorf("invalid branch: %s", co.Branch)
 		}
-		if co.Count < 0 {
-			return fmt.Errorf("count cannot be negative")
-		} else if co.Count == 0 {
-			// The default value.
-			co.Count = 1
-		}
-		if !instanceNameRe.MatchString(co.Name) {
-			return fmt.Errorf("invalid instance name: %v", co.Name)
-		}
 		if names[co.Name] {
-			return fmt.Errorf("duplicate instance name: %v", co.Name)
+			return fmt.Errorf("duplicate checkout name: %v", co.Name)
 		}
 		names[co.Name] = true
 	}
author	Aleksandr Nogikh <nogikh@google.com>	2021-10-28 17:49:25 +0000
committer	Aleksandr Nogikh <wp32pw@gmail.com>	2021-11-12 16:33:43 +0100
commit	bba04c950dfc1bc6b709232bb32d552223997678 (patch)
tree	e646235d8fa12248ea951a491a59259753e266b2 /tools/syz-testbed/testbed.go
parent	e543635de79b44506c51fa953a8ad8eb870397f6 (diff)