diff options
| author | Dmitry Vyukov <dvyukov@google.com> | 2017-11-30 17:14:18 +0100 |
|---|---|---|
| committer | Dmitry Vyukov <dvyukov@google.com> | 2017-12-01 13:58:11 +0100 |
| commit | 2fa91450df792689c42bd52f98ffdacee99ace91 (patch) | |
| tree | 6bac47c9c556725b596af31c0212d57fc6157575 | |
| parent | 5683420f11c9eb812a57f2c5786b38015a652fa0 (diff) | |
dashboard/app: add manager monitoring
Make it possible to monitor health and operation
of all managers from dashboard.
1. Notify dashboard about internal syz-ci errors
(currently we don't know when/if they happen).
2. Send statistics from managers to dashboard.
| -rw-r--r-- | dashboard/app/api.go | 56 | ||||
| -rw-r--r-- | dashboard/app/entities.go | 87 | ||||
| -rw-r--r-- | dashboard/app/handler.go | 22 | ||||
| -rw-r--r-- | dashboard/app/main.go | 174 | ||||
| -rw-r--r-- | dashboard/app/main.html | 49 | ||||
| -rw-r--r-- | dashboard/app/reporting.go | 3 | ||||
| -rw-r--r-- | dashboard/dashapi/dashapi.go | 18 | ||||
| -rw-r--r-- | syz-ci/jobs.go | 41 | ||||
| -rw-r--r-- | syz-ci/manager.go | 33 | ||||
| -rw-r--r-- | syz-ci/managercmd.go | 19 | ||||
| -rw-r--r-- | syz-ci/testing.go | 9 | ||||
| -rw-r--r-- | syz-manager/manager.go | 39 | ||||
| -rw-r--r-- | vm/vm.go | 9 | ||||
| -rw-r--r-- | vm/vmimpl/vmimpl.go | 4 |
14 files changed, 477 insertions, 86 deletions
diff --git a/dashboard/app/api.go b/dashboard/app/api.go index db265f8b8..ef1f344fd 100644 --- a/dashboard/app/api.go +++ b/dashboard/app/api.go @@ -43,6 +43,7 @@ var apiNamespaceHandlers = map[string]APINamespaceHandler{ "report_crash": apiReportCrash, "report_failed_repro": apiReportFailedRepro, "need_repro": apiNeedRepro, + "manager_stats": apiManagerStats, } type JSONHandler func(c context.Context, r *http.Request) (interface{}, error) @@ -204,6 +205,12 @@ func apiUploadBuild(c context.Context, ns string, r *http.Request) (interface{}, return nil, err } } + if err := updateManager(c, ns, req.Manager, func(mgr *Manager, stats *ManagerStats) { + mgr.CurrentBuild = req.ID + mgr.FailedBuildBug = "" + }); err != nil { + return nil, err + } return nil, nil } @@ -374,7 +381,14 @@ func apiReportBuildError(c context.Context, ns string, r *http.Request) (interfa if err := uploadBuild(c, ns, &req.Build, BuildFailed); err != nil { return nil, err } - if _, err := reportCrash(c, ns, &req.Crash); err != nil { + req.Crash.BuildID = req.Build.ID + bug, err := reportCrash(c, ns, &req.Crash) + if err != nil { + return nil, err + } + if err := updateManager(c, ns, req.Build.Manager, func(mgr *Manager, stats *ManagerStats) { + mgr.FailedBuildBug = bugKeyHash(bug.Namespace, bug.Title, bug.Seq) + }); err != nil { return nil, err } return nil, nil @@ -387,10 +401,17 @@ func apiReportCrash(c context.Context, ns string, r *http.Request) (interface{}, if err := json.NewDecoder(r.Body).Decode(req); err != nil { return nil, fmt.Errorf("failed to unmarshal request: %v", err) } - return reportCrash(c, ns, req) + bug, err := reportCrash(c, ns, req) + if err != nil { + return nil, err + } + resp := &dashapi.ReportCrashResp{ + NeedRepro: needRepro(bug), + } + return resp, nil } -func reportCrash(c context.Context, ns string, req *dashapi.Crash) (interface{}, error) { +func reportCrash(c context.Context, ns string, req *dashapi.Crash) (*Bug, error) { req.Title = limitLength(req.Title, maxTextLen) req.Maintainers = email.MergeEmailLists(req.Maintainers) if req.Corrupted { @@ -488,10 +509,7 @@ func reportCrash(c context.Context, ns string, req *dashapi.Crash) (interface{}, if saveCrash { purgeOldCrashes(c, bug, bugKey) } - resp := &dashapi.ReportCrashResp{ - NeedRepro: needRepro(bug), - } - return resp, nil + return bug, nil } func purgeOldCrashes(c context.Context, bug *Bug, bugKey *datastore.Key) { @@ -603,6 +621,30 @@ func apiNeedRepro(c context.Context, ns string, r *http.Request) (interface{}, e return resp, nil } +func apiManagerStats(c context.Context, ns string, r *http.Request) (interface{}, error) { + req := new(dashapi.ManagerStatsReq) + if err := json.NewDecoder(r.Body).Decode(req); err != nil { + return nil, fmt.Errorf("failed to unmarshal request: %v", err) + } + now := timeNow(c) + if err := updateManager(c, ns, req.Name, func(mgr *Manager, stats *ManagerStats) { + mgr.LastAlive = now + mgr.CurrentUpTime = req.UpTime + if cur := int64(req.Corpus); cur > stats.MaxCorpus { + stats.MaxCorpus = cur + } + if cur := int64(req.Cover); cur > stats.MaxCover { + stats.MaxCover = cur + } + stats.TotalFuzzingTime += req.FuzzingTime + stats.TotalCrashes += int64(req.Crashes) + stats.TotalExecs += int64(req.Execs) + }); err != nil { + return nil, err + } + return nil, nil +} + func findBugForCrash(c context.Context, ns, title string) (*Bug, *datastore.Key, error) { var bugs []*Bug keys, err := datastore.NewQuery("Bug"). diff --git a/dashboard/app/entities.go b/dashboard/app/entities.go index 6661b84b7..492ca198e 100644 --- a/dashboard/app/entities.go +++ b/dashboard/app/entities.go @@ -24,6 +24,26 @@ const ( maxCrashes = 40 ) +type Manager struct { + Namespace string + Name string + CurrentBuild string + FailedBuildBug string + LastAlive time.Time + CurrentUpTime time.Duration +} + +// ManagerStats holds per-day manager runtime stats. +// Has Manager as parent entity. Keyed by Date. +type ManagerStats struct { + Date int // YYYYMMDD + MaxCorpus int64 + MaxCover int64 + TotalFuzzingTime time.Duration + TotalCrashes int64 + TotalExecs int64 +} + type Build struct { Namespace string Manager string @@ -93,7 +113,7 @@ type ReportingStateEntry struct { Name string // Current reporting quota consumption. Sent int - Date int + Date int // YYYYMMDD } // Job represent a single patch testing job for syz-ci. @@ -163,6 +183,65 @@ const ( BuildJob ) +// updateManager does transactional compare-and-swap on the manager and its current stats. +func updateManager(c context.Context, ns, name string, fn func(mgr *Manager, stats *ManagerStats)) error { + date := timeDate(timeNow(c)) + tx := func(c context.Context) error { + mgr := new(Manager) + mgrKey := datastore.NewKey(c, "Manager", fmt.Sprintf("%v-%v", ns, name), 0, nil) + if err := datastore.Get(c, mgrKey, mgr); err != nil { + if err != datastore.ErrNoSuchEntity { + return fmt.Errorf("failed to get manager %v/%v: %v", ns, name, err) + } + mgr = &Manager{ + Namespace: ns, + Name: name, + } + } + stats := new(ManagerStats) + statsKey := datastore.NewKey(c, "ManagerStats", "", int64(date), mgrKey) + if err := datastore.Get(c, statsKey, stats); err != nil { + if err != datastore.ErrNoSuchEntity { + return fmt.Errorf("failed to get stats %v/%v/%v: %v", ns, name, date, err) + } + stats = &ManagerStats{ + Date: date, + } + } + + fn(mgr, stats) + + if _, err := datastore.Put(c, mgrKey, mgr); err != nil { + return fmt.Errorf("failed to put manager: %v", err) + } + if _, err := datastore.Put(c, statsKey, stats); err != nil { + return fmt.Errorf("failed to put manager stats: %v", err) + } + return nil + } + return datastore.RunInTransaction(c, tx, &datastore.TransactionOptions{Attempts: 10}) +} + +func loadAllManagers(c context.Context) ([]*Manager, []*datastore.Key, error) { + var managers []*Manager + keys, err := datastore.NewQuery("Manager"). + GetAll(c, &managers) + if err != nil { + return nil, nil, fmt.Errorf("failed to query managers: %v", err) + } + var result []*Manager + var resultKeys []*datastore.Key + + for i, mgr := range managers { + if _, ok := config.Namespaces[mgr.Namespace].DecommissionedManagers[mgr.Name]; ok { + continue + } + result = append(result, mgr) + resultKeys = append(resultKeys, keys[i]) + } + return result, resultKeys, nil +} + func buildKey(c context.Context, ns, id string) *datastore.Key { if ns == "" { panic("requesting build key outside of namespace") @@ -237,3 +316,9 @@ func textLink(tag string, id int64) string { } return fmt.Sprintf("/text?tag=%v&id=%v", tag, id) } + +// timeDate returns t's date as a single int YYYYMMDD. +func timeDate(t time.Time) int { + year, month, day := t.Date() + return year*10000 + int(month)*100 + day +} diff --git a/dashboard/app/handler.go b/dashboard/app/handler.go index 431261ff2..21fe67e36 100644 --- a/dashboard/app/handler.go +++ b/dashboard/app/handler.go @@ -68,6 +68,26 @@ func formatTime(t time.Time) string { return t.Format("Jan 02 15:04") } +func formatClock(t time.Time) string { + if t.IsZero() { + return "" + } + return t.Format("15:04") +} + +func formatDuration(d time.Duration) string { + if d == 0 { + return "" + } + days := int(d / (24 * time.Hour)) + hours := int(d / time.Hour % 24) + mins := int(d / time.Minute % 60) + if days != 0 { + return fmt.Sprintf("%vd%vh", days, hours) + } + return fmt.Sprintf("%vh%vm", hours, mins) +} + func formatReproLevel(l dashapi.ReproLevel) string { switch l { case ReproLevelSyz: @@ -84,6 +104,8 @@ var ( templateFuncs = template.FuncMap{ "formatTime": formatTime, + "formatClock": formatClock, + "formatDuration": formatDuration, "formatReproLevel": formatReproLevel, } ) diff --git a/dashboard/app/main.go b/dashboard/app/main.go index d24b9b6ee..99c31d699 100644 --- a/dashboard/app/main.go +++ b/dashboard/app/main.go @@ -29,10 +29,35 @@ func init() { type uiMain struct { Header *uiHeader Log []byte + Managers []*uiManager Jobs []*uiJob BugGroups []*uiBugGroup } +type uiManager struct { + Namespace string + Name string + CurrentBuild *uiBuild + FailedBuildBugLink string + LastActive time.Time + LastActiveBad bool + CurrentUpTime time.Duration + MaxCorpus int64 + MaxCover int64 + TotalFuzzingTime time.Duration + TotalCrashes int64 + TotalExecs int64 +} + +type uiBuild struct { + Time time.Time + SyzkallerCommit string + KernelRepo string + KernelBranch string + KernelCommit string + KernelConfigLink string +} + type uiBugPage struct { Header *uiHeader Bug *uiBug @@ -46,7 +71,6 @@ type uiBugGroup struct { type uiBug struct { Namespace string - ID string Title string NumCrashes int64 FirstTime time.Time @@ -55,29 +79,27 @@ type uiBug struct { ReportingIndex int Status string Link string + ExternalLink string Commits string PatchedOn []string MissingOn []string } type uiCrash struct { - Manager string - Time time.Time - Maintainers string - LogLink string - ReportLink string - ReproSyzLink string - ReproCLink string - SyzkallerCommit string - KernelRepo string - KernelBranch string - KernelCommit string - KernelConfigLink string + Manager string + Time time.Time + Maintainers string + LogLink string + ReportLink string + ReproSyzLink string + ReproCLink string + *uiBuild } type uiJob struct { Created time.Time - Link string + BugLink string + ExternalLink string User string Reporting string Namespace string @@ -108,6 +130,10 @@ func handleMain(c context.Context, w http.ResponseWriter, r *http.Request) error if err != nil { return err } + managers, err := loadManagers(c) + if err != nil { + return err + } jobs, err := loadRecentJobs(c) if err != nil { return err @@ -119,6 +145,7 @@ func handleMain(c context.Context, w http.ResponseWriter, r *http.Request) error data := &uiMain{ Header: h, Log: errorLog, + Managers: managers, Jobs: jobs, BugGroups: groups, } @@ -227,9 +254,9 @@ func createUIBug(c context.Context, bug *Bug, state *ReportingState, managers [] if status == "" { status = "???" } + id := bugKeyHash(bug.Namespace, bug.Title, bug.Seq) uiBug := &uiBug{ Namespace: bug.Namespace, - ID: bugKeyHash(bug.Namespace, bug.Title, bug.Seq), Title: bug.displayTitle(), NumCrashes: bug.NumCrashes, FirstTime: bug.FirstTime, @@ -237,7 +264,8 @@ func createUIBug(c context.Context, bug *Bug, state *ReportingState, managers [] ReproLevel: bug.ReproLevel, ReportingIndex: reportingIdx, Status: status, - Link: link, + Link: bugLink(id), + ExternalLink: link, PatchedOn: bug.PatchedOn, } if len(bug.Commits) != 0 { @@ -278,24 +306,92 @@ func loadCrashesForBug(c context.Context, bug *Bug) ([]*uiCrash, error) { builds[crash.BuildID] = build } ui := &uiCrash{ - Manager: crash.Manager, - Time: crash.Time, - Maintainers: fmt.Sprintf("%q", crash.Maintainers), - LogLink: textLink("CrashLog", crash.Log), - ReportLink: textLink("CrashReport", crash.Report), - ReproSyzLink: textLink("ReproSyz", crash.ReproSyz), - ReproCLink: textLink("ReproC", crash.ReproC), - SyzkallerCommit: build.SyzkallerCommit, - KernelRepo: build.KernelRepo, - KernelBranch: build.KernelBranch, - KernelCommit: build.KernelCommit, - KernelConfigLink: textLink("KernelConfig", build.KernelConfig), + Manager: crash.Manager, + Time: crash.Time, + Maintainers: fmt.Sprintf("%q", crash.Maintainers), + LogLink: textLink("CrashLog", crash.Log), + ReportLink: textLink("CrashReport", crash.Report), + ReproSyzLink: textLink("ReproSyz", crash.ReproSyz), + ReproCLink: textLink("ReproC", crash.ReproC), + uiBuild: makeUIBuild(build), } results = append(results, ui) } return results, nil } +func makeUIBuild(build *Build) *uiBuild { + return &uiBuild{ + Time: build.Time, + SyzkallerCommit: build.SyzkallerCommit, + KernelRepo: build.KernelRepo, + KernelBranch: build.KernelBranch, + KernelCommit: build.KernelCommit, + KernelConfigLink: textLink("KernelConfig", build.KernelConfig), + } +} + +func loadManagers(c context.Context) ([]*uiManager, error) { + now := timeNow(c) + date := timeDate(now) + managers, managerKeys, err := loadAllManagers(c) + if err != nil { + return nil, err + } + var buildKeys []*datastore.Key + var statsKeys []*datastore.Key + for i, mgr := range managers { + if mgr.CurrentBuild != "" { + buildKeys = append(buildKeys, buildKey(c, mgr.Namespace, mgr.CurrentBuild)) + } + if timeDate(mgr.LastAlive) == date { + statsKeys = append(statsKeys, + datastore.NewKey(c, "ManagerStats", "", int64(date), managerKeys[i])) + } + } + builds := make([]*Build, len(buildKeys)) + if err := datastore.GetMulti(c, buildKeys, builds); err != nil { + return nil, err + } + uiBuilds := make(map[string]*uiBuild) + for _, build := range builds { + uiBuilds[build.Namespace+"|"+build.ID] = makeUIBuild(build) + } + stats := make([]*ManagerStats, len(statsKeys)) + if err := datastore.GetMulti(c, statsKeys, stats); err != nil { + return nil, err + } + var fullStats []*ManagerStats + for _, mgr := range managers { + if timeDate(mgr.LastAlive) != date { + fullStats = append(fullStats, &ManagerStats{}) + continue + } + fullStats = append(fullStats, stats[0]) + stats = stats[1:] + } + var results []*uiManager + for i, mgr := range managers { + stats := fullStats[i] + results = append(results, &uiManager{ + Namespace: mgr.Namespace, + Name: mgr.Name, + CurrentBuild: uiBuilds[mgr.Namespace+"|"+mgr.CurrentBuild], + FailedBuildBugLink: bugLink(mgr.FailedBuildBug), + LastActive: mgr.LastAlive, + LastActiveBad: now.Sub(mgr.LastAlive) > 12*time.Hour, + CurrentUpTime: mgr.CurrentUpTime, + MaxCorpus: stats.MaxCorpus, + MaxCover: stats.MaxCover, + TotalFuzzingTime: stats.TotalFuzzingTime, + TotalCrashes: stats.TotalCrashes, + TotalExecs: stats.TotalExecs, + }) + } + sort.Sort(uiManagerSorter(results)) + return results, nil +} + func loadRecentJobs(c context.Context) ([]*uiJob, error) { var jobs []*Job keys, err := datastore.NewQuery("Job"). @@ -309,13 +405,13 @@ func loadRecentJobs(c context.Context) ([]*uiJob, error) { for i, job := range jobs { ui := &uiJob{ Created: job.Created, - Link: job.Link, + BugLink: bugLink(keys[i].Parent().StringID()), + ExternalLink: job.Link, User: job.User, Reporting: job.Reporting, Namespace: job.Namespace, Manager: job.Manager, BugTitle: job.BugTitle, - BugID: keys[i].Parent().StringID(), KernelRepo: job.KernelRepo, KernelBranch: job.KernelBranch, PatchLink: textLink("Patch", job.Patch), @@ -376,6 +472,24 @@ func fetchErrorLogs(c context.Context) ([]byte, error) { return buf.Bytes(), nil } +func bugLink(id string) string { + if id == "" { + return "" + } + return "/bug?id=" + id +} + +type uiManagerSorter []*uiManager + +func (a uiManagerSorter) Len() int { return len(a) } +func (a uiManagerSorter) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a uiManagerSorter) Less(i, j int) bool { + if a[i].Namespace != a[j].Namespace { + return a[i].Namespace < a[j].Namespace + } + return a[i].Name < a[j].Name +} + type uiBugSorter []*uiBug func (a uiBugSorter) Len() int { return len(a) } diff --git a/dashboard/app/main.html b/dashboard/app/main.html index ea1eec7c9..a5bad8efa 100644 --- a/dashboard/app/main.html +++ b/dashboard/app/main.html @@ -11,11 +11,11 @@ </tr> {{range $b := $.Bugs}} <tr> - <td class="title"><a href="/bug?id={{$b.ID}}">{{$b.Title}}</a></td> + <td class="title"><a href="{{$b.Link}}">{{$b.Title}}</a></td> <td class="count">{{$b.NumCrashes}}</td> <td class="repro">{{formatReproLevel $b.ReproLevel}}</td> <td class="time">{{formatTime $b.LastTime}}</td> - <td class="status">{{if $b.Link}}<a href="{{$b.Link}}">{{$b.Status}}</a>{{else}}{{$b.Status}}{{end}}</td> + <td class="status">{{if $b.Link}}<a href="{{$b.ExternalLink}}">{{$b.Status}}</a>{{else}}{{$b.Status}}{{end}}</td> <td class="patched" title="{{$b.Commits}}">{{if $b.Commits}}{{len $b.PatchedOn}}/{{len $b.MissingOn}}{{end}}</td> </tr> {{end}} @@ -40,6 +40,49 @@ <br><br> <table class="list_table"> + <caption>Managers:</caption> + <tr> + <th>Name</th> + <th>Last Active</th> + <th>Current Build</th> + <th>Failed Build</th> + <th>Today: Uptime</th> + <th>Fuzzing Time</th> + <th>Corpus</th> + <th>Coverage</th> + <th>Crashes</th> + <th>Execs</th> + </tr> + {{range $mgr := $.Managers}} + <tr> + <td>{{$mgr.Namespace}}/{{$mgr.Name}}</td> + {{if $mgr.LastActiveBad}} + <td style="color:#f00">{{formatTime $mgr.LastActive}}</td> + {{else}} + <td>{{formatClock $mgr.LastActive}}</td> + {{end}} + {{if $mgr.CurrentBuild}} + <td title="{{$mgr.CurrentBuild.KernelRepo}}/{{$mgr.CurrentBuild.KernelBranch}}/{{$mgr.CurrentBuild.KernelCommit}} (syzkaller {{$mgr.CurrentBuild.SyzkallerCommit}})">{{formatTime $mgr.CurrentBuild.Time}}</td> + {{else}} + <td></td> + {{end}} + {{if $mgr.FailedBuildBugLink}} + <td><a href="{{$mgr.FailedBuildBugLink}}" style="color:#f00">failed</a></td> + {{else}} + <td></td> + {{end}} + <td>{{formatDuration $mgr.CurrentUpTime}}</td> + <td>{{formatDuration $mgr.TotalFuzzingTime}}</td> + <td>{{$mgr.MaxCorpus}}</td> + <td>{{$mgr.MaxCover}}</td> + <td>{{$mgr.TotalCrashes}}</td> + <td>{{$mgr.TotalExecs}}</td> + </tr> + {{end}} + </table> + <br><br> + + <table class="list_table"> <caption>Recent jobs:</caption> <tr> <th>Created</th> @@ -59,7 +102,7 @@ <td class="time">{{formatTime $job.Started}}{{if gt $job.Attempts 1}} ({{$job.Attempts}}){{end}}</td> <td class="time">{{formatTime $job.Finished}}</td> <td>{{$job.User}}</td> - <td class="title"><a href="/bug?id={{$job.BugID}}">{{$job.BugTitle}}</a></td> + <td class="title"><a href="{{$job.BugLink}}">{{$job.BugTitle}}</a></td> <td><a href="{{$job.PatchLink}}">patch</a></td> <td>{{$job.Namespace}}/{{$job.Reporting}}</td> <td>{{$job.Manager}}</td> diff --git a/dashboard/app/reporting.go b/dashboard/app/reporting.go index 9180c14ad..ad2ad0d1f 100644 --- a/dashboard/app/reporting.go +++ b/dashboard/app/reporting.go @@ -577,8 +577,7 @@ func (state *ReportingState) getEntry(now time.Time, namespace, name string) *Re panic(fmt.Sprintf("requesting reporting state for %v/%v", namespace, name)) } // Convert time to date of the form 20170125. - year, month, day := now.Date() - date := year*10000 + int(month)*100 + day + date := timeDate(now) for i := range state.Entries { ent := &state.Entries[i] if ent.Namespace == namespace && ent.Name == name { diff --git a/dashboard/dashapi/dashapi.go b/dashboard/dashapi/dashapi.go index 8ebea29ac..4ba797f32 100644 --- a/dashboard/dashapi/dashapi.go +++ b/dashboard/dashapi/dashapi.go @@ -16,6 +16,7 @@ import ( "net/url" "reflect" "strings" + "time" ) type Dashboard struct { @@ -249,6 +250,23 @@ type PollResponse struct { Reports []*BugReport } +type ManagerStatsReq struct { + Name string + // Current level: + UpTime time.Duration + Corpus uint64 + Cover uint64 + + // Delta since last sync: + FuzzingTime time.Duration + Crashes uint64 + Execs uint64 +} + +func (dash *Dashboard) UploadManagerStats(req *ManagerStatsReq) error { + return dash.query("manager_stats", req, nil) +} + type ( BugStatus int ReproLevel int diff --git a/syz-ci/jobs.go b/syz-ci/jobs.go index 604dd1895..5c5b0c5a8 100644 --- a/syz-ci/jobs.go +++ b/syz-ci/jobs.go @@ -24,12 +24,14 @@ import ( ) type JobProcessor struct { + name string managers []*Manager dash *dashapi.Dashboard } func newJobProcessor(cfg *Config, managers []*Manager) *JobProcessor { jp := &JobProcessor{ + name: fmt.Sprintf("%v-job", cfg.Name), managers: managers, } if cfg.Dashboard_Addr != "" && cfg.Dashboard_Client != "" { @@ -62,7 +64,7 @@ func (jp *JobProcessor) poll() { } req, err := jp.dash.JobPoll(names) if err != nil { - Logf(0, "failed to poll jobs: %v", err) + jp.Errorf("failed to poll jobs: %v", err) return } if req.ID == "" { @@ -76,7 +78,7 @@ func (jp *JobProcessor) poll() { } } if mgr == nil { - Logf(0, "got job for unknown manager: %v", req.Manager) + jp.Errorf("got job for unknown manager: %v", req.Manager) return } job := &Job{ @@ -85,11 +87,11 @@ func (jp *JobProcessor) poll() { } Logf(0, "starting job %v for manager %v on %v/%v", req.ID, req.Manager, req.KernelRepo, req.KernelBranch) - resp := job.process() + resp := jp.process(job) Logf(0, "done job %v: commit %v, crash %q, error: %s", resp.ID, resp.Build.KernelCommit, resp.CrashTitle, resp.Error) if err := jp.dash.JobDone(resp); err != nil { - Logf(0, "failed to mark job as done: %v", err) + jp.Errorf("failed to mark job as done: %v", err) return } } @@ -101,7 +103,7 @@ type Job struct { mgrcfg *mgrconfig.Config } -func (job *Job) process() *dashapi.JobDoneReq { +func (jp *JobProcessor) process(job *Job) *dashapi.JobDoneReq { req, mgr := job.req, job.mgr build := dashapi.Build{ Manager: mgr.name, @@ -134,6 +136,7 @@ func (job *Job) process() *dashapi.JobDoneReq { for _, req := range required { if !req.ok { job.resp.Error = []byte(req.name + " is empty") + jp.Errorf("%s", job.resp.Error) return job.resp } } @@ -145,20 +148,21 @@ func (job *Job) process() *dashapi.JobDoneReq { case "gce", "qemu": default: job.resp.Error = []byte(fmt.Sprintf("testing is not yet supported for %v machine type.", typ)) + jp.Errorf("%s", job.resp.Error) return job.resp } - if err := job.buildImage(); err != nil { + if err := jp.buildImage(job); err != nil { job.resp.Error = []byte(err.Error()) return job.resp } - if err := job.test(); err != nil { + if err := jp.test(job); err != nil { job.resp.Error = []byte(err.Error()) return job.resp } return job.resp } -func (job *Job) buildImage() error { +func (jp *JobProcessor) buildImage(job *Job) error { kernelBuildSem <- struct{}{} defer func() { <-kernelBuildSem }() req, resp, mgr := job.req, job.resp, job.mgr @@ -242,7 +246,7 @@ func (job *Job) buildImage() error { return nil } -func (job *Job) test() error { +func (jp *JobProcessor) test(job *Job) error { req, mgrcfg := job.req, job.mgrcfg Logf(0, "job: booting VM...") @@ -305,7 +309,7 @@ func (job *Job) test() error { " -fault_call=%v -fault_nth=%v -repeat=0 -cover=0 %v", execprogBin, executorBin, mgrcfg.TargetArch, mgrcfg.Procs, opts.Sandbox, opts.FaultCall, opts.FaultNth, vmProgFile) - crashed, err := job.testProgram(inst, cmdSyz, reporter, 7*time.Minute) + crashed, err := jp.testProgram(job, inst, cmdSyz, reporter, 7*time.Minute) if crashed || err != nil { return err } @@ -330,7 +334,7 @@ func (job *Job) test() error { } // We should test for longer (e.g. 5 mins), but the problem is that // reproducer does not print anything, so after 3 mins we detect "no output". - crashed, err := job.testProgram(inst, vmBin, reporter, time.Minute) + crashed, err := jp.testProgram(job, inst, vmBin, reporter, time.Minute) if crashed || err != nil { return err } @@ -338,8 +342,8 @@ func (job *Job) test() error { return nil } -func (job *Job) testProgram(inst *vm.Instance, command string, reporter report.Reporter, - testTime time.Duration) (bool, error) { +func (jp *JobProcessor) testProgram(job *Job, inst *vm.Instance, command string, + reporter report.Reporter, testTime time.Duration) (bool, error) { outc, errc, err := inst.Run(testTime, nil, command) if err != nil { return false, fmt.Errorf("failed to run binary in VM: %v", err) @@ -349,11 +353,18 @@ func (job *Job) testProgram(inst *vm.Instance, command string, reporter report.R return false, nil } if err := reporter.Symbolize(rep); err != nil { - // TODO(dvyukov): send such errors to dashboard. - Logf(0, "job: failed to symbolize report: %v", err) + jp.Errorf("failed to symbolize report: %v", err) } job.resp.CrashTitle = rep.Title job.resp.CrashReport = rep.Report job.resp.CrashLog = rep.Output return true, nil } + +// Errorf logs non-fatal error and sends it to dashboard. +func (jp *JobProcessor) Errorf(msg string, args ...interface{}) { + Logf(0, "job: "+msg, args...) + if jp.dash != nil { + jp.dash.LogError(jp.name, msg, args...) + } +} diff --git a/syz-ci/manager.go b/syz-ci/manager.go index 4e4f03485..9ad2f8e53 100644 --- a/syz-ci/manager.go +++ b/syz-ci/manager.go @@ -147,7 +147,7 @@ loop: rebuildAfter := buildRetryPeriod commit, err := git.Poll(mgr.kernelDir, mgr.mgrcfg.Repo, mgr.mgrcfg.Branch) if err != nil { - Logf(0, "%v: failed to poll: %v", mgr.name, err) + mgr.Errorf("failed to poll: %v", err) } else { Logf(0, "%v: poll: %v", mgr.name, commit) if commit != lastCommit && @@ -166,7 +166,7 @@ loop: rebuildAfter = kernelRebuildPeriod latestInfo = mgr.checkLatest() if latestInfo == nil { - Logf(0, "%v: failed to read build info after build", mgr.name) + mgr.Errorf("failed to read build info after build") } } <-kernelBuildSem @@ -239,6 +239,7 @@ func (mgr *Manager) build() error { } var tagData []byte + tagData = append(tagData, mgr.name...) tagData = append(tagData, kernelCommit...) tagData = append(tagData, mgr.compilerID...) tagData = append(tagData, mgr.configTag...) @@ -274,7 +275,7 @@ func (mgr *Manager) build() error { Output: []byte(err.Error()), } if err := mgr.reportBuildError(rep, info, tmpDir); err != nil { - Logf(0, "%v: failed to report image error: %v", mgr.name, err) + mgr.Errorf("failed to report image error: %v", err) } return fmt.Errorf("kernel build failed: %v", err) } @@ -307,7 +308,7 @@ func (mgr *Manager) build() error { func (mgr *Manager) restartManager() { if !osutil.FilesExist(mgr.latestDir, imageFiles) { - Logf(0, "%v: can't start manager, image files missing", mgr.name) + mgr.Errorf("can't start manager, image files missing") return } if mgr.cmd != nil { @@ -315,26 +316,26 @@ func (mgr *Manager) restartManager() { mgr.cmd = nil } if err := osutil.LinkFiles(mgr.latestDir, mgr.currentDir, imageFiles); err != nil { - Logf(0, "%v: failed to create current image dir: %v", mgr.name, err) + mgr.Errorf("failed to create current image dir: %v", err) return } info, err := loadBuildInfo(mgr.currentDir) if err != nil { - Logf(0, "%v: failed to load build info: %v", mgr.name, err) + mgr.Errorf("failed to load build info: %v", err) return } cfgFile, err := mgr.writeConfig(info) if err != nil { - Logf(0, "%v: failed to create manager config: %v", mgr.name, err) + mgr.Errorf("failed to create manager config: %v", err) return } if err := mgr.uploadBuild(info, mgr.currentDir); err != nil { - Logf(0, "%v: failed to upload build: %v", mgr.name, err) + mgr.Errorf("failed to upload build: %v", err) return } bin := filepath.FromSlash("syzkaller/current/bin/syz-manager") logFile := filepath.Join(mgr.currentDir, "manager.log") - mgr.cmd = NewManagerCmd(mgr.name, logFile, bin, "-config", cfgFile) + mgr.cmd = NewManagerCmd(mgr.name, logFile, mgr.Errorf, bin, "-config", cfgFile) } func (mgr *Manager) testImage(imageDir string, info *BuildInfo) error { @@ -361,7 +362,7 @@ func (mgr *Manager) testImage(imageDir string, info *BuildInfo) error { if rep != nil { rep.Title = fmt.Sprintf("%v boot error: %v", mgr.mgrcfg.Repo_Alias, rep.Title) if err := mgr.reportBuildError(rep, info, imageDir); err != nil { - Logf(0, "%v: failed to report image error: %v", mgr.name, err) + mgr.Errorf("failed to report image error: %v", err) } return fmt.Errorf("VM boot failed with: %v", rep.Title) } @@ -373,7 +374,7 @@ func (mgr *Manager) testImage(imageDir string, info *BuildInfo) error { if rep != nil { rep.Title = fmt.Sprintf("%v test error: %v", mgr.mgrcfg.Repo_Alias, rep.Title) if err := mgr.reportBuildError(rep, info, imageDir); err != nil { - Logf(0, "%v: failed to report image error: %v", mgr.name, err) + mgr.Errorf("failed to report image error: %v", err) } return fmt.Errorf("VM testing failed with: %v", rep.Title) } @@ -482,7 +483,7 @@ func (mgr *Manager) uploadBuild(info *BuildInfo, imageDir string) error { commits, err := mgr.pollCommits(info.KernelCommit) if err != nil { // This is not critical for operation. - Logf(0, "%v: failed to poll commits: %v", mgr.name, err) + mgr.Errorf("failed to poll commits: %v", err) } build.Commits = commits return mgr.dash.UploadBuild(build) @@ -533,3 +534,11 @@ func (mgr *Manager) pollCommits(buildCommit string) ([]string, error) { } return present, nil } + +// Errorf logs non-fatal error and sends it to dashboard. +func (mgr *Manager) Errorf(msg string, args ...interface{}) { + Logf(0, mgr.name+": "+msg, args...) + if mgr.dash != nil { + mgr.dash.LogError(mgr.name, msg, args...) + } +} diff --git a/syz-ci/managercmd.go b/syz-ci/managercmd.go index 77339e5c0..143eb8011 100644 --- a/syz-ci/managercmd.go +++ b/syz-ci/managercmd.go @@ -19,19 +19,23 @@ import ( type ManagerCmd struct { name string log string + errorf Errorf bin string args []string closing chan bool } +type Errorf func(msg string, args ...interface{}) + // NewManagerCmd starts new syz-manager process. // name - name for logging. // log - manager log file with stdout/stderr. // bin/args - process binary/args. -func NewManagerCmd(name, log, bin string, args ...string) *ManagerCmd { +func NewManagerCmd(name, log string, errorf Errorf, bin string, args ...string) *ManagerCmd { mc := &ManagerCmd{ name: name, log: log, + errorf: errorf, bin: bin, args: args, closing: make(chan bool), @@ -48,8 +52,8 @@ func (mc *ManagerCmd) Close() { func (mc *ManagerCmd) loop() { const ( - restartPeriod = time.Minute // don't restart crashing manager more frequently than that - interruptTimeout = time.Minute // give manager that much time to react to SIGINT + restartPeriod = 10 * time.Minute // don't restart crashing manager more frequently than that + interruptTimeout = time.Minute // give manager that much time to react to SIGINT ) var ( cmd *exec.Cmd @@ -73,7 +77,7 @@ func (mc *ManagerCmd) loop() { os.Rename(mc.log, mc.log+".old") logfile, err := os.Create(mc.log) if err != nil { - Logf(0, "%v: failed to create manager log: %v", mc.name, err) + mc.errorf("failed to create manager log: %v", err) } else { cmd = osutil.Command(mc.bin, mc.args...) cmd.Stdout = logfile @@ -81,7 +85,7 @@ func (mc *ManagerCmd) loop() { err := cmd.Start() logfile.Close() if err != nil { - Logf(0, "%v: failed to start manager: %v", mc.name, err) + mc.errorf("failed to start manager: %v", err) cmd = nil } else { Logf(1, "%v: started manager", mc.name) @@ -110,7 +114,10 @@ func (mc *ManagerCmd) loop() { } case err := <-stopped: if cmd == nil { - panic("spurious stop signal") + mc.errorf("spurious stop signal: %v", err) + } + if closing != nil { + mc.errorf("manager exited unexpectedly: %v", err) } cmd = nil Logf(1, "%v: manager exited with %v", mc.name, err) diff --git a/syz-ci/testing.go b/syz-ci/testing.go index dcee197ac..adf223f04 100644 --- a/syz-ci/testing.go +++ b/syz-ci/testing.go @@ -31,12 +31,13 @@ func bootInstance(mgrcfg *mgrconfig.Config) (*vm.Instance, report.Reporter, *rep } inst, err := vmPool.Create(0) if err != nil { - if bootErr, ok := err.(vm.BootError); ok { - rep := reporter.Parse(bootErr.Output) + if bootErr, ok := err.(vm.BootErrorer); ok { + title, output := bootErr.BootError() + rep := reporter.Parse(output) if rep == nil { rep = &report.Report{ - Title: bootErr.Title, - Output: bootErr.Output, + Title: title, + Output: output, } } if err := reporter.Symbolize(rep); err != nil { diff --git a/syz-manager/manager.go b/syz-manager/manager.go index 49faa6e83..6be5761ba 100644 --- a/syz-manager/manager.go +++ b/syz-manager/manager.go @@ -307,6 +307,10 @@ func RunManager(cfg *mgrconfig.Config, target *prog.Target, syscalls map[int]boo }() } + if mgr.dash != nil { + go mgr.dashboardReporter() + } + if mgr.cfg.Hub_Client != "" { go func() { for { @@ -1167,3 +1171,38 @@ func (mgr *Manager) checkUsedFiles() { } } } + +func (mgr *Manager) dashboardReporter() { + var lastFuzzingTime time.Duration + var lastCrashes, lastExecs uint64 + for { + time.Sleep(time.Minute) + mgr.mu.Lock() + if mgr.firstConnect.IsZero() { + mgr.mu.Unlock() + continue + } + crashes := mgr.stats["crashes"] + execs := mgr.stats["exec total"] + req := &dashapi.ManagerStatsReq{ + Name: mgr.cfg.Name, + UpTime: time.Since(mgr.firstConnect), + Corpus: uint64(len(mgr.corpus)), + Cover: uint64(len(mgr.corpusSignal)), + FuzzingTime: mgr.fuzzingTime - lastFuzzingTime, + Crashes: crashes - lastCrashes, + Execs: execs - lastExecs, + } + mgr.mu.Unlock() + + if err := mgr.dash.UploadManagerStats(req); err != nil { + Logf(0, "faield to upload dashboard stats: %v", err) + continue + } + mgr.mu.Lock() + lastFuzzingTime += req.FuzzingTime + lastCrashes += req.Crashes + lastExecs += req.Execs + mgr.mu.Unlock() + } +} @@ -37,18 +37,15 @@ type Instance struct { index int } -type ( - Env vmimpl.Env - BootError vmimpl.BootError -) +type Env vmimpl.Env var ( Shutdown = vmimpl.Shutdown TimeoutErr = vmimpl.TimeoutErr ) -func (err BootError) Error() string { - return fmt.Sprintf("%v\n%s", err.Title, err.Output) +type BootErrorer interface { + BootError() (string, []byte) } func Create(typ string, env *Env) (*Pool, error) { diff --git a/vm/vmimpl/vmimpl.go b/vm/vmimpl/vmimpl.go index 81f798d26..2e3833d89 100644 --- a/vm/vmimpl/vmimpl.go +++ b/vm/vmimpl/vmimpl.go @@ -66,6 +66,10 @@ func (err BootError) Error() string { return fmt.Sprintf("%v\n%s", err.Title, err.Output) } +func (err BootError) BootError() (string, []byte) { + return err.Title, err.Output +} + // Create creates a VM type that can be used to create individual VMs. func Create(typ string, env *Env) (Pool, error) { ctor := ctors[typ] |
