From b78f6ba060f2182be9259be23a8c07d2d16671fa Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Thu, 30 Mar 2023 16:47:18 +0200 Subject: pkg/email/lore: add Lore archives parsing routines We need to 1) Query raw messages from an LKML git archive. 2) Group raw messages into threads. --- pkg/email/lore/parse.go | 96 ++++++++++++++++++++++ pkg/email/lore/parse_test.go | 189 +++++++++++++++++++++++++++++++++++++++++++ pkg/email/lore/read.go | 32 ++++++++ 3 files changed, 317 insertions(+) create mode 100644 pkg/email/lore/parse.go create mode 100644 pkg/email/lore/parse_test.go create mode 100644 pkg/email/lore/read.go (limited to 'pkg/email/lore') diff --git a/pkg/email/lore/parse.go b/pkg/email/lore/parse.go new file mode 100644 index 000000000..6d90f9ee5 --- /dev/null +++ b/pkg/email/lore/parse.go @@ -0,0 +1,96 @@ +// Copyright 2023 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +package lore + +import ( + "github.com/google/syzkaller/pkg/email" +) + +type Thread struct { + Subject string + MessageID string + BugIDs []string + Messages []*email.Email +} + +// Threads extracts individual threads from a list of emails. +func Threads(emails []*email.Email) []*Thread { + ctx := &parseCtx{ + messages: map[string]*email.Email{}, + } + for _, email := range emails { + ctx.record(email) + } + return ctx.threads() +} + +type parseCtx struct { + messages map[string]*email.Email +} + +func (c *parseCtx) record(msg *email.Email) { + c.messages[msg.MessageID] = msg +} + +func (c *parseCtx) threads() []*Thread { + threads := map[string]*Thread{} + threadsList := []*Thread{} + // Detect threads, i.e. messages without In-Reply-To. + for _, msg := range c.messages { + if msg.InReplyTo == "" { + thread := &Thread{ + MessageID: msg.MessageID, + Subject: msg.Subject, + } + threads[msg.MessageID] = thread + threadsList = append(threadsList, thread) + } + } + // Assign messages to threads. + for _, msg := range c.messages { + base := c.first(msg) + if base == nil { + continue + } + thread := threads[base.MessageID] + thread.BugIDs = append(thread.BugIDs, msg.BugIDs...) + thread.Messages = append(threads[base.MessageID].Messages, msg) + } + // Deduplicate BugIDs lists. + for _, thread := range threads { + if len(thread.BugIDs) == 0 { + continue + } + unique := map[string]struct{}{} + newList := []string{} + for _, id := range thread.BugIDs { + if _, ok := unique[id]; !ok { + newList = append(newList, id) + } + unique[id] = struct{}{} + } + thread.BugIDs = newList + } + return threadsList +} + +// first finds the firt message of an email thread. +func (c *parseCtx) first(msg *email.Email) *email.Email { + visited := map[*email.Email]struct{}{} + for { + // There have been a few cases when we'd otherwise get an infinite loop. + if _, ok := visited[msg]; ok { + return nil + } + visited[msg] = struct{}{} + if msg.InReplyTo == "" { + return msg + } + msg = c.messages[msg.InReplyTo] + if msg == nil { + // Probably we just didn't load the message. + return nil + } + } +} diff --git a/pkg/email/lore/parse_test.go b/pkg/email/lore/parse_test.go new file mode 100644 index 000000000..90be05200 --- /dev/null +++ b/pkg/email/lore/parse_test.go @@ -0,0 +1,189 @@ +// Copyright 2023 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +package lore + +import ( + "sort" + "strings" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/syzkaller/pkg/email" +) + +func TestThreadsCollection(t *testing.T) { + messages := []string{ + // <-- <-- . + `Date: Sun, 7 May 2017 19:54:00 -0700 +Subject: Thread A +Message-ID: +From: UserA +Content-Type: text/plain + + +Some text`, + `Date: Sun, 7 May 2017 19:55:00 -0700 +Subject: Re: Thread A +Message-ID: +From: UserB +To: UserA +Content-Type: text/plain +In-Reply-To: + + +Some reply`, + `Date: Sun, 7 May 2017 19:56:00 -0700 +Subject: Re: Re: Thread A +Message-ID: +From: UserC +To: UserA , UserB +Content-Type: text/plain +In-Reply-To: + + +Some reply (2)`, + // with two children: , . + `Date: Sun, 7 May 2017 19:57:00 -0700 +Subject: [syzbot] Some bug +Message-ID: +From: syzbot +Content-Type: text/plain + + +Bug report`, + `Date: Sun, 7 May 2017 19:58:00 -0700 +Subject: Re: [syzbot] Some bug +Message-ID: +From: UserC +To: syzbot +In-Reply-To: +Content-Type: text/plain + + +Bug report reply`, + `Date: Sun, 7 May 2017 19:58:01 -0700 +Subject: Re: [syzbot] Some bug +Message-ID: +From: UserD +To: syzbot +In-Reply-To: B +Content-Type: text/plain + + +Bug report reply 2`, + // And one PATCH without replies. + `Date: Sun, 7 May 2017 19:58:01 -0700 +Subject: [PATCH] Some bug fixed +Message-ID: +From: UserE +Cc: syzbot +Content-Type: text/plain + + +Patch`, + } + + zone := time.FixedZone("", -7*60*60) + expected := map[string]*Thread{ + "": { + Subject: "Thread A", + MessageID: "", + Messages: []*email.Email{ + { + MessageID: "", + Subject: "Thread A", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, zone), + Author: "a@user.com", + Cc: []string{"a@user.com"}, + Command: email.CmdNone, + }, + { + MessageID: "", + Subject: "Re: Thread A", + Date: time.Date(2017, time.May, 7, 19, 55, 0, 0, zone), + Author: "b@user.com", + Cc: []string{"a@user.com", "b@user.com"}, + InReplyTo: "", + Command: email.CmdNone, + }, + { + MessageID: "", + Subject: "Re: Re: Thread A", + Date: time.Date(2017, time.May, 7, 19, 56, 0, 0, zone), + Author: "c@user.com", + Cc: []string{"a@user.com", "b@user.com", "c@user.com"}, + InReplyTo: "", + Command: email.CmdNone, + }, + }, + }, + "": { + Subject: "[syzbot] Some bug", + MessageID: "", + BugIDs: []string{"4564456"}, + Messages: []*email.Email{ + { + MessageID: "", + BugIDs: []string{"4564456"}, + Subject: "[syzbot] Some bug", + Date: time.Date(2017, time.May, 7, 19, 57, 0, 0, zone), + Author: "syzbot@bar.com", + Command: email.CmdNone, + }, + { + MessageID: "", + BugIDs: []string{"4564456"}, + Subject: "Re: [syzbot] Some bug", + Date: time.Date(2017, time.May, 7, 19, 58, 0, 0, zone), + Author: "c@user.com", + Cc: []string{"c@user.com"}, + InReplyTo: "", + Command: email.CmdNone, + }, + }, + }, + "": { + Subject: "[PATCH] Some bug fixed", + MessageID: "", + BugIDs: []string{"12345"}, + Messages: []*email.Email{ + { + MessageID: "", + BugIDs: []string{"12345"}, + Subject: "[PATCH] Some bug fixed", + Date: time.Date(2017, time.May, 7, 19, 58, 1, 0, zone), + Author: "e@user.com", + Cc: []string{"e@user.com"}, + Command: email.CmdNone, + }, + }, + }, + } + + emails := []*email.Email{} + for _, m := range messages { + msg, err := email.Parse(strings.NewReader(m), []string{"syzbot@bar.com"}, + []string{}, []string{"bar.com"}) + if err != nil { + t.Fatal(err) + } + msg.Body = "" + emails = append(emails, msg) + } + + threads := Threads(emails) + for _, d := range threads { + sort.Slice(d.Messages, func(i, j int) bool { + return d.Messages[i].Date.Before(d.Messages[j].Date) + }) + if diff := cmp.Diff(expected[d.MessageID], d); diff != "" { + t.Fatalf("%s: %s", d.MessageID, diff) + } + } + + if len(threads) != len(expected) { + t.Fatalf("Expected %d threads, got %d", len(expected), len(threads)) + } +} diff --git a/pkg/email/lore/read.go b/pkg/email/lore/read.go new file mode 100644 index 000000000..1cfa680b6 --- /dev/null +++ b/pkg/email/lore/read.go @@ -0,0 +1,32 @@ +// Copyright 2023 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +package lore + +import ( + "fmt" + + "github.com/google/syzkaller/pkg/vcs" +) + +type EmailReader struct { + Extract func() ([]byte, error) +} + +// ReadArchive queries the parsed messages from a single LKML message archive. +func ReadArchive(dir string, messages chan<- *EmailReader) error { + repo := vcs.NewLKMLRepo(dir) + commits, err := repo.ListCommitHashes("HEAD") + if err != nil { + return fmt.Errorf("failed to get recent commits: %w", err) + } + for _, iterCommit := range commits { + commit := iterCommit + messages <- &EmailReader{ + Extract: func() ([]byte, error) { + return repo.Object("m", commit) + }, + } + } + return nil +} -- cgit mrf-deployment