diff options
| author | Aleksandr Nogikh <nogikh@google.com> | 2023-03-30 19:36:54 +0200 |
|---|---|---|
| committer | Aleksandr Nogikh <wp32pw@gmail.com> | 2023-04-06 13:59:25 +0200 |
| commit | 7c8c334ec07d3333dacd150dc389ba3f3db649df (patch) | |
| tree | 839d2d0e6a7b65d115bce0bfcb772c0293462b06 /pkg/email | |
| parent | 139c4ef69fd5289588228c700717631d8f1731d0 (diff) | |
pkg/email: extract more information
Extract Date and InReplyTo.
Extract not just one, but multiple BugIDs from multiple sources:
1) Email addresses in From/To/Cc.
2) Email addresses in Body.
3) Dashboard links in Body.
Diffstat (limited to 'pkg/email')
| -rw-r--r-- | pkg/email/parser.go | 68 | ||||
| -rw-r--r-- | pkg/email/parser_test.go | 123 |
2 files changed, 178 insertions, 13 deletions
diff --git a/pkg/email/parser.go b/pkg/email/parser.go index af044791d..795cc6fed 100644 --- a/pkg/email/parser.go +++ b/pkg/email/parser.go @@ -14,12 +14,15 @@ import ( "regexp" "sort" "strings" + "time" "unicode" ) type Email struct { - BugID string + BugIDs []string MessageID string + InReplyTo string + Date time.Time Link string Subject string MailingList string @@ -65,7 +68,7 @@ func prepareEmails(list []string) map[string]bool { return ret } -func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { +func Parse(r io.Reader, ownEmails, goodLists, domains []string) (*Email, error) { msg, err := mail.ReadMessage(r) if err != nil { return nil, fmt.Errorf("failed to read email: %v", err) @@ -81,7 +84,6 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { to, _ := msg.Header.AddressList("To") // AddressList fails if the header is not present. cc, _ := msg.Header.AddressList("Cc") - bugID := "" var ccList []string ownAddrs := prepareEmails(ownEmails) fromMe := false @@ -99,6 +101,7 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { originalFrom = originalFroms[0].String() } + bugIDs := []string{} rawCcList := append(append(append(cc, to...), from...), originalFroms...) for _, addr := range rawCcList { cleaned, context, _ := RemoveAddrContext(addr.Address) @@ -106,9 +109,7 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { cleaned = addr.Address } if ownAddrs[cleaned] { - if bugID == "" { - bugID = context - } + bugIDs = append(bugIDs, context) } else { ccList = append(ccList, CanonicalEmail(cleaned)) } @@ -142,6 +143,8 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { } cmd, cmdStr, cmdArgs = extractCommand(subject + "\n" + bodyStr) } + bugIDs = append(bugIDs, extractBodyBugIDs(bodyStr, ownAddrs, domains)...) + link := "" if match := groupsLinkRe.FindStringSubmatchIndex(bodyStr); match != nil { link = bodyStr[match[2]:match[3]] @@ -162,10 +165,12 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { // In other cases, the mailing list would preserve From and just change Sender. mailingList = CanonicalEmail(sender) } - + date, _ := mail.ParseDate(msg.Header.Get("Date")) email := &Email{ - BugID: bugID, + BugIDs: dedupBugIDs(bugIDs), MessageID: msg.Header.Get("Message-ID"), + InReplyTo: msg.Header.Get("In-Reply-To"), + Date: date, Link: link, Author: author, MailingList: mailingList, @@ -412,6 +417,53 @@ func parseBody(r io.Reader, headers mail.Header) ([]byte, [][]byte, error) { } } +func extractBodyBugIDs(body string, ownEmailMap map[string]bool, domains []string) []string { + // Let's build a regular expression. + var rb strings.Builder + for email := range ownEmailMap { + escaped := regexp.QuoteMeta(email) + part := strings.ReplaceAll(escaped, `@`, `\+(\w+?)@`) + if rb.Len() > 0 { + rb.WriteString(`|`) + } + rb.WriteString(part) + } + for _, domain := range domains { + escaped := regexp.QuoteMeta(domain + "/bug?extid=") + if rb.Len() > 0 { + rb.WriteString(`|`) + } + rb.WriteString(escaped) + rb.WriteString(`([\w]+)`) + } + rg := regexp.MustCompile(rb.String()) + ids := []string{} + for _, match := range rg.FindAllStringSubmatch(body, -1) { + // Take all non-empty group matches. + for i := 1; i < len(match); i++ { + if match[i] == "" { + continue + } + ids = append(ids, match[i]) + } + } + return ids +} + +func dedupBugIDs(list []string) []string { + // We should preserve the original order of IDs. + var ret []string + dup := map[string]struct{}{} + for _, v := range list { + if _, ok := dup[v]; ok { + continue + } + dup[v] = struct{}{} + ret = append(ret, v) + } + return ret +} + // MergeEmailLists merges several email lists removing duplicates and invalid entries. func MergeEmailLists(lists ...[]string) []string { const ( diff --git a/pkg/email/parser_test.go b/pkg/email/parser_test.go index cde028a02..151d259b8 100644 --- a/pkg/email/parser_test.go +++ b/pkg/email/parser_test.go @@ -8,6 +8,7 @@ import ( "reflect" "strings" "testing" + "time" "github.com/google/go-cmp/cmp" ) @@ -121,7 +122,10 @@ func TestParse(t *testing.T) { for i, test := range parseTests { body := func(t *testing.T, test ParseTest) { email, err := Parse(strings.NewReader(test.email), - []string{"bot <foo@bar.com>"}, []string{"list@googlegroups.com"}) + []string{"bot <foo@bar.com>"}, + []string{"list@googlegroups.com"}, + []string{"bar.com"}, + ) if err != nil { t.Fatal(err) } @@ -342,6 +346,8 @@ type ParseTest struct { res Email } +var parseTestZone = time.FixedZone("", -7*60*60) + // nolint: lll var parseTests = []ParseTest{ {`Date: Sun, 7 May 2017 19:54:00 -0700 @@ -362,8 +368,9 @@ To post to this group, send email to syzkaller@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/syzkaller/abcdef@google.com. For more options, visit https://groups.google.com/d/optout.`, Email{ - BugID: "4564456", + BugIDs: []string{"4564456"}, MessageID: "<123>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), Link: "https://groups.google.com/d/msgid/syzkaller/abcdef@google.com", Subject: "test subject", Author: "bob@example.com", @@ -394,8 +401,9 @@ Content-Type: text/plain; charset="UTF-8" text body last line`, Email{ - BugID: "4564456", + BugIDs: []string{"4564456"}, MessageID: "<123>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), Subject: "test subject", Author: "foo@bar.com", Cc: []string{"bob@example.com"}, @@ -417,6 +425,7 @@ second line last line`, Email{ MessageID: "<123>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), Subject: "test subject", Author: "bob@example.com", Cc: []string{"alice@example.com", "bob@example.com", "bot@example.com"}, @@ -443,6 +452,7 @@ last line #syz command`, Email{ MessageID: "<123>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), Subject: "test subject", Author: "bob@example.com", Cc: []string{"alice@example.com", "bob@example.com", "bot@example.com"}, @@ -483,6 +493,7 @@ IHQpKSB7CiAJCXNwaW5fdW5sb2NrKCZrY292LT5sb2NrKTsKIAkJcmV0dXJuOwo= --001a114ce0b01684a6054f0d8b81--`, Email{ MessageID: "<123>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), Subject: "test subject", Author: "bob@example.com", Cc: []string{"bob@example.com", "bot@example.com"}, @@ -571,6 +582,7 @@ or)</div></div></div> --f403043eee70018593054f0d9f1f--`, Email{ MessageID: "<123>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), Subject: "test subject", Author: "bob@example.com", Cc: []string{"bob@example.com", "bot@example.com"}, @@ -648,6 +660,7 @@ On 2018/06/10 4:57, syzbot wrote: d `, Email{ MessageID: "<1250334f-7220-2bff-5d87-b87573758d81@bar.com>", + Date: time.Date(2018, time.June, 10, 10, 38, 20, 0, time.FixedZone("", 9*60*60)), Subject: "Re: BUG: unable to handle kernel NULL pointer dereference in sock_poll", Author: "bar@foo.com", Cc: []string{"bar@foo.com", "syzbot@syzkaller.appspotmail.com"}, @@ -709,8 +722,9 @@ To: syzbot <foo+4564456@bar.com> nothing to see here`, Email{ - BugID: "4564456", + BugIDs: []string{"4564456"}, MessageID: "<123>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), Subject: "#syz test: git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git master", Author: "bob@example.com", Cc: []string{"bob@example.com"}, @@ -729,6 +743,7 @@ To: syzbot <list@googlegroups.com> nothing to see here`, Email{ MessageID: "<123>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), Subject: "Subject", Author: "user@mail.com", MailingList: "list@googlegroups.com", @@ -746,6 +761,7 @@ To: <user2@mail.com> nothing to see here`, Email{ MessageID: "<123>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), Subject: "Subject", Author: "user@mail.com", MailingList: "list@googlegroups.com", @@ -763,6 +779,7 @@ To: <user2@mail.com> nothing to see here`, Email{ MessageID: "<123>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), Subject: "Subject", Author: "list@googlegroups.com", MailingList: "list@googlegroups.com", @@ -776,7 +793,7 @@ Subject: Re: BUG: unable to handle kernel NULL pointer dereference in To: syzbot <syzbot+344bb0f46d7719cd9483@syzkaller.appspotmail.com> From: bar <bar@foo.com> Message-ID: <1250334f-7220-2bff-5d87-b87573758d81@bar.com> -Date: Sun, 10 Jun 2018 10:38:20 +0900 +Date: Sun, 7 May 2017 19:54:00 -0700 MIME-Version: 1.0 Content-Type: text/plain; charset="UTF-8" Content-Language: en-US @@ -787,6 +804,7 @@ test: https://github.com/torvalds/linux.git 7b5bb460defa107dd2e82= f950fddb9ea6bdb5e39 `, Email{ MessageID: "<1250334f-7220-2bff-5d87-b87573758d81@bar.com>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), Subject: "Re: BUG: unable to handle kernel NULL pointer dereference in sock_poll", Author: "bar@foo.com", Cc: []string{"bar@foo.com", "syzbot@syzkaller.appspotmail.com"}, @@ -797,4 +815,99 @@ test: https://github.com/torvalds/linux.git 7b5bb460defa107dd2e82f950fddb9ea6bdb CommandStr: "test:", CommandArgs: "https://github.com/torvalds/linux.git 7b5bb460defa107dd2e82f950fddb9ea6bdb5e39", }}, + {`Sender: syzkaller-bugs@googlegroups.com +Subject: [PATCH] Some patch +To: <someone@foo.com> +From: bar <bar@foo.com> +Message-ID: <1250334f-7220-2bff-5d87-b87573758d81@bar.com> +Date: Sun, 7 May 2017 19:54:00 -0700 +MIME-Version: 1.0 +Content-Type: text/plain; charset="UTF-8" +Content-Language: en-US +Content-Transfer-Encoding: quoted-printable + +Reported-by: syzbot <foo+223c7461c58c58a4cb10@bar.com> +`, Email{ + BugIDs: []string{"223c7461c58c58a4cb10"}, + MessageID: "<1250334f-7220-2bff-5d87-b87573758d81@bar.com>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), + Subject: "[PATCH] Some patch", + Author: "bar@foo.com", + Cc: []string{"bar@foo.com", "someone@foo.com"}, + Body: `Reported-by: syzbot <foo+223c7461c58c58a4cb10@bar.com> +`, + Command: CmdNone, + }}, + {`Sender: syzkaller-bugs@googlegroups.com +Subject: [PATCH] Some patch +To: <someone@foo.com> +From: bar <bar@foo.com> +Message-ID: <1250334f-7220-2bff-5d87-b87573758d81@bar.com> +Date: Sun, 7 May 2017 19:54:00 -0700 +MIME-Version: 1.0 +Content-Type: text/plain; charset="UTF-8" +Content-Language: en-US + +Link: https://bar.com/bug?extid=223c7461c58c58a4cb10@bar.com +`, Email{ + BugIDs: []string{"223c7461c58c58a4cb10"}, + MessageID: "<1250334f-7220-2bff-5d87-b87573758d81@bar.com>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), + Subject: "[PATCH] Some patch", + Author: "bar@foo.com", + Cc: []string{"bar@foo.com", "someone@foo.com"}, + Body: `Link: https://bar.com/bug?extid=223c7461c58c58a4cb10@bar.com +`, + Command: CmdNone, + }}, + + {`Sender: syzkaller-bugs@googlegroups.com +Subject: [PATCH] Some patch +To: <someone@foo.com> +From: bar <bar@foo.com> +Message-ID: <1250334f-7220-2bff-5d87-b87573758d81@bar.com> +Date: Sun, 7 May 2017 19:54:00 -0700 +MIME-Version: 1.0 +Content-Type: text/plain; charset="UTF-8" +Content-Language: en-US +Content-Transfer-Encoding: quoted-printable + +Reported-by: syzbot <foo+223c7461c58c58a4cb10@bar.com> +Reported-by: syzbot <foo+9909090909090909@bar.com> +`, Email{ + BugIDs: []string{"223c7461c58c58a4cb10", "9909090909090909"}, + MessageID: "<1250334f-7220-2bff-5d87-b87573758d81@bar.com>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), + Subject: "[PATCH] Some patch", + Author: "bar@foo.com", + Cc: []string{"bar@foo.com", "someone@foo.com"}, + Body: `Reported-by: syzbot <foo+223c7461c58c58a4cb10@bar.com> +Reported-by: syzbot <foo+9909090909090909@bar.com> +`, + Command: CmdNone, + }}, + {`Sender: syzkaller-bugs@googlegroups.com +Subject: [PATCH] Some patch +To: <someone@foo.com>, <foo+9909090909090909@bar.com> +From: bar <bar@foo.com> +Message-ID: <1250334f-7220-2bff-5d87-b87573758d81@bar.com> +Date: Sun, 7 May 2017 19:54:00 -0700 +MIME-Version: 1.0 +Content-Type: text/plain; charset="UTF-8" +Content-Language: en-US +Content-Transfer-Encoding: quoted-printable + +Reported-by: syzbot <foo+223c7461c58c58a4cb10@bar.com> +`, Email{ + // First come BugIDs from header, then from the body. + BugIDs: []string{"9909090909090909", "223c7461c58c58a4cb10"}, + MessageID: "<1250334f-7220-2bff-5d87-b87573758d81@bar.com>", + Date: time.Date(2017, time.May, 7, 19, 54, 0, 0, parseTestZone), + Subject: "[PATCH] Some patch", + Author: "bar@foo.com", + Cc: []string{"bar@foo.com", "someone@foo.com"}, + Body: `Reported-by: syzbot <foo+223c7461c58c58a4cb10@bar.com> +`, + Command: CmdNone, + }}, } |
