diff options
| author | Aleksandr Nogikh <nogikh@google.com> | 2023-03-30 19:36:54 +0200 |
|---|---|---|
| committer | Aleksandr Nogikh <wp32pw@gmail.com> | 2023-04-06 13:59:25 +0200 |
| commit | 7c8c334ec07d3333dacd150dc389ba3f3db649df (patch) | |
| tree | 839d2d0e6a7b65d115bce0bfcb772c0293462b06 /pkg/email/parser.go | |
| parent | 139c4ef69fd5289588228c700717631d8f1731d0 (diff) | |
pkg/email: extract more information
Extract Date and InReplyTo.
Extract not just one, but multiple BugIDs from multiple sources:
1) Email addresses in From/To/Cc.
2) Email addresses in Body.
3) Dashboard links in Body.
Diffstat (limited to 'pkg/email/parser.go')
| -rw-r--r-- | pkg/email/parser.go | 68 |
1 files changed, 60 insertions, 8 deletions
diff --git a/pkg/email/parser.go b/pkg/email/parser.go index af044791d..795cc6fed 100644 --- a/pkg/email/parser.go +++ b/pkg/email/parser.go @@ -14,12 +14,15 @@ import ( "regexp" "sort" "strings" + "time" "unicode" ) type Email struct { - BugID string + BugIDs []string MessageID string + InReplyTo string + Date time.Time Link string Subject string MailingList string @@ -65,7 +68,7 @@ func prepareEmails(list []string) map[string]bool { return ret } -func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { +func Parse(r io.Reader, ownEmails, goodLists, domains []string) (*Email, error) { msg, err := mail.ReadMessage(r) if err != nil { return nil, fmt.Errorf("failed to read email: %v", err) @@ -81,7 +84,6 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { to, _ := msg.Header.AddressList("To") // AddressList fails if the header is not present. cc, _ := msg.Header.AddressList("Cc") - bugID := "" var ccList []string ownAddrs := prepareEmails(ownEmails) fromMe := false @@ -99,6 +101,7 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { originalFrom = originalFroms[0].String() } + bugIDs := []string{} rawCcList := append(append(append(cc, to...), from...), originalFroms...) for _, addr := range rawCcList { cleaned, context, _ := RemoveAddrContext(addr.Address) @@ -106,9 +109,7 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { cleaned = addr.Address } if ownAddrs[cleaned] { - if bugID == "" { - bugID = context - } + bugIDs = append(bugIDs, context) } else { ccList = append(ccList, CanonicalEmail(cleaned)) } @@ -142,6 +143,8 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { } cmd, cmdStr, cmdArgs = extractCommand(subject + "\n" + bodyStr) } + bugIDs = append(bugIDs, extractBodyBugIDs(bodyStr, ownAddrs, domains)...) + link := "" if match := groupsLinkRe.FindStringSubmatchIndex(bodyStr); match != nil { link = bodyStr[match[2]:match[3]] @@ -162,10 +165,12 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) { // In other cases, the mailing list would preserve From and just change Sender. mailingList = CanonicalEmail(sender) } - + date, _ := mail.ParseDate(msg.Header.Get("Date")) email := &Email{ - BugID: bugID, + BugIDs: dedupBugIDs(bugIDs), MessageID: msg.Header.Get("Message-ID"), + InReplyTo: msg.Header.Get("In-Reply-To"), + Date: date, Link: link, Author: author, MailingList: mailingList, @@ -412,6 +417,53 @@ func parseBody(r io.Reader, headers mail.Header) ([]byte, [][]byte, error) { } } +func extractBodyBugIDs(body string, ownEmailMap map[string]bool, domains []string) []string { + // Let's build a regular expression. + var rb strings.Builder + for email := range ownEmailMap { + escaped := regexp.QuoteMeta(email) + part := strings.ReplaceAll(escaped, `@`, `\+(\w+?)@`) + if rb.Len() > 0 { + rb.WriteString(`|`) + } + rb.WriteString(part) + } + for _, domain := range domains { + escaped := regexp.QuoteMeta(domain + "/bug?extid=") + if rb.Len() > 0 { + rb.WriteString(`|`) + } + rb.WriteString(escaped) + rb.WriteString(`([\w]+)`) + } + rg := regexp.MustCompile(rb.String()) + ids := []string{} + for _, match := range rg.FindAllStringSubmatch(body, -1) { + // Take all non-empty group matches. + for i := 1; i < len(match); i++ { + if match[i] == "" { + continue + } + ids = append(ids, match[i]) + } + } + return ids +} + +func dedupBugIDs(list []string) []string { + // We should preserve the original order of IDs. + var ret []string + dup := map[string]struct{}{} + for _, v := range list { + if _, ok := dup[v]; ok { + continue + } + dup[v] = struct{}{} + ret = append(ret, v) + } + return ret +} + // MergeEmailLists merges several email lists removing duplicates and invalid entries. func MergeEmailLists(lists ...[]string) []string { const ( |
