aboutsummaryrefslogtreecommitdiffstats
path: root/pkg/email/parser.go
diff options
context:
space:
mode:
authorAleksandr Nogikh <nogikh@google.com>2023-03-30 19:36:54 +0200
committerAleksandr Nogikh <wp32pw@gmail.com>2023-04-06 13:59:25 +0200
commit7c8c334ec07d3333dacd150dc389ba3f3db649df (patch)
tree839d2d0e6a7b65d115bce0bfcb772c0293462b06 /pkg/email/parser.go
parent139c4ef69fd5289588228c700717631d8f1731d0 (diff)
pkg/email: extract more information
Extract Date and InReplyTo. Extract not just one, but multiple BugIDs from multiple sources: 1) Email addresses in From/To/Cc. 2) Email addresses in Body. 3) Dashboard links in Body.
Diffstat (limited to 'pkg/email/parser.go')
-rw-r--r--pkg/email/parser.go68
1 files changed, 60 insertions, 8 deletions
diff --git a/pkg/email/parser.go b/pkg/email/parser.go
index af044791d..795cc6fed 100644
--- a/pkg/email/parser.go
+++ b/pkg/email/parser.go
@@ -14,12 +14,15 @@ import (
"regexp"
"sort"
"strings"
+ "time"
"unicode"
)
type Email struct {
- BugID string
+ BugIDs []string
MessageID string
+ InReplyTo string
+ Date time.Time
Link string
Subject string
MailingList string
@@ -65,7 +68,7 @@ func prepareEmails(list []string) map[string]bool {
return ret
}
-func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) {
+func Parse(r io.Reader, ownEmails, goodLists, domains []string) (*Email, error) {
msg, err := mail.ReadMessage(r)
if err != nil {
return nil, fmt.Errorf("failed to read email: %v", err)
@@ -81,7 +84,6 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) {
to, _ := msg.Header.AddressList("To")
// AddressList fails if the header is not present.
cc, _ := msg.Header.AddressList("Cc")
- bugID := ""
var ccList []string
ownAddrs := prepareEmails(ownEmails)
fromMe := false
@@ -99,6 +101,7 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) {
originalFrom = originalFroms[0].String()
}
+ bugIDs := []string{}
rawCcList := append(append(append(cc, to...), from...), originalFroms...)
for _, addr := range rawCcList {
cleaned, context, _ := RemoveAddrContext(addr.Address)
@@ -106,9 +109,7 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) {
cleaned = addr.Address
}
if ownAddrs[cleaned] {
- if bugID == "" {
- bugID = context
- }
+ bugIDs = append(bugIDs, context)
} else {
ccList = append(ccList, CanonicalEmail(cleaned))
}
@@ -142,6 +143,8 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) {
}
cmd, cmdStr, cmdArgs = extractCommand(subject + "\n" + bodyStr)
}
+ bugIDs = append(bugIDs, extractBodyBugIDs(bodyStr, ownAddrs, domains)...)
+
link := ""
if match := groupsLinkRe.FindStringSubmatchIndex(bodyStr); match != nil {
link = bodyStr[match[2]:match[3]]
@@ -162,10 +165,12 @@ func Parse(r io.Reader, ownEmails, goodLists []string) (*Email, error) {
// In other cases, the mailing list would preserve From and just change Sender.
mailingList = CanonicalEmail(sender)
}
-
+ date, _ := mail.ParseDate(msg.Header.Get("Date"))
email := &Email{
- BugID: bugID,
+ BugIDs: dedupBugIDs(bugIDs),
MessageID: msg.Header.Get("Message-ID"),
+ InReplyTo: msg.Header.Get("In-Reply-To"),
+ Date: date,
Link: link,
Author: author,
MailingList: mailingList,
@@ -412,6 +417,53 @@ func parseBody(r io.Reader, headers mail.Header) ([]byte, [][]byte, error) {
}
}
+func extractBodyBugIDs(body string, ownEmailMap map[string]bool, domains []string) []string {
+ // Let's build a regular expression.
+ var rb strings.Builder
+ for email := range ownEmailMap {
+ escaped := regexp.QuoteMeta(email)
+ part := strings.ReplaceAll(escaped, `@`, `\+(\w+?)@`)
+ if rb.Len() > 0 {
+ rb.WriteString(`|`)
+ }
+ rb.WriteString(part)
+ }
+ for _, domain := range domains {
+ escaped := regexp.QuoteMeta(domain + "/bug?extid=")
+ if rb.Len() > 0 {
+ rb.WriteString(`|`)
+ }
+ rb.WriteString(escaped)
+ rb.WriteString(`([\w]+)`)
+ }
+ rg := regexp.MustCompile(rb.String())
+ ids := []string{}
+ for _, match := range rg.FindAllStringSubmatch(body, -1) {
+ // Take all non-empty group matches.
+ for i := 1; i < len(match); i++ {
+ if match[i] == "" {
+ continue
+ }
+ ids = append(ids, match[i])
+ }
+ }
+ return ids
+}
+
+func dedupBugIDs(list []string) []string {
+ // We should preserve the original order of IDs.
+ var ret []string
+ dup := map[string]struct{}{}
+ for _, v := range list {
+ if _, ok := dup[v]; ok {
+ continue
+ }
+ dup[v] = struct{}{}
+ ret = append(ret, v)
+ }
+ return ret
+}
+
// MergeEmailLists merges several email lists removing duplicates and invalid entries.
func MergeEmailLists(lists ...[]string) []string {
const (