package main import ( "bytes" "html" "io" "mime" "mime/quotedprintable" "encoding/base64" "strings" "github.com/emersion/go-message" _ "github.com/emersion/go-message/charset" ) // decodeHTMLEntities converts HTML entities to their text equivalents func decodeHTMLEntities(s string) string { return html.UnescapeString(s) } // ParsedBody contains extracted text and HTML from a MIME message type ParsedBody struct { Text string HTML string } // ParseMIMEBody extracts text/plain and text/html parts from raw email bytes func ParseMIMEBody(raw []byte) ParsedBody { result := ParsedBody{} entity, err := message.Read(bytes.NewReader(raw)) if err != nil { // Fallback: return raw as text result.Text = string(raw) return result } extractParts(entity, &result) return result } func extractParts(entity *message.Entity, result *ParsedBody) { mediaType, params, err := mime.ParseMediaType(entity.Header.Get("Content-Type")) if err != nil { mediaType = "text/plain" } if strings.HasPrefix(mediaType, "multipart/") { // Multipart message - recurse into parts mr := entity.MultipartReader() if mr == nil { return } for { part, err := mr.NextPart() if err != nil { break } extractParts(part, result) } } else { // Single part - extract content body, err := io.ReadAll(entity.Body) if err != nil { return } // Decode transfer encoding encoding := strings.ToLower(entity.Header.Get("Content-Transfer-Encoding")) decoded := decodeBody(body, encoding) // Decode charset if needed charset := params["charset"] if charset != "" { // go-message/charset handles this automatically via import } text := string(decoded) switch mediaType { case "text/plain": if result.Text == "" { result.Text = text } case "text/html": if result.HTML == "" { result.HTML = text // Also extract text from HTML if we don't have plain text if result.Text == "" { result.Text = stripHTML(text) } } } } } func decodeBody(body []byte, encoding string) []byte { switch encoding { case "quoted-printable": decoded, err := io.ReadAll(quotedprintable.NewReader(bytes.NewReader(body))) if err != nil { return body } return decoded case "base64": decoded, err := base64.StdEncoding.DecodeString(string(body)) if err != nil { // Try with whitespace removed cleaned := strings.ReplaceAll(string(body), "\r\n", "") cleaned = strings.ReplaceAll(cleaned, "\n", "") decoded, err = base64.StdEncoding.DecodeString(cleaned) if err != nil { return body } } return decoded default: return body } } // stripHTML removes HTML tags and returns plain text func stripHTML(html string) string { var result strings.Builder inTag := false inStyle := false inScript := false html = strings.ReplaceAll(html, "
", "\n") html = strings.ReplaceAll(html, "
", "\n") html = strings.ReplaceAll(html, "
", "\n") html = strings.ReplaceAll(html, "

", "\n") html = strings.ReplaceAll(html, "", "\n") html = decodeHTMLEntities(html) for _, r := range html { switch { case r == '<': inTag = true case r == '>': inTag = false case !inTag && !inStyle && !inScript: result.WriteRune(r) } // Track style/script blocks (simplified) if inTag { lower := strings.ToLower(string(r)) if strings.Contains(lower, "style") { inStyle = true } if strings.Contains(lower, "/style") { inStyle = false } if strings.Contains(lower, "script") { inScript = true } if strings.Contains(lower, "/script") { inScript = false } } } // Clean up whitespace text := result.String() lines := strings.Split(text, "\n") var cleaned []string for _, line := range lines { line = strings.TrimSpace(line) if line != "" { cleaned = append(cleaned, line) } } return strings.Join(cleaned, "\n") }