package main import ( "bytes" "html" "io" "mime" "mime/quotedprintable" "encoding/base64" "strings" "github.com/emersion/go-message" _ "github.com/emersion/go-message/charset" ) // decodeHTMLEntities converts HTML entities to their text equivalents func decodeHTMLEntities(s string) string { return html.UnescapeString(s) } // ParsedBody contains extracted text and HTML from a MIME message type ParsedBody struct { Text string HTML string } // ParseMIMEBody extracts text/plain and text/html parts from raw email bytes func ParseMIMEBody(raw []byte) ParsedBody { result := ParsedBody{} entity, err := message.Read(bytes.NewReader(raw)) if err != nil { // Fallback: return raw as text result.Text = string(raw) return result } extractParts(entity, &result) return result } func extractParts(entity *message.Entity, result *ParsedBody) { mediaType, params, err := mime.ParseMediaType(entity.Header.Get("Content-Type")) if err != nil { mediaType = "text/plain" } if strings.HasPrefix(mediaType, "multipart/") { // Multipart message - recurse into parts mr := entity.MultipartReader() if mr == nil { return } for { part, err := mr.NextPart() if err != nil { break } extractParts(part, result) } } else { // Single part - extract content body, err := io.ReadAll(entity.Body) if err != nil { return } // Decode transfer encoding encoding := strings.ToLower(entity.Header.Get("Content-Transfer-Encoding")) decoded := decodeBody(body, encoding) // Decode charset if needed charset := params["charset"] if charset != "" { // go-message/charset handles this automatically via import } text := string(decoded) switch mediaType { case "text/plain": if result.Text == "" { result.Text = text } case "text/html": if result.HTML == "" { result.HTML = text // Also extract text from HTML if we don't have plain text if result.Text == "" { result.Text = stripHTML(text) } } } } } func decodeBody(body []byte, encoding string) []byte { switch encoding { case "quoted-printable": decoded, err := io.ReadAll(quotedprintable.NewReader(bytes.NewReader(body))) if err != nil { return body } return decoded case "base64": decoded, err := base64.StdEncoding.DecodeString(string(body)) if err != nil { // Try with whitespace removed cleaned := strings.ReplaceAll(string(body), "\r\n", "") cleaned = strings.ReplaceAll(cleaned, "\n", "") decoded, err = base64.StdEncoding.DecodeString(cleaned) if err != nil { return body } } return decoded default: return body } } // Attachment represents an email attachment with content type Attachment struct { Filename string `json:"filename"` ContentType string `json:"content_type"` Size int `json:"size"` Content string `json:"content"` // base64 encoded } // ExtractAttachments extracts all attachments from raw email bytes func ExtractAttachments(raw []byte) []Attachment { attachments := make([]Attachment, 0) // Initialize to empty slice (not nil) entity, err := message.Read(bytes.NewReader(raw)) if err != nil { return attachments } extractAttachmentParts(entity, &attachments) return attachments } func extractAttachmentParts(entity *message.Entity, attachments *[]Attachment) { mediaType, params, err := mime.ParseMediaType(entity.Header.Get("Content-Type")) if err != nil { return } if strings.HasPrefix(mediaType, "multipart/") { // Multipart message - recurse into parts mr := entity.MultipartReader() if mr == nil { return } for { part, err := mr.NextPart() if err != nil { break } extractAttachmentParts(part, attachments) } } else { // Check if this is an attachment disp := entity.Header.Get("Content-Disposition") if disp == "" { return } dispType, dispParams, err := mime.ParseMediaType(disp) if err != nil { return } if !strings.EqualFold(dispType, "attachment") && !strings.EqualFold(dispType, "inline") { return } // Get filename filename := dispParams["filename"] if filename == "" { filename = params["name"] } if filename == "" { // Skip attachments without filename return } // Read body body, err := io.ReadAll(entity.Body) if err != nil { return } // Decode transfer encoding encoding := strings.ToLower(entity.Header.Get("Content-Transfer-Encoding")) decoded := decodeBody(body, encoding) // Encode as base64 for JSON transport content := base64.StdEncoding.EncodeToString(decoded) *attachments = append(*attachments, Attachment{ Filename: filename, ContentType: mediaType, Size: len(decoded), Content: content, }) } } // stripHTML removes HTML tags and returns plain text func stripHTML(htmlContent string) string { // Remove style and script blocks first result := htmlContent // Remove blocks for { start := strings.Index(strings.ToLower(result), "") if end == -1 { break } result = result[:start] + result[start+end+8:] } // Remove blocks for { start := strings.Index(strings.ToLower(result), "") if end == -1 { break } result = result[:start] + result[start+end+9:] } // Remove ... blocks for { start := strings.Index(strings.ToLower(result), "") if end == -1 { break } result = result[:start] + result[start+end+7:] } // Replace common block elements with newlines result = strings.ReplaceAll(result, "
", "\n") result = strings.ReplaceAll(result, "
", "\n") result = strings.ReplaceAll(result, "
", "\n") result = strings.ReplaceAll(result, "

", "\n") result = strings.ReplaceAll(result, "", "\n") result = strings.ReplaceAll(result, "", "\n") result = strings.ReplaceAll(result, "", "\n") // Decode HTML entities result = decodeHTMLEntities(result) // Remove all remaining tags var builder strings.Builder inTag := false for _, r := range result { switch { case r == '<': inTag = true case r == '>': inTag = false case !inTag: builder.WriteRune(r) } } // Clean up whitespace text := builder.String() lines := strings.Split(text, "\n") var cleaned []string for _, line := range lines { line = strings.TrimSpace(line) if line != "" { cleaned = append(cleaned, line) } } return strings.Join(cleaned, "\n") }