message-center/mime.go

267 lines
5.9 KiB
Go

package main
import (
"bytes"
"html"
"io"
"mime"
"mime/quotedprintable"
"encoding/base64"
"strings"
"github.com/emersion/go-message"
_ "github.com/emersion/go-message/charset"
)
// decodeHTMLEntities converts HTML entities to their text equivalents
func decodeHTMLEntities(s string) string {
return html.UnescapeString(s)
}
// ParsedBody contains extracted text and HTML from a MIME message
type ParsedBody struct {
Text string
HTML string
}
// ParseMIMEBody extracts text/plain and text/html parts from raw email bytes
func ParseMIMEBody(raw []byte) ParsedBody {
result := ParsedBody{}
entity, err := message.Read(bytes.NewReader(raw))
if err != nil {
// Fallback: return raw as text
result.Text = string(raw)
return result
}
extractParts(entity, &result)
return result
}
func extractParts(entity *message.Entity, result *ParsedBody) {
mediaType, params, err := mime.ParseMediaType(entity.Header.Get("Content-Type"))
if err != nil {
mediaType = "text/plain"
}
if strings.HasPrefix(mediaType, "multipart/") {
// Multipart message - recurse into parts
mr := entity.MultipartReader()
if mr == nil {
return
}
for {
part, err := mr.NextPart()
if err != nil {
break
}
extractParts(part, result)
}
} else {
// Single part - extract content
body, err := io.ReadAll(entity.Body)
if err != nil {
return
}
// Decode transfer encoding
encoding := strings.ToLower(entity.Header.Get("Content-Transfer-Encoding"))
decoded := decodeBody(body, encoding)
// Decode charset if needed
charset := params["charset"]
if charset != "" {
// go-message/charset handles this automatically via import
}
text := string(decoded)
switch mediaType {
case "text/plain":
if result.Text == "" {
result.Text = text
}
case "text/html":
if result.HTML == "" {
result.HTML = text
// Also extract text from HTML if we don't have plain text
if result.Text == "" {
result.Text = stripHTML(text)
}
}
}
}
}
func decodeBody(body []byte, encoding string) []byte {
switch encoding {
case "quoted-printable":
decoded, err := io.ReadAll(quotedprintable.NewReader(bytes.NewReader(body)))
if err != nil {
return body
}
return decoded
case "base64":
decoded, err := base64.StdEncoding.DecodeString(string(body))
if err != nil {
// Try with whitespace removed
cleaned := strings.ReplaceAll(string(body), "\r\n", "")
cleaned = strings.ReplaceAll(cleaned, "\n", "")
decoded, err = base64.StdEncoding.DecodeString(cleaned)
if err != nil {
return body
}
}
return decoded
default:
return body
}
}
// Attachment represents an email attachment with content
type Attachment struct {
Filename string `json:"filename"`
ContentType string `json:"content_type"`
Size int `json:"size"`
Content string `json:"content"` // base64 encoded
}
// ExtractAttachments extracts all attachments from raw email bytes
func ExtractAttachments(raw []byte) []Attachment {
attachments := make([]Attachment, 0) // Initialize to empty slice (not nil)
entity, err := message.Read(bytes.NewReader(raw))
if err != nil {
return attachments
}
extractAttachmentParts(entity, &attachments)
return attachments
}
func extractAttachmentParts(entity *message.Entity, attachments *[]Attachment) {
mediaType, params, err := mime.ParseMediaType(entity.Header.Get("Content-Type"))
if err != nil {
return
}
if strings.HasPrefix(mediaType, "multipart/") {
// Multipart message - recurse into parts
mr := entity.MultipartReader()
if mr == nil {
return
}
for {
part, err := mr.NextPart()
if err != nil {
break
}
extractAttachmentParts(part, attachments)
}
} else {
// Check if this is an attachment
disp := entity.Header.Get("Content-Disposition")
if disp == "" {
return
}
dispType, dispParams, err := mime.ParseMediaType(disp)
if err != nil {
return
}
if !strings.EqualFold(dispType, "attachment") && !strings.EqualFold(dispType, "inline") {
return
}
// Get filename
filename := dispParams["filename"]
if filename == "" {
filename = params["name"]
}
if filename == "" {
// Skip attachments without filename
return
}
// Read body
body, err := io.ReadAll(entity.Body)
if err != nil {
return
}
// Decode transfer encoding
encoding := strings.ToLower(entity.Header.Get("Content-Transfer-Encoding"))
decoded := decodeBody(body, encoding)
// Encode as base64 for JSON transport
content := base64.StdEncoding.EncodeToString(decoded)
*attachments = append(*attachments, Attachment{
Filename: filename,
ContentType: mediaType,
Size: len(decoded),
Content: content,
})
}
}
// stripHTML removes HTML tags and returns plain text
func stripHTML(html string) string {
var result strings.Builder
inTag := false
inStyle := false
inScript := false
html = strings.ReplaceAll(html, "<br>", "\n")
html = strings.ReplaceAll(html, "<br/>", "\n")
html = strings.ReplaceAll(html, "<br />", "\n")
html = strings.ReplaceAll(html, "</p>", "\n")
html = strings.ReplaceAll(html, "</div>", "\n")
html = decodeHTMLEntities(html)
for _, r := range html {
switch {
case r == '<':
inTag = true
case r == '>':
inTag = false
case !inTag && !inStyle && !inScript:
result.WriteRune(r)
}
// Track style/script blocks (simplified)
if inTag {
lower := strings.ToLower(string(r))
if strings.Contains(lower, "style") {
inStyle = true
}
if strings.Contains(lower, "/style") {
inStyle = false
}
if strings.Contains(lower, "script") {
inScript = true
}
if strings.Contains(lower, "/script") {
inScript = false
}
}
}
// Clean up whitespace
text := result.String()
lines := strings.Split(text, "\n")
var cleaned []string
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
cleaned = append(cleaned, line)
}
}
return strings.Join(cleaned, "\n")
}