267 lines
5.9 KiB
Go
267 lines
5.9 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"html"
|
|
"io"
|
|
"mime"
|
|
"mime/quotedprintable"
|
|
"encoding/base64"
|
|
"strings"
|
|
|
|
"github.com/emersion/go-message"
|
|
_ "github.com/emersion/go-message/charset"
|
|
)
|
|
|
|
// decodeHTMLEntities converts HTML entities to their text equivalents
|
|
func decodeHTMLEntities(s string) string {
|
|
return html.UnescapeString(s)
|
|
}
|
|
|
|
// ParsedBody contains extracted text and HTML from a MIME message
|
|
type ParsedBody struct {
|
|
Text string
|
|
HTML string
|
|
}
|
|
|
|
// ParseMIMEBody extracts text/plain and text/html parts from raw email bytes
|
|
func ParseMIMEBody(raw []byte) ParsedBody {
|
|
result := ParsedBody{}
|
|
|
|
entity, err := message.Read(bytes.NewReader(raw))
|
|
if err != nil {
|
|
// Fallback: return raw as text
|
|
result.Text = string(raw)
|
|
return result
|
|
}
|
|
|
|
extractParts(entity, &result)
|
|
return result
|
|
}
|
|
|
|
func extractParts(entity *message.Entity, result *ParsedBody) {
|
|
mediaType, params, err := mime.ParseMediaType(entity.Header.Get("Content-Type"))
|
|
if err != nil {
|
|
mediaType = "text/plain"
|
|
}
|
|
|
|
if strings.HasPrefix(mediaType, "multipart/") {
|
|
// Multipart message - recurse into parts
|
|
mr := entity.MultipartReader()
|
|
if mr == nil {
|
|
return
|
|
}
|
|
|
|
for {
|
|
part, err := mr.NextPart()
|
|
if err != nil {
|
|
break
|
|
}
|
|
extractParts(part, result)
|
|
}
|
|
} else {
|
|
// Single part - extract content
|
|
body, err := io.ReadAll(entity.Body)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
// Decode transfer encoding
|
|
encoding := strings.ToLower(entity.Header.Get("Content-Transfer-Encoding"))
|
|
decoded := decodeBody(body, encoding)
|
|
|
|
// Decode charset if needed
|
|
charset := params["charset"]
|
|
if charset != "" {
|
|
// go-message/charset handles this automatically via import
|
|
}
|
|
|
|
text := string(decoded)
|
|
|
|
switch mediaType {
|
|
case "text/plain":
|
|
if result.Text == "" {
|
|
result.Text = text
|
|
}
|
|
case "text/html":
|
|
if result.HTML == "" {
|
|
result.HTML = text
|
|
// Also extract text from HTML if we don't have plain text
|
|
if result.Text == "" {
|
|
result.Text = stripHTML(text)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func decodeBody(body []byte, encoding string) []byte {
|
|
switch encoding {
|
|
case "quoted-printable":
|
|
decoded, err := io.ReadAll(quotedprintable.NewReader(bytes.NewReader(body)))
|
|
if err != nil {
|
|
return body
|
|
}
|
|
return decoded
|
|
case "base64":
|
|
decoded, err := base64.StdEncoding.DecodeString(string(body))
|
|
if err != nil {
|
|
// Try with whitespace removed
|
|
cleaned := strings.ReplaceAll(string(body), "\r\n", "")
|
|
cleaned = strings.ReplaceAll(cleaned, "\n", "")
|
|
decoded, err = base64.StdEncoding.DecodeString(cleaned)
|
|
if err != nil {
|
|
return body
|
|
}
|
|
}
|
|
return decoded
|
|
default:
|
|
return body
|
|
}
|
|
}
|
|
|
|
// Attachment represents an email attachment with content
|
|
type Attachment struct {
|
|
Filename string `json:"filename"`
|
|
ContentType string `json:"content_type"`
|
|
Size int `json:"size"`
|
|
Content string `json:"content"` // base64 encoded
|
|
}
|
|
|
|
// ExtractAttachments extracts all attachments from raw email bytes
|
|
func ExtractAttachments(raw []byte) []Attachment {
|
|
attachments := make([]Attachment, 0) // Initialize to empty slice (not nil)
|
|
|
|
entity, err := message.Read(bytes.NewReader(raw))
|
|
if err != nil {
|
|
return attachments
|
|
}
|
|
|
|
extractAttachmentParts(entity, &attachments)
|
|
return attachments
|
|
}
|
|
|
|
func extractAttachmentParts(entity *message.Entity, attachments *[]Attachment) {
|
|
mediaType, params, err := mime.ParseMediaType(entity.Header.Get("Content-Type"))
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
if strings.HasPrefix(mediaType, "multipart/") {
|
|
// Multipart message - recurse into parts
|
|
mr := entity.MultipartReader()
|
|
if mr == nil {
|
|
return
|
|
}
|
|
|
|
for {
|
|
part, err := mr.NextPart()
|
|
if err != nil {
|
|
break
|
|
}
|
|
extractAttachmentParts(part, attachments)
|
|
}
|
|
} else {
|
|
// Check if this is an attachment
|
|
disp := entity.Header.Get("Content-Disposition")
|
|
if disp == "" {
|
|
return
|
|
}
|
|
|
|
dispType, dispParams, err := mime.ParseMediaType(disp)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
if !strings.EqualFold(dispType, "attachment") && !strings.EqualFold(dispType, "inline") {
|
|
return
|
|
}
|
|
|
|
// Get filename
|
|
filename := dispParams["filename"]
|
|
if filename == "" {
|
|
filename = params["name"]
|
|
}
|
|
if filename == "" {
|
|
// Skip attachments without filename
|
|
return
|
|
}
|
|
|
|
// Read body
|
|
body, err := io.ReadAll(entity.Body)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
// Decode transfer encoding
|
|
encoding := strings.ToLower(entity.Header.Get("Content-Transfer-Encoding"))
|
|
decoded := decodeBody(body, encoding)
|
|
|
|
// Encode as base64 for JSON transport
|
|
content := base64.StdEncoding.EncodeToString(decoded)
|
|
|
|
*attachments = append(*attachments, Attachment{
|
|
Filename: filename,
|
|
ContentType: mediaType,
|
|
Size: len(decoded),
|
|
Content: content,
|
|
})
|
|
}
|
|
}
|
|
|
|
// stripHTML removes HTML tags and returns plain text
|
|
func stripHTML(html string) string {
|
|
var result strings.Builder
|
|
inTag := false
|
|
inStyle := false
|
|
inScript := false
|
|
|
|
html = strings.ReplaceAll(html, "<br>", "\n")
|
|
html = strings.ReplaceAll(html, "<br/>", "\n")
|
|
html = strings.ReplaceAll(html, "<br />", "\n")
|
|
html = strings.ReplaceAll(html, "</p>", "\n")
|
|
html = strings.ReplaceAll(html, "</div>", "\n")
|
|
html = decodeHTMLEntities(html)
|
|
|
|
for _, r := range html {
|
|
switch {
|
|
case r == '<':
|
|
inTag = true
|
|
case r == '>':
|
|
inTag = false
|
|
case !inTag && !inStyle && !inScript:
|
|
result.WriteRune(r)
|
|
}
|
|
|
|
// Track style/script blocks (simplified)
|
|
if inTag {
|
|
lower := strings.ToLower(string(r))
|
|
if strings.Contains(lower, "style") {
|
|
inStyle = true
|
|
}
|
|
if strings.Contains(lower, "/style") {
|
|
inStyle = false
|
|
}
|
|
if strings.Contains(lower, "script") {
|
|
inScript = true
|
|
}
|
|
if strings.Contains(lower, "/script") {
|
|
inScript = false
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clean up whitespace
|
|
text := result.String()
|
|
lines := strings.Split(text, "\n")
|
|
var cleaned []string
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if line != "" {
|
|
cleaned = append(cleaned, line)
|
|
}
|
|
}
|
|
|
|
return strings.Join(cleaned, "\n")
|
|
}
|