Improve HTML body parsing - strip style/script/head blocks

This commit is contained in:
Johan Jongsma 2026-02-02 22:21:23 +00:00
parent b7a83f2ab5
commit 30c4d15b87
1 changed files with 61 additions and 33 deletions

94
mime.go
View File

@ -210,49 +210,77 @@ func extractAttachmentParts(entity *message.Entity, attachments *[]Attachment) {
}
// stripHTML removes HTML tags and returns plain text
func stripHTML(html string) string {
var result strings.Builder
func stripHTML(htmlContent string) string {
// Remove style and script blocks first
result := htmlContent
// Remove <style>...</style> blocks
for {
start := strings.Index(strings.ToLower(result), "<style")
if start == -1 {
break
}
end := strings.Index(strings.ToLower(result[start:]), "</style>")
if end == -1 {
break
}
result = result[:start] + result[start+end+8:]
}
// Remove <script>...</script> blocks
for {
start := strings.Index(strings.ToLower(result), "<script")
if start == -1 {
break
}
end := strings.Index(strings.ToLower(result[start:]), "</script>")
if end == -1 {
break
}
result = result[:start] + result[start+end+9:]
}
// Remove <head>...</head> blocks
for {
start := strings.Index(strings.ToLower(result), "<head")
if start == -1 {
break
}
end := strings.Index(strings.ToLower(result[start:]), "</head>")
if end == -1 {
break
}
result = result[:start] + result[start+end+7:]
}
// Replace common block elements with newlines
result = strings.ReplaceAll(result, "<br>", "\n")
result = strings.ReplaceAll(result, "<br/>", "\n")
result = strings.ReplaceAll(result, "<br />", "\n")
result = strings.ReplaceAll(result, "</p>", "\n")
result = strings.ReplaceAll(result, "</div>", "\n")
result = strings.ReplaceAll(result, "</tr>", "\n")
result = strings.ReplaceAll(result, "</li>", "\n")
// Decode HTML entities
result = decodeHTMLEntities(result)
// Remove all remaining tags
var builder strings.Builder
inTag := false
inStyle := false
inScript := false
html = strings.ReplaceAll(html, "<br>", "\n")
html = strings.ReplaceAll(html, "<br/>", "\n")
html = strings.ReplaceAll(html, "<br />", "\n")
html = strings.ReplaceAll(html, "</p>", "\n")
html = strings.ReplaceAll(html, "</div>", "\n")
html = decodeHTMLEntities(html)
for _, r := range html {
for _, r := range result {
switch {
case r == '<':
inTag = true
case r == '>':
inTag = false
case !inTag && !inStyle && !inScript:
result.WriteRune(r)
}
// Track style/script blocks (simplified)
if inTag {
lower := strings.ToLower(string(r))
if strings.Contains(lower, "style") {
inStyle = true
}
if strings.Contains(lower, "/style") {
inStyle = false
}
if strings.Contains(lower, "script") {
inScript = true
}
if strings.Contains(lower, "/script") {
inScript = false
}
case !inTag:
builder.WriteRune(r)
}
}
// Clean up whitespace
text := result.String()
text := builder.String()
lines := strings.Split(text, "\n")
var cleaned []string
for _, line := range lines {