Improve HTML body parsing - strip style/script/head blocks

2026-02-02 22:21:23 +00:00 · 2026-02-02 22:21:23 +00:00 · 30c4d15b87
parent b7a83f2ab5
commit 30c4d15b87
1 changed files with 61 additions and 33 deletions
--- a/mime.go
+++ b/mime.go
@ -210,49 +210,77 @@ func extractAttachmentParts(entity *message.Entity, attachments *[]Attachment) {
 }

 // stripHTML removes HTML tags and returns plain text
-func stripHTML(html string) string {
-	var result strings.Builder
+func stripHTML(htmlContent string) string {
+	// Remove style and script blocks first
+	result := htmlContent
+	
+	// Remove <style>...</style> blocks
+	for {
+		start := strings.Index(strings.ToLower(result), "<style")
+		if start == -1 {
+			break
+		}
+		end := strings.Index(strings.ToLower(result[start:]), "</style>")
+		if end == -1 {
+			break
+		}
+		result = result[:start] + result[start+end+8:]
+	}
+	
+	// Remove <script>...</script> blocks
+	for {
+		start := strings.Index(strings.ToLower(result), "<script")
+		if start == -1 {
+			break
+		}
+		end := strings.Index(strings.ToLower(result[start:]), "</script>")
+		if end == -1 {
+			break
+		}
+		result = result[:start] + result[start+end+9:]
+	}
+	
+	// Remove <head>...</head> blocks
+	for {
+		start := strings.Index(strings.ToLower(result), "<head")
+		if start == -1 {
+			break
+		}
+		end := strings.Index(strings.ToLower(result[start:]), "</head>")
+		if end == -1 {
+			break
+		}
+		result = result[:start] + result[start+end+7:]
+	}
+	
+	// Replace common block elements with newlines
+	result = strings.ReplaceAll(result, "<br>", "\n")
+	result = strings.ReplaceAll(result, "<br/>", "\n")
+	result = strings.ReplaceAll(result, "<br />", "\n")
+	result = strings.ReplaceAll(result, "</p>", "\n")
+	result = strings.ReplaceAll(result, "</div>", "\n")
+	result = strings.ReplaceAll(result, "</tr>", "\n")
+	result = strings.ReplaceAll(result, "</li>", "\n")
+	
+	// Decode HTML entities
+	result = decodeHTMLEntities(result)
+	
+	// Remove all remaining tags
+	var builder strings.Builder
 	inTag := false
-	inStyle := false
-	inScript := false
-	
-	html = strings.ReplaceAll(html, "<br>", "\n")
-	html = strings.ReplaceAll(html, "<br/>", "\n")
-	html = strings.ReplaceAll(html, "<br />", "\n")
-	html = strings.ReplaceAll(html, "</p>", "\n")
-	html = strings.ReplaceAll(html, "</div>", "\n")
-	html = decodeHTMLEntities(html)
-	
-	for _, r := range html {
+	for _, r := range result {
 		switch {
 		case r == '<':
 			inTag = true
 		case r == '>':
 			inTag = false
-		case !inTag && !inStyle && !inScript:
-			result.WriteRune(r)
-		}
-		
-		// Track style/script blocks (simplified)
-		if inTag {
-			lower := strings.ToLower(string(r))
-			if strings.Contains(lower, "style") {
-				inStyle = true
-			}
-			if strings.Contains(lower, "/style") {
-				inStyle = false
-			}
-			if strings.Contains(lower, "script") {
-				inScript = true
-			}
-			if strings.Contains(lower, "/script") {
-				inScript = false
-			}
+		case !inTag:
+			builder.WriteRune(r)
 		}
 	}
 	
 	// Clean up whitespace
-	text := result.String()
+	text := builder.String()
 	lines := strings.Split(text, "\n")
 	var cleaned []string
 	for _, line := range lines {