Improve HTML body parsing - strip style/script/head blocks
This commit is contained in:
parent
b7a83f2ab5
commit
30c4d15b87
94
mime.go
94
mime.go
|
|
@ -210,49 +210,77 @@ func extractAttachmentParts(entity *message.Entity, attachments *[]Attachment) {
|
|||
}
|
||||
|
||||
// stripHTML removes HTML tags and returns plain text
|
||||
func stripHTML(html string) string {
|
||||
var result strings.Builder
|
||||
func stripHTML(htmlContent string) string {
|
||||
// Remove style and script blocks first
|
||||
result := htmlContent
|
||||
|
||||
// Remove <style>...</style> blocks
|
||||
for {
|
||||
start := strings.Index(strings.ToLower(result), "<style")
|
||||
if start == -1 {
|
||||
break
|
||||
}
|
||||
end := strings.Index(strings.ToLower(result[start:]), "</style>")
|
||||
if end == -1 {
|
||||
break
|
||||
}
|
||||
result = result[:start] + result[start+end+8:]
|
||||
}
|
||||
|
||||
// Remove <script>...</script> blocks
|
||||
for {
|
||||
start := strings.Index(strings.ToLower(result), "<script")
|
||||
if start == -1 {
|
||||
break
|
||||
}
|
||||
end := strings.Index(strings.ToLower(result[start:]), "</script>")
|
||||
if end == -1 {
|
||||
break
|
||||
}
|
||||
result = result[:start] + result[start+end+9:]
|
||||
}
|
||||
|
||||
// Remove <head>...</head> blocks
|
||||
for {
|
||||
start := strings.Index(strings.ToLower(result), "<head")
|
||||
if start == -1 {
|
||||
break
|
||||
}
|
||||
end := strings.Index(strings.ToLower(result[start:]), "</head>")
|
||||
if end == -1 {
|
||||
break
|
||||
}
|
||||
result = result[:start] + result[start+end+7:]
|
||||
}
|
||||
|
||||
// Replace common block elements with newlines
|
||||
result = strings.ReplaceAll(result, "<br>", "\n")
|
||||
result = strings.ReplaceAll(result, "<br/>", "\n")
|
||||
result = strings.ReplaceAll(result, "<br />", "\n")
|
||||
result = strings.ReplaceAll(result, "</p>", "\n")
|
||||
result = strings.ReplaceAll(result, "</div>", "\n")
|
||||
result = strings.ReplaceAll(result, "</tr>", "\n")
|
||||
result = strings.ReplaceAll(result, "</li>", "\n")
|
||||
|
||||
// Decode HTML entities
|
||||
result = decodeHTMLEntities(result)
|
||||
|
||||
// Remove all remaining tags
|
||||
var builder strings.Builder
|
||||
inTag := false
|
||||
inStyle := false
|
||||
inScript := false
|
||||
|
||||
html = strings.ReplaceAll(html, "<br>", "\n")
|
||||
html = strings.ReplaceAll(html, "<br/>", "\n")
|
||||
html = strings.ReplaceAll(html, "<br />", "\n")
|
||||
html = strings.ReplaceAll(html, "</p>", "\n")
|
||||
html = strings.ReplaceAll(html, "</div>", "\n")
|
||||
html = decodeHTMLEntities(html)
|
||||
|
||||
for _, r := range html {
|
||||
for _, r := range result {
|
||||
switch {
|
||||
case r == '<':
|
||||
inTag = true
|
||||
case r == '>':
|
||||
inTag = false
|
||||
case !inTag && !inStyle && !inScript:
|
||||
result.WriteRune(r)
|
||||
}
|
||||
|
||||
// Track style/script blocks (simplified)
|
||||
if inTag {
|
||||
lower := strings.ToLower(string(r))
|
||||
if strings.Contains(lower, "style") {
|
||||
inStyle = true
|
||||
}
|
||||
if strings.Contains(lower, "/style") {
|
||||
inStyle = false
|
||||
}
|
||||
if strings.Contains(lower, "script") {
|
||||
inScript = true
|
||||
}
|
||||
if strings.Contains(lower, "/script") {
|
||||
inScript = false
|
||||
}
|
||||
case !inTag:
|
||||
builder.WriteRune(r)
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up whitespace
|
||||
text := result.String()
|
||||
text := builder.String()
|
||||
lines := strings.Split(text, "\n")
|
||||
var cleaned []string
|
||||
for _, line := range lines {
|
||||
|
|
|
|||
Loading…
Reference in New Issue