Improve HTML body parsing - strip style/script/head blocks
This commit is contained in:
parent
b7a83f2ab5
commit
30c4d15b87
94
mime.go
94
mime.go
|
|
@ -210,49 +210,77 @@ func extractAttachmentParts(entity *message.Entity, attachments *[]Attachment) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// stripHTML removes HTML tags and returns plain text
|
// stripHTML removes HTML tags and returns plain text
|
||||||
func stripHTML(html string) string {
|
func stripHTML(htmlContent string) string {
|
||||||
var result strings.Builder
|
// Remove style and script blocks first
|
||||||
|
result := htmlContent
|
||||||
|
|
||||||
|
// Remove <style>...</style> blocks
|
||||||
|
for {
|
||||||
|
start := strings.Index(strings.ToLower(result), "<style")
|
||||||
|
if start == -1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
end := strings.Index(strings.ToLower(result[start:]), "</style>")
|
||||||
|
if end == -1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
result = result[:start] + result[start+end+8:]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove <script>...</script> blocks
|
||||||
|
for {
|
||||||
|
start := strings.Index(strings.ToLower(result), "<script")
|
||||||
|
if start == -1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
end := strings.Index(strings.ToLower(result[start:]), "</script>")
|
||||||
|
if end == -1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
result = result[:start] + result[start+end+9:]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove <head>...</head> blocks
|
||||||
|
for {
|
||||||
|
start := strings.Index(strings.ToLower(result), "<head")
|
||||||
|
if start == -1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
end := strings.Index(strings.ToLower(result[start:]), "</head>")
|
||||||
|
if end == -1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
result = result[:start] + result[start+end+7:]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace common block elements with newlines
|
||||||
|
result = strings.ReplaceAll(result, "<br>", "\n")
|
||||||
|
result = strings.ReplaceAll(result, "<br/>", "\n")
|
||||||
|
result = strings.ReplaceAll(result, "<br />", "\n")
|
||||||
|
result = strings.ReplaceAll(result, "</p>", "\n")
|
||||||
|
result = strings.ReplaceAll(result, "</div>", "\n")
|
||||||
|
result = strings.ReplaceAll(result, "</tr>", "\n")
|
||||||
|
result = strings.ReplaceAll(result, "</li>", "\n")
|
||||||
|
|
||||||
|
// Decode HTML entities
|
||||||
|
result = decodeHTMLEntities(result)
|
||||||
|
|
||||||
|
// Remove all remaining tags
|
||||||
|
var builder strings.Builder
|
||||||
inTag := false
|
inTag := false
|
||||||
inStyle := false
|
for _, r := range result {
|
||||||
inScript := false
|
|
||||||
|
|
||||||
html = strings.ReplaceAll(html, "<br>", "\n")
|
|
||||||
html = strings.ReplaceAll(html, "<br/>", "\n")
|
|
||||||
html = strings.ReplaceAll(html, "<br />", "\n")
|
|
||||||
html = strings.ReplaceAll(html, "</p>", "\n")
|
|
||||||
html = strings.ReplaceAll(html, "</div>", "\n")
|
|
||||||
html = decodeHTMLEntities(html)
|
|
||||||
|
|
||||||
for _, r := range html {
|
|
||||||
switch {
|
switch {
|
||||||
case r == '<':
|
case r == '<':
|
||||||
inTag = true
|
inTag = true
|
||||||
case r == '>':
|
case r == '>':
|
||||||
inTag = false
|
inTag = false
|
||||||
case !inTag && !inStyle && !inScript:
|
case !inTag:
|
||||||
result.WriteRune(r)
|
builder.WriteRune(r)
|
||||||
}
|
|
||||||
|
|
||||||
// Track style/script blocks (simplified)
|
|
||||||
if inTag {
|
|
||||||
lower := strings.ToLower(string(r))
|
|
||||||
if strings.Contains(lower, "style") {
|
|
||||||
inStyle = true
|
|
||||||
}
|
|
||||||
if strings.Contains(lower, "/style") {
|
|
||||||
inStyle = false
|
|
||||||
}
|
|
||||||
if strings.Contains(lower, "script") {
|
|
||||||
inScript = true
|
|
||||||
}
|
|
||||||
if strings.Contains(lower, "/script") {
|
|
||||||
inScript = false
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean up whitespace
|
// Clean up whitespace
|
||||||
text := result.String()
|
text := builder.String()
|
||||||
lines := strings.Split(text, "\n")
|
lines := strings.Split(text, "\n")
|
||||||
var cleaned []string
|
var cleaned []string
|
||||||
for _, line := range lines {
|
for _, line := range lines {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue