whatsviewer/parser.go

160 lines
3.8 KiB
Go

package main
import (
"regexp"
"strings"
"time"
)
// Match date/time prefix: M/DD/YY, H:MM[narrow-nbsp]AM/PM -
// The narrow no-break space (\u202f) appears before AM/PM in WhatsApp exports.
var messageStartRe = regexp.MustCompile(`^(\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2}[\s\x{202f}]*[APap][Mm]) - (.*)`)
// Time layouts to try parsing (with both regular space and narrow no-break space)
var timeLayouts = []string{
"1/2/06, 3:04\u202fPM",
"1/2/06, 3:04 PM",
"1/2/06, 3:04\u202fam",
"1/2/06, 3:04 am",
}
func parseTimestamp(s string) (time.Time, bool) {
// Normalize: replace narrow no-break space with regular space for parsing
normalized := strings.ReplaceAll(s, "\u202f", " ")
// Also try with the original
for _, layout := range []string{"1/2/06, 3:04 PM", "1/2/06, 3:04 pm"} {
t, err := time.Parse(layout, normalized)
if err == nil {
return t, true
}
// Try uppercase
t, err = time.Parse(layout, strings.ToUpper(normalized))
if err == nil {
return t, true
}
}
return time.Time{}, false
}
func classifyAttachment(filename string) string {
lower := strings.ToLower(filename)
if strings.HasPrefix(lower, "stk-") && strings.HasSuffix(lower, ".webp") {
return "sticker"
}
if strings.HasPrefix(lower, "ptt-") {
return "audio"
}
for _, ext := range []string{".jpg", ".jpeg", ".png", ".webp", ".gif"} {
if strings.HasSuffix(lower, ext) {
return "image"
}
}
for _, ext := range []string{".mp4", ".3gp", ".mov"} {
if strings.HasSuffix(lower, ext) {
return "video"
}
}
for _, ext := range []string{".opus", ".ogg", ".m4a", ".mp3", ".aac"} {
if strings.HasSuffix(lower, ext) {
return "audio"
}
}
return "document"
}
func parseChat(content string) []Message {
// Strip UTF-8 BOM
content = strings.TrimPrefix(content, "\xef\xbb\xbf")
// Normalize line endings
content = strings.ReplaceAll(content, "\r\n", "\n")
content = strings.ReplaceAll(content, "\r", "\n")
lines := strings.Split(content, "\n")
var messages []Message
for i := 0; i < len(lines); i++ {
line := lines[i]
if line == "" {
continue
}
m := messageStartRe.FindStringSubmatch(line)
if m == nil {
// Continuation line — append to previous message
if len(messages) > 0 {
prev := &messages[len(messages)-1]
if prev.Text != "" {
prev.Text += "\n"
}
prev.Text += line
// Check if continuation line has attachment
checkAttachment(prev, line)
}
continue
}
timestampStr := m[1]
rest := m[2]
ts, ok := parseTimestamp(timestampStr)
if !ok {
// If we can't parse timestamp, treat as continuation
if len(messages) > 0 {
prev := &messages[len(messages)-1]
if prev.Text != "" {
prev.Text += "\n"
}
prev.Text += line
}
continue
}
msg := Message{
Timestamp: ts,
}
// Check if it's a system message (no colon after sender) or a user message
colonIdx := strings.Index(rest, ": ")
if colonIdx == -1 {
// System message
msg.IsSystem = true
msg.Text = rest
} else {
msg.Sender = rest[:colonIdx]
msg.Text = rest[colonIdx+2:]
}
checkAttachment(&msg, msg.Text)
messages = append(messages, msg)
}
return messages
}
func checkAttachment(msg *Message, text string) {
const suffix = " (file attached)"
// Check each line of text for attachments
for _, line := range strings.Split(text, "\n") {
line = strings.TrimSpace(line)
if strings.HasSuffix(line, suffix) {
filename := strings.TrimSuffix(line, suffix)
// Don't add duplicate attachments
found := false
for _, a := range msg.Attachments {
if a.Filename == filename {
found = true
break
}
}
if !found {
msg.Attachments = append(msg.Attachments, Attachment{
Filename: filename,
Type: classifyAttachment(filename),
})
}
// Remove the "(file attached)" text from displayed text
msg.Text = strings.Replace(msg.Text, line, filename, 1)
}
}
}