diff --git a/ai.go b/ai.go index 55a5fe2..0dd420e 100644 --- a/ai.go +++ b/ai.go @@ -1,6 +1,7 @@ package main import ( + "archive/zip" "bytes" "crypto/sha256" "encoding/base64" @@ -617,6 +618,22 @@ func ProcessDocument(filePath string) (*Document, error) { UpdateJob(hash, "error", err.Error()) return nil, fmt.Errorf("text analysis failed: %w", err) } + } else if IsOfficeFile(ext) { + // Office formats — extract text natively from ZIP/XML, no LibreOffice needed + UpdateJob(hash, "converting", "Extracting text...") + log.Printf(" Extracting text from %s...", ext) + text, err := ExtractOfficeText(filePath) + if err != nil { + UpdateJob(hash, "error", err.Error()) + return nil, fmt.Errorf("office text extraction failed: %w", err) + } + log.Printf(" Extracted %d chars, classifying...", len(text)) + UpdateJob(hash, "classifying", "Classifying...") + analysis, err = AnalyzeText(text, filepath.Base(filePath)) + if err != nil { + UpdateJob(hash, "error", err.Error()) + return nil, fmt.Errorf("text analysis failed: %w", err) + } } else { // Vision — convert to image and analyze UpdateJob(hash, "converting", "Converting to image...") @@ -751,3 +768,124 @@ func copyFile(src, dst string) error { _, err = io.Copy(out, in) return err } + +// ExtractOfficeText extracts plain text from DOCX/XLSX/PPTX natively. +// These are ZIP archives containing XML — no LibreOffice needed. +func ExtractOfficeText(filePath string) (string, error) { + ext := strings.ToLower(filepath.Ext(filePath)) + + r, err := zip.OpenReader(filePath) + if err != nil { + return "", fmt.Errorf("not a valid Office file: %w", err) + } + defer r.Close() + + // Which XML paths to extract per format + var targets []string + switch ext { + case ".docx", ".doc", ".odt", ".rtf": + targets = []string{"word/document.xml", "word/body.xml"} + case ".xlsx", ".xls": + // All sheets + for _, f := range r.File { + if strings.HasPrefix(f.Name, "xl/worksheets/sheet") && strings.HasSuffix(f.Name, ".xml") { + targets = append(targets, f.Name) + } + } + // Shared strings (cell values are stored here for xlsx) + targets = append(targets, "xl/sharedStrings.xml") + case ".pptx", ".ppt": + for _, f := range r.File { + if strings.HasPrefix(f.Name, "ppt/slides/slide") && strings.HasSuffix(f.Name, ".xml") { + targets = append(targets, f.Name) + } + } + default: + return "", fmt.Errorf("unsupported office format: %s", ext) + } + + var sb strings.Builder + for _, f := range r.File { + found := false + for _, t := range targets { + if f.Name == t { + found = true + break + } + } + if !found { + continue + } + + rc, err := f.Open() + if err != nil { + continue + } + data, err := io.ReadAll(rc) + rc.Close() + if err != nil { + continue + } + + sb.WriteString(xmlToText(data)) + sb.WriteString("\n") + } + + text := strings.TrimSpace(sb.String()) + if text == "" { + return "", fmt.Errorf("no text extracted from %s", filepath.Base(filePath)) + } + return text, nil +} + +// xmlToText strips XML tags and decodes entities, returning plain text. +func xmlToText(data []byte) string { + var sb strings.Builder + inTag := false + lastWasSpace := false + + for i := 0; i < len(data); i++ { + c := data[i] + switch { + case c == '<': + inTag = true + // Emit space between elements so words don't run together + if !lastWasSpace { + sb.WriteByte(' ') + lastWasSpace = true + } + case c == '>': + inTag = false + case !inTag: + if c == ' ' || c == '\t' || c == '\n' || c == '\r' { + if !lastWasSpace { + sb.WriteByte(' ') + lastWasSpace = true + } + } else { + sb.WriteByte(c) + lastWasSpace = false + } + } + } + + // Decode common XML entities + result := sb.String() + result = strings.ReplaceAll(result, "&", "&") + result = strings.ReplaceAll(result, "<", "<") + result = strings.ReplaceAll(result, ">", ">") + result = strings.ReplaceAll(result, """, "\"") + result = strings.ReplaceAll(result, "'", "'") + result = strings.ReplaceAll(result, " ", "\n") + result = strings.ReplaceAll(result, " ", "\t") + + return strings.TrimSpace(result) +} + +// IsOfficeFile returns true for formats with extractable XML text. +func IsOfficeFile(ext string) bool { + return map[string]bool{ + ".docx": true, ".xlsx": true, ".pptx": true, + ".odt": true, ".ods": true, ".odp": true, + }[ext] +}