chore: auto-commit uncommitted changes
This commit is contained in:
parent
bbc029196a
commit
2c91d5649e
138
ai.go
138
ai.go
|
|
@ -1,6 +1,7 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"crypto/sha256"
|
||||
"encoding/base64"
|
||||
|
|
@ -617,6 +618,22 @@ func ProcessDocument(filePath string) (*Document, error) {
|
|||
UpdateJob(hash, "error", err.Error())
|
||||
return nil, fmt.Errorf("text analysis failed: %w", err)
|
||||
}
|
||||
} else if IsOfficeFile(ext) {
|
||||
// Office formats — extract text natively from ZIP/XML, no LibreOffice needed
|
||||
UpdateJob(hash, "converting", "Extracting text...")
|
||||
log.Printf(" Extracting text from %s...", ext)
|
||||
text, err := ExtractOfficeText(filePath)
|
||||
if err != nil {
|
||||
UpdateJob(hash, "error", err.Error())
|
||||
return nil, fmt.Errorf("office text extraction failed: %w", err)
|
||||
}
|
||||
log.Printf(" Extracted %d chars, classifying...", len(text))
|
||||
UpdateJob(hash, "classifying", "Classifying...")
|
||||
analysis, err = AnalyzeText(text, filepath.Base(filePath))
|
||||
if err != nil {
|
||||
UpdateJob(hash, "error", err.Error())
|
||||
return nil, fmt.Errorf("text analysis failed: %w", err)
|
||||
}
|
||||
} else {
|
||||
// Vision — convert to image and analyze
|
||||
UpdateJob(hash, "converting", "Converting to image...")
|
||||
|
|
@ -751,3 +768,124 @@ func copyFile(src, dst string) error {
|
|||
_, err = io.Copy(out, in)
|
||||
return err
|
||||
}
|
||||
|
||||
// ExtractOfficeText extracts plain text from DOCX/XLSX/PPTX natively.
|
||||
// These are ZIP archives containing XML — no LibreOffice needed.
|
||||
func ExtractOfficeText(filePath string) (string, error) {
|
||||
ext := strings.ToLower(filepath.Ext(filePath))
|
||||
|
||||
r, err := zip.OpenReader(filePath)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("not a valid Office file: %w", err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
// Which XML paths to extract per format
|
||||
var targets []string
|
||||
switch ext {
|
||||
case ".docx", ".doc", ".odt", ".rtf":
|
||||
targets = []string{"word/document.xml", "word/body.xml"}
|
||||
case ".xlsx", ".xls":
|
||||
// All sheets
|
||||
for _, f := range r.File {
|
||||
if strings.HasPrefix(f.Name, "xl/worksheets/sheet") && strings.HasSuffix(f.Name, ".xml") {
|
||||
targets = append(targets, f.Name)
|
||||
}
|
||||
}
|
||||
// Shared strings (cell values are stored here for xlsx)
|
||||
targets = append(targets, "xl/sharedStrings.xml")
|
||||
case ".pptx", ".ppt":
|
||||
for _, f := range r.File {
|
||||
if strings.HasPrefix(f.Name, "ppt/slides/slide") && strings.HasSuffix(f.Name, ".xml") {
|
||||
targets = append(targets, f.Name)
|
||||
}
|
||||
}
|
||||
default:
|
||||
return "", fmt.Errorf("unsupported office format: %s", ext)
|
||||
}
|
||||
|
||||
var sb strings.Builder
|
||||
for _, f := range r.File {
|
||||
found := false
|
||||
for _, t := range targets {
|
||||
if f.Name == t {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
|
||||
rc, err := f.Open()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
data, err := io.ReadAll(rc)
|
||||
rc.Close()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
sb.WriteString(xmlToText(data))
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
|
||||
text := strings.TrimSpace(sb.String())
|
||||
if text == "" {
|
||||
return "", fmt.Errorf("no text extracted from %s", filepath.Base(filePath))
|
||||
}
|
||||
return text, nil
|
||||
}
|
||||
|
||||
// xmlToText strips XML tags and decodes entities, returning plain text.
|
||||
func xmlToText(data []byte) string {
|
||||
var sb strings.Builder
|
||||
inTag := false
|
||||
lastWasSpace := false
|
||||
|
||||
for i := 0; i < len(data); i++ {
|
||||
c := data[i]
|
||||
switch {
|
||||
case c == '<':
|
||||
inTag = true
|
||||
// Emit space between elements so words don't run together
|
||||
if !lastWasSpace {
|
||||
sb.WriteByte(' ')
|
||||
lastWasSpace = true
|
||||
}
|
||||
case c == '>':
|
||||
inTag = false
|
||||
case !inTag:
|
||||
if c == ' ' || c == '\t' || c == '\n' || c == '\r' {
|
||||
if !lastWasSpace {
|
||||
sb.WriteByte(' ')
|
||||
lastWasSpace = true
|
||||
}
|
||||
} else {
|
||||
sb.WriteByte(c)
|
||||
lastWasSpace = false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Decode common XML entities
|
||||
result := sb.String()
|
||||
result = strings.ReplaceAll(result, "&", "&")
|
||||
result = strings.ReplaceAll(result, "<", "<")
|
||||
result = strings.ReplaceAll(result, ">", ">")
|
||||
result = strings.ReplaceAll(result, """, "\"")
|
||||
result = strings.ReplaceAll(result, "'", "'")
|
||||
result = strings.ReplaceAll(result, "
", "\n")
|
||||
result = strings.ReplaceAll(result, "	", "\t")
|
||||
|
||||
return strings.TrimSpace(result)
|
||||
}
|
||||
|
||||
// IsOfficeFile returns true for formats with extractable XML text.
|
||||
func IsOfficeFile(ext string) bool {
|
||||
return map[string]bool{
|
||||
".docx": true, ".xlsx": true, ".pptx": true,
|
||||
".odt": true, ".ods": true, ".odp": true,
|
||||
}[ext]
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue