chore: auto-commit uncommitted changes
This commit is contained in:
parent
bbc029196a
commit
2c91d5649e
138
ai.go
138
ai.go
|
|
@ -1,6 +1,7 @@
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"archive/zip"
|
||||||
"bytes"
|
"bytes"
|
||||||
"crypto/sha256"
|
"crypto/sha256"
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
|
|
@ -617,6 +618,22 @@ func ProcessDocument(filePath string) (*Document, error) {
|
||||||
UpdateJob(hash, "error", err.Error())
|
UpdateJob(hash, "error", err.Error())
|
||||||
return nil, fmt.Errorf("text analysis failed: %w", err)
|
return nil, fmt.Errorf("text analysis failed: %w", err)
|
||||||
}
|
}
|
||||||
|
} else if IsOfficeFile(ext) {
|
||||||
|
// Office formats — extract text natively from ZIP/XML, no LibreOffice needed
|
||||||
|
UpdateJob(hash, "converting", "Extracting text...")
|
||||||
|
log.Printf(" Extracting text from %s...", ext)
|
||||||
|
text, err := ExtractOfficeText(filePath)
|
||||||
|
if err != nil {
|
||||||
|
UpdateJob(hash, "error", err.Error())
|
||||||
|
return nil, fmt.Errorf("office text extraction failed: %w", err)
|
||||||
|
}
|
||||||
|
log.Printf(" Extracted %d chars, classifying...", len(text))
|
||||||
|
UpdateJob(hash, "classifying", "Classifying...")
|
||||||
|
analysis, err = AnalyzeText(text, filepath.Base(filePath))
|
||||||
|
if err != nil {
|
||||||
|
UpdateJob(hash, "error", err.Error())
|
||||||
|
return nil, fmt.Errorf("text analysis failed: %w", err)
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Vision — convert to image and analyze
|
// Vision — convert to image and analyze
|
||||||
UpdateJob(hash, "converting", "Converting to image...")
|
UpdateJob(hash, "converting", "Converting to image...")
|
||||||
|
|
@ -751,3 +768,124 @@ func copyFile(src, dst string) error {
|
||||||
_, err = io.Copy(out, in)
|
_, err = io.Copy(out, in)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ExtractOfficeText extracts plain text from DOCX/XLSX/PPTX natively.
|
||||||
|
// These are ZIP archives containing XML — no LibreOffice needed.
|
||||||
|
func ExtractOfficeText(filePath string) (string, error) {
|
||||||
|
ext := strings.ToLower(filepath.Ext(filePath))
|
||||||
|
|
||||||
|
r, err := zip.OpenReader(filePath)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("not a valid Office file: %w", err)
|
||||||
|
}
|
||||||
|
defer r.Close()
|
||||||
|
|
||||||
|
// Which XML paths to extract per format
|
||||||
|
var targets []string
|
||||||
|
switch ext {
|
||||||
|
case ".docx", ".doc", ".odt", ".rtf":
|
||||||
|
targets = []string{"word/document.xml", "word/body.xml"}
|
||||||
|
case ".xlsx", ".xls":
|
||||||
|
// All sheets
|
||||||
|
for _, f := range r.File {
|
||||||
|
if strings.HasPrefix(f.Name, "xl/worksheets/sheet") && strings.HasSuffix(f.Name, ".xml") {
|
||||||
|
targets = append(targets, f.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Shared strings (cell values are stored here for xlsx)
|
||||||
|
targets = append(targets, "xl/sharedStrings.xml")
|
||||||
|
case ".pptx", ".ppt":
|
||||||
|
for _, f := range r.File {
|
||||||
|
if strings.HasPrefix(f.Name, "ppt/slides/slide") && strings.HasSuffix(f.Name, ".xml") {
|
||||||
|
targets = append(targets, f.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return "", fmt.Errorf("unsupported office format: %s", ext)
|
||||||
|
}
|
||||||
|
|
||||||
|
var sb strings.Builder
|
||||||
|
for _, f := range r.File {
|
||||||
|
found := false
|
||||||
|
for _, t := range targets {
|
||||||
|
if f.Name == t {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
rc, err := f.Open()
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
data, err := io.ReadAll(rc)
|
||||||
|
rc.Close()
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.WriteString(xmlToText(data))
|
||||||
|
sb.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
text := strings.TrimSpace(sb.String())
|
||||||
|
if text == "" {
|
||||||
|
return "", fmt.Errorf("no text extracted from %s", filepath.Base(filePath))
|
||||||
|
}
|
||||||
|
return text, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// xmlToText strips XML tags and decodes entities, returning plain text.
|
||||||
|
func xmlToText(data []byte) string {
|
||||||
|
var sb strings.Builder
|
||||||
|
inTag := false
|
||||||
|
lastWasSpace := false
|
||||||
|
|
||||||
|
for i := 0; i < len(data); i++ {
|
||||||
|
c := data[i]
|
||||||
|
switch {
|
||||||
|
case c == '<':
|
||||||
|
inTag = true
|
||||||
|
// Emit space between elements so words don't run together
|
||||||
|
if !lastWasSpace {
|
||||||
|
sb.WriteByte(' ')
|
||||||
|
lastWasSpace = true
|
||||||
|
}
|
||||||
|
case c == '>':
|
||||||
|
inTag = false
|
||||||
|
case !inTag:
|
||||||
|
if c == ' ' || c == '\t' || c == '\n' || c == '\r' {
|
||||||
|
if !lastWasSpace {
|
||||||
|
sb.WriteByte(' ')
|
||||||
|
lastWasSpace = true
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sb.WriteByte(c)
|
||||||
|
lastWasSpace = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode common XML entities
|
||||||
|
result := sb.String()
|
||||||
|
result = strings.ReplaceAll(result, "&", "&")
|
||||||
|
result = strings.ReplaceAll(result, "<", "<")
|
||||||
|
result = strings.ReplaceAll(result, ">", ">")
|
||||||
|
result = strings.ReplaceAll(result, """, "\"")
|
||||||
|
result = strings.ReplaceAll(result, "'", "'")
|
||||||
|
result = strings.ReplaceAll(result, "
", "\n")
|
||||||
|
result = strings.ReplaceAll(result, "	", "\t")
|
||||||
|
|
||||||
|
return strings.TrimSpace(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsOfficeFile returns true for formats with extractable XML text.
|
||||||
|
func IsOfficeFile(ext string) bool {
|
||||||
|
return map[string]bool{
|
||||||
|
".docx": true, ".xlsx": true, ".pptx": true,
|
||||||
|
".odt": true, ".ods": true, ".odp": true,
|
||||||
|
}[ext]
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue