1010 lines
30 KiB
Go
1010 lines
30 KiB
Go
package main
|
|
|
|
import (
|
|
"archive/zip"
|
|
"bytes"
|
|
"crypto/sha256"
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
var (
|
|
fireworksAPIKey string
|
|
fireworksBaseURL = "https://api.fireworks.ai/inference/v1"
|
|
)
|
|
|
|
func init() {
|
|
fireworksAPIKey = os.Getenv("FIREWORKS_API_KEY")
|
|
if fireworksAPIKey == "" {
|
|
// Try .env file in docsys directory
|
|
envPath := filepath.Join(os.Getenv("HOME"), "dev/docsys/.env")
|
|
if data, err := os.ReadFile(envPath); err == nil {
|
|
for _, line := range strings.Split(string(data), "\n") {
|
|
if strings.HasPrefix(line, "FIREWORKS_API_KEY=") {
|
|
fireworksAPIKey = strings.TrimSpace(strings.TrimPrefix(line, "FIREWORKS_API_KEY="))
|
|
fireworksAPIKey = strings.Trim(fireworksAPIKey, `"'`)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// DocumentAnalysis contains the AI-extracted information
|
|
type DocumentAnalysis struct {
|
|
Category string `json:"category"`
|
|
DocType string `json:"doc_type"`
|
|
Date string `json:"date"`
|
|
Vendor string `json:"vendor"`
|
|
Amount interface{} `json:"amount"` // Can be string or number
|
|
Title string `json:"title"`
|
|
Summary string `json:"summary"`
|
|
FullText string `json:"full_text"`
|
|
}
|
|
|
|
func (d *DocumentAnalysis) AmountString() string {
|
|
switch v := d.Amount.(type) {
|
|
case string:
|
|
return v
|
|
case float64:
|
|
return fmt.Sprintf("$%.2f", v)
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
// FileHash returns first 16 chars of SHA256 hash
|
|
func FileHash(filepath string) (string, error) {
|
|
f, err := os.Open(filepath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer f.Close()
|
|
|
|
h := sha256.New()
|
|
if _, err := io.Copy(h, f); err != nil {
|
|
return "", err
|
|
}
|
|
return fmt.Sprintf("%x", h.Sum(nil))[:16], nil
|
|
}
|
|
|
|
// ConvertToImage converts PDF/Office docs to PNG for vision API
|
|
func ConvertToImage(filePath string) ([]byte, error) {
|
|
ext := strings.ToLower(filepath.Ext(filePath))
|
|
|
|
// Office documents → PDF first
|
|
officeExts := map[string]bool{".doc": true, ".docx": true, ".odt": true, ".rtf": true, ".xls": true, ".xlsx": true, ".ppt": true, ".pptx": true}
|
|
if officeExts[ext] {
|
|
tmpDir, err := os.MkdirTemp("", "docsys")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer os.RemoveAll(tmpDir)
|
|
|
|
cmd := exec.Command("soffice", "--headless", "--convert-to", "pdf", "--outdir", tmpDir, filePath)
|
|
if err := cmd.Run(); err != nil {
|
|
return nil, fmt.Errorf("libreoffice conversion failed: %w", err)
|
|
}
|
|
|
|
base := strings.TrimSuffix(filepath.Base(filePath), ext)
|
|
pdfPath := filepath.Join(tmpDir, base+".pdf")
|
|
filePath = pdfPath
|
|
ext = ".pdf"
|
|
}
|
|
|
|
// PDF → PNG (first page only for preview, full processing done separately)
|
|
if ext == ".pdf" {
|
|
tmpDir, err := os.MkdirTemp("", "docsys")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer os.RemoveAll(tmpDir)
|
|
|
|
// Convert first page for initial analysis
|
|
outPrefix := filepath.Join(tmpDir, "page")
|
|
cmd := exec.Command("pdftoppm", "-png", "-f", "1", "-l", "1", "-r", "150", filePath, outPrefix)
|
|
if err := cmd.Run(); err != nil {
|
|
return nil, fmt.Errorf("pdftoppm failed: %w", err)
|
|
}
|
|
|
|
// pdftoppm uses variable-width zero-padding depending on page count
|
|
// (e.g. page-01.png for <100 pages, page-001.png for <1000 pages).
|
|
// Glob for the first match instead of hardcoding "page-1.png".
|
|
matches, err := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
|
|
if err != nil || len(matches) == 0 {
|
|
return nil, fmt.Errorf("pdftoppm output not found in %s", tmpDir)
|
|
}
|
|
return os.ReadFile(matches[0])
|
|
}
|
|
|
|
// Image files — read directly
|
|
return os.ReadFile(filePath)
|
|
}
|
|
|
|
|
|
// IsTextFile returns true for plain text files
|
|
func IsTextFile(ext string) bool {
|
|
textExts := map[string]bool{
|
|
".txt": true, ".md": true, ".markdown": true, ".text": true, ".log": true,
|
|
".json": true, ".xml": true, ".csv": true, ".yaml": true, ".yml": true,
|
|
}
|
|
return textExts[ext]
|
|
}
|
|
|
|
// AnalyzeWithVision uses K2.5 vision model
|
|
func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) {
|
|
if fireworksAPIKey == "" {
|
|
return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
|
|
}
|
|
|
|
b64 := base64.StdEncoding.EncodeToString(imageData)
|
|
|
|
prompt := `Analyze this document image and extract:
|
|
|
|
1. **Full Text**: Transcribe ALL visible text EXACTLY as it appears — in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Transcribe word-for-word in the source language (Russian, Dutch, German, French, etc.) formatted as clean Markdown:
|
|
- Use headers (##) for sections
|
|
- Use **bold** for labels/field names
|
|
- Use tables for tabular data (items, prices, etc.)
|
|
- Use bullet lists where appropriate
|
|
- Preserve ALL numbers, dates, amounts, and codes exactly as shown
|
|
|
|
2. **Classification**: Categorize into exactly ONE of:
|
|
taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
|
|
|
|
3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2")
|
|
|
|
4. **Key Fields** (these may be in English for searchability):
|
|
- date: Document date (YYYY-MM-DD if possible)
|
|
- vendor: Company/organization name
|
|
- amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN")
|
|
|
|
5. **Title**: Specific English title (max 8 words) that identifies THIS document uniquely. Ask yourself: "If I had to find this document again, what would I search for?" Prioritize the most unique/recoverable detail: an account number, confirmation code, ID, or specific value that can't be inferred from context. If the document's main value is a number or ID, put it in the title. Bad: "FedEx Receipt Jan 2026" (generic). Good: "FedEx Account 203634010" or "Duke Energy Electric Bill Oct 2024" or "IRS Form 1099-INT Chase Bank 2024". Never use generic words like "Document", "Letter", "Report" alone — always qualify them.
|
|
|
|
6. **Summary**: 1-2 sentence English description with key details.
|
|
|
|
IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable.
|
|
|
|
**Known proper nouns** (use these exact spellings when you see similar handwriting):
|
|
- Jongsma (surname — may look like "Jongoma", "Jongsoma", "Jongma")
|
|
- Johan (first name)
|
|
- Tatyana / Tanya (first name)
|
|
- St. Petersburg, Florida (city — not Russia)
|
|
|
|
Respond in JSON ONLY:
|
|
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`
|
|
|
|
reqBody := map[string]interface{}{
|
|
"model": "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct",
|
|
"max_tokens": 4096,
|
|
"messages": []map[string]interface{}{
|
|
{"role": "system", "content": "You are a document analysis API. Output ONLY raw JSON. No thinking, no commentary, no code fences. First character must be {, last character must be }."},
|
|
{
|
|
"role": "user",
|
|
"content": []map[string]interface{}{
|
|
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
|
|
{"type": "text", "text": prompt},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
analysis, err := callFireworks(reqBody)
|
|
if err != nil {
|
|
// Retry once with minimal prompt to avoid triggering extended reasoning
|
|
log.Printf(" [AI] First attempt failed, retrying with simplified prompt...")
|
|
retryBody := map[string]interface{}{
|
|
"model": "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct",
|
|
"max_tokens": 4096,
|
|
"messages": []map[string]interface{}{
|
|
{"role": "system", "content": "Output valid JSON only. No other text."},
|
|
{
|
|
"role": "user",
|
|
"content": []map[string]interface{}{
|
|
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
|
|
{"type": "text", "text": `Look at this document image. Transcribe ALL text verbatim (original language, do NOT translate). Return ONLY this JSON with no placeholders:
|
|
{"category":"","doc_type":"","date":"","vendor":"","amount":"","title":"","summary":"","full_text":""}`},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
return callFireworks(retryBody)
|
|
}
|
|
return analysis, nil
|
|
}
|
|
|
|
// AnalyzeText uses K2 text model for plain text files
|
|
func AnalyzeText(text, filename string) (*DocumentAnalysis, error) {
|
|
if fireworksAPIKey == "" {
|
|
return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
|
|
}
|
|
|
|
// Truncate long text
|
|
if len(text) > 50000 {
|
|
text = text[:50000]
|
|
}
|
|
|
|
prompt := fmt.Sprintf(`Analyze this document:
|
|
|
|
**Filename:** %s
|
|
|
|
**Content:**
|
|
%s
|
|
|
|
Categorize into ONE of: taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
|
|
|
|
For the title: specific and concise (max 8 words). Ask yourself: "What would I search for to find this again?" Prioritize unique/recoverable details — account numbers, IDs, confirmation codes — over generic document type names. Examples:
|
|
- "FedEx Account 203634010" (if the key value is an account number)
|
|
- "Allegiant Confirmation IB2EJA AVL-PIE Feb 2026"
|
|
- "IRS Form 1099-INT Chase Bank 2024"
|
|
Bad: "FedEx Receipt Jan 2026" or "Flight Confirmation" — too generic.
|
|
|
|
Also produce a "full_text" field: reformat the raw content as clean Markdown. Use ## headers for sections, | tables | for tabular data, **bold** for field labels, and bullet lists where appropriate. Preserve all values exactly. Do not summarize — include everything.
|
|
|
|
Respond in JSON ONLY:
|
|
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`, filename, text)
|
|
|
|
reqBody := map[string]interface{}{
|
|
"model": "accounts/fireworks/models/kimi-k2-instruct-0905",
|
|
"max_tokens": 2048,
|
|
"messages": []map[string]interface{}{
|
|
{"role": "user", "content": prompt},
|
|
},
|
|
}
|
|
|
|
analysis, err := callFireworks(reqBody)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// If model didn't produce full_text, fall back to raw extracted text
|
|
if analysis.FullText == "" {
|
|
analysis.FullText = text
|
|
}
|
|
return analysis, nil
|
|
}
|
|
|
|
func callFireworks(reqBody map[string]interface{}) (*DocumentAnalysis, error) {
|
|
jsonBody, _ := json.Marshal(reqBody)
|
|
|
|
req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody))
|
|
req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
client := &http.Client{Timeout: 120 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != 200 {
|
|
body, _ := io.ReadAll(resp.Body)
|
|
return nil, fmt.Errorf("API error %d: %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
respBody, _ := io.ReadAll(resp.Body)
|
|
|
|
var result struct {
|
|
Choices []struct {
|
|
Message struct {
|
|
Content string `json:"content"`
|
|
ReasoningContent string `json:"reasoning_content"`
|
|
} `json:"message"`
|
|
} `json:"choices"`
|
|
}
|
|
if err := json.Unmarshal(respBody, &result); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if len(result.Choices) == 0 {
|
|
return nil, fmt.Errorf("no response from API")
|
|
}
|
|
|
|
content := result.Choices[0].Message.Content
|
|
reasoning := result.Choices[0].Message.ReasoningContent
|
|
|
|
// K2.5 reasoning mode: actual JSON may be in content or reasoning_content
|
|
// Try content first, if it doesn't look like JSON, try reasoning_content
|
|
if !strings.Contains(content, "{") && reasoning != "" && strings.Contains(reasoning, "{") {
|
|
log.Printf(" [AI] Using reasoning_content (content had no JSON)")
|
|
content = reasoning
|
|
}
|
|
|
|
// Strip markdown code fences (```json ... ``` or ``` ... ```)
|
|
content = strings.TrimSpace(content)
|
|
if strings.HasPrefix(content, "```") {
|
|
// Remove opening fence (```json or ```)
|
|
if idx := strings.Index(content, "\n"); idx >= 0 {
|
|
content = content[idx+1:]
|
|
}
|
|
// Remove closing fence
|
|
if idx := strings.LastIndex(content, "```"); idx >= 0 {
|
|
content = content[:idx]
|
|
}
|
|
content = strings.TrimSpace(content)
|
|
}
|
|
|
|
// Extract JSON from response
|
|
if idx := strings.Index(content, "{"); idx >= 0 {
|
|
if end := strings.LastIndex(content, "}"); end > idx {
|
|
content = content[idx : end+1]
|
|
}
|
|
}
|
|
|
|
var analysis DocumentAnalysis
|
|
if err := json.Unmarshal([]byte(content), &analysis); err != nil {
|
|
// Last resort: try to find a JSON object with braces matching
|
|
cleaned := extractJSONObject(content)
|
|
if cleaned != "" {
|
|
if err2 := json.Unmarshal([]byte(cleaned), &analysis); err2 != nil {
|
|
log.Printf(" [AI debug] Failed to parse even after cleanup. Content starts: %.200s", content)
|
|
return nil, fmt.Errorf("failed to parse response: %w", err)
|
|
}
|
|
} else {
|
|
log.Printf(" [AI debug] No JSON object found in response. Content starts: %.200s", content)
|
|
return nil, fmt.Errorf("failed to parse response: %w", err)
|
|
}
|
|
}
|
|
|
|
// Validate category
|
|
validCats := map[string]bool{"taxes": true, "bills": true, "medical": true, "insurance": true, "legal": true, "financial": true, "expenses": true, "vehicles": true, "home": true, "personal": true, "contacts": true, "uncategorized": true}
|
|
if !validCats[analysis.Category] {
|
|
analysis.Category = "uncategorized"
|
|
}
|
|
|
|
return &analysis, nil
|
|
}
|
|
|
|
// extractJSONObject tries to find a balanced JSON object in a string
|
|
func extractJSONObject(s string) string {
|
|
start := strings.Index(s, "{")
|
|
if start < 0 {
|
|
return ""
|
|
}
|
|
depth := 0
|
|
inString := false
|
|
escaped := false
|
|
for i := start; i < len(s); i++ {
|
|
c := s[i]
|
|
if escaped {
|
|
escaped = false
|
|
continue
|
|
}
|
|
if c == '\\' && inString {
|
|
escaped = true
|
|
continue
|
|
}
|
|
if c == '"' {
|
|
inString = !inString
|
|
continue
|
|
}
|
|
if inString {
|
|
continue
|
|
}
|
|
if c == '{' {
|
|
depth++
|
|
} else if c == '}' {
|
|
depth--
|
|
if depth == 0 {
|
|
return s[start : i+1]
|
|
}
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// GenerateEmbedding creates a vector embedding using Fireworks
|
|
func GenerateEmbedding(text string) ([]float32, error) {
|
|
if fireworksAPIKey == "" {
|
|
return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
|
|
}
|
|
|
|
// Truncate
|
|
if len(text) > 32000 {
|
|
text = text[:32000]
|
|
}
|
|
|
|
reqBody := map[string]interface{}{
|
|
"model": "fireworks/qwen3-embedding-8b",
|
|
"input": text,
|
|
}
|
|
jsonBody, _ := json.Marshal(reqBody)
|
|
|
|
req, _ := http.NewRequest("POST", fireworksBaseURL+"/embeddings", bytes.NewReader(jsonBody))
|
|
req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
client := &http.Client{Timeout: 30 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != 200 {
|
|
body, _ := io.ReadAll(resp.Body)
|
|
return nil, fmt.Errorf("embedding API error %d: %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
var result struct {
|
|
Data []struct {
|
|
Embedding []float32 `json:"embedding"`
|
|
} `json:"data"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if len(result.Data) == 0 {
|
|
return nil, fmt.Errorf("no embedding returned")
|
|
}
|
|
|
|
return result.Data[0].Embedding, nil
|
|
}
|
|
|
|
// GetPDFPageCount returns the number of pages in a PDF
|
|
func GetPDFPageCount(filePath string) int {
|
|
cmd := exec.Command("pdfinfo", filePath)
|
|
out, err := cmd.Output()
|
|
if err != nil {
|
|
return 1
|
|
}
|
|
for _, line := range strings.Split(string(out), "\n") {
|
|
if strings.HasPrefix(line, "Pages:") {
|
|
var count int
|
|
fmt.Sscanf(line, "Pages: %d", &count)
|
|
return count
|
|
}
|
|
}
|
|
return 1
|
|
}
|
|
|
|
// ProcessPDFPageByPage extracts text from each page separately
|
|
func ProcessPDFPageByPage(filePath string, jobID string) (string, error) {
|
|
pageCount := GetPDFPageCount(filePath)
|
|
log.Printf(" Processing %d pages separately...", pageCount)
|
|
|
|
var allText strings.Builder
|
|
|
|
for page := 1; page <= pageCount; page++ {
|
|
UpdateJob(jobID, "ocr", fmt.Sprintf("Page %d/%d", page, pageCount))
|
|
tmpDir, err := os.MkdirTemp("", "docsys-page")
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
// Convert single page to PNG
|
|
outPrefix := filepath.Join(tmpDir, "page")
|
|
cmd := exec.Command("pdftoppm", "-png", "-f", fmt.Sprintf("%d", page), "-l", fmt.Sprintf("%d", page), "-r", "150", filePath, outPrefix)
|
|
if err := cmd.Run(); err != nil {
|
|
os.RemoveAll(tmpDir)
|
|
continue
|
|
}
|
|
|
|
// Glob for the output — pdftoppm zero-pads based on total page count
|
|
pageMatches, _ := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
|
|
if len(pageMatches) == 0 {
|
|
os.RemoveAll(tmpDir)
|
|
continue
|
|
}
|
|
imageData, err := os.ReadFile(pageMatches[0])
|
|
os.RemoveAll(tmpDir)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
// OCR this page
|
|
log.Printf(" Page %d/%d...", page, pageCount)
|
|
pageAnalysis, err := AnalyzePageOnly(imageData, page)
|
|
if err != nil {
|
|
log.Printf(" Page %d failed: %v", page, err)
|
|
continue
|
|
}
|
|
|
|
if pageAnalysis != "" {
|
|
allText.WriteString(fmt.Sprintf("\n\n---\n## Page %d\n\n", page))
|
|
allText.WriteString(pageAnalysis)
|
|
}
|
|
}
|
|
|
|
return allText.String(), nil
|
|
}
|
|
|
|
// AnalyzePageOnly extracts just the text from a single page image
|
|
func AnalyzePageOnly(imageData []byte, pageNum int) (string, error) {
|
|
if fireworksAPIKey == "" {
|
|
return "", fmt.Errorf("FIREWORKS_API_KEY not set")
|
|
}
|
|
|
|
b64 := base64.StdEncoding.EncodeToString(imageData)
|
|
|
|
prompt := `Transcribe ALL visible text on this page EXACTLY as it appears, in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.
|
|
|
|
FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve ALL numbers, dates, amounts, and values exactly as shown. If the document is in Russian, Dutch, German, French, or any other language — keep it in that language.`
|
|
|
|
reqBody := map[string]interface{}{
|
|
"model": "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct",
|
|
"max_tokens": 4096,
|
|
"messages": []map[string]interface{}{
|
|
{
|
|
"role": "user",
|
|
"content": []map[string]interface{}{
|
|
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
|
|
{"type": "text", "text": prompt},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
jsonBody, _ := json.Marshal(reqBody)
|
|
req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody))
|
|
req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
client := &http.Client{Timeout: 120 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != 200 {
|
|
body, _ := io.ReadAll(resp.Body)
|
|
return "", fmt.Errorf("API error %d: %s", resp.StatusCode, string(body))
|
|
}
|
|
|
|
// Read raw response to debug content vs reasoning_content
|
|
rawBody, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
var result struct {
|
|
Choices []struct {
|
|
Message struct {
|
|
Content string `json:"content"`
|
|
ReasoningContent string `json:"reasoning_content"`
|
|
} `json:"message"`
|
|
} `json:"choices"`
|
|
}
|
|
if err := json.Unmarshal(rawBody, &result); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if len(result.Choices) == 0 {
|
|
return "", fmt.Errorf("no response")
|
|
}
|
|
|
|
content := result.Choices[0].Message.Content
|
|
reasoning := result.Choices[0].Message.ReasoningContent
|
|
|
|
if reasoning != "" {
|
|
log.Printf(" [OCR debug] reasoning_content length: %d, content length: %d", len(reasoning), len(content))
|
|
if len(content) > 100 {
|
|
log.Printf(" [OCR debug] content starts: %.100s", content)
|
|
}
|
|
}
|
|
|
|
// If content is empty but reasoning has text, model put everything in wrong field
|
|
if strings.TrimSpace(content) == "" && reasoning != "" {
|
|
log.Printf(" [OCR debug] WARNING: content empty, using reasoning_content")
|
|
content = reasoning
|
|
}
|
|
|
|
return strings.TrimSpace(content), nil
|
|
}
|
|
|
|
// ProcessDocument handles the full document processing pipeline
|
|
func ProcessDocument(filePath string) (*Document, error) {
|
|
log.Printf("Processing: %s", filepath.Base(filePath))
|
|
|
|
ext := strings.ToLower(filepath.Ext(filePath))
|
|
|
|
// Get file hash
|
|
hash, err := FileHash(filePath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("hash failed: %w", err)
|
|
}
|
|
log.Printf(" Hash: %s", hash)
|
|
|
|
// Start progress tracking
|
|
StartJob(hash, filepath.Base(filePath))
|
|
defer FinishJob(hash)
|
|
|
|
// Check if already fully processed (not pending)
|
|
if existing, _ := GetDocument(hash); existing != nil && existing.Status == "ready" {
|
|
log.Printf(" Already exists, skipping")
|
|
os.Remove(filePath)
|
|
return existing, nil
|
|
}
|
|
|
|
var analysis *DocumentAnalysis
|
|
|
|
if IsTextFile(ext) {
|
|
// Plain text — read and analyze
|
|
data, err := os.ReadFile(filePath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
UpdateJob(hash, "classifying", "Analyzing text...")
|
|
log.Printf(" Analyzing text with K2...")
|
|
analysis, err = AnalyzeText(string(data), filepath.Base(filePath))
|
|
if err != nil {
|
|
UpdateJob(hash, "error", err.Error())
|
|
return nil, fmt.Errorf("text analysis failed: %w", err)
|
|
}
|
|
} else if IsOfficeFile(ext) {
|
|
// Office formats — extract text natively from ZIP/XML, no LibreOffice needed
|
|
UpdateJob(hash, "converting", "Extracting text...")
|
|
log.Printf(" Extracting text from %s...", ext)
|
|
text, err := ExtractOfficeText(filePath)
|
|
if err != nil {
|
|
UpdateJob(hash, "error", err.Error())
|
|
return nil, fmt.Errorf("office text extraction failed: %w", err)
|
|
}
|
|
log.Printf(" Extracted %d chars, classifying...", len(text))
|
|
UpdateJob(hash, "classifying", "Classifying...")
|
|
analysis, err = AnalyzeText(text, filepath.Base(filePath))
|
|
if err != nil {
|
|
UpdateJob(hash, "error", err.Error())
|
|
return nil, fmt.Errorf("text analysis failed: %w", err)
|
|
}
|
|
} else {
|
|
// Vision — convert to image and analyze
|
|
UpdateJob(hash, "converting", "Converting to image...")
|
|
log.Printf(" Converting to image...")
|
|
imageData, err := ConvertToImage(filePath)
|
|
if err != nil {
|
|
UpdateJob(hash, "error", err.Error())
|
|
return nil, fmt.Errorf("image conversion failed: %w", err)
|
|
}
|
|
UpdateJob(hash, "ocr", "Analyzing first page...")
|
|
log.Printf(" Analyzing with K2.5 vision...")
|
|
analysis, err = AnalyzeWithVision(imageData)
|
|
if err != nil {
|
|
// Vision JSON extraction failed — fall back to two-step: plain-text OCR + text classifier
|
|
log.Printf(" AnalyzeWithVision failed (%v), falling back to OCR+classify...", err)
|
|
UpdateJob(hash, "ocr", "Falling back to OCR + classify...")
|
|
pageText, ocrErr := AnalyzePageOnly(imageData, 1)
|
|
if ocrErr != nil {
|
|
UpdateJob(hash, "error", ocrErr.Error())
|
|
return nil, fmt.Errorf("vision analysis failed (primary: %v, fallback: %w)", err, ocrErr)
|
|
}
|
|
// Classify the extracted text
|
|
log.Printf(" OCR succeeded (%d chars), classifying...", len(pageText))
|
|
UpdateJob(hash, "classifying", "Classifying extracted text...")
|
|
analysis, err = AnalyzeText(pageText, filepath.Base(filePath))
|
|
if err != nil {
|
|
// Use minimal stub so at least the doc is stored with its text
|
|
log.Printf(" Classification failed too: %v — storing with minimal metadata", err)
|
|
analysis = &DocumentAnalysis{
|
|
Category: "uncategorized",
|
|
DocType: "unknown",
|
|
Title: strings.TrimSuffix(filepath.Base(filePath), filepath.Ext(filePath)),
|
|
Summary: "Extraction failed — stored raw text only",
|
|
FullText: pageText,
|
|
}
|
|
} else {
|
|
analysis.FullText = pageText
|
|
}
|
|
}
|
|
|
|
// For PDFs, process pages for accurate OCR
|
|
if ext == ".pdf" {
|
|
pageCount := GetPDFPageCount(filePath)
|
|
if pageCount > 1 {
|
|
log.Printf(" Multi-page PDF detected (%d pages)", pageCount)
|
|
UpdateJob(hash, "ocr", fmt.Sprintf("Multi-page PDF: %d pages", pageCount))
|
|
fullText, err := ProcessPDFPageByPage(filePath, hash)
|
|
if err == nil && fullText != "" {
|
|
analysis.FullText = fullText
|
|
}
|
|
} else if analysis.FullText == "" || len(analysis.FullText) < 50 || strings.HasPrefix(analysis.FullText, "[") {
|
|
// Single-page but full_text is empty/placeholder — retry with AnalyzePageOnly
|
|
log.Printf(" Single-page PDF with bad full_text (%q) — retrying with AnalyzePageOnly...", analysis.FullText)
|
|
UpdateJob(hash, "ocr", "Retrying text extraction...")
|
|
if pageText, err := AnalyzePageOnly(imageData, 1); err == nil && pageText != "" {
|
|
analysis.FullText = pageText
|
|
} else if err != nil {
|
|
log.Printf(" AnalyzePageOnly fallback failed: %v", err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
log.Printf(" Category: %s, Type: %s", analysis.Category, analysis.DocType)
|
|
|
|
// Copy to store (skip if already there — reprocessing from store-backed upload)
|
|
storePath := filepath.Join(storeDir, hash+ext)
|
|
if _, statErr := os.Stat(storePath); os.IsNotExist(statErr) {
|
|
if err := copyFile(filePath, storePath); err != nil {
|
|
return nil, fmt.Errorf("store copy failed: %w", err)
|
|
}
|
|
}
|
|
|
|
// Only set PDFPath for actual PDFs — office/text files have no previewable PDF
|
|
pdfStorePath := ""
|
|
if ext == ".pdf" {
|
|
pdfStorePath = storePath
|
|
}
|
|
|
|
// Create document record
|
|
// Use title if provided, fall back to summary
|
|
title := analysis.Title
|
|
if title == "" {
|
|
title = analysis.Summary
|
|
}
|
|
|
|
doc := &Document{
|
|
ID: hash,
|
|
Title: title,
|
|
Category: analysis.Category,
|
|
Type: analysis.DocType,
|
|
Date: analysis.Date,
|
|
Amount: analysis.AmountString(),
|
|
Vendor: analysis.Vendor,
|
|
Summary: analysis.Summary,
|
|
FullText: analysis.FullText,
|
|
PDFPath: pdfStorePath,
|
|
OriginalFile: filepath.Base(filePath),
|
|
ProcessedAt: time.Now().Format(time.RFC3339),
|
|
Status: "ready",
|
|
}
|
|
|
|
// Save to database
|
|
if err := InsertDocument(doc); err != nil {
|
|
return nil, fmt.Errorf("db insert failed: %w", err)
|
|
}
|
|
|
|
// Generate embedding
|
|
if analysis.FullText != "" {
|
|
UpdateJob(hash, "embedding", "Generating search index...")
|
|
log.Printf(" Generating embedding...")
|
|
if emb, err := GenerateEmbedding(analysis.FullText); err == nil {
|
|
log.Printf(" Embedding: %d dimensions", len(emb))
|
|
StoreEmbedding(hash, emb)
|
|
} else {
|
|
log.Printf(" Embedding failed: %v", err)
|
|
}
|
|
}
|
|
|
|
// Remove from inbox
|
|
os.Remove(filePath)
|
|
|
|
log.Printf(" ✓ Done: %s/%s", analysis.Category, hash)
|
|
return doc, nil
|
|
}
|
|
|
|
func copyFile(src, dst string) error {
|
|
in, err := os.Open(src)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer in.Close()
|
|
|
|
out, err := os.Create(dst)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer out.Close()
|
|
|
|
_, err = io.Copy(out, in)
|
|
return err
|
|
}
|
|
|
|
// ExtractOfficeText extracts plain text from DOCX/XLSX/PPTX natively.
|
|
// These are ZIP archives containing XML — no LibreOffice needed.
|
|
func ExtractOfficeText(filePath string) (string, error) {
|
|
ext := strings.ToLower(filepath.Ext(filePath))
|
|
|
|
r, err := zip.OpenReader(filePath)
|
|
if err != nil {
|
|
return "", fmt.Errorf("not a valid Office file: %w", err)
|
|
}
|
|
defer r.Close()
|
|
|
|
// Which XML paths to extract per format
|
|
var targets []string
|
|
switch ext {
|
|
case ".docx", ".doc", ".odt", ".rtf":
|
|
targets = []string{"word/document.xml", "word/body.xml"}
|
|
case ".xlsx", ".xls":
|
|
// All sheets
|
|
for _, f := range r.File {
|
|
if strings.HasPrefix(f.Name, "xl/worksheets/sheet") && strings.HasSuffix(f.Name, ".xml") {
|
|
targets = append(targets, f.Name)
|
|
}
|
|
}
|
|
// Shared strings (cell values are stored here for xlsx)
|
|
targets = append(targets, "xl/sharedStrings.xml")
|
|
case ".pptx", ".ppt":
|
|
for _, f := range r.File {
|
|
if strings.HasPrefix(f.Name, "ppt/slides/slide") && strings.HasSuffix(f.Name, ".xml") {
|
|
targets = append(targets, f.Name)
|
|
}
|
|
}
|
|
default:
|
|
return "", fmt.Errorf("unsupported office format: %s", ext)
|
|
}
|
|
|
|
var sb strings.Builder
|
|
for _, f := range r.File {
|
|
found := false
|
|
for _, t := range targets {
|
|
if f.Name == t {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
continue
|
|
}
|
|
|
|
rc, err := f.Open()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
data, err := io.ReadAll(rc)
|
|
rc.Close()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
sb.WriteString(xmlToText(data))
|
|
sb.WriteString("\n")
|
|
}
|
|
|
|
text := strings.TrimSpace(sb.String())
|
|
if text == "" {
|
|
return "", fmt.Errorf("no text extracted from %s", filepath.Base(filePath))
|
|
}
|
|
return text, nil
|
|
}
|
|
|
|
// xmlToText parses DOCX/Office XML into structured plain text.
|
|
// Understands paragraph breaks, table rows/cells, and text runs.
|
|
func xmlToText(data []byte) string {
|
|
s := string(data)
|
|
|
|
// Normalise tag names: strip namespace prefix (w:p → p, a:p → p, etc.)
|
|
// We work on the raw string using simple tag scanning.
|
|
var sb strings.Builder
|
|
inTag := false
|
|
var tagBuf strings.Builder
|
|
lastWasNewline := false
|
|
|
|
flushLine := func() {
|
|
line := strings.TrimSpace(sb.String())
|
|
if line != "" {
|
|
sb.Reset()
|
|
// return the line — caller collects via out builder
|
|
// (we reuse sb for the whole doc; collect separately)
|
|
}
|
|
}
|
|
_ = flushLine
|
|
|
|
// Two-pass: build a token stream of (tag, text) pairs
|
|
type token struct {
|
|
tag string // lowercased local name, or "" for text
|
|
text string
|
|
}
|
|
var tokens []token
|
|
|
|
for i := 0; i < len(s); i++ {
|
|
c := s[i]
|
|
if c == '<' {
|
|
// flush text buffer
|
|
if sb.Len() > 0 {
|
|
tokens = append(tokens, token{text: sb.String()})
|
|
sb.Reset()
|
|
}
|
|
inTag = true
|
|
tagBuf.Reset()
|
|
continue
|
|
}
|
|
if c == '>' {
|
|
inTag = false
|
|
raw := strings.TrimSpace(tagBuf.String())
|
|
// Extract local name (strip namespace and attributes)
|
|
name := raw
|
|
if idx := strings.IndexAny(raw, " \t\n\r/"); idx >= 0 {
|
|
name = raw[:idx]
|
|
}
|
|
if idx := strings.Index(name, ":"); idx >= 0 {
|
|
name = name[idx+1:]
|
|
}
|
|
name = strings.ToLower(name)
|
|
closing := strings.HasPrefix(raw, "/")
|
|
if closing {
|
|
name = strings.TrimPrefix(name, "/")
|
|
}
|
|
tokens = append(tokens, token{tag: name})
|
|
tagBuf.Reset()
|
|
continue
|
|
}
|
|
if inTag {
|
|
tagBuf.WriteByte(c)
|
|
} else {
|
|
sb.WriteByte(c)
|
|
}
|
|
}
|
|
if sb.Len() > 0 {
|
|
tokens = append(tokens, token{text: sb.String()})
|
|
}
|
|
|
|
// Now render tokens into structured text
|
|
var out strings.Builder
|
|
pendingNewlines := 0
|
|
|
|
emitNewlines := func(n int) {
|
|
if n > pendingNewlines {
|
|
pendingNewlines = n
|
|
}
|
|
}
|
|
flushNewlines := func() {
|
|
for i := 0; i < pendingNewlines; i++ {
|
|
out.WriteByte('\n')
|
|
}
|
|
pendingNewlines = 0
|
|
lastWasNewline = pendingNewlines == 0
|
|
}
|
|
_ = lastWasNewline
|
|
|
|
inCell := false
|
|
cellCount := 0
|
|
|
|
for _, tok := range tokens {
|
|
if tok.tag == "" {
|
|
// text node
|
|
txt := tok.text
|
|
// decode entities
|
|
txt = strings.ReplaceAll(txt, "&", "&")
|
|
txt = strings.ReplaceAll(txt, "<", "<")
|
|
txt = strings.ReplaceAll(txt, ">", ">")
|
|
txt = strings.ReplaceAll(txt, """, "\"")
|
|
txt = strings.ReplaceAll(txt, "'", "'")
|
|
txt = strings.ReplaceAll(txt, "
", "\n")
|
|
txt = strings.ReplaceAll(txt, "	", "\t")
|
|
if strings.TrimSpace(txt) != "" {
|
|
flushNewlines()
|
|
out.WriteString(txt)
|
|
}
|
|
continue
|
|
}
|
|
switch tok.tag {
|
|
case "p": // paragraph
|
|
emitNewlines(1)
|
|
case "tr": // table row start
|
|
inCell = false
|
|
cellCount = 0
|
|
emitNewlines(1)
|
|
case "tc": // table cell
|
|
if inCell {
|
|
out.WriteString("\t")
|
|
}
|
|
inCell = true
|
|
cellCount++
|
|
case "br": // line break
|
|
emitNewlines(1)
|
|
}
|
|
}
|
|
|
|
return strings.TrimSpace(out.String())
|
|
}
|
|
|
|
// IsOfficeFile returns true for formats with extractable XML text.
|
|
func IsOfficeFile(ext string) bool {
|
|
return map[string]bool{
|
|
".docx": true, ".xlsx": true, ".pptx": true,
|
|
".odt": true, ".ods": true, ".odp": true,
|
|
}[ext]
|
|
}
|