docsys/ai.go

748 lines
22 KiB
Go

package main
import (
"bytes"
"crypto/sha256"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
var (
fireworksAPIKey string
fireworksBaseURL = "https://api.fireworks.ai/inference/v1"
)
func init() {
fireworksAPIKey = os.Getenv("FIREWORKS_API_KEY")
if fireworksAPIKey == "" {
// Try .env file in docsys directory
envPath := filepath.Join(os.Getenv("HOME"), "dev/docsys/.env")
if data, err := os.ReadFile(envPath); err == nil {
for _, line := range strings.Split(string(data), "\n") {
if strings.HasPrefix(line, "FIREWORKS_API_KEY=") {
fireworksAPIKey = strings.TrimSpace(strings.TrimPrefix(line, "FIREWORKS_API_KEY="))
fireworksAPIKey = strings.Trim(fireworksAPIKey, `"'`)
break
}
}
}
}
}
// DocumentAnalysis contains the AI-extracted information
type DocumentAnalysis struct {
Category string `json:"category"`
DocType string `json:"doc_type"`
Date string `json:"date"`
Vendor string `json:"vendor"`
Amount interface{} `json:"amount"` // Can be string or number
Title string `json:"title"`
Summary string `json:"summary"`
FullText string `json:"full_text"`
}
func (d *DocumentAnalysis) AmountString() string {
switch v := d.Amount.(type) {
case string:
return v
case float64:
return fmt.Sprintf("$%.2f", v)
default:
return ""
}
}
// FileHash returns first 16 chars of SHA256 hash
func FileHash(filepath string) (string, error) {
f, err := os.Open(filepath)
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return fmt.Sprintf("%x", h.Sum(nil))[:16], nil
}
// ConvertToImage converts PDF/Office docs to PNG for vision API
func ConvertToImage(filePath string) ([]byte, error) {
ext := strings.ToLower(filepath.Ext(filePath))
// Office documents → PDF first
officeExts := map[string]bool{".doc": true, ".docx": true, ".odt": true, ".rtf": true, ".xls": true, ".xlsx": true, ".ppt": true, ".pptx": true}
if officeExts[ext] {
tmpDir, err := os.MkdirTemp("", "docsys")
if err != nil {
return nil, err
}
defer os.RemoveAll(tmpDir)
cmd := exec.Command("libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmpDir, filePath)
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("libreoffice conversion failed: %w", err)
}
base := strings.TrimSuffix(filepath.Base(filePath), ext)
pdfPath := filepath.Join(tmpDir, base+".pdf")
filePath = pdfPath
ext = ".pdf"
}
// PDF → PNG (first page only for preview, full processing done separately)
if ext == ".pdf" {
tmpDir, err := os.MkdirTemp("", "docsys")
if err != nil {
return nil, err
}
defer os.RemoveAll(tmpDir)
// Convert first page for initial analysis
outPrefix := filepath.Join(tmpDir, "page")
cmd := exec.Command("pdftoppm", "-png", "-f", "1", "-l", "1", "-r", "150", filePath, outPrefix)
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("pdftoppm failed: %w", err)
}
pngPath := filepath.Join(tmpDir, "page-1.png")
return os.ReadFile(pngPath)
}
// Image files — read directly
return os.ReadFile(filePath)
}
// IsTextFile returns true for plain text files
func IsTextFile(ext string) bool {
textExts := map[string]bool{
".txt": true, ".md": true, ".markdown": true, ".text": true, ".log": true,
".json": true, ".xml": true, ".csv": true, ".yaml": true, ".yml": true,
}
return textExts[ext]
}
// AnalyzeWithVision uses K2.5 vision model
func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) {
if fireworksAPIKey == "" {
return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
}
b64 := base64.StdEncoding.EncodeToString(imageData)
prompt := `Analyze this document image and extract:
1. **Full Text**: Transcribe ALL visible text EXACTLY as it appears — in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Transcribe word-for-word in the source language (Russian, Dutch, German, French, etc.) formatted as clean Markdown:
- Use headers (##) for sections
- Use **bold** for labels/field names
- Use tables for tabular data (items, prices, etc.)
- Use bullet lists where appropriate
- Preserve ALL numbers, dates, amounts, and codes exactly as shown
2. **Classification**: Categorize into exactly ONE of:
taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2")
4. **Key Fields** (these may be in English for searchability):
- date: Document date (YYYY-MM-DD if possible)
- vendor: Company/organization name
- amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN")
5. **Title**: SHORT English title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025"
6. **Summary**: 1-2 sentence English description with key details.
IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable.
Respond in JSON ONLY:
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`
reqBody := map[string]interface{}{
"model": "accounts/fireworks/models/kimi-k2p5",
"max_tokens": 4096,
"messages": []map[string]interface{}{
{"role": "system", "content": "You are a document analysis API. Output ONLY raw JSON. No thinking, no commentary, no code fences. First character must be {, last character must be }."},
{
"role": "user",
"content": []map[string]interface{}{
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
{"type": "text", "text": prompt},
},
},
},
}
analysis, err := callFireworks(reqBody)
if err != nil {
// Retry once with minimal prompt to avoid triggering extended reasoning
log.Printf(" [AI] First attempt failed, retrying with simplified prompt...")
retryBody := map[string]interface{}{
"model": "accounts/fireworks/models/kimi-k2p5",
"max_tokens": 4096,
"messages": []map[string]interface{}{
{"role": "system", "content": "Output valid JSON only. No other text."},
{
"role": "user",
"content": []map[string]interface{}{
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
{"type": "text", "text": `Look at this document image. Transcribe ALL text verbatim (original language, do NOT translate). Return ONLY this JSON with no placeholders:
{"category":"","doc_type":"","date":"","vendor":"","amount":"","title":"","summary":"","full_text":""}`},
},
},
},
}
return callFireworks(retryBody)
}
return analysis, nil
}
// AnalyzeText uses K2 text model for plain text files
func AnalyzeText(text, filename string) (*DocumentAnalysis, error) {
if fireworksAPIKey == "" {
return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
}
// Truncate long text
if len(text) > 50000 {
text = text[:50000]
}
prompt := fmt.Sprintf(`Analyze this document:
**Filename:** %s
**Content:**
%s
Categorize into ONE of: taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
Respond in JSON ONLY:
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "summary": "..."}`, filename, text)
reqBody := map[string]interface{}{
"model": "accounts/fireworks/models/kimi-k2-instruct-0905",
"max_tokens": 1024,
"messages": []map[string]interface{}{
{"role": "user", "content": prompt},
},
}
analysis, err := callFireworks(reqBody)
if err != nil {
return nil, err
}
analysis.FullText = text
return analysis, nil
}
func callFireworks(reqBody map[string]interface{}) (*DocumentAnalysis, error) {
jsonBody, _ := json.Marshal(reqBody)
req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody))
req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 120 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("API error %d: %s", resp.StatusCode, string(body))
}
respBody, _ := io.ReadAll(resp.Body)
var result struct {
Choices []struct {
Message struct {
Content string `json:"content"`
ReasoningContent string `json:"reasoning_content"`
} `json:"message"`
} `json:"choices"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return nil, err
}
if len(result.Choices) == 0 {
return nil, fmt.Errorf("no response from API")
}
content := result.Choices[0].Message.Content
reasoning := result.Choices[0].Message.ReasoningContent
// K2.5 reasoning mode: actual JSON may be in content or reasoning_content
// Try content first, if it doesn't look like JSON, try reasoning_content
if !strings.Contains(content, "{") && reasoning != "" && strings.Contains(reasoning, "{") {
log.Printf(" [AI] Using reasoning_content (content had no JSON)")
content = reasoning
}
// Strip markdown code fences (```json ... ``` or ``` ... ```)
content = strings.TrimSpace(content)
if strings.HasPrefix(content, "```") {
// Remove opening fence (```json or ```)
if idx := strings.Index(content, "\n"); idx >= 0 {
content = content[idx+1:]
}
// Remove closing fence
if idx := strings.LastIndex(content, "```"); idx >= 0 {
content = content[:idx]
}
content = strings.TrimSpace(content)
}
// Extract JSON from response
if idx := strings.Index(content, "{"); idx >= 0 {
if end := strings.LastIndex(content, "}"); end > idx {
content = content[idx : end+1]
}
}
var analysis DocumentAnalysis
if err := json.Unmarshal([]byte(content), &analysis); err != nil {
// Last resort: try to find a JSON object with braces matching
cleaned := extractJSONObject(content)
if cleaned != "" {
if err2 := json.Unmarshal([]byte(cleaned), &analysis); err2 != nil {
log.Printf(" [AI debug] Failed to parse even after cleanup. Content starts: %.200s", content)
return nil, fmt.Errorf("failed to parse response: %w", err)
}
} else {
log.Printf(" [AI debug] No JSON object found in response. Content starts: %.200s", content)
return nil, fmt.Errorf("failed to parse response: %w", err)
}
}
// Validate category
validCats := map[string]bool{"taxes": true, "bills": true, "medical": true, "insurance": true, "legal": true, "financial": true, "expenses": true, "vehicles": true, "home": true, "personal": true, "contacts": true, "uncategorized": true}
if !validCats[analysis.Category] {
analysis.Category = "uncategorized"
}
return &analysis, nil
}
// extractJSONObject tries to find a balanced JSON object in a string
func extractJSONObject(s string) string {
start := strings.Index(s, "{")
if start < 0 {
return ""
}
depth := 0
inString := false
escaped := false
for i := start; i < len(s); i++ {
c := s[i]
if escaped {
escaped = false
continue
}
if c == '\\' && inString {
escaped = true
continue
}
if c == '"' {
inString = !inString
continue
}
if inString {
continue
}
if c == '{' {
depth++
} else if c == '}' {
depth--
if depth == 0 {
return s[start : i+1]
}
}
}
return ""
}
// GenerateEmbedding creates a vector embedding using Fireworks
func GenerateEmbedding(text string) ([]float32, error) {
if fireworksAPIKey == "" {
return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
}
// Truncate
if len(text) > 32000 {
text = text[:32000]
}
reqBody := map[string]interface{}{
"model": "fireworks/qwen3-embedding-8b",
"input": text,
}
jsonBody, _ := json.Marshal(reqBody)
req, _ := http.NewRequest("POST", fireworksBaseURL+"/embeddings", bytes.NewReader(jsonBody))
req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 30 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("embedding API error %d: %s", resp.StatusCode, string(body))
}
var result struct {
Data []struct {
Embedding []float32 `json:"embedding"`
} `json:"data"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return nil, err
}
if len(result.Data) == 0 {
return nil, fmt.Errorf("no embedding returned")
}
return result.Data[0].Embedding, nil
}
// GetPDFPageCount returns the number of pages in a PDF
func GetPDFPageCount(filePath string) int {
cmd := exec.Command("pdfinfo", filePath)
out, err := cmd.Output()
if err != nil {
return 1
}
for _, line := range strings.Split(string(out), "\n") {
if strings.HasPrefix(line, "Pages:") {
var count int
fmt.Sscanf(line, "Pages: %d", &count)
return count
}
}
return 1
}
// ProcessPDFPageByPage extracts text from each page separately
func ProcessPDFPageByPage(filePath string, jobID string) (string, error) {
pageCount := GetPDFPageCount(filePath)
log.Printf(" Processing %d pages separately...", pageCount)
var allText strings.Builder
for page := 1; page <= pageCount; page++ {
UpdateJob(jobID, "ocr", fmt.Sprintf("Page %d/%d", page, pageCount))
tmpDir, err := os.MkdirTemp("", "docsys-page")
if err != nil {
continue
}
// Convert single page to PNG
outPrefix := filepath.Join(tmpDir, "page")
cmd := exec.Command("pdftoppm", "-png", "-f", fmt.Sprintf("%d", page), "-l", fmt.Sprintf("%d", page), "-r", "150", filePath, outPrefix)
if err := cmd.Run(); err != nil {
os.RemoveAll(tmpDir)
continue
}
pngPath := filepath.Join(tmpDir, fmt.Sprintf("page-%d.png", page))
imageData, err := os.ReadFile(pngPath)
os.RemoveAll(tmpDir)
if err != nil {
continue
}
// OCR this page
log.Printf(" Page %d/%d...", page, pageCount)
pageAnalysis, err := AnalyzePageOnly(imageData, page)
if err != nil {
log.Printf(" Page %d failed: %v", page, err)
continue
}
if pageAnalysis != "" {
allText.WriteString(fmt.Sprintf("\n\n---\n## Page %d\n\n", page))
allText.WriteString(pageAnalysis)
}
}
return allText.String(), nil
}
// AnalyzePageOnly extracts just the text from a single page image
func AnalyzePageOnly(imageData []byte, pageNum int) (string, error) {
if fireworksAPIKey == "" {
return "", fmt.Errorf("FIREWORKS_API_KEY not set")
}
b64 := base64.StdEncoding.EncodeToString(imageData)
prompt := `Transcribe ALL visible text on this page EXACTLY as it appears, in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.
FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve ALL numbers, dates, amounts, and values exactly as shown. If the document is in Russian, Dutch, German, French, or any other language — keep it in that language.`
reqBody := map[string]interface{}{
"model": "accounts/fireworks/models/kimi-k2p5",
"max_tokens": 4096,
"messages": []map[string]interface{}{
{
"role": "user",
"content": []map[string]interface{}{
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
{"type": "text", "text": prompt},
},
},
},
}
jsonBody, _ := json.Marshal(reqBody)
req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody))
req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 120 * time.Second}
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
body, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("API error %d: %s", resp.StatusCode, string(body))
}
// Read raw response to debug content vs reasoning_content
rawBody, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
var result struct {
Choices []struct {
Message struct {
Content string `json:"content"`
ReasoningContent string `json:"reasoning_content"`
} `json:"message"`
} `json:"choices"`
}
if err := json.Unmarshal(rawBody, &result); err != nil {
return "", err
}
if len(result.Choices) == 0 {
return "", fmt.Errorf("no response")
}
content := result.Choices[0].Message.Content
reasoning := result.Choices[0].Message.ReasoningContent
if reasoning != "" {
log.Printf(" [OCR debug] reasoning_content length: %d, content length: %d", len(reasoning), len(content))
if len(content) > 100 {
log.Printf(" [OCR debug] content starts: %.100s", content)
}
}
// If content is empty but reasoning has text, model put everything in wrong field
if strings.TrimSpace(content) == "" && reasoning != "" {
log.Printf(" [OCR debug] WARNING: content empty, using reasoning_content")
content = reasoning
}
return strings.TrimSpace(content), nil
}
// ProcessDocument handles the full document processing pipeline
func ProcessDocument(filePath string) (*Document, error) {
log.Printf("Processing: %s", filepath.Base(filePath))
ext := strings.ToLower(filepath.Ext(filePath))
// Get file hash
hash, err := FileHash(filePath)
if err != nil {
return nil, fmt.Errorf("hash failed: %w", err)
}
log.Printf(" Hash: %s", hash)
// Start progress tracking
StartJob(hash, filepath.Base(filePath))
defer FinishJob(hash)
// Check if already fully processed (not pending)
if existing, _ := GetDocument(hash); existing != nil && existing.Status == "ready" {
log.Printf(" Already exists, skipping")
os.Remove(filePath)
return existing, nil
}
var analysis *DocumentAnalysis
if IsTextFile(ext) {
// Plain text — read and analyze
data, err := os.ReadFile(filePath)
if err != nil {
return nil, err
}
UpdateJob(hash, "classifying", "Analyzing text...")
log.Printf(" Analyzing text with K2...")
analysis, err = AnalyzeText(string(data), filepath.Base(filePath))
if err != nil {
UpdateJob(hash, "error", err.Error())
return nil, fmt.Errorf("text analysis failed: %w", err)
}
} else {
// Vision — convert to image and analyze
UpdateJob(hash, "converting", "Converting to image...")
log.Printf(" Converting to image...")
imageData, err := ConvertToImage(filePath)
if err != nil {
UpdateJob(hash, "error", err.Error())
return nil, fmt.Errorf("image conversion failed: %w", err)
}
UpdateJob(hash, "ocr", "Analyzing first page...")
log.Printf(" Analyzing with K2.5 vision...")
analysis, err = AnalyzeWithVision(imageData)
if err != nil {
// Vision JSON extraction failed — fall back to two-step: plain-text OCR + text classifier
log.Printf(" AnalyzeWithVision failed (%v), falling back to OCR+classify...", err)
UpdateJob(hash, "ocr", "Falling back to OCR + classify...")
pageText, ocrErr := AnalyzePageOnly(imageData, 1)
if ocrErr != nil {
UpdateJob(hash, "error", ocrErr.Error())
return nil, fmt.Errorf("vision analysis failed (primary: %v, fallback: %w)", err, ocrErr)
}
// Classify the extracted text
log.Printf(" OCR succeeded (%d chars), classifying...", len(pageText))
UpdateJob(hash, "classifying", "Classifying extracted text...")
analysis, err = AnalyzeText(pageText, filepath.Base(filePath))
if err != nil {
// Use minimal stub so at least the doc is stored with its text
log.Printf(" Classification failed too: %v — storing with minimal metadata", err)
analysis = &DocumentAnalysis{
Category: "uncategorized",
DocType: "unknown",
Title: strings.TrimSuffix(filepath.Base(filePath), filepath.Ext(filePath)),
Summary: "Extraction failed — stored raw text only",
FullText: pageText,
}
} else {
analysis.FullText = pageText
}
}
// For PDFs, process pages for accurate OCR
if ext == ".pdf" {
pageCount := GetPDFPageCount(filePath)
if pageCount > 1 {
log.Printf(" Multi-page PDF detected (%d pages)", pageCount)
UpdateJob(hash, "ocr", fmt.Sprintf("Multi-page PDF: %d pages", pageCount))
fullText, err := ProcessPDFPageByPage(filePath, hash)
if err == nil && fullText != "" {
analysis.FullText = fullText
}
} else if analysis.FullText == "" || len(analysis.FullText) < 50 || strings.HasPrefix(analysis.FullText, "[") {
// Single-page but full_text is empty/placeholder — retry with AnalyzePageOnly
log.Printf(" Single-page PDF with bad full_text (%q) — retrying with AnalyzePageOnly...", analysis.FullText)
UpdateJob(hash, "ocr", "Retrying text extraction...")
if pageText, err := AnalyzePageOnly(imageData, 1); err == nil && pageText != "" {
analysis.FullText = pageText
} else if err != nil {
log.Printf(" AnalyzePageOnly fallback failed: %v", err)
}
}
}
}
log.Printf(" Category: %s, Type: %s", analysis.Category, analysis.DocType)
// Copy to store
storePath := filepath.Join(storeDir, hash+ext)
if err := copyFile(filePath, storePath); err != nil {
return nil, fmt.Errorf("store copy failed: %w", err)
}
// Create document record
// Use title if provided, fall back to summary
title := analysis.Title
if title == "" {
title = analysis.Summary
}
doc := &Document{
ID: hash,
Title: title,
Category: analysis.Category,
Type: analysis.DocType,
Date: analysis.Date,
Amount: analysis.AmountString(),
Vendor: analysis.Vendor,
Summary: analysis.Summary,
FullText: analysis.FullText,
PDFPath: storePath,
OriginalFile: filepath.Base(filePath),
ProcessedAt: time.Now().Format(time.RFC3339),
Status: "ready",
}
// Save to database
if err := InsertDocument(doc); err != nil {
return nil, fmt.Errorf("db insert failed: %w", err)
}
// Generate embedding
if analysis.FullText != "" {
UpdateJob(hash, "embedding", "Generating search index...")
log.Printf(" Generating embedding...")
if emb, err := GenerateEmbedding(analysis.FullText); err == nil {
log.Printf(" Embedding: %d dimensions", len(emb))
StoreEmbedding(hash, emb)
} else {
log.Printf(" Embedding failed: %v", err)
}
}
// Remove from inbox
os.Remove(filePath)
log.Printf(" ✓ Done: %s/%s", analysis.Category, hash)
return doc, nil
}
func copyFile(src, dst string) error {
in, err := os.Open(src)
if err != nil {
return err
}
defer in.Close()
out, err := os.Create(dst)
if err != nil {
return err
}
defer out.Close()
_, err = io.Copy(out, in)
return err
}