package main import ( "bytes" "crypto/sha256" "encoding/base64" "encoding/json" "fmt" "io" "log" "net/http" "os" "os/exec" "path/filepath" "strings" "time" ) var ( fireworksAPIKey string fireworksBaseURL = "https://api.fireworks.ai/inference/v1" ) func init() { fireworksAPIKey = os.Getenv("FIREWORKS_API_KEY") if fireworksAPIKey == "" { // Try .env file in docsys directory envPath := filepath.Join(os.Getenv("HOME"), "dev/docsys/.env") if data, err := os.ReadFile(envPath); err == nil { for _, line := range strings.Split(string(data), "\n") { if strings.HasPrefix(line, "FIREWORKS_API_KEY=") { fireworksAPIKey = strings.TrimSpace(strings.TrimPrefix(line, "FIREWORKS_API_KEY=")) fireworksAPIKey = strings.Trim(fireworksAPIKey, `"'`) break } } } } } // DocumentAnalysis contains the AI-extracted information type DocumentAnalysis struct { Category string `json:"category"` DocType string `json:"doc_type"` Date string `json:"date"` Vendor string `json:"vendor"` Amount interface{} `json:"amount"` // Can be string or number Title string `json:"title"` Summary string `json:"summary"` FullText string `json:"full_text"` } func (d *DocumentAnalysis) AmountString() string { switch v := d.Amount.(type) { case string: return v case float64: return fmt.Sprintf("$%.2f", v) default: return "" } } // FileHash returns first 16 chars of SHA256 hash func FileHash(filepath string) (string, error) { f, err := os.Open(filepath) if err != nil { return "", err } defer f.Close() h := sha256.New() if _, err := io.Copy(h, f); err != nil { return "", err } return fmt.Sprintf("%x", h.Sum(nil))[:16], nil } // ConvertToImage converts PDF/Office docs to PNG for vision API func ConvertToImage(filePath string) ([]byte, error) { ext := strings.ToLower(filepath.Ext(filePath)) // Office documents → PDF first officeExts := map[string]bool{".doc": true, ".docx": true, ".odt": true, ".rtf": true, ".xls": true, ".xlsx": true, ".ppt": true, ".pptx": true} if officeExts[ext] { tmpDir, err := os.MkdirTemp("", "docsys") if err != nil { return nil, err } defer os.RemoveAll(tmpDir) cmd := exec.Command("libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmpDir, filePath) if err := cmd.Run(); err != nil { return nil, fmt.Errorf("libreoffice conversion failed: %w", err) } base := strings.TrimSuffix(filepath.Base(filePath), ext) pdfPath := filepath.Join(tmpDir, base+".pdf") filePath = pdfPath ext = ".pdf" } // PDF → PNG (first page only for preview, full processing done separately) if ext == ".pdf" { tmpDir, err := os.MkdirTemp("", "docsys") if err != nil { return nil, err } defer os.RemoveAll(tmpDir) // Convert first page for initial analysis outPrefix := filepath.Join(tmpDir, "page") cmd := exec.Command("pdftoppm", "-png", "-f", "1", "-l", "1", "-r", "150", filePath, outPrefix) if err := cmd.Run(); err != nil { return nil, fmt.Errorf("pdftoppm failed: %w", err) } pngPath := filepath.Join(tmpDir, "page-1.png") return os.ReadFile(pngPath) } // Image files — read directly return os.ReadFile(filePath) } // IsTextFile returns true for plain text files func IsTextFile(ext string) bool { textExts := map[string]bool{ ".txt": true, ".md": true, ".markdown": true, ".text": true, ".log": true, ".json": true, ".xml": true, ".csv": true, ".yaml": true, ".yml": true, } return textExts[ext] } // AnalyzeWithVision uses K2.5 vision model func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) { if fireworksAPIKey == "" { return nil, fmt.Errorf("FIREWORKS_API_KEY not set") } b64 := base64.StdEncoding.EncodeToString(imageData) prompt := `Analyze this document image and extract: 1. **Full Text**: Transcribe ALL visible text EXACTLY as it appears — in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Transcribe word-for-word in the source language (Russian, Dutch, German, French, etc.) formatted as clean Markdown: - Use headers (##) for sections - Use **bold** for labels/field names - Use tables for tabular data (items, prices, etc.) - Use bullet lists where appropriate - Preserve ALL numbers, dates, amounts, and codes exactly as shown 2. **Classification**: Categorize into exactly ONE of: taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized 3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2") 4. **Key Fields** (these may be in English for searchability): - date: Document date (YYYY-MM-DD if possible) - vendor: Company/organization name - amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN") 5. **Title**: SHORT English title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025" 6. **Summary**: 1-2 sentence English description with key details. IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable. Respond in JSON ONLY: {"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}` reqBody := map[string]interface{}{ "model": "accounts/fireworks/models/kimi-k2p5", "max_tokens": 4096, "messages": []map[string]interface{}{ {"role": "system", "content": "You are a document analysis API. Output ONLY raw JSON. No thinking, no commentary, no code fences. First character must be {, last character must be }."}, { "role": "user", "content": []map[string]interface{}{ {"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}}, {"type": "text", "text": prompt}, }, }, }, } analysis, err := callFireworks(reqBody) if err != nil { // Retry once with minimal prompt to avoid triggering extended reasoning log.Printf(" [AI] First attempt failed, retrying with simplified prompt...") retryBody := map[string]interface{}{ "model": "accounts/fireworks/models/kimi-k2p5", "max_tokens": 4096, "messages": []map[string]interface{}{ {"role": "system", "content": "Output valid JSON only. No other text."}, { "role": "user", "content": []map[string]interface{}{ {"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}}, {"type": "text", "text": `Look at this document image. Transcribe ALL text verbatim (original language, do NOT translate). Return ONLY this JSON with no placeholders: {"category":"","doc_type":"","date":"","vendor":"","amount":"","title":"","summary":"","full_text":""}`}, }, }, }, } return callFireworks(retryBody) } return analysis, nil } // AnalyzeText uses K2 text model for plain text files func AnalyzeText(text, filename string) (*DocumentAnalysis, error) { if fireworksAPIKey == "" { return nil, fmt.Errorf("FIREWORKS_API_KEY not set") } // Truncate long text if len(text) > 50000 { text = text[:50000] } prompt := fmt.Sprintf(`Analyze this document: **Filename:** %s **Content:** %s Categorize into ONE of: taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized Respond in JSON ONLY: {"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "summary": "..."}`, filename, text) reqBody := map[string]interface{}{ "model": "accounts/fireworks/models/kimi-k2-instruct-0905", "max_tokens": 1024, "messages": []map[string]interface{}{ {"role": "user", "content": prompt}, }, } analysis, err := callFireworks(reqBody) if err != nil { return nil, err } analysis.FullText = text return analysis, nil } func callFireworks(reqBody map[string]interface{}) (*DocumentAnalysis, error) { jsonBody, _ := json.Marshal(reqBody) req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody)) req.Header.Set("Authorization", "Bearer "+fireworksAPIKey) req.Header.Set("Content-Type", "application/json") client := &http.Client{Timeout: 120 * time.Second} resp, err := client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != 200 { body, _ := io.ReadAll(resp.Body) return nil, fmt.Errorf("API error %d: %s", resp.StatusCode, string(body)) } respBody, _ := io.ReadAll(resp.Body) var result struct { Choices []struct { Message struct { Content string `json:"content"` ReasoningContent string `json:"reasoning_content"` } `json:"message"` } `json:"choices"` } if err := json.Unmarshal(respBody, &result); err != nil { return nil, err } if len(result.Choices) == 0 { return nil, fmt.Errorf("no response from API") } content := result.Choices[0].Message.Content reasoning := result.Choices[0].Message.ReasoningContent // K2.5 reasoning mode: actual JSON may be in content or reasoning_content // Try content first, if it doesn't look like JSON, try reasoning_content if !strings.Contains(content, "{") && reasoning != "" && strings.Contains(reasoning, "{") { log.Printf(" [AI] Using reasoning_content (content had no JSON)") content = reasoning } // Strip markdown code fences (```json ... ``` or ``` ... ```) content = strings.TrimSpace(content) if strings.HasPrefix(content, "```") { // Remove opening fence (```json or ```) if idx := strings.Index(content, "\n"); idx >= 0 { content = content[idx+1:] } // Remove closing fence if idx := strings.LastIndex(content, "```"); idx >= 0 { content = content[:idx] } content = strings.TrimSpace(content) } // Extract JSON from response if idx := strings.Index(content, "{"); idx >= 0 { if end := strings.LastIndex(content, "}"); end > idx { content = content[idx : end+1] } } var analysis DocumentAnalysis if err := json.Unmarshal([]byte(content), &analysis); err != nil { // Last resort: try to find a JSON object with braces matching cleaned := extractJSONObject(content) if cleaned != "" { if err2 := json.Unmarshal([]byte(cleaned), &analysis); err2 != nil { log.Printf(" [AI debug] Failed to parse even after cleanup. Content starts: %.200s", content) return nil, fmt.Errorf("failed to parse response: %w", err) } } else { log.Printf(" [AI debug] No JSON object found in response. Content starts: %.200s", content) return nil, fmt.Errorf("failed to parse response: %w", err) } } // Validate category validCats := map[string]bool{"taxes": true, "bills": true, "medical": true, "insurance": true, "legal": true, "financial": true, "expenses": true, "vehicles": true, "home": true, "personal": true, "contacts": true, "uncategorized": true} if !validCats[analysis.Category] { analysis.Category = "uncategorized" } return &analysis, nil } // extractJSONObject tries to find a balanced JSON object in a string func extractJSONObject(s string) string { start := strings.Index(s, "{") if start < 0 { return "" } depth := 0 inString := false escaped := false for i := start; i < len(s); i++ { c := s[i] if escaped { escaped = false continue } if c == '\\' && inString { escaped = true continue } if c == '"' { inString = !inString continue } if inString { continue } if c == '{' { depth++ } else if c == '}' { depth-- if depth == 0 { return s[start : i+1] } } } return "" } // GenerateEmbedding creates a vector embedding using Fireworks func GenerateEmbedding(text string) ([]float32, error) { if fireworksAPIKey == "" { return nil, fmt.Errorf("FIREWORKS_API_KEY not set") } // Truncate if len(text) > 32000 { text = text[:32000] } reqBody := map[string]interface{}{ "model": "fireworks/qwen3-embedding-8b", "input": text, } jsonBody, _ := json.Marshal(reqBody) req, _ := http.NewRequest("POST", fireworksBaseURL+"/embeddings", bytes.NewReader(jsonBody)) req.Header.Set("Authorization", "Bearer "+fireworksAPIKey) req.Header.Set("Content-Type", "application/json") client := &http.Client{Timeout: 30 * time.Second} resp, err := client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != 200 { body, _ := io.ReadAll(resp.Body) return nil, fmt.Errorf("embedding API error %d: %s", resp.StatusCode, string(body)) } var result struct { Data []struct { Embedding []float32 `json:"embedding"` } `json:"data"` } if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { return nil, err } if len(result.Data) == 0 { return nil, fmt.Errorf("no embedding returned") } return result.Data[0].Embedding, nil } // GetPDFPageCount returns the number of pages in a PDF func GetPDFPageCount(filePath string) int { cmd := exec.Command("pdfinfo", filePath) out, err := cmd.Output() if err != nil { return 1 } for _, line := range strings.Split(string(out), "\n") { if strings.HasPrefix(line, "Pages:") { var count int fmt.Sscanf(line, "Pages: %d", &count) return count } } return 1 } // ProcessPDFPageByPage extracts text from each page separately func ProcessPDFPageByPage(filePath string, jobID string) (string, error) { pageCount := GetPDFPageCount(filePath) log.Printf(" Processing %d pages separately...", pageCount) var allText strings.Builder for page := 1; page <= pageCount; page++ { UpdateJob(jobID, "ocr", fmt.Sprintf("Page %d/%d", page, pageCount)) tmpDir, err := os.MkdirTemp("", "docsys-page") if err != nil { continue } // Convert single page to PNG outPrefix := filepath.Join(tmpDir, "page") cmd := exec.Command("pdftoppm", "-png", "-f", fmt.Sprintf("%d", page), "-l", fmt.Sprintf("%d", page), "-r", "150", filePath, outPrefix) if err := cmd.Run(); err != nil { os.RemoveAll(tmpDir) continue } pngPath := filepath.Join(tmpDir, fmt.Sprintf("page-%d.png", page)) imageData, err := os.ReadFile(pngPath) os.RemoveAll(tmpDir) if err != nil { continue } // OCR this page log.Printf(" Page %d/%d...", page, pageCount) pageAnalysis, err := AnalyzePageOnly(imageData, page) if err != nil { log.Printf(" Page %d failed: %v", page, err) continue } if pageAnalysis != "" { allText.WriteString(fmt.Sprintf("\n\n---\n## Page %d\n\n", page)) allText.WriteString(pageAnalysis) } } return allText.String(), nil } // AnalyzePageOnly extracts just the text from a single page image func AnalyzePageOnly(imageData []byte, pageNum int) (string, error) { if fireworksAPIKey == "" { return "", fmt.Errorf("FIREWORKS_API_KEY not set") } b64 := base64.StdEncoding.EncodeToString(imageData) prompt := `Transcribe ALL visible text on this page EXACTLY as it appears, in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content. FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve ALL numbers, dates, amounts, and values exactly as shown. If the document is in Russian, Dutch, German, French, or any other language — keep it in that language.` reqBody := map[string]interface{}{ "model": "accounts/fireworks/models/kimi-k2p5", "max_tokens": 4096, "messages": []map[string]interface{}{ { "role": "user", "content": []map[string]interface{}{ {"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}}, {"type": "text", "text": prompt}, }, }, }, } jsonBody, _ := json.Marshal(reqBody) req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody)) req.Header.Set("Authorization", "Bearer "+fireworksAPIKey) req.Header.Set("Content-Type", "application/json") client := &http.Client{Timeout: 120 * time.Second} resp, err := client.Do(req) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode != 200 { body, _ := io.ReadAll(resp.Body) return "", fmt.Errorf("API error %d: %s", resp.StatusCode, string(body)) } // Read raw response to debug content vs reasoning_content rawBody, err := io.ReadAll(resp.Body) if err != nil { return "", err } var result struct { Choices []struct { Message struct { Content string `json:"content"` ReasoningContent string `json:"reasoning_content"` } `json:"message"` } `json:"choices"` } if err := json.Unmarshal(rawBody, &result); err != nil { return "", err } if len(result.Choices) == 0 { return "", fmt.Errorf("no response") } content := result.Choices[0].Message.Content reasoning := result.Choices[0].Message.ReasoningContent if reasoning != "" { log.Printf(" [OCR debug] reasoning_content length: %d, content length: %d", len(reasoning), len(content)) if len(content) > 100 { log.Printf(" [OCR debug] content starts: %.100s", content) } } // If content is empty but reasoning has text, model put everything in wrong field if strings.TrimSpace(content) == "" && reasoning != "" { log.Printf(" [OCR debug] WARNING: content empty, using reasoning_content") content = reasoning } return strings.TrimSpace(content), nil } // ProcessDocument handles the full document processing pipeline func ProcessDocument(filePath string) (*Document, error) { log.Printf("Processing: %s", filepath.Base(filePath)) ext := strings.ToLower(filepath.Ext(filePath)) // Get file hash hash, err := FileHash(filePath) if err != nil { return nil, fmt.Errorf("hash failed: %w", err) } log.Printf(" Hash: %s", hash) // Start progress tracking StartJob(hash, filepath.Base(filePath)) defer FinishJob(hash) // Check if already fully processed (not pending) if existing, _ := GetDocument(hash); existing != nil && existing.Status == "ready" { log.Printf(" Already exists, skipping") os.Remove(filePath) return existing, nil } var analysis *DocumentAnalysis if IsTextFile(ext) { // Plain text — read and analyze data, err := os.ReadFile(filePath) if err != nil { return nil, err } UpdateJob(hash, "classifying", "Analyzing text...") log.Printf(" Analyzing text with K2...") analysis, err = AnalyzeText(string(data), filepath.Base(filePath)) if err != nil { UpdateJob(hash, "error", err.Error()) return nil, fmt.Errorf("text analysis failed: %w", err) } } else { // Vision — convert to image and analyze UpdateJob(hash, "converting", "Converting to image...") log.Printf(" Converting to image...") imageData, err := ConvertToImage(filePath) if err != nil { UpdateJob(hash, "error", err.Error()) return nil, fmt.Errorf("image conversion failed: %w", err) } UpdateJob(hash, "ocr", "Analyzing first page...") log.Printf(" Analyzing with K2.5 vision...") analysis, err = AnalyzeWithVision(imageData) if err != nil { // Vision JSON extraction failed — fall back to two-step: plain-text OCR + text classifier log.Printf(" AnalyzeWithVision failed (%v), falling back to OCR+classify...", err) UpdateJob(hash, "ocr", "Falling back to OCR + classify...") pageText, ocrErr := AnalyzePageOnly(imageData, 1) if ocrErr != nil { UpdateJob(hash, "error", ocrErr.Error()) return nil, fmt.Errorf("vision analysis failed (primary: %v, fallback: %w)", err, ocrErr) } // Classify the extracted text log.Printf(" OCR succeeded (%d chars), classifying...", len(pageText)) UpdateJob(hash, "classifying", "Classifying extracted text...") analysis, err = AnalyzeText(pageText, filepath.Base(filePath)) if err != nil { // Use minimal stub so at least the doc is stored with its text log.Printf(" Classification failed too: %v — storing with minimal metadata", err) analysis = &DocumentAnalysis{ Category: "uncategorized", DocType: "unknown", Title: strings.TrimSuffix(filepath.Base(filePath), filepath.Ext(filePath)), Summary: "Extraction failed — stored raw text only", FullText: pageText, } } else { analysis.FullText = pageText } } // For PDFs, process pages for accurate OCR if ext == ".pdf" { pageCount := GetPDFPageCount(filePath) if pageCount > 1 { log.Printf(" Multi-page PDF detected (%d pages)", pageCount) UpdateJob(hash, "ocr", fmt.Sprintf("Multi-page PDF: %d pages", pageCount)) fullText, err := ProcessPDFPageByPage(filePath, hash) if err == nil && fullText != "" { analysis.FullText = fullText } } else if analysis.FullText == "" || len(analysis.FullText) < 50 || strings.HasPrefix(analysis.FullText, "[") { // Single-page but full_text is empty/placeholder — retry with AnalyzePageOnly log.Printf(" Single-page PDF with bad full_text (%q) — retrying with AnalyzePageOnly...", analysis.FullText) UpdateJob(hash, "ocr", "Retrying text extraction...") if pageText, err := AnalyzePageOnly(imageData, 1); err == nil && pageText != "" { analysis.FullText = pageText } else if err != nil { log.Printf(" AnalyzePageOnly fallback failed: %v", err) } } } } log.Printf(" Category: %s, Type: %s", analysis.Category, analysis.DocType) // Copy to store storePath := filepath.Join(storeDir, hash+ext) if err := copyFile(filePath, storePath); err != nil { return nil, fmt.Errorf("store copy failed: %w", err) } // Create document record // Use title if provided, fall back to summary title := analysis.Title if title == "" { title = analysis.Summary } doc := &Document{ ID: hash, Title: title, Category: analysis.Category, Type: analysis.DocType, Date: analysis.Date, Amount: analysis.AmountString(), Vendor: analysis.Vendor, Summary: analysis.Summary, FullText: analysis.FullText, PDFPath: storePath, OriginalFile: filepath.Base(filePath), ProcessedAt: time.Now().Format(time.RFC3339), Status: "ready", } // Save to database if err := InsertDocument(doc); err != nil { return nil, fmt.Errorf("db insert failed: %w", err) } // Generate embedding if analysis.FullText != "" { UpdateJob(hash, "embedding", "Generating search index...") log.Printf(" Generating embedding...") if emb, err := GenerateEmbedding(analysis.FullText); err == nil { log.Printf(" Embedding: %d dimensions", len(emb)) StoreEmbedding(hash, emb) } else { log.Printf(" Embedding failed: %v", err) } } // Remove from inbox os.Remove(filePath) log.Printf(" ✓ Done: %s/%s", analysis.Category, hash) return doc, nil } func copyFile(src, dst string) error { in, err := os.Open(src) if err != nil { return err } defer in.Close() out, err := os.Create(dst) if err != nil { return err } defer out.Close() _, err = io.Copy(out, in) return err }