From 63d4e5e5cab39fe472601493c9afef9c7561164b Mon Sep 17 00:00:00 2001 From: James Date: Sat, 28 Feb 2026 06:01:28 -0500 Subject: [PATCH] chore: auto-commit uncommitted changes --- ai.go | 183 ++++++-- ai.go.bak-20260228-005328 | 891 ++++++++++++++++++++++++++++++++++++++ main.go | 101 ++++- templates/dashboard.html | 92 +++- templates/document.html | 17 +- 5 files changed, 1215 insertions(+), 69 deletions(-) create mode 100644 ai.go.bak-20260228-005328 diff --git a/ai.go b/ai.go index 0dd420e..5e93877 100644 --- a/ai.go +++ b/ai.go @@ -90,7 +90,7 @@ func ConvertToImage(filePath string) ([]byte, error) { } defer os.RemoveAll(tmpDir) - cmd := exec.Command("libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmpDir, filePath) + cmd := exec.Command("soffice", "--headless", "--convert-to", "pdf", "--outdir", tmpDir, filePath) if err := cmd.Run(); err != nil { return nil, fmt.Errorf("libreoffice conversion failed: %w", err) } @@ -124,6 +124,7 @@ func ConvertToImage(filePath string) ([]byte, error) { return os.ReadFile(filePath) } + // IsTextFile returns true for plain text files func IsTextFile(ext string) bool { textExts := map[string]bool{ @@ -160,7 +161,7 @@ func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) { - vendor: Company/organization name - amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN") -5. **Title**: Specific English title (max 8 words) that identifies THIS document uniquely. Include the key distinguishing details: who sent it, what it's about, and when. Bad: "Financial Report" or "Invoice". Good: "N-able Technology Exchange Rate Loss Explanation Feb 2025" or "Duke Energy Electric Bill Oct 2024" or "IRS Form 1099-INT Chase Bank 2024" or "BayCare HomeCare Invoice $340 Nov 2025". Never use generic words like "Document", "Letter", "Report" alone — always qualify them. +5. **Title**: Specific English title (max 8 words) that identifies THIS document uniquely. Ask yourself: "If I had to find this document again, what would I search for?" Prioritize the most unique/recoverable detail: an account number, confirmation code, ID, or specific value that can't be inferred from context. If the document's main value is a number or ID, put it in the title. Bad: "FedEx Receipt Jan 2026" (generic). Good: "FedEx Account 203634010" or "Duke Energy Electric Bill Oct 2024" or "IRS Form 1099-INT Chase Bank 2024". Never use generic words like "Document", "Letter", "Report" alone — always qualify them. 6. **Summary**: 1-2 sentence English description with key details. @@ -234,12 +235,20 @@ func AnalyzeText(text, filename string) (*DocumentAnalysis, error) { Categorize into ONE of: taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized +For the title: specific and concise (max 8 words). Ask yourself: "What would I search for to find this again?" Prioritize unique/recoverable details — account numbers, IDs, confirmation codes — over generic document type names. Examples: +- "FedEx Account 203634010" (if the key value is an account number) +- "Allegiant Confirmation IB2EJA AVL-PIE Feb 2026" +- "IRS Form 1099-INT Chase Bank 2024" +Bad: "FedEx Receipt Jan 2026" or "Flight Confirmation" — too generic. + +Also produce a "full_text" field: reformat the raw content as clean Markdown. Use ## headers for sections, | tables | for tabular data, **bold** for field labels, and bullet lists where appropriate. Preserve all values exactly. Do not summarize — include everything. + Respond in JSON ONLY: -{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "summary": "..."}`, filename, text) +{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`, filename, text) reqBody := map[string]interface{}{ "model": "accounts/fireworks/models/kimi-k2-instruct-0905", - "max_tokens": 1024, + "max_tokens": 2048, "messages": []map[string]interface{}{ {"role": "user", "content": prompt}, }, @@ -249,7 +258,10 @@ Respond in JSON ONLY: if err != nil { return nil, err } - analysis.FullText = text + // If model didn't produce full_text, fall back to raw extracted text + if analysis.FullText == "" { + analysis.FullText = text + } return analysis, nil } @@ -705,13 +717,19 @@ func ProcessDocument(filePath string) (*Document, error) { return nil, fmt.Errorf("store copy failed: %w", err) } + // Only set PDFPath for actual PDFs — office/text files have no previewable PDF + pdfStorePath := "" + if ext == ".pdf" { + pdfStorePath = storePath + } + // Create document record // Use title if provided, fall back to summary title := analysis.Title if title == "" { title = analysis.Summary } - + doc := &Document{ ID: hash, Title: title, @@ -722,7 +740,7 @@ func ProcessDocument(filePath string) (*Document, error) { Vendor: analysis.Vendor, Summary: analysis.Summary, FullText: analysis.FullText, - PDFPath: storePath, + PDFPath: pdfStorePath, OriginalFile: filepath.Base(filePath), ProcessedAt: time.Now().Format(time.RFC3339), Status: "ready", @@ -838,48 +856,135 @@ func ExtractOfficeText(filePath string) (string, error) { return text, nil } -// xmlToText strips XML tags and decodes entities, returning plain text. +// xmlToText parses DOCX/Office XML into structured plain text. +// Understands paragraph breaks, table rows/cells, and text runs. func xmlToText(data []byte) string { + s := string(data) + + // Normalise tag names: strip namespace prefix (w:p → p, a:p → p, etc.) + // We work on the raw string using simple tag scanning. var sb strings.Builder inTag := false - lastWasSpace := false + var tagBuf strings.Builder + lastWasNewline := false - for i := 0; i < len(data); i++ { - c := data[i] - switch { - case c == '<': + flushLine := func() { + line := strings.TrimSpace(sb.String()) + if line != "" { + sb.Reset() + // return the line — caller collects via out builder + // (we reuse sb for the whole doc; collect separately) + } + } + _ = flushLine + + // Two-pass: build a token stream of (tag, text) pairs + type token struct { + tag string // lowercased local name, or "" for text + text string + } + var tokens []token + + for i := 0; i < len(s); i++ { + c := s[i] + if c == '<' { + // flush text buffer + if sb.Len() > 0 { + tokens = append(tokens, token{text: sb.String()}) + sb.Reset() + } inTag = true - // Emit space between elements so words don't run together - if !lastWasSpace { - sb.WriteByte(' ') - lastWasSpace = true - } - case c == '>': + tagBuf.Reset() + continue + } + if c == '>' { inTag = false - case !inTag: - if c == ' ' || c == '\t' || c == '\n' || c == '\r' { - if !lastWasSpace { - sb.WriteByte(' ') - lastWasSpace = true - } - } else { - sb.WriteByte(c) - lastWasSpace = false + raw := strings.TrimSpace(tagBuf.String()) + // Extract local name (strip namespace and attributes) + name := raw + if idx := strings.IndexAny(raw, " \t\n\r/"); idx >= 0 { + name = raw[:idx] } + if idx := strings.Index(name, ":"); idx >= 0 { + name = name[idx+1:] + } + name = strings.ToLower(name) + closing := strings.HasPrefix(raw, "/") + if closing { + name = strings.TrimPrefix(name, "/") + } + tokens = append(tokens, token{tag: name}) + tagBuf.Reset() + continue + } + if inTag { + tagBuf.WriteByte(c) + } else { + sb.WriteByte(c) + } + } + if sb.Len() > 0 { + tokens = append(tokens, token{text: sb.String()}) + } + + // Now render tokens into structured text + var out strings.Builder + pendingNewlines := 0 + + emitNewlines := func(n int) { + if n > pendingNewlines { + pendingNewlines = n + } + } + flushNewlines := func() { + for i := 0; i < pendingNewlines; i++ { + out.WriteByte('\n') + } + pendingNewlines = 0 + lastWasNewline = pendingNewlines == 0 + } + _ = lastWasNewline + + inCell := false + cellCount := 0 + + for _, tok := range tokens { + if tok.tag == "" { + // text node + txt := tok.text + // decode entities + txt = strings.ReplaceAll(txt, "&", "&") + txt = strings.ReplaceAll(txt, "<", "<") + txt = strings.ReplaceAll(txt, ">", ">") + txt = strings.ReplaceAll(txt, """, "\"") + txt = strings.ReplaceAll(txt, "'", "'") + txt = strings.ReplaceAll(txt, " ", "\n") + txt = strings.ReplaceAll(txt, " ", "\t") + if strings.TrimSpace(txt) != "" { + flushNewlines() + out.WriteString(txt) + } + continue + } + switch tok.tag { + case "p": // paragraph + emitNewlines(1) + case "tr": // table row start + inCell = false + cellCount = 0 + emitNewlines(1) + case "tc": // table cell + if inCell { + out.WriteString("\t") + } + inCell = true + cellCount++ + case "br": // line break + emitNewlines(1) } } - // Decode common XML entities - result := sb.String() - result = strings.ReplaceAll(result, "&", "&") - result = strings.ReplaceAll(result, "<", "<") - result = strings.ReplaceAll(result, ">", ">") - result = strings.ReplaceAll(result, """, "\"") - result = strings.ReplaceAll(result, "'", "'") - result = strings.ReplaceAll(result, " ", "\n") - result = strings.ReplaceAll(result, " ", "\t") - - return strings.TrimSpace(result) + return strings.TrimSpace(out.String()) } // IsOfficeFile returns true for formats with extractable XML text. diff --git a/ai.go.bak-20260228-005328 b/ai.go.bak-20260228-005328 new file mode 100644 index 0000000..0dd420e --- /dev/null +++ b/ai.go.bak-20260228-005328 @@ -0,0 +1,891 @@ +package main + +import ( + "archive/zip" + "bytes" + "crypto/sha256" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +var ( + fireworksAPIKey string + fireworksBaseURL = "https://api.fireworks.ai/inference/v1" +) + +func init() { + fireworksAPIKey = os.Getenv("FIREWORKS_API_KEY") + if fireworksAPIKey == "" { + // Try .env file in docsys directory + envPath := filepath.Join(os.Getenv("HOME"), "dev/docsys/.env") + if data, err := os.ReadFile(envPath); err == nil { + for _, line := range strings.Split(string(data), "\n") { + if strings.HasPrefix(line, "FIREWORKS_API_KEY=") { + fireworksAPIKey = strings.TrimSpace(strings.TrimPrefix(line, "FIREWORKS_API_KEY=")) + fireworksAPIKey = strings.Trim(fireworksAPIKey, `"'`) + break + } + } + } + } +} + +// DocumentAnalysis contains the AI-extracted information +type DocumentAnalysis struct { + Category string `json:"category"` + DocType string `json:"doc_type"` + Date string `json:"date"` + Vendor string `json:"vendor"` + Amount interface{} `json:"amount"` // Can be string or number + Title string `json:"title"` + Summary string `json:"summary"` + FullText string `json:"full_text"` +} + +func (d *DocumentAnalysis) AmountString() string { + switch v := d.Amount.(type) { + case string: + return v + case float64: + return fmt.Sprintf("$%.2f", v) + default: + return "" + } +} + +// FileHash returns first 16 chars of SHA256 hash +func FileHash(filepath string) (string, error) { + f, err := os.Open(filepath) + if err != nil { + return "", err + } + defer f.Close() + + h := sha256.New() + if _, err := io.Copy(h, f); err != nil { + return "", err + } + return fmt.Sprintf("%x", h.Sum(nil))[:16], nil +} + +// ConvertToImage converts PDF/Office docs to PNG for vision API +func ConvertToImage(filePath string) ([]byte, error) { + ext := strings.ToLower(filepath.Ext(filePath)) + + // Office documents → PDF first + officeExts := map[string]bool{".doc": true, ".docx": true, ".odt": true, ".rtf": true, ".xls": true, ".xlsx": true, ".ppt": true, ".pptx": true} + if officeExts[ext] { + tmpDir, err := os.MkdirTemp("", "docsys") + if err != nil { + return nil, err + } + defer os.RemoveAll(tmpDir) + + cmd := exec.Command("libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmpDir, filePath) + if err := cmd.Run(); err != nil { + return nil, fmt.Errorf("libreoffice conversion failed: %w", err) + } + + base := strings.TrimSuffix(filepath.Base(filePath), ext) + pdfPath := filepath.Join(tmpDir, base+".pdf") + filePath = pdfPath + ext = ".pdf" + } + + // PDF → PNG (first page only for preview, full processing done separately) + if ext == ".pdf" { + tmpDir, err := os.MkdirTemp("", "docsys") + if err != nil { + return nil, err + } + defer os.RemoveAll(tmpDir) + + // Convert first page for initial analysis + outPrefix := filepath.Join(tmpDir, "page") + cmd := exec.Command("pdftoppm", "-png", "-f", "1", "-l", "1", "-r", "150", filePath, outPrefix) + if err := cmd.Run(); err != nil { + return nil, fmt.Errorf("pdftoppm failed: %w", err) + } + + pngPath := filepath.Join(tmpDir, "page-1.png") + return os.ReadFile(pngPath) + } + + // Image files — read directly + return os.ReadFile(filePath) +} + +// IsTextFile returns true for plain text files +func IsTextFile(ext string) bool { + textExts := map[string]bool{ + ".txt": true, ".md": true, ".markdown": true, ".text": true, ".log": true, + ".json": true, ".xml": true, ".csv": true, ".yaml": true, ".yml": true, + } + return textExts[ext] +} + +// AnalyzeWithVision uses K2.5 vision model +func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) { + if fireworksAPIKey == "" { + return nil, fmt.Errorf("FIREWORKS_API_KEY not set") + } + + b64 := base64.StdEncoding.EncodeToString(imageData) + + prompt := `Analyze this document image and extract: + +1. **Full Text**: Transcribe ALL visible text EXACTLY as it appears — in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Transcribe word-for-word in the source language (Russian, Dutch, German, French, etc.) formatted as clean Markdown: + - Use headers (##) for sections + - Use **bold** for labels/field names + - Use tables for tabular data (items, prices, etc.) + - Use bullet lists where appropriate + - Preserve ALL numbers, dates, amounts, and codes exactly as shown + +2. **Classification**: Categorize into exactly ONE of: + taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized + +3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2") + +4. **Key Fields** (these may be in English for searchability): + - date: Document date (YYYY-MM-DD if possible) + - vendor: Company/organization name + - amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN") + +5. **Title**: Specific English title (max 8 words) that identifies THIS document uniquely. Include the key distinguishing details: who sent it, what it's about, and when. Bad: "Financial Report" or "Invoice". Good: "N-able Technology Exchange Rate Loss Explanation Feb 2025" or "Duke Energy Electric Bill Oct 2024" or "IRS Form 1099-INT Chase Bank 2024" or "BayCare HomeCare Invoice $340 Nov 2025". Never use generic words like "Document", "Letter", "Report" alone — always qualify them. + +6. **Summary**: 1-2 sentence English description with key details. + +IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable. + +**Known proper nouns** (use these exact spellings when you see similar handwriting): +- Jongsma (surname — may look like "Jongoma", "Jongsoma", "Jongma") +- Johan (first name) +- Tatyana / Tanya (first name) +- St. Petersburg, Florida (city — not Russia) + +Respond in JSON ONLY: +{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}` + + reqBody := map[string]interface{}{ + "model": "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct", + "max_tokens": 4096, + "messages": []map[string]interface{}{ + {"role": "system", "content": "You are a document analysis API. Output ONLY raw JSON. No thinking, no commentary, no code fences. First character must be {, last character must be }."}, + { + "role": "user", + "content": []map[string]interface{}{ + {"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}}, + {"type": "text", "text": prompt}, + }, + }, + }, + } + + analysis, err := callFireworks(reqBody) + if err != nil { + // Retry once with minimal prompt to avoid triggering extended reasoning + log.Printf(" [AI] First attempt failed, retrying with simplified prompt...") + retryBody := map[string]interface{}{ + "model": "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct", + "max_tokens": 4096, + "messages": []map[string]interface{}{ + {"role": "system", "content": "Output valid JSON only. No other text."}, + { + "role": "user", + "content": []map[string]interface{}{ + {"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}}, + {"type": "text", "text": `Look at this document image. Transcribe ALL text verbatim (original language, do NOT translate). Return ONLY this JSON with no placeholders: +{"category":"","doc_type":"","date":"","vendor":"","amount":"","title":"","summary":"","full_text":""}`}, + }, + }, + }, + } + return callFireworks(retryBody) + } + return analysis, nil +} + +// AnalyzeText uses K2 text model for plain text files +func AnalyzeText(text, filename string) (*DocumentAnalysis, error) { + if fireworksAPIKey == "" { + return nil, fmt.Errorf("FIREWORKS_API_KEY not set") + } + + // Truncate long text + if len(text) > 50000 { + text = text[:50000] + } + + prompt := fmt.Sprintf(`Analyze this document: + +**Filename:** %s + +**Content:** +%s + +Categorize into ONE of: taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized + +Respond in JSON ONLY: +{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "summary": "..."}`, filename, text) + + reqBody := map[string]interface{}{ + "model": "accounts/fireworks/models/kimi-k2-instruct-0905", + "max_tokens": 1024, + "messages": []map[string]interface{}{ + {"role": "user", "content": prompt}, + }, + } + + analysis, err := callFireworks(reqBody) + if err != nil { + return nil, err + } + analysis.FullText = text + return analysis, nil +} + +func callFireworks(reqBody map[string]interface{}) (*DocumentAnalysis, error) { + jsonBody, _ := json.Marshal(reqBody) + + req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody)) + req.Header.Set("Authorization", "Bearer "+fireworksAPIKey) + req.Header.Set("Content-Type", "application/json") + + client := &http.Client{Timeout: 120 * time.Second} + resp, err := client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("API error %d: %s", resp.StatusCode, string(body)) + } + + respBody, _ := io.ReadAll(resp.Body) + + var result struct { + Choices []struct { + Message struct { + Content string `json:"content"` + ReasoningContent string `json:"reasoning_content"` + } `json:"message"` + } `json:"choices"` + } + if err := json.Unmarshal(respBody, &result); err != nil { + return nil, err + } + + if len(result.Choices) == 0 { + return nil, fmt.Errorf("no response from API") + } + + content := result.Choices[0].Message.Content + reasoning := result.Choices[0].Message.ReasoningContent + + // K2.5 reasoning mode: actual JSON may be in content or reasoning_content + // Try content first, if it doesn't look like JSON, try reasoning_content + if !strings.Contains(content, "{") && reasoning != "" && strings.Contains(reasoning, "{") { + log.Printf(" [AI] Using reasoning_content (content had no JSON)") + content = reasoning + } + + // Strip markdown code fences (```json ... ``` or ``` ... ```) + content = strings.TrimSpace(content) + if strings.HasPrefix(content, "```") { + // Remove opening fence (```json or ```) + if idx := strings.Index(content, "\n"); idx >= 0 { + content = content[idx+1:] + } + // Remove closing fence + if idx := strings.LastIndex(content, "```"); idx >= 0 { + content = content[:idx] + } + content = strings.TrimSpace(content) + } + + // Extract JSON from response + if idx := strings.Index(content, "{"); idx >= 0 { + if end := strings.LastIndex(content, "}"); end > idx { + content = content[idx : end+1] + } + } + + var analysis DocumentAnalysis + if err := json.Unmarshal([]byte(content), &analysis); err != nil { + // Last resort: try to find a JSON object with braces matching + cleaned := extractJSONObject(content) + if cleaned != "" { + if err2 := json.Unmarshal([]byte(cleaned), &analysis); err2 != nil { + log.Printf(" [AI debug] Failed to parse even after cleanup. Content starts: %.200s", content) + return nil, fmt.Errorf("failed to parse response: %w", err) + } + } else { + log.Printf(" [AI debug] No JSON object found in response. Content starts: %.200s", content) + return nil, fmt.Errorf("failed to parse response: %w", err) + } + } + + // Validate category + validCats := map[string]bool{"taxes": true, "bills": true, "medical": true, "insurance": true, "legal": true, "financial": true, "expenses": true, "vehicles": true, "home": true, "personal": true, "contacts": true, "uncategorized": true} + if !validCats[analysis.Category] { + analysis.Category = "uncategorized" + } + + return &analysis, nil +} + +// extractJSONObject tries to find a balanced JSON object in a string +func extractJSONObject(s string) string { + start := strings.Index(s, "{") + if start < 0 { + return "" + } + depth := 0 + inString := false + escaped := false + for i := start; i < len(s); i++ { + c := s[i] + if escaped { + escaped = false + continue + } + if c == '\\' && inString { + escaped = true + continue + } + if c == '"' { + inString = !inString + continue + } + if inString { + continue + } + if c == '{' { + depth++ + } else if c == '}' { + depth-- + if depth == 0 { + return s[start : i+1] + } + } + } + return "" +} + +// GenerateEmbedding creates a vector embedding using Fireworks +func GenerateEmbedding(text string) ([]float32, error) { + if fireworksAPIKey == "" { + return nil, fmt.Errorf("FIREWORKS_API_KEY not set") + } + + // Truncate + if len(text) > 32000 { + text = text[:32000] + } + + reqBody := map[string]interface{}{ + "model": "fireworks/qwen3-embedding-8b", + "input": text, + } + jsonBody, _ := json.Marshal(reqBody) + + req, _ := http.NewRequest("POST", fireworksBaseURL+"/embeddings", bytes.NewReader(jsonBody)) + req.Header.Set("Authorization", "Bearer "+fireworksAPIKey) + req.Header.Set("Content-Type", "application/json") + + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("embedding API error %d: %s", resp.StatusCode, string(body)) + } + + var result struct { + Data []struct { + Embedding []float32 `json:"embedding"` + } `json:"data"` + } + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, err + } + + if len(result.Data) == 0 { + return nil, fmt.Errorf("no embedding returned") + } + + return result.Data[0].Embedding, nil +} + +// GetPDFPageCount returns the number of pages in a PDF +func GetPDFPageCount(filePath string) int { + cmd := exec.Command("pdfinfo", filePath) + out, err := cmd.Output() + if err != nil { + return 1 + } + for _, line := range strings.Split(string(out), "\n") { + if strings.HasPrefix(line, "Pages:") { + var count int + fmt.Sscanf(line, "Pages: %d", &count) + return count + } + } + return 1 +} + +// ProcessPDFPageByPage extracts text from each page separately +func ProcessPDFPageByPage(filePath string, jobID string) (string, error) { + pageCount := GetPDFPageCount(filePath) + log.Printf(" Processing %d pages separately...", pageCount) + + var allText strings.Builder + + for page := 1; page <= pageCount; page++ { + UpdateJob(jobID, "ocr", fmt.Sprintf("Page %d/%d", page, pageCount)) + tmpDir, err := os.MkdirTemp("", "docsys-page") + if err != nil { + continue + } + + // Convert single page to PNG + outPrefix := filepath.Join(tmpDir, "page") + cmd := exec.Command("pdftoppm", "-png", "-f", fmt.Sprintf("%d", page), "-l", fmt.Sprintf("%d", page), "-r", "150", filePath, outPrefix) + if err := cmd.Run(); err != nil { + os.RemoveAll(tmpDir) + continue + } + + pngPath := filepath.Join(tmpDir, fmt.Sprintf("page-%d.png", page)) + imageData, err := os.ReadFile(pngPath) + os.RemoveAll(tmpDir) + if err != nil { + continue + } + + // OCR this page + log.Printf(" Page %d/%d...", page, pageCount) + pageAnalysis, err := AnalyzePageOnly(imageData, page) + if err != nil { + log.Printf(" Page %d failed: %v", page, err) + continue + } + + if pageAnalysis != "" { + allText.WriteString(fmt.Sprintf("\n\n---\n## Page %d\n\n", page)) + allText.WriteString(pageAnalysis) + } + } + + return allText.String(), nil +} + +// AnalyzePageOnly extracts just the text from a single page image +func AnalyzePageOnly(imageData []byte, pageNum int) (string, error) { + if fireworksAPIKey == "" { + return "", fmt.Errorf("FIREWORKS_API_KEY not set") + } + + b64 := base64.StdEncoding.EncodeToString(imageData) + + prompt := `Transcribe ALL visible text on this page EXACTLY as it appears, in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content. + +FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve ALL numbers, dates, amounts, and values exactly as shown. If the document is in Russian, Dutch, German, French, or any other language — keep it in that language.` + + reqBody := map[string]interface{}{ + "model": "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct", + "max_tokens": 4096, + "messages": []map[string]interface{}{ + { + "role": "user", + "content": []map[string]interface{}{ + {"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}}, + {"type": "text", "text": prompt}, + }, + }, + }, + } + + jsonBody, _ := json.Marshal(reqBody) + req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody)) + req.Header.Set("Authorization", "Bearer "+fireworksAPIKey) + req.Header.Set("Content-Type", "application/json") + + client := &http.Client{Timeout: 120 * time.Second} + resp, err := client.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + body, _ := io.ReadAll(resp.Body) + return "", fmt.Errorf("API error %d: %s", resp.StatusCode, string(body)) + } + + // Read raw response to debug content vs reasoning_content + rawBody, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + + var result struct { + Choices []struct { + Message struct { + Content string `json:"content"` + ReasoningContent string `json:"reasoning_content"` + } `json:"message"` + } `json:"choices"` + } + if err := json.Unmarshal(rawBody, &result); err != nil { + return "", err + } + + if len(result.Choices) == 0 { + return "", fmt.Errorf("no response") + } + + content := result.Choices[0].Message.Content + reasoning := result.Choices[0].Message.ReasoningContent + + if reasoning != "" { + log.Printf(" [OCR debug] reasoning_content length: %d, content length: %d", len(reasoning), len(content)) + if len(content) > 100 { + log.Printf(" [OCR debug] content starts: %.100s", content) + } + } + + // If content is empty but reasoning has text, model put everything in wrong field + if strings.TrimSpace(content) == "" && reasoning != "" { + log.Printf(" [OCR debug] WARNING: content empty, using reasoning_content") + content = reasoning + } + + return strings.TrimSpace(content), nil +} + +// ProcessDocument handles the full document processing pipeline +func ProcessDocument(filePath string) (*Document, error) { + log.Printf("Processing: %s", filepath.Base(filePath)) + + ext := strings.ToLower(filepath.Ext(filePath)) + + // Get file hash + hash, err := FileHash(filePath) + if err != nil { + return nil, fmt.Errorf("hash failed: %w", err) + } + log.Printf(" Hash: %s", hash) + + // Start progress tracking + StartJob(hash, filepath.Base(filePath)) + defer FinishJob(hash) + + // Check if already fully processed (not pending) + if existing, _ := GetDocument(hash); existing != nil && existing.Status == "ready" { + log.Printf(" Already exists, skipping") + os.Remove(filePath) + return existing, nil + } + + var analysis *DocumentAnalysis + + if IsTextFile(ext) { + // Plain text — read and analyze + data, err := os.ReadFile(filePath) + if err != nil { + return nil, err + } + UpdateJob(hash, "classifying", "Analyzing text...") + log.Printf(" Analyzing text with K2...") + analysis, err = AnalyzeText(string(data), filepath.Base(filePath)) + if err != nil { + UpdateJob(hash, "error", err.Error()) + return nil, fmt.Errorf("text analysis failed: %w", err) + } + } else if IsOfficeFile(ext) { + // Office formats — extract text natively from ZIP/XML, no LibreOffice needed + UpdateJob(hash, "converting", "Extracting text...") + log.Printf(" Extracting text from %s...", ext) + text, err := ExtractOfficeText(filePath) + if err != nil { + UpdateJob(hash, "error", err.Error()) + return nil, fmt.Errorf("office text extraction failed: %w", err) + } + log.Printf(" Extracted %d chars, classifying...", len(text)) + UpdateJob(hash, "classifying", "Classifying...") + analysis, err = AnalyzeText(text, filepath.Base(filePath)) + if err != nil { + UpdateJob(hash, "error", err.Error()) + return nil, fmt.Errorf("text analysis failed: %w", err) + } + } else { + // Vision — convert to image and analyze + UpdateJob(hash, "converting", "Converting to image...") + log.Printf(" Converting to image...") + imageData, err := ConvertToImage(filePath) + if err != nil { + UpdateJob(hash, "error", err.Error()) + return nil, fmt.Errorf("image conversion failed: %w", err) + } + UpdateJob(hash, "ocr", "Analyzing first page...") + log.Printf(" Analyzing with K2.5 vision...") + analysis, err = AnalyzeWithVision(imageData) + if err != nil { + // Vision JSON extraction failed — fall back to two-step: plain-text OCR + text classifier + log.Printf(" AnalyzeWithVision failed (%v), falling back to OCR+classify...", err) + UpdateJob(hash, "ocr", "Falling back to OCR + classify...") + pageText, ocrErr := AnalyzePageOnly(imageData, 1) + if ocrErr != nil { + UpdateJob(hash, "error", ocrErr.Error()) + return nil, fmt.Errorf("vision analysis failed (primary: %v, fallback: %w)", err, ocrErr) + } + // Classify the extracted text + log.Printf(" OCR succeeded (%d chars), classifying...", len(pageText)) + UpdateJob(hash, "classifying", "Classifying extracted text...") + analysis, err = AnalyzeText(pageText, filepath.Base(filePath)) + if err != nil { + // Use minimal stub so at least the doc is stored with its text + log.Printf(" Classification failed too: %v — storing with minimal metadata", err) + analysis = &DocumentAnalysis{ + Category: "uncategorized", + DocType: "unknown", + Title: strings.TrimSuffix(filepath.Base(filePath), filepath.Ext(filePath)), + Summary: "Extraction failed — stored raw text only", + FullText: pageText, + } + } else { + analysis.FullText = pageText + } + } + + // For PDFs, process pages for accurate OCR + if ext == ".pdf" { + pageCount := GetPDFPageCount(filePath) + if pageCount > 1 { + log.Printf(" Multi-page PDF detected (%d pages)", pageCount) + UpdateJob(hash, "ocr", fmt.Sprintf("Multi-page PDF: %d pages", pageCount)) + fullText, err := ProcessPDFPageByPage(filePath, hash) + if err == nil && fullText != "" { + analysis.FullText = fullText + } + } else if analysis.FullText == "" || len(analysis.FullText) < 50 || strings.HasPrefix(analysis.FullText, "[") { + // Single-page but full_text is empty/placeholder — retry with AnalyzePageOnly + log.Printf(" Single-page PDF with bad full_text (%q) — retrying with AnalyzePageOnly...", analysis.FullText) + UpdateJob(hash, "ocr", "Retrying text extraction...") + if pageText, err := AnalyzePageOnly(imageData, 1); err == nil && pageText != "" { + analysis.FullText = pageText + } else if err != nil { + log.Printf(" AnalyzePageOnly fallback failed: %v", err) + } + } + } + } + + log.Printf(" Category: %s, Type: %s", analysis.Category, analysis.DocType) + + // Copy to store + storePath := filepath.Join(storeDir, hash+ext) + if err := copyFile(filePath, storePath); err != nil { + return nil, fmt.Errorf("store copy failed: %w", err) + } + + // Create document record + // Use title if provided, fall back to summary + title := analysis.Title + if title == "" { + title = analysis.Summary + } + + doc := &Document{ + ID: hash, + Title: title, + Category: analysis.Category, + Type: analysis.DocType, + Date: analysis.Date, + Amount: analysis.AmountString(), + Vendor: analysis.Vendor, + Summary: analysis.Summary, + FullText: analysis.FullText, + PDFPath: storePath, + OriginalFile: filepath.Base(filePath), + ProcessedAt: time.Now().Format(time.RFC3339), + Status: "ready", + } + + // Save to database + if err := InsertDocument(doc); err != nil { + return nil, fmt.Errorf("db insert failed: %w", err) + } + + // Generate embedding + if analysis.FullText != "" { + UpdateJob(hash, "embedding", "Generating search index...") + log.Printf(" Generating embedding...") + if emb, err := GenerateEmbedding(analysis.FullText); err == nil { + log.Printf(" Embedding: %d dimensions", len(emb)) + StoreEmbedding(hash, emb) + } else { + log.Printf(" Embedding failed: %v", err) + } + } + + // Remove from inbox + os.Remove(filePath) + + log.Printf(" ✓ Done: %s/%s", analysis.Category, hash) + return doc, nil +} + +func copyFile(src, dst string) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + + out, err := os.Create(dst) + if err != nil { + return err + } + defer out.Close() + + _, err = io.Copy(out, in) + return err +} + +// ExtractOfficeText extracts plain text from DOCX/XLSX/PPTX natively. +// These are ZIP archives containing XML — no LibreOffice needed. +func ExtractOfficeText(filePath string) (string, error) { + ext := strings.ToLower(filepath.Ext(filePath)) + + r, err := zip.OpenReader(filePath) + if err != nil { + return "", fmt.Errorf("not a valid Office file: %w", err) + } + defer r.Close() + + // Which XML paths to extract per format + var targets []string + switch ext { + case ".docx", ".doc", ".odt", ".rtf": + targets = []string{"word/document.xml", "word/body.xml"} + case ".xlsx", ".xls": + // All sheets + for _, f := range r.File { + if strings.HasPrefix(f.Name, "xl/worksheets/sheet") && strings.HasSuffix(f.Name, ".xml") { + targets = append(targets, f.Name) + } + } + // Shared strings (cell values are stored here for xlsx) + targets = append(targets, "xl/sharedStrings.xml") + case ".pptx", ".ppt": + for _, f := range r.File { + if strings.HasPrefix(f.Name, "ppt/slides/slide") && strings.HasSuffix(f.Name, ".xml") { + targets = append(targets, f.Name) + } + } + default: + return "", fmt.Errorf("unsupported office format: %s", ext) + } + + var sb strings.Builder + for _, f := range r.File { + found := false + for _, t := range targets { + if f.Name == t { + found = true + break + } + } + if !found { + continue + } + + rc, err := f.Open() + if err != nil { + continue + } + data, err := io.ReadAll(rc) + rc.Close() + if err != nil { + continue + } + + sb.WriteString(xmlToText(data)) + sb.WriteString("\n") + } + + text := strings.TrimSpace(sb.String()) + if text == "" { + return "", fmt.Errorf("no text extracted from %s", filepath.Base(filePath)) + } + return text, nil +} + +// xmlToText strips XML tags and decodes entities, returning plain text. +func xmlToText(data []byte) string { + var sb strings.Builder + inTag := false + lastWasSpace := false + + for i := 0; i < len(data); i++ { + c := data[i] + switch { + case c == '<': + inTag = true + // Emit space between elements so words don't run together + if !lastWasSpace { + sb.WriteByte(' ') + lastWasSpace = true + } + case c == '>': + inTag = false + case !inTag: + if c == ' ' || c == '\t' || c == '\n' || c == '\r' { + if !lastWasSpace { + sb.WriteByte(' ') + lastWasSpace = true + } + } else { + sb.WriteByte(c) + lastWasSpace = false + } + } + } + + // Decode common XML entities + result := sb.String() + result = strings.ReplaceAll(result, "&", "&") + result = strings.ReplaceAll(result, "<", "<") + result = strings.ReplaceAll(result, ">", ">") + result = strings.ReplaceAll(result, """, "\"") + result = strings.ReplaceAll(result, "'", "'") + result = strings.ReplaceAll(result, " ", "\n") + result = strings.ReplaceAll(result, " ", "\t") + + return strings.TrimSpace(result) +} + +// IsOfficeFile returns true for formats with extractable XML text. +func IsOfficeFile(ext string) bool { + return map[string]bool{ + ".docx": true, ".xlsx": true, ".pptx": true, + ".odt": true, ".ods": true, ".odp": true, + }[ext] +} diff --git a/main.go b/main.go index 4f3cf2d..540dbe9 100644 --- a/main.go +++ b/main.go @@ -72,6 +72,10 @@ func main() { "title": strings.Title, "safe": func(s string) template.HTML { return template.HTML(s) }, "multiply": func(a float64, b float64) float64 { return a * b }, + "isImage": func(filename string) bool { + ext := strings.ToLower(filepath.Ext(filename)) + return ext == ".jpg" || ext == ".jpeg" || ext == ".png" || ext == ".gif" || ext == ".webp" || ext == ".tiff" + }, } r := chi.NewRouter() @@ -84,6 +88,7 @@ func main() { // PDF serving r.Get("/pdf/{hash}", servePDF) + r.Get("/img/{hash}", serveImage) // Pages r.Get("/", dashboardHandler) @@ -323,6 +328,26 @@ func servePDF(w http.ResponseWriter, r *http.Request) { http.Error(w, "File not found", http.StatusNotFound) } +func serveImage(w http.ResponseWriter, r *http.Request) { + hash := chi.URLParam(r, "hash") + mimeTypes := map[string]string{ + ".jpg": "image/jpeg", ".jpeg": "image/jpeg", + ".png": "image/png", ".gif": "image/gif", + ".webp": "image/webp", ".tiff": "image/tiff", + } + for ext, mime := range mimeTypes { + path := filepath.Join(storeDir, hash+ext) + if _, err := os.Stat(path); err == nil { + w.Header().Set("Content-Type", mime) + w.Header().Set("Cache-Control", "private, max-age=3600") + http.ServeFile(w, r, path) + return + } + } + http.NotFound(w, r) +} + + // sanitizeFilename removes characters unsafe for use in Content-Disposition filenames. func sanitizeFilename(name string) string { replacer := strings.NewReplacer(`"`, "'", "/", "-", "\\", "-", "\n", " ", "\r", "") @@ -444,6 +469,7 @@ func ingestHandler(w http.ResponseWriter, r *http.Request) { var req struct { Filename string `json:"filename"` Content string `json:"content"` + URL string `json:"url"` Source string `json:"source"` Subject string `json:"subject"` From string `json:"from"` @@ -454,16 +480,73 @@ func ingestHandler(w http.ResponseWriter, r *http.Request) { return } - if req.Filename == "" || req.Content == "" { - http.Error(w, "filename and content are required", http.StatusBadRequest) - return - } + var data []byte - // Decode base64 content - data, err := base64.StdEncoding.DecodeString(req.Content) - if err != nil { - http.Error(w, "Invalid base64 content", http.StatusBadRequest) - return + if req.URL != "" { + // Fetch from URL + resp, err := http.Get(req.URL) + if err != nil { + http.Error(w, "Failed to fetch URL: "+err.Error(), http.StatusBadGateway) + return + } + defer resp.Body.Close() + if resp.StatusCode >= 400 { + http.Error(w, fmt.Sprintf("URL returned %d", resp.StatusCode), http.StatusBadGateway) + return + } + data, err = io.ReadAll(resp.Body) + if err != nil { + http.Error(w, "Failed to read URL content", http.StatusInternalServerError) + return + } + // Derive filename from URL or content-type if not provided + if req.Filename == "" { + ext := "" + ct := resp.Header.Get("Content-Type") + switch { + case strings.Contains(ct, "jpeg") || strings.Contains(ct, "jpg"): + ext = ".jpg" + case strings.Contains(ct, "png"): + ext = ".png" + case strings.Contains(ct, "gif"): + ext = ".gif" + case strings.Contains(ct, "webp"): + ext = ".webp" + case strings.Contains(ct, "pdf"): + ext = ".pdf" + case strings.Contains(ct, "tiff"): + ext = ".tiff" + default: + // Try to get extension from URL path + urlPath := resp.Request.URL.Path + if e := filepath.Ext(urlPath); e != "" { + ext = e + } else { + ext = ".bin" + } + } + // Use last path segment of URL as base name + base := filepath.Base(resp.Request.URL.Path) + if base == "." || base == "/" { + base = "url-import" + } + if filepath.Ext(base) == "" { + base += ext + } + req.Filename = base + } + } else { + if req.Filename == "" || req.Content == "" { + http.Error(w, "filename and content required, or provide url", http.StatusBadRequest) + return + } + // Decode base64 content + var err error + data, err = base64.StdEncoding.DecodeString(req.Content) + if err != nil { + http.Error(w, "Invalid base64 content", http.StatusBadRequest) + return + } } // Sanitize filename diff --git a/templates/dashboard.html b/templates/dashboard.html index c91d5cc..cfb8738 100644 --- a/templates/dashboard.html +++ b/templates/dashboard.html @@ -120,7 +120,7 @@ {{if .Stats.RecentUploads}}
{{range .Stats.RecentUploads}} - + {{else}} @@ -292,6 +299,51 @@ } }); + // Clipboard paste — handles Ctrl+V of images/screenshots + document.addEventListener('paste', async (e) => { + const items = Array.from(e.clipboardData.items).filter(i => i.type.startsWith('image/')); + if (!items.length) return; + e.preventDefault(); + progress.classList.remove('hidden'); + const pendingIds = []; + for (const item of items) { + const file = item.getAsFile(); + if (!file) continue; + const ext = item.type.split('/')[1] || 'png'; + const name = `paste-${Date.now()}.${ext}`; + try { + const formData = new FormData(); + formData.append('file', new File([file], name, { type: item.type })); + const res = await fetch('/api/upload', { method: 'POST', body: formData }); + const data = await res.json(); + if (data.status === 'success') { + showToast('✓ Pasted image', 'success'); + pendingIds.push(data.id); + } else { + showToast('Paste failed', 'error'); + } + } catch (err) { + showToast('Paste failed: ' + err.message, 'error'); + } + } + if (pendingIds.length > 0) { + sessionStorage.setItem('pendingDocs', JSON.stringify(pendingIds)); + window.location.reload(); + } else { + progress.classList.add('hidden'); + } + }); + + async function deleteDoc(id, title) { + if (!confirm('Delete "' + title + '"?')) return; + const res = await fetch('/api/document/' + id, { method: 'DELETE' }); + if (res.ok) { + window.location.reload(); + } else { + alert('Failed to delete document'); + } + } + // Check for pending docs on page load (from web upload) const pendingDocsJson = sessionStorage.getItem('pendingDocs'); if (pendingDocsJson) { diff --git a/templates/document.html b/templates/document.html index d547d60..90411dc 100644 --- a/templates/document.html +++ b/templates/document.html @@ -26,6 +26,12 @@
+