From d962c9839dfe5ed42bef3accd4504823ab743d2b Mon Sep 17 00:00:00 2001
From: James <james@jongsma.me>
Date: Wed, 25 Feb 2026 14:01:59 -0500
Subject: [PATCH] Fix extraction: don't translate, fallback OCR+classify path
 for non-JSON responses

- Add 'DO NOT translate, preserve original language' to vision prompts
- Shorter/tighter JSON prompt to reduce K2.5 reasoning verbosity
- Fallback: when AnalyzeWithVision returns no JSON, do AnalyzePageOnly (plain text) then AnalyzeText (classify)
- Fallback to AnalyzePageOnly for single-page PDFs with empty/placeholder full_text
- Switch model back to kimi-k2p5 (only vision model on this Fireworks account)
- Build with CGO_ENABLED=1 -tags fts5 (required for SQLite FTS5)
---
 ai.go | 64 +++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 15 deletions(-)

diff --git a/ai.go b/ai.go
index c20d32f..f88f8ea 100644
--- a/ai.go
+++ b/ai.go
@@ -142,26 +142,28 @@ func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) {
 
 	prompt := `Analyze this document image and extract:
 
-1. **Full Text**: Transcribe ALL visible text, formatted as clean Markdown:
+1. **Full Text**: Transcribe ALL visible text EXACTLY as it appears — in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Transcribe word-for-word in the source language (Russian, Dutch, German, French, etc.) formatted as clean Markdown:
    - Use headers (##) for sections
    - Use **bold** for labels/field names
    - Use tables for tabular data (items, prices, etc.)
    - Use bullet lists where appropriate
-   - Preserve important structure but make it readable
+   - Preserve ALL numbers, dates, amounts, and codes exactly as shown
 
 2. **Classification**: Categorize into exactly ONE of:
    taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
 
 3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2")
 
-4. **Key Fields**:
+4. **Key Fields** (these may be in English for searchability):
    - date: Document date (YYYY-MM-DD if possible)
    - vendor: Company/organization name
-   - amount: Dollar amount if present (e.g., "$123.45")
+   - amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN")
 
-5. **Title**: SHORT title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025"
+5. **Title**: SHORT English title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025"
 
-6. **Summary**: 1-2 sentence description with key details.
+6. **Summary**: 1-2 sentence English description with key details.
+
+IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable.
 
 Respond in JSON ONLY:
 {"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`
@@ -170,7 +172,7 @@ Respond in JSON ONLY:
 		"model":      "accounts/fireworks/models/kimi-k2p5",
 		"max_tokens": 4096,
 		"messages": []map[string]interface{}{
-			{"role": "system", "content": "You are a document analysis API. You MUST respond with raw JSON only. No markdown, no code fences, no explanation text. Start your response with { and end with }."},
+			{"role": "system", "content": "You are a document analysis API. Output ONLY raw JSON. No thinking, no commentary, no code fences. First character must be {, last character must be }."},
 			{
 				"role": "user",
 				"content": []map[string]interface{}{
@@ -183,7 +185,7 @@ Respond in JSON ONLY:
 
 	analysis, err := callFireworks(reqBody)
 	if err != nil {
-		// Retry once with a simpler prompt that's harder for the model to misinterpret
+		// Retry once with minimal prompt to avoid triggering extended reasoning
 		log.Printf("  [AI] First attempt failed, retrying with simplified prompt...")
 		retryBody := map[string]interface{}{
 			"model":      "accounts/fireworks/models/kimi-k2p5",
@@ -194,8 +196,8 @@ Respond in JSON ONLY:
 					"role": "user",
 					"content": []map[string]interface{}{
 						{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
-						{"type": "text", "text": `Look at this document. Return ONLY this JSON (fill in values):
-{"category":"uncategorized","doc_type":"unknown","date":"","vendor":"","amount":"","title":"Short Title Here","summary":"One sentence.","full_text":"All visible text here"}`},
+						{"type": "text", "text": `Look at this document image. Transcribe ALL text verbatim (original language, do NOT translate). Return ONLY this JSON with no placeholders:
+{"category":"","doc_type":"","date":"","vendor":"","amount":"","title":"","summary":"","full_text":""}`},
 					},
 				},
 			},
@@ -494,9 +496,9 @@ func AnalyzePageOnly(imageData []byte, pageNum int) (string, error) {
 
 	b64 := base64.StdEncoding.EncodeToString(imageData)
 
-	prompt := `Transcribe ALL visible text on this page as clean markdown. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.
+	prompt := `Transcribe ALL visible text on this page EXACTLY as it appears, in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.
 
-FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve all numbers, dates, and values exactly as shown.`
+FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve ALL numbers, dates, amounts, and values exactly as shown. If the document is in Russian, Dutch, German, French, or any other language — keep it in that language.`
 
 	reqBody := map[string]interface{}{
 		"model":      "accounts/fireworks/models/kimi-k2p5",
@@ -622,11 +624,34 @@ func ProcessDocument(filePath string) (*Document, error) {
 		log.Printf("  Analyzing with K2.5 vision...")
 		analysis, err = AnalyzeWithVision(imageData)
 		if err != nil {
-			UpdateJob(hash, "error", err.Error())
-			return nil, fmt.Errorf("vision analysis failed: %w", err)
+			// Vision JSON extraction failed — fall back to two-step: plain-text OCR + text classifier
+			log.Printf("  AnalyzeWithVision failed (%v), falling back to OCR+classify...", err)
+			UpdateJob(hash, "ocr", "Falling back to OCR + classify...")
+			pageText, ocrErr := AnalyzePageOnly(imageData, 1)
+			if ocrErr != nil {
+				UpdateJob(hash, "error", ocrErr.Error())
+				return nil, fmt.Errorf("vision analysis failed (primary: %v, fallback: %w)", err, ocrErr)
+			}
+			// Classify the extracted text
+			log.Printf("  OCR succeeded (%d chars), classifying...", len(pageText))
+			UpdateJob(hash, "classifying", "Classifying extracted text...")
+			analysis, err = AnalyzeText(pageText, filepath.Base(filePath))
+			if err != nil {
+				// Use minimal stub so at least the doc is stored with its text
+				log.Printf("  Classification failed too: %v — storing with minimal metadata", err)
+				analysis = &DocumentAnalysis{
+					Category: "uncategorized",
+					DocType:  "unknown",
+					Title:    strings.TrimSuffix(filepath.Base(filePath), filepath.Ext(filePath)),
+					Summary:  "Extraction failed — stored raw text only",
+					FullText: pageText,
+				}
+			} else {
+				analysis.FullText = pageText
+			}
 		}
 
-		// For multi-page PDFs, process each page separately for accurate OCR
+		// For PDFs, process pages for accurate OCR
 		if ext == ".pdf" {
 			pageCount := GetPDFPageCount(filePath)
 			if pageCount > 1 {
@@ -636,6 +661,15 @@ func ProcessDocument(filePath string) (*Document, error) {
 				if err == nil && fullText != "" {
 					analysis.FullText = fullText
 				}
+			} else if analysis.FullText == "" || len(analysis.FullText) < 50 || strings.HasPrefix(analysis.FullText, "[") {
+				// Single-page but full_text is empty/placeholder — retry with AnalyzePageOnly
+				log.Printf("  Single-page PDF with bad full_text (%q) — retrying with AnalyzePageOnly...", analysis.FullText)
+				UpdateJob(hash, "ocr", "Retrying text extraction...")
+				if pageText, err := AnalyzePageOnly(imageData, 1); err == nil && pageText != "" {
+					analysis.FullText = pageText
+				} else if err != nil {
+					log.Printf("  AnalyzePageOnly fallback failed: %v", err)
+				}
 			}
 		}
 	}