Fix extraction: don't translate, fallback OCR+classify path for non-JSON responses

- Add 'DO NOT translate, preserve original language' to vision prompts - Shorter/tighter JSON prompt to reduce K2.5 reasoning verbosity - Fallback: when AnalyzeWithVision returns no JSON, do AnalyzePageOnly (plain text) then AnalyzeText (classify) - Fallback to AnalyzePageOnly for single-page PDFs with empty/placeholder full_text - Switch model back to kimi-k2p5 (only vision model on this Fireworks account) - Build with CGO_ENABLED=1 -tags fts5 (required for SQLite FTS5)
2026-02-25 14:01:59 -05:00 · 2026-02-25 14:01:59 -05:00 · d962c9839d
parent 00d8f7c94a
commit d962c9839d
1 changed files with 49 additions and 15 deletions
--- a/ai.go
+++ b/ai.go
@ -142,26 +142,28 @@ func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) {
 	prompt := `Analyze this document image and extract:
-1. **Full Text**: Transcribe ALL visible text, formatted as clean Markdown:
+1. **Full Text**: Transcribe ALL visible text EXACTLY as it appears — in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Transcribe word-for-word in the source language (Russian, Dutch, German, French, etc.) formatted as clean Markdown:
   - Use headers (##) for sections
   - Use **bold** for labels/field names
   - Use tables for tabular data (items, prices, etc.)
   - Use bullet lists where appropriate
-   - Preserve important structure but make it readable
+   - Preserve ALL numbers, dates, amounts, and codes exactly as shown
 2. **Classification**: Categorize into exactly ONE of:
   taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
 3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2")
-4. **Key Fields**:
+4. **Key Fields** (these may be in English for searchability):
   - date: Document date (YYYY-MM-DD if possible)
   - vendor: Company/organization name
-   - amount: Dollar amount if present (e.g., "$123.45")
+   - amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN")
-5. **Title**: SHORT title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025"
+5. **Title**: SHORT English title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025"
-6. **Summary**: 1-2 sentence description with key details.
+6. **Summary**: 1-2 sentence English description with key details.
 IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable.
 Respond in JSON ONLY:
 {"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`
@ -170,7 +172,7 @@ Respond in JSON ONLY:
 		"model":      "accounts/fireworks/models/kimi-k2p5",
 		"max_tokens": 4096,
 		"messages": []map[string]interface{}{
-			{"role": "system", "content": "You are a document analysis API. You MUST respond with raw JSON only. No markdown, no code fences, no explanation text. Start your response with { and end with }."},
+			{"role": "system", "content": "You are a document analysis API. Output ONLY raw JSON. No thinking, no commentary, no code fences. First character must be {, last character must be }."},
 			{
 				"role": "user",
 				"content": []map[string]interface{}{
@ -183,7 +185,7 @@ Respond in JSON ONLY:
 	analysis, err := callFireworks(reqBody)
 	if err != nil {
-		// Retry once with a simpler prompt that's harder for the model to misinterpret
+		// Retry once with minimal prompt to avoid triggering extended reasoning
 		log.Printf("  [AI] First attempt failed, retrying with simplified prompt...")
 		retryBody := map[string]interface{}{
 			"model":      "accounts/fireworks/models/kimi-k2p5",
@ -194,8 +196,8 @@ Respond in JSON ONLY:
 					"role": "user",
 					"content": []map[string]interface{}{
 						{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
-						{"type": "text", "text": `Look at this document. Return ONLY this JSON (fill in values):
+						{"type": "text", "text": `Look at this document image. Transcribe ALL text verbatim (original language, do NOT translate). Return ONLY this JSON with no placeholders:
-{"category":"uncategorized","doc_type":"unknown","date":"","vendor":"","amount":"","title":"Short Title Here","summary":"One sentence.","full_text":"All visible text here"}`},
+{"category":"","doc_type":"","date":"","vendor":"","amount":"","title":"","summary":"","full_text":""}`},
 					},
 				},
 			},
@ -494,9 +496,9 @@ func AnalyzePageOnly(imageData []byte, pageNum int) (string, error) {
 	b64 := base64.StdEncoding.EncodeToString(imageData)
-	prompt := `Transcribe ALL visible text on this page as clean markdown. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.
+	prompt := `Transcribe ALL visible text on this page EXACTLY as it appears, in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.
-FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve all numbers, dates, and values exactly as shown.`
+FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve ALL numbers, dates, amounts, and values exactly as shown. If the document is in Russian, Dutch, German, French, or any other language — keep it in that language.`
 	reqBody := map[string]interface{}{
 		"model":      "accounts/fireworks/models/kimi-k2p5",
@ -622,11 +624,34 @@ func ProcessDocument(filePath string) (*Document, error) {
 		log.Printf("  Analyzing with K2.5 vision...")
 		analysis, err = AnalyzeWithVision(imageData)
 		if err != nil {
-			UpdateJob(hash, "error", err.Error())
+			// Vision JSON extraction failed — fall back to two-step: plain-text OCR + text classifier
-			return nil, fmt.Errorf("vision analysis failed: %w", err)
+			log.Printf("  AnalyzeWithVision failed (%v), falling back to OCR+classify...", err)
 			UpdateJob(hash, "ocr", "Falling back to OCR + classify...")
 			pageText, ocrErr := AnalyzePageOnly(imageData, 1)
 			if ocrErr != nil {
 				UpdateJob(hash, "error", ocrErr.Error())
 				return nil, fmt.Errorf("vision analysis failed (primary: %v, fallback: %w)", err, ocrErr)
 			}
 			// Classify the extracted text
 			log.Printf("  OCR succeeded (%d chars), classifying...", len(pageText))
 			UpdateJob(hash, "classifying", "Classifying extracted text...")
 			analysis, err = AnalyzeText(pageText, filepath.Base(filePath))
 			if err != nil {
 				// Use minimal stub so at least the doc is stored with its text
 				log.Printf("  Classification failed too: %v — storing with minimal metadata", err)
 				analysis = &DocumentAnalysis{
 					Category: "uncategorized",
 					DocType:  "unknown",
 					Title:    strings.TrimSuffix(filepath.Base(filePath), filepath.Ext(filePath)),
 					Summary:  "Extraction failed — stored raw text only",
 					FullText: pageText,
 				}
 			} else {
 				analysis.FullText = pageText
 			}
 		}
-		// For multi-page PDFs, process each page separately for accurate OCR
+		// For PDFs, process pages for accurate OCR
 		if ext == ".pdf" {
 			pageCount := GetPDFPageCount(filePath)
 			if pageCount > 1 {
@ -636,6 +661,15 @@ func ProcessDocument(filePath string) (*Document, error) {
 				if err == nil && fullText != "" {
 					analysis.FullText = fullText
 				}
 			} else if analysis.FullText == "" || len(analysis.FullText) < 50 || strings.HasPrefix(analysis.FullText, "[") {
 				// Single-page but full_text is empty/placeholder — retry with AnalyzePageOnly
 				log.Printf("  Single-page PDF with bad full_text (%q) — retrying with AnalyzePageOnly...", analysis.FullText)
 				UpdateJob(hash, "ocr", "Retrying text extraction...")
 				if pageText, err := AnalyzePageOnly(imageData, 1); err == nil && pageText != "" {
 					analysis.FullText = pageText
 				} else if err != nil {
 					log.Printf("  AnalyzePageOnly fallback failed: %v", err)
 				}
 			}
 		}
 	}