From d962c9839dfe5ed42bef3accd4504823ab743d2b Mon Sep 17 00:00:00 2001 From: James Date: Wed, 25 Feb 2026 14:01:59 -0500 Subject: [PATCH] Fix extraction: don't translate, fallback OCR+classify path for non-JSON responses - Add 'DO NOT translate, preserve original language' to vision prompts - Shorter/tighter JSON prompt to reduce K2.5 reasoning verbosity - Fallback: when AnalyzeWithVision returns no JSON, do AnalyzePageOnly (plain text) then AnalyzeText (classify) - Fallback to AnalyzePageOnly for single-page PDFs with empty/placeholder full_text - Switch model back to kimi-k2p5 (only vision model on this Fireworks account) - Build with CGO_ENABLED=1 -tags fts5 (required for SQLite FTS5) --- ai.go | 64 +++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/ai.go b/ai.go index c20d32f..f88f8ea 100644 --- a/ai.go +++ b/ai.go @@ -142,26 +142,28 @@ func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) { prompt := `Analyze this document image and extract: -1. **Full Text**: Transcribe ALL visible text, formatted as clean Markdown: +1. **Full Text**: Transcribe ALL visible text EXACTLY as it appears — in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Transcribe word-for-word in the source language (Russian, Dutch, German, French, etc.) formatted as clean Markdown: - Use headers (##) for sections - Use **bold** for labels/field names - Use tables for tabular data (items, prices, etc.) - Use bullet lists where appropriate - - Preserve important structure but make it readable + - Preserve ALL numbers, dates, amounts, and codes exactly as shown 2. **Classification**: Categorize into exactly ONE of: taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized 3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2") -4. **Key Fields**: +4. **Key Fields** (these may be in English for searchability): - date: Document date (YYYY-MM-DD if possible) - vendor: Company/organization name - - amount: Dollar amount if present (e.g., "$123.45") + - amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN") -5. **Title**: SHORT title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025" +5. **Title**: SHORT English title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025" -6. **Summary**: 1-2 sentence description with key details. +6. **Summary**: 1-2 sentence English description with key details. + +IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable. Respond in JSON ONLY: {"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}` @@ -170,7 +172,7 @@ Respond in JSON ONLY: "model": "accounts/fireworks/models/kimi-k2p5", "max_tokens": 4096, "messages": []map[string]interface{}{ - {"role": "system", "content": "You are a document analysis API. You MUST respond with raw JSON only. No markdown, no code fences, no explanation text. Start your response with { and end with }."}, + {"role": "system", "content": "You are a document analysis API. Output ONLY raw JSON. No thinking, no commentary, no code fences. First character must be {, last character must be }."}, { "role": "user", "content": []map[string]interface{}{ @@ -183,7 +185,7 @@ Respond in JSON ONLY: analysis, err := callFireworks(reqBody) if err != nil { - // Retry once with a simpler prompt that's harder for the model to misinterpret + // Retry once with minimal prompt to avoid triggering extended reasoning log.Printf(" [AI] First attempt failed, retrying with simplified prompt...") retryBody := map[string]interface{}{ "model": "accounts/fireworks/models/kimi-k2p5", @@ -194,8 +196,8 @@ Respond in JSON ONLY: "role": "user", "content": []map[string]interface{}{ {"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}}, - {"type": "text", "text": `Look at this document. Return ONLY this JSON (fill in values): -{"category":"uncategorized","doc_type":"unknown","date":"","vendor":"","amount":"","title":"Short Title Here","summary":"One sentence.","full_text":"All visible text here"}`}, + {"type": "text", "text": `Look at this document image. Transcribe ALL text verbatim (original language, do NOT translate). Return ONLY this JSON with no placeholders: +{"category":"","doc_type":"","date":"","vendor":"","amount":"","title":"","summary":"","full_text":""}`}, }, }, }, @@ -494,9 +496,9 @@ func AnalyzePageOnly(imageData []byte, pageNum int) (string, error) { b64 := base64.StdEncoding.EncodeToString(imageData) - prompt := `Transcribe ALL visible text on this page as clean markdown. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content. + prompt := `Transcribe ALL visible text on this page EXACTLY as it appears, in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content. -FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve all numbers, dates, and values exactly as shown.` +FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve ALL numbers, dates, amounts, and values exactly as shown. If the document is in Russian, Dutch, German, French, or any other language — keep it in that language.` reqBody := map[string]interface{}{ "model": "accounts/fireworks/models/kimi-k2p5", @@ -622,11 +624,34 @@ func ProcessDocument(filePath string) (*Document, error) { log.Printf(" Analyzing with K2.5 vision...") analysis, err = AnalyzeWithVision(imageData) if err != nil { - UpdateJob(hash, "error", err.Error()) - return nil, fmt.Errorf("vision analysis failed: %w", err) + // Vision JSON extraction failed — fall back to two-step: plain-text OCR + text classifier + log.Printf(" AnalyzeWithVision failed (%v), falling back to OCR+classify...", err) + UpdateJob(hash, "ocr", "Falling back to OCR + classify...") + pageText, ocrErr := AnalyzePageOnly(imageData, 1) + if ocrErr != nil { + UpdateJob(hash, "error", ocrErr.Error()) + return nil, fmt.Errorf("vision analysis failed (primary: %v, fallback: %w)", err, ocrErr) + } + // Classify the extracted text + log.Printf(" OCR succeeded (%d chars), classifying...", len(pageText)) + UpdateJob(hash, "classifying", "Classifying extracted text...") + analysis, err = AnalyzeText(pageText, filepath.Base(filePath)) + if err != nil { + // Use minimal stub so at least the doc is stored with its text + log.Printf(" Classification failed too: %v — storing with minimal metadata", err) + analysis = &DocumentAnalysis{ + Category: "uncategorized", + DocType: "unknown", + Title: strings.TrimSuffix(filepath.Base(filePath), filepath.Ext(filePath)), + Summary: "Extraction failed — stored raw text only", + FullText: pageText, + } + } else { + analysis.FullText = pageText + } } - // For multi-page PDFs, process each page separately for accurate OCR + // For PDFs, process pages for accurate OCR if ext == ".pdf" { pageCount := GetPDFPageCount(filePath) if pageCount > 1 { @@ -636,6 +661,15 @@ func ProcessDocument(filePath string) (*Document, error) { if err == nil && fullText != "" { analysis.FullText = fullText } + } else if analysis.FullText == "" || len(analysis.FullText) < 50 || strings.HasPrefix(analysis.FullText, "[") { + // Single-page but full_text is empty/placeholder — retry with AnalyzePageOnly + log.Printf(" Single-page PDF with bad full_text (%q) — retrying with AnalyzePageOnly...", analysis.FullText) + UpdateJob(hash, "ocr", "Retrying text extraction...") + if pageText, err := AnalyzePageOnly(imageData, 1); err == nil && pageText != "" { + analysis.FullText = pageText + } else if err != nil { + log.Printf(" AnalyzePageOnly fallback failed: %v", err) + } } } }