Fix extraction: don't translate, fallback OCR+classify path for non-JSON responses

- Add 'DO NOT translate, preserve original language' to vision prompts
- Shorter/tighter JSON prompt to reduce K2.5 reasoning verbosity
- Fallback: when AnalyzeWithVision returns no JSON, do AnalyzePageOnly (plain text) then AnalyzeText (classify)
- Fallback to AnalyzePageOnly for single-page PDFs with empty/placeholder full_text
- Switch model back to kimi-k2p5 (only vision model on this Fireworks account)
- Build with CGO_ENABLED=1 -tags fts5 (required for SQLite FTS5)
This commit is contained in:
James 2026-02-25 14:01:59 -05:00
parent 00d8f7c94a
commit d962c9839d
1 changed files with 49 additions and 15 deletions

64
ai.go
View File

@ -142,26 +142,28 @@ func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) {
prompt := `Analyze this document image and extract:
1. **Full Text**: Transcribe ALL visible text, formatted as clean Markdown:
1. **Full Text**: Transcribe ALL visible text EXACTLY as it appears in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Transcribe word-for-word in the source language (Russian, Dutch, German, French, etc.) formatted as clean Markdown:
- Use headers (##) for sections
- Use **bold** for labels/field names
- Use tables for tabular data (items, prices, etc.)
- Use bullet lists where appropriate
- Preserve important structure but make it readable
- Preserve ALL numbers, dates, amounts, and codes exactly as shown
2. **Classification**: Categorize into exactly ONE of:
taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2")
4. **Key Fields**:
4. **Key Fields** (these may be in English for searchability):
- date: Document date (YYYY-MM-DD if possible)
- vendor: Company/organization name
- amount: Dollar amount if present (e.g., "$123.45")
- amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN")
5. **Title**: SHORT title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025"
5. **Title**: SHORT English title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025"
6. **Summary**: 1-2 sentence description with key details.
6. **Summary**: 1-2 sentence English description with key details.
IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable.
Respond in JSON ONLY:
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`
@ -170,7 +172,7 @@ Respond in JSON ONLY:
"model": "accounts/fireworks/models/kimi-k2p5",
"max_tokens": 4096,
"messages": []map[string]interface{}{
{"role": "system", "content": "You are a document analysis API. You MUST respond with raw JSON only. No markdown, no code fences, no explanation text. Start your response with { and end with }."},
{"role": "system", "content": "You are a document analysis API. Output ONLY raw JSON. No thinking, no commentary, no code fences. First character must be {, last character must be }."},
{
"role": "user",
"content": []map[string]interface{}{
@ -183,7 +185,7 @@ Respond in JSON ONLY:
analysis, err := callFireworks(reqBody)
if err != nil {
// Retry once with a simpler prompt that's harder for the model to misinterpret
// Retry once with minimal prompt to avoid triggering extended reasoning
log.Printf(" [AI] First attempt failed, retrying with simplified prompt...")
retryBody := map[string]interface{}{
"model": "accounts/fireworks/models/kimi-k2p5",
@ -194,8 +196,8 @@ Respond in JSON ONLY:
"role": "user",
"content": []map[string]interface{}{
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
{"type": "text", "text": `Look at this document. Return ONLY this JSON (fill in values):
{"category":"uncategorized","doc_type":"unknown","date":"","vendor":"","amount":"","title":"Short Title Here","summary":"One sentence.","full_text":"All visible text here"}`},
{"type": "text", "text": `Look at this document image. Transcribe ALL text verbatim (original language, do NOT translate). Return ONLY this JSON with no placeholders:
{"category":"","doc_type":"","date":"","vendor":"","amount":"","title":"","summary":"","full_text":""}`},
},
},
},
@ -494,9 +496,9 @@ func AnalyzePageOnly(imageData []byte, pageNum int) (string, error) {
b64 := base64.StdEncoding.EncodeToString(imageData)
prompt := `Transcribe ALL visible text on this page as clean markdown. Output ONLY the transcribed text no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.
prompt := `Transcribe ALL visible text on this page EXACTLY as it appears, in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Output ONLY the transcribed text no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.
FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve all numbers, dates, and values exactly as shown.`
FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve ALL numbers, dates, amounts, and values exactly as shown. If the document is in Russian, Dutch, German, French, or any other language keep it in that language.`
reqBody := map[string]interface{}{
"model": "accounts/fireworks/models/kimi-k2p5",
@ -622,11 +624,34 @@ func ProcessDocument(filePath string) (*Document, error) {
log.Printf(" Analyzing with K2.5 vision...")
analysis, err = AnalyzeWithVision(imageData)
if err != nil {
UpdateJob(hash, "error", err.Error())
return nil, fmt.Errorf("vision analysis failed: %w", err)
// Vision JSON extraction failed — fall back to two-step: plain-text OCR + text classifier
log.Printf(" AnalyzeWithVision failed (%v), falling back to OCR+classify...", err)
UpdateJob(hash, "ocr", "Falling back to OCR + classify...")
pageText, ocrErr := AnalyzePageOnly(imageData, 1)
if ocrErr != nil {
UpdateJob(hash, "error", ocrErr.Error())
return nil, fmt.Errorf("vision analysis failed (primary: %v, fallback: %w)", err, ocrErr)
}
// Classify the extracted text
log.Printf(" OCR succeeded (%d chars), classifying...", len(pageText))
UpdateJob(hash, "classifying", "Classifying extracted text...")
analysis, err = AnalyzeText(pageText, filepath.Base(filePath))
if err != nil {
// Use minimal stub so at least the doc is stored with its text
log.Printf(" Classification failed too: %v — storing with minimal metadata", err)
analysis = &DocumentAnalysis{
Category: "uncategorized",
DocType: "unknown",
Title: strings.TrimSuffix(filepath.Base(filePath), filepath.Ext(filePath)),
Summary: "Extraction failed — stored raw text only",
FullText: pageText,
}
} else {
analysis.FullText = pageText
}
}
// For multi-page PDFs, process each page separately for accurate OCR
// For PDFs, process pages for accurate OCR
if ext == ".pdf" {
pageCount := GetPDFPageCount(filePath)
if pageCount > 1 {
@ -636,6 +661,15 @@ func ProcessDocument(filePath string) (*Document, error) {
if err == nil && fullText != "" {
analysis.FullText = fullText
}
} else if analysis.FullText == "" || len(analysis.FullText) < 50 || strings.HasPrefix(analysis.FullText, "[") {
// Single-page but full_text is empty/placeholder — retry with AnalyzePageOnly
log.Printf(" Single-page PDF with bad full_text (%q) — retrying with AnalyzePageOnly...", analysis.FullText)
UpdateJob(hash, "ocr", "Retrying text extraction...")
if pageText, err := AnalyzePageOnly(imageData, 1); err == nil && pageText != "" {
analysis.FullText = pageText
} else if err != nil {
log.Printf(" AnalyzePageOnly fallback failed: %v", err)
}
}
}
}