Fix extraction: don't translate, fallback OCR+classify path for non-JSON responses
- Add 'DO NOT translate, preserve original language' to vision prompts - Shorter/tighter JSON prompt to reduce K2.5 reasoning verbosity - Fallback: when AnalyzeWithVision returns no JSON, do AnalyzePageOnly (plain text) then AnalyzeText (classify) - Fallback to AnalyzePageOnly for single-page PDFs with empty/placeholder full_text - Switch model back to kimi-k2p5 (only vision model on this Fireworks account) - Build with CGO_ENABLED=1 -tags fts5 (required for SQLite FTS5)
This commit is contained in:
parent
00d8f7c94a
commit
d962c9839d
64
ai.go
64
ai.go
|
|
@ -142,26 +142,28 @@ func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) {
|
||||||
|
|
||||||
prompt := `Analyze this document image and extract:
|
prompt := `Analyze this document image and extract:
|
||||||
|
|
||||||
1. **Full Text**: Transcribe ALL visible text, formatted as clean Markdown:
|
1. **Full Text**: Transcribe ALL visible text EXACTLY as it appears — in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Transcribe word-for-word in the source language (Russian, Dutch, German, French, etc.) formatted as clean Markdown:
|
||||||
- Use headers (##) for sections
|
- Use headers (##) for sections
|
||||||
- Use **bold** for labels/field names
|
- Use **bold** for labels/field names
|
||||||
- Use tables for tabular data (items, prices, etc.)
|
- Use tables for tabular data (items, prices, etc.)
|
||||||
- Use bullet lists where appropriate
|
- Use bullet lists where appropriate
|
||||||
- Preserve important structure but make it readable
|
- Preserve ALL numbers, dates, amounts, and codes exactly as shown
|
||||||
|
|
||||||
2. **Classification**: Categorize into exactly ONE of:
|
2. **Classification**: Categorize into exactly ONE of:
|
||||||
taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
|
taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
|
||||||
|
|
||||||
3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2")
|
3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2")
|
||||||
|
|
||||||
4. **Key Fields**:
|
4. **Key Fields** (these may be in English for searchability):
|
||||||
- date: Document date (YYYY-MM-DD if possible)
|
- date: Document date (YYYY-MM-DD if possible)
|
||||||
- vendor: Company/organization name
|
- vendor: Company/organization name
|
||||||
- amount: Dollar amount if present (e.g., "$123.45")
|
- amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN")
|
||||||
|
|
||||||
5. **Title**: SHORT title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025"
|
5. **Title**: SHORT English title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025"
|
||||||
|
|
||||||
6. **Summary**: 1-2 sentence description with key details.
|
6. **Summary**: 1-2 sentence English description with key details.
|
||||||
|
|
||||||
|
IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable.
|
||||||
|
|
||||||
Respond in JSON ONLY:
|
Respond in JSON ONLY:
|
||||||
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`
|
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`
|
||||||
|
|
@ -170,7 +172,7 @@ Respond in JSON ONLY:
|
||||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||||
"max_tokens": 4096,
|
"max_tokens": 4096,
|
||||||
"messages": []map[string]interface{}{
|
"messages": []map[string]interface{}{
|
||||||
{"role": "system", "content": "You are a document analysis API. You MUST respond with raw JSON only. No markdown, no code fences, no explanation text. Start your response with { and end with }."},
|
{"role": "system", "content": "You are a document analysis API. Output ONLY raw JSON. No thinking, no commentary, no code fences. First character must be {, last character must be }."},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": []map[string]interface{}{
|
"content": []map[string]interface{}{
|
||||||
|
|
@ -183,7 +185,7 @@ Respond in JSON ONLY:
|
||||||
|
|
||||||
analysis, err := callFireworks(reqBody)
|
analysis, err := callFireworks(reqBody)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Retry once with a simpler prompt that's harder for the model to misinterpret
|
// Retry once with minimal prompt to avoid triggering extended reasoning
|
||||||
log.Printf(" [AI] First attempt failed, retrying with simplified prompt...")
|
log.Printf(" [AI] First attempt failed, retrying with simplified prompt...")
|
||||||
retryBody := map[string]interface{}{
|
retryBody := map[string]interface{}{
|
||||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||||
|
|
@ -194,8 +196,8 @@ Respond in JSON ONLY:
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": []map[string]interface{}{
|
"content": []map[string]interface{}{
|
||||||
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
|
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
|
||||||
{"type": "text", "text": `Look at this document. Return ONLY this JSON (fill in values):
|
{"type": "text", "text": `Look at this document image. Transcribe ALL text verbatim (original language, do NOT translate). Return ONLY this JSON with no placeholders:
|
||||||
{"category":"uncategorized","doc_type":"unknown","date":"","vendor":"","amount":"","title":"Short Title Here","summary":"One sentence.","full_text":"All visible text here"}`},
|
{"category":"","doc_type":"","date":"","vendor":"","amount":"","title":"","summary":"","full_text":""}`},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
@ -494,9 +496,9 @@ func AnalyzePageOnly(imageData []byte, pageNum int) (string, error) {
|
||||||
|
|
||||||
b64 := base64.StdEncoding.EncodeToString(imageData)
|
b64 := base64.StdEncoding.EncodeToString(imageData)
|
||||||
|
|
||||||
prompt := `Transcribe ALL visible text on this page as clean markdown. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.
|
prompt := `Transcribe ALL visible text on this page EXACTLY as it appears, in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.
|
||||||
|
|
||||||
FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve all numbers, dates, and values exactly as shown.`
|
FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve ALL numbers, dates, amounts, and values exactly as shown. If the document is in Russian, Dutch, German, French, or any other language — keep it in that language.`
|
||||||
|
|
||||||
reqBody := map[string]interface{}{
|
reqBody := map[string]interface{}{
|
||||||
"model": "accounts/fireworks/models/kimi-k2p5",
|
"model": "accounts/fireworks/models/kimi-k2p5",
|
||||||
|
|
@ -622,11 +624,34 @@ func ProcessDocument(filePath string) (*Document, error) {
|
||||||
log.Printf(" Analyzing with K2.5 vision...")
|
log.Printf(" Analyzing with K2.5 vision...")
|
||||||
analysis, err = AnalyzeWithVision(imageData)
|
analysis, err = AnalyzeWithVision(imageData)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
UpdateJob(hash, "error", err.Error())
|
// Vision JSON extraction failed — fall back to two-step: plain-text OCR + text classifier
|
||||||
return nil, fmt.Errorf("vision analysis failed: %w", err)
|
log.Printf(" AnalyzeWithVision failed (%v), falling back to OCR+classify...", err)
|
||||||
|
UpdateJob(hash, "ocr", "Falling back to OCR + classify...")
|
||||||
|
pageText, ocrErr := AnalyzePageOnly(imageData, 1)
|
||||||
|
if ocrErr != nil {
|
||||||
|
UpdateJob(hash, "error", ocrErr.Error())
|
||||||
|
return nil, fmt.Errorf("vision analysis failed (primary: %v, fallback: %w)", err, ocrErr)
|
||||||
|
}
|
||||||
|
// Classify the extracted text
|
||||||
|
log.Printf(" OCR succeeded (%d chars), classifying...", len(pageText))
|
||||||
|
UpdateJob(hash, "classifying", "Classifying extracted text...")
|
||||||
|
analysis, err = AnalyzeText(pageText, filepath.Base(filePath))
|
||||||
|
if err != nil {
|
||||||
|
// Use minimal stub so at least the doc is stored with its text
|
||||||
|
log.Printf(" Classification failed too: %v — storing with minimal metadata", err)
|
||||||
|
analysis = &DocumentAnalysis{
|
||||||
|
Category: "uncategorized",
|
||||||
|
DocType: "unknown",
|
||||||
|
Title: strings.TrimSuffix(filepath.Base(filePath), filepath.Ext(filePath)),
|
||||||
|
Summary: "Extraction failed — stored raw text only",
|
||||||
|
FullText: pageText,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
analysis.FullText = pageText
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// For multi-page PDFs, process each page separately for accurate OCR
|
// For PDFs, process pages for accurate OCR
|
||||||
if ext == ".pdf" {
|
if ext == ".pdf" {
|
||||||
pageCount := GetPDFPageCount(filePath)
|
pageCount := GetPDFPageCount(filePath)
|
||||||
if pageCount > 1 {
|
if pageCount > 1 {
|
||||||
|
|
@ -636,6 +661,15 @@ func ProcessDocument(filePath string) (*Document, error) {
|
||||||
if err == nil && fullText != "" {
|
if err == nil && fullText != "" {
|
||||||
analysis.FullText = fullText
|
analysis.FullText = fullText
|
||||||
}
|
}
|
||||||
|
} else if analysis.FullText == "" || len(analysis.FullText) < 50 || strings.HasPrefix(analysis.FullText, "[") {
|
||||||
|
// Single-page but full_text is empty/placeholder — retry with AnalyzePageOnly
|
||||||
|
log.Printf(" Single-page PDF with bad full_text (%q) — retrying with AnalyzePageOnly...", analysis.FullText)
|
||||||
|
UpdateJob(hash, "ocr", "Retrying text extraction...")
|
||||||
|
if pageText, err := AnalyzePageOnly(imageData, 1); err == nil && pageText != "" {
|
||||||
|
analysis.FullText = pageText
|
||||||
|
} else if err != nil {
|
||||||
|
log.Printf(" AnalyzePageOnly fallback failed: %v", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue