package main import ( "encoding/base64" "encoding/json" "fmt" "inou/lib" "log" "os" "os/exec" "path/filepath" "sort" "strings" "sync" "time" ) type extractedEntry struct { Type string `json:"type"` Value string `json:"value"` Summary string `json:"summary"` SummaryTranslated string `json:"summary_translated,omitempty"` SearchKey string `json:"search_key,omitempty"` Timestamp string `json:"timestamp,omitempty"` Data map[string]interface{} `json:"data"` SourceSpans []sourceSpan `json:"source_spans,omitempty"` } type sourceSpan struct { Start string `json:"start"` End string `json:"end"` } var extractionPreamble = `IMPORTANT RULES (apply to all entries you return): - Do NOT translate. Keep ALL text values (summary, value, data fields) in the ORIGINAL language of the document. - For each entry, include "source_spans": an array of {"start": "...", "end": "..."} where start/end are the VERBATIM first and last 5-8 words of the relevant passage(s) in the source markdown. This is used to highlight the source text. Multiple spans are allowed. - For each entry, include "search_key": a short normalized deduplication key in English lowercase. Format: "thing:qualifier:YYYY-MM" or "thing:qualifier" for undated facts. Examples: "surgery:vp-shunt:2020-07", "device:ommaya-reservoir:2020-04", "diagnosis:hydrocephalus", "provider:peraud:ulm". Same real-world fact across different documents MUST produce the same key. ` // loadExtractionPrompts discovers all extract_*.md files and returns {categoryID: prompt content}. func loadExtractionPrompts() map[int]string { pattern := filepath.Join(lib.TrackerPromptsDir(), "extract_*.md") files, _ := filepath.Glob(pattern) prompts := make(map[int]string) for _, f := range files { base := filepath.Base(f) name := strings.TrimPrefix(base, "extract_") name = strings.TrimSuffix(name, ".md") catID, ok := lib.CategoryFromString[name] if !ok { fmt.Printf("Unknown category in prompt file: %s\n", base) continue } data, err := os.ReadFile(f) if err != nil { continue } prompts[catID] = string(data) } return prompts } const ( visionModel = "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct" textModel = "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct" ) var ocrPrompt = `You are a medical document OCR system. Produce a faithful markdown transcription of this document. The images are sequential pages of the same document. Process them in order: page 1 first, then page 2, etc. Rules: - Read each page top-to-bottom, left-to-right - Preserve ALL text, dates, values, names, addresses, and structure - Translate nothing — keep the original language - Use markdown headers, lists, and formatting to reflect the document structure - For tables, use markdown tables. Preserve numeric values exactly. - Be complete — do not skip or summarize anything - Do not describe visual elements (logos, signatures) — only transcribe text - For handwritten text, transcribe as accurately as possible. Mark uncertain readings with [?]` func main() { if len(os.Args) < 3 { fmt.Fprintf(os.Stderr, "Usage: test-doc-import \n") os.Exit(1) } dossierID := os.Args[1] pdfPath := os.Args[2] fileName := filepath.Base(pdfPath) if err := lib.Init(); err != nil { log.Fatalf("lib.Init: %v", err) } lib.ConfigInit() lib.InitPrompts("tracker_prompts") fmt.Printf("Prompts dir: %s\n", lib.TrackerPromptsDir()) // 1. Convert PDF to PNG pages tempDir, _ := os.MkdirTemp("", "doc-import-*") defer os.RemoveAll(tempDir) prefix := filepath.Join(tempDir, "page") cmd := exec.Command("pdftoppm", "-png", "-r", "200", pdfPath, prefix) if out, err := cmd.CombinedOutput(); err != nil { log.Fatalf("pdftoppm: %v: %s", err, out) } pageFiles, _ := filepath.Glob(prefix + "*.png") sort.Strings(pageFiles) fmt.Printf("%d pages converted\n", len(pageFiles)) // 2. OCR content := []interface{}{ map[string]string{"type": "text", "text": ocrPrompt}, } for _, pf := range pageFiles { imgBytes, _ := os.ReadFile(pf) b64 := base64.StdEncoding.EncodeToString(imgBytes) content = append(content, map[string]interface{}{ "type": "image_url", "image_url": map[string]string{ "url": "data:image/png;base64," + b64, }, }) } fmt.Printf("Calling OCR...\n") start := time.Now() markdown, err := lib.CallFireworks(visionModel, []map[string]interface{}{ {"role": "user", "content": content}, }, 16384) if err != nil { log.Fatalf("OCR: %v", err) } fmt.Printf("OCR done: %d chars in %.1fs\n", len(markdown), time.Since(start).Seconds()) // 3. Create document entry now := time.Now().Unix() docData := map[string]interface{}{ "markdown": markdown, "pages": len(pageFiles), } docDataJSON, _ := json.Marshal(docData) docEntry := &lib.Entry{ DossierID: dossierID, Category: lib.CategoryDocument, Type: "pdf", Value: fileName, Timestamp: now, Data: string(docDataJSON), } lib.EntryWrite("", docEntry) docID := docEntry.EntryID fmt.Printf("Document entry: %s\n", docID) // 4. Fan out extraction type catResult struct { Category int Entries []extractedEntry } var mu sync.Mutex var results []catResult var wg sync.WaitGroup prompts := loadExtractionPrompts() fmt.Printf("Starting %d extraction calls...\n", len(prompts)) extractStart := time.Now() for catID, promptTmpl := range prompts { wg.Add(1) go func(catID int, promptTmpl string) { defer wg.Done() catName := lib.CategoryName(catID) prompt := extractionPreamble + "\n" + strings.ReplaceAll(promptTmpl, "{{MARKDOWN}}", markdown) msgs := []map[string]interface{}{ {"role": "user", "content": prompt}, } resp, err := lib.CallFireworks(textModel, msgs, 4096) if err != nil { fmt.Printf(" [%s] API error: %v\n", catName, err) return } resp = strings.TrimSpace(resp) if resp == "null" || resp == "" { fmt.Printf(" [%s] → null\n", catName) return } var entries []extractedEntry if err := json.Unmarshal([]byte(resp), &entries); err != nil { var single extractedEntry if err2 := json.Unmarshal([]byte(resp), &single); err2 == nil && single.Summary != "" { entries = []extractedEntry{single} } else { fmt.Printf(" [%s] → parse error: %v\n Response: %s\n", catName, err, resp[:min(200, len(resp))]) return } } if len(entries) == 0 { fmt.Printf(" [%s] → empty array\n", catName) return } fmt.Printf(" [%s] → %d entries\n", catName, len(entries)) mu.Lock() results = append(results, catResult{Category: catID, Entries: entries}) mu.Unlock() }(catID, promptTmpl) } wg.Wait() fmt.Printf("Extraction done in %.1fs: %d categories\n", time.Since(extractStart).Seconds(), len(results)) // 5. Create entries var totalEntries int for _, r := range results { for _, e := range r.Entries { dataMap := map[string]interface{}{"source_doc_id": docID} for k, v := range e.Data { dataMap[k] = v } if len(e.SourceSpans) > 0 { dataMap["source_spans"] = e.SourceSpans } if e.SummaryTranslated != "" { dataMap["summary_translated"] = e.SummaryTranslated } dataJSON, _ := json.Marshal(dataMap) ts := now if e.Timestamp != "" { for _, layout := range []string{"2006-01-02", "02.01.2006", "01/02/2006"} { if t, err := time.Parse(layout, e.Timestamp); err == nil { ts = t.Unix() break } } } entry := &lib.Entry{ DossierID: dossierID, ParentID: docID, Category: r.Category, Type: e.Type, Value: e.Value, Summary: e.Summary, SearchKey: e.SearchKey, Timestamp: ts, Data: string(dataJSON), } lib.EntryWrite("", entry) totalEntries++ } } fmt.Printf("Created %d entries under doc %s\n", totalEntries, docID) // 6. Show results fmt.Println("\n=== Results ===") for _, r := range results { catName := lib.CategoryName(r.Category) for _, e := range r.Entries { spans := "" if len(e.SourceSpans) > 0 { spans = fmt.Sprintf(" spans=%d", len(e.SourceSpans)) } trans := "" if e.SummaryTranslated != "" { trans = fmt.Sprintf(" [%s]", e.SummaryTranslated) } fmt.Printf(" [%s] Type=%s Summary=%s%s%s\n", catName, e.Type, e.Summary, trans, spans) } } } func min(a, b int) int { if a < b { return a } return b }