package main import ( "encoding/base64" "encoding/json" "fmt" "inou/lib" "os" "path/filepath" "strings" "time" ) const importPrompt = `Analyze this medical document and extract structured data. Output language: - Field names and types: always English - Descriptions, summaries, content: Russian (document language) - Medical terms: keep original Dates: - Determine format from document context (language, institution, country) - Output as YYYY-MM-DD (ISO) regardless of input format - Partial dates OK: "2020-05" or "at 3 months old" Respond with JSON only: { "document": { "type": "consultation | lab_report | imaging_report | discharge_summary | prescription | other", "specialty": "neurology | cardiology | ophthalmology | etc", "date": "YYYY-MM-DD", "patient_age_at_doc": "9 months | 3 years | etc", "institution": "hospital or clinic name", "provider": "doctor name", "topics": ["searchable keywords"], "summary": "1-2 sentences: what information is in this document (not conclusions)" }, "events": [ { "type": "procedure | medication | hospitalization | lab | vital | symptom | therapy | vaccination | milestone | device", "what": "description", "when": "YYYY-MM-DD or YYYY-MM or approximate", "where": "location if applicable", "details": "additional info" } ], "assessments": [ {"by": "provider name", "states": "their clinical impression or opinion"} ], "attributes": { "birth": {}, "allergies": [], "conditions": [], "family_history": {} }, "content": "Full document text as clean markdown with headers" } Guidelines: - Events: things that happened, with dates. Include type field. - Assessments: opinions attributed to provider, never stated as fact - Attributes: characteristics of the person, flexible structure - Content: full text as markdown for future reprocessing` const dossierID = "63d5df76904b1ec5" // Anastasiia const docsDir = "/tank/inou/anastasiia-restored/documents" const batchTag = "import-2025-01-24" // for easy deletion type ExtractionResult struct { Document struct { Type string `json:"type"` Specialty string `json:"specialty"` Date string `json:"date"` PatientAge string `json:"patient_age_at_doc"` Institution string `json:"institution"` Provider string `json:"provider"` Topics []string `json:"topics"` Summary string `json:"summary"` } `json:"document"` Events []map[string]interface{} `json:"events"` Assessments []map[string]interface{} `json:"assessments"` Attributes map[string]interface{} `json:"attributes"` Content string `json:"content"` } func main() { if err := lib.CryptoInit("/tank/inou/master.key"); err != nil { fmt.Fprintf(os.Stderr, "CryptoInit: %v\n", err) os.Exit(1) } if err := lib.DBInit("/tank/inou/data/inou.db"); err != nil { fmt.Fprintf(os.Stderr, "DBInit: %v\n", err) os.Exit(1) } lib.ConfigInit() // Check for --delete flag if len(os.Args) > 1 && os.Args[1] == "--delete" { deleteImported() return } // Check for specific files to process var files []string if len(os.Args) > 1 { for _, arg := range os.Args[1:] { files = append(files, filepath.Join(docsDir, arg)) } } else { var err error files, err = filepath.Glob(filepath.Join(docsDir, "*.pdf")) if err != nil { fmt.Fprintf(os.Stderr, "Glob: %v\n", err) os.Exit(1) } } fmt.Printf("Found %d PDF files\n\n", len(files)) for i, file := range files { filename := filepath.Base(file) fmt.Printf("[%d/%d] Processing: %s\n", i+1, len(files), filename) if err := processDocument(file, filename); err != nil { fmt.Fprintf(os.Stderr, " ERROR: %v\n", err) continue } fmt.Println(" OK") // Small delay to avoid rate limiting time.Sleep(2 * time.Second) } } func processDocument(filePath, filename string) error { // Read PDF pdfBytes, err := os.ReadFile(filePath) if err != nil { return fmt.Errorf("read file: %w", err) } // Call Gemini b64 := base64.StdEncoding.EncodeToString(pdfBytes) parts := []lib.GeminiPart{ {Text: importPrompt}, { InlineData: &lib.GeminiInlineData{ MimeType: "application/pdf", Data: b64, }, }, } // Use higher token limit for full document extraction maxTokens := 8192 config := &lib.GeminiConfig{ MaxOutputTokens: &maxTokens, } respText, err := lib.CallGeminiMultimodal(parts, config) if err != nil { return fmt.Errorf("gemini: %w", err) } // Parse response var result ExtractionResult if err := json.Unmarshal([]byte(respText), &result); err != nil { return fmt.Errorf("parse response: %w (response: %s)", err, respText[:min(200, len(respText))]) } // Parse document date var timestamp int64 if result.Document.Date != "" { if t, err := time.Parse("2006-01-02", result.Document.Date); err == nil { timestamp = t.Unix() } else if t, err := time.Parse("2006-01", result.Document.Date); err == nil { timestamp = t.Unix() } } // Build data JSON data := map[string]interface{}{ "original_filename": filename, "content": result.Content, "events": result.Events, "assessments": result.Assessments, "attributes": result.Attributes, "extraction": map[string]interface{}{ "provider": result.Document.Provider, "institution": result.Document.Institution, "specialty": result.Document.Specialty, "patient_age": result.Document.PatientAge, }, } dataJSON, _ := json.Marshal(data) // Create entry entry := lib.Entry{ EntryID: lib.NewID(), DossierID: dossierID, Category: lib.CategoryDocument, Type: result.Document.Type, Value: filename, // Use filename as title for now Summary: result.Document.Summary, Timestamp: timestamp, Tags: batchTag + "," + strings.Join(result.Document.Topics, ","), Data: string(dataJSON), } if err := lib.EntryWrite("", &entry); err != nil { return fmt.Errorf("save entry: %w", err) } return nil } func deleteImported() { fmt.Println("Deleting entries with tag:", batchTag) entries, err := lib.EntryQuery(dossierID, -1, "") if err != nil { fmt.Fprintf(os.Stderr, "Query: %v\n", err) os.Exit(1) } var toDelete []*lib.Entry for _, e := range entries { if strings.Contains(e.Tags, batchTag) { toDelete = append(toDelete, e) } } fmt.Printf("Found %d entries to delete\n", len(toDelete)) for _, e := range toDelete { if err := lib.EntryDelete("", e.DossierID, &lib.Filter{EntryID: e.EntryID}); err != nil { fmt.Fprintf(os.Stderr, "Delete %s: %v\n", e.EntryID, err) } } fmt.Println("Done") } func min(a, b int) int { if a < b { return a } return b }