248 lines
6.5 KiB
Go
248 lines
6.5 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"fmt"
|
|
"inou/lib"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const importPrompt = `Analyze this medical document and extract structured data.
|
|
|
|
Output language:
|
|
- Field names and types: always English
|
|
- Descriptions, summaries, content: Russian (document language)
|
|
- Medical terms: keep original
|
|
|
|
Dates:
|
|
- Determine format from document context (language, institution, country)
|
|
- Output as YYYY-MM-DD (ISO) regardless of input format
|
|
- Partial dates OK: "2020-05" or "at 3 months old"
|
|
|
|
Respond with JSON only:
|
|
|
|
{
|
|
"document": {
|
|
"type": "consultation | lab_report | imaging_report | discharge_summary | prescription | other",
|
|
"specialty": "neurology | cardiology | ophthalmology | etc",
|
|
"date": "YYYY-MM-DD",
|
|
"patient_age_at_doc": "9 months | 3 years | etc",
|
|
"institution": "hospital or clinic name",
|
|
"provider": "doctor name",
|
|
"topics": ["searchable keywords"],
|
|
"summary": "1-2 sentences: what information is in this document (not conclusions)"
|
|
},
|
|
|
|
"events": [
|
|
{
|
|
"type": "procedure | medication | hospitalization | lab | vital | symptom | therapy | vaccination | milestone | device",
|
|
"what": "description",
|
|
"when": "YYYY-MM-DD or YYYY-MM or approximate",
|
|
"where": "location if applicable",
|
|
"details": "additional info"
|
|
}
|
|
],
|
|
|
|
"assessments": [
|
|
{"by": "provider name", "states": "their clinical impression or opinion"}
|
|
],
|
|
|
|
"attributes": {
|
|
"birth": {},
|
|
"allergies": [],
|
|
"conditions": [],
|
|
"family_history": {}
|
|
},
|
|
|
|
"content": "Full document text as clean markdown with headers"
|
|
}
|
|
|
|
Guidelines:
|
|
- Events: things that happened, with dates. Include type field.
|
|
- Assessments: opinions attributed to provider, never stated as fact
|
|
- Attributes: characteristics of the person, flexible structure
|
|
- Content: full text as markdown for future reprocessing`
|
|
|
|
const dossierID = "63d5df76904b1ec5" // Anastasiia
|
|
const docsDir = "/tank/inou/anastasiia-restored/documents"
|
|
const batchTag = "import-2025-01-24" // for easy deletion
|
|
|
|
type ExtractionResult struct {
|
|
Document struct {
|
|
Type string `json:"type"`
|
|
Specialty string `json:"specialty"`
|
|
Date string `json:"date"`
|
|
PatientAge string `json:"patient_age_at_doc"`
|
|
Institution string `json:"institution"`
|
|
Provider string `json:"provider"`
|
|
Topics []string `json:"topics"`
|
|
Summary string `json:"summary"`
|
|
} `json:"document"`
|
|
Events []map[string]interface{} `json:"events"`
|
|
Assessments []map[string]interface{} `json:"assessments"`
|
|
Attributes map[string]interface{} `json:"attributes"`
|
|
Content string `json:"content"`
|
|
}
|
|
|
|
func main() {
|
|
if err := lib.CryptoInit("/tank/inou/master.key"); err != nil {
|
|
fmt.Fprintf(os.Stderr, "CryptoInit: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
if err := lib.DBInit("/tank/inou/data/inou.db"); err != nil {
|
|
fmt.Fprintf(os.Stderr, "DBInit: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
lib.ConfigInit()
|
|
|
|
// Check for --delete flag
|
|
if len(os.Args) > 1 && os.Args[1] == "--delete" {
|
|
deleteImported()
|
|
return
|
|
}
|
|
|
|
// Check for specific files to process
|
|
var files []string
|
|
if len(os.Args) > 1 {
|
|
for _, arg := range os.Args[1:] {
|
|
files = append(files, filepath.Join(docsDir, arg))
|
|
}
|
|
} else {
|
|
var err error
|
|
files, err = filepath.Glob(filepath.Join(docsDir, "*.pdf"))
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "Glob: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
|
|
fmt.Printf("Found %d PDF files\n\n", len(files))
|
|
|
|
for i, file := range files {
|
|
filename := filepath.Base(file)
|
|
fmt.Printf("[%d/%d] Processing: %s\n", i+1, len(files), filename)
|
|
|
|
if err := processDocument(file, filename); err != nil {
|
|
fmt.Fprintf(os.Stderr, " ERROR: %v\n", err)
|
|
continue
|
|
}
|
|
fmt.Println(" OK")
|
|
|
|
// Small delay to avoid rate limiting
|
|
time.Sleep(2 * time.Second)
|
|
}
|
|
}
|
|
|
|
func processDocument(filePath, filename string) error {
|
|
// Read PDF
|
|
pdfBytes, err := os.ReadFile(filePath)
|
|
if err != nil {
|
|
return fmt.Errorf("read file: %w", err)
|
|
}
|
|
|
|
// Call Gemini
|
|
b64 := base64.StdEncoding.EncodeToString(pdfBytes)
|
|
parts := []lib.GeminiPart{
|
|
{Text: importPrompt},
|
|
{
|
|
InlineData: &lib.GeminiInlineData{
|
|
MimeType: "application/pdf",
|
|
Data: b64,
|
|
},
|
|
},
|
|
}
|
|
|
|
// Use higher token limit for full document extraction
|
|
maxTokens := 8192
|
|
config := &lib.GeminiConfig{
|
|
MaxOutputTokens: &maxTokens,
|
|
}
|
|
respText, err := lib.CallGeminiMultimodal(parts, config)
|
|
if err != nil {
|
|
return fmt.Errorf("gemini: %w", err)
|
|
}
|
|
|
|
// Parse response
|
|
var result ExtractionResult
|
|
if err := json.Unmarshal([]byte(respText), &result); err != nil {
|
|
return fmt.Errorf("parse response: %w (response: %s)", err, respText[:min(200, len(respText))])
|
|
}
|
|
|
|
// Parse document date
|
|
var timestamp int64
|
|
if result.Document.Date != "" {
|
|
if t, err := time.Parse("2006-01-02", result.Document.Date); err == nil {
|
|
timestamp = t.Unix()
|
|
} else if t, err := time.Parse("2006-01", result.Document.Date); err == nil {
|
|
timestamp = t.Unix()
|
|
}
|
|
}
|
|
|
|
// Build data JSON
|
|
data := map[string]interface{}{
|
|
"original_filename": filename,
|
|
"content": result.Content,
|
|
"events": result.Events,
|
|
"assessments": result.Assessments,
|
|
"attributes": result.Attributes,
|
|
"extraction": map[string]interface{}{
|
|
"provider": result.Document.Provider,
|
|
"institution": result.Document.Institution,
|
|
"specialty": result.Document.Specialty,
|
|
"patient_age": result.Document.PatientAge,
|
|
},
|
|
}
|
|
dataJSON, _ := json.Marshal(data)
|
|
|
|
// Create entry
|
|
entry := lib.Entry{
|
|
EntryID: lib.NewID(),
|
|
DossierID: dossierID,
|
|
Category: lib.CategoryDocument,
|
|
Type: result.Document.Type,
|
|
Value: filename, // Use filename as title for now
|
|
Summary: result.Document.Summary,
|
|
Timestamp: timestamp,
|
|
Tags: batchTag + "," + strings.Join(result.Document.Topics, ","),
|
|
Data: string(dataJSON),
|
|
}
|
|
|
|
if err := lib.Save("entries", &entry); err != nil {
|
|
return fmt.Errorf("save entry: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func deleteImported() {
|
|
fmt.Println("Deleting entries with tag:", batchTag)
|
|
|
|
var entries []lib.Entry
|
|
err := lib.Query("SELECT entry_id FROM entries WHERE tags LIKE ?", []interface{}{"%" + batchTag + "%"}, &entries)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "Query: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
fmt.Printf("Found %d entries to delete\n", len(entries))
|
|
|
|
for _, e := range entries {
|
|
if err := lib.Delete("entries", "entry_id", e.EntryID); err != nil {
|
|
fmt.Fprintf(os.Stderr, "Delete %s: %v\n", e.EntryID, err)
|
|
}
|
|
}
|
|
|
|
fmt.Println("Done")
|
|
}
|
|
|
|
func min(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|