inou/doc-processor/restore/import_docs.go

254 lines
6.5 KiB
Go

package main
import (
"encoding/base64"
"encoding/json"
"fmt"
"inou/lib"
"os"
"path/filepath"
"strings"
"time"
)
const importPrompt = `Analyze this medical document and extract structured data.
Output language:
- Field names and types: always English
- Descriptions, summaries, content: Russian (document language)
- Medical terms: keep original
Dates:
- Determine format from document context (language, institution, country)
- Output as YYYY-MM-DD (ISO) regardless of input format
- Partial dates OK: "2020-05" or "at 3 months old"
Respond with JSON only:
{
"document": {
"type": "consultation | lab_report | imaging_report | discharge_summary | prescription | other",
"specialty": "neurology | cardiology | ophthalmology | etc",
"date": "YYYY-MM-DD",
"patient_age_at_doc": "9 months | 3 years | etc",
"institution": "hospital or clinic name",
"provider": "doctor name",
"topics": ["searchable keywords"],
"summary": "1-2 sentences: what information is in this document (not conclusions)"
},
"events": [
{
"type": "procedure | medication | hospitalization | lab | vital | symptom | therapy | vaccination | milestone | device",
"what": "description",
"when": "YYYY-MM-DD or YYYY-MM or approximate",
"where": "location if applicable",
"details": "additional info"
}
],
"assessments": [
{"by": "provider name", "states": "their clinical impression or opinion"}
],
"attributes": {
"birth": {},
"allergies": [],
"conditions": [],
"family_history": {}
},
"content": "Full document text as clean markdown with headers"
}
Guidelines:
- Events: things that happened, with dates. Include type field.
- Assessments: opinions attributed to provider, never stated as fact
- Attributes: characteristics of the person, flexible structure
- Content: full text as markdown for future reprocessing`
const dossierID = "63d5df76904b1ec5" // Anastasiia
const docsDir = "/tank/inou/anastasiia-restored/documents"
const batchTag = "import-2025-01-24" // for easy deletion
type ExtractionResult struct {
Document struct {
Type string `json:"type"`
Specialty string `json:"specialty"`
Date string `json:"date"`
PatientAge string `json:"patient_age_at_doc"`
Institution string `json:"institution"`
Provider string `json:"provider"`
Topics []string `json:"topics"`
Summary string `json:"summary"`
} `json:"document"`
Events []map[string]interface{} `json:"events"`
Assessments []map[string]interface{} `json:"assessments"`
Attributes map[string]interface{} `json:"attributes"`
Content string `json:"content"`
}
func main() {
if err := lib.CryptoInit("/tank/inou/master.key"); err != nil {
fmt.Fprintf(os.Stderr, "CryptoInit: %v\n", err)
os.Exit(1)
}
if err := lib.DBInit("/tank/inou/data/inou.db"); err != nil {
fmt.Fprintf(os.Stderr, "DBInit: %v\n", err)
os.Exit(1)
}
lib.ConfigInit()
// Check for --delete flag
if len(os.Args) > 1 && os.Args[1] == "--delete" {
deleteImported()
return
}
// Check for specific files to process
var files []string
if len(os.Args) > 1 {
for _, arg := range os.Args[1:] {
files = append(files, filepath.Join(docsDir, arg))
}
} else {
var err error
files, err = filepath.Glob(filepath.Join(docsDir, "*.pdf"))
if err != nil {
fmt.Fprintf(os.Stderr, "Glob: %v\n", err)
os.Exit(1)
}
}
fmt.Printf("Found %d PDF files\n\n", len(files))
for i, file := range files {
filename := filepath.Base(file)
fmt.Printf("[%d/%d] Processing: %s\n", i+1, len(files), filename)
if err := processDocument(file, filename); err != nil {
fmt.Fprintf(os.Stderr, " ERROR: %v\n", err)
continue
}
fmt.Println(" OK")
// Small delay to avoid rate limiting
time.Sleep(2 * time.Second)
}
}
func processDocument(filePath, filename string) error {
// Read PDF
pdfBytes, err := os.ReadFile(filePath)
if err != nil {
return fmt.Errorf("read file: %w", err)
}
// Call Gemini
b64 := base64.StdEncoding.EncodeToString(pdfBytes)
parts := []lib.GeminiPart{
{Text: importPrompt},
{
InlineData: &lib.GeminiInlineData{
MimeType: "application/pdf",
Data: b64,
},
},
}
// Use higher token limit for full document extraction
maxTokens := 8192
config := &lib.GeminiConfig{
MaxOutputTokens: &maxTokens,
}
respText, err := lib.CallGeminiMultimodal(parts, config)
if err != nil {
return fmt.Errorf("gemini: %w", err)
}
// Parse response
var result ExtractionResult
if err := json.Unmarshal([]byte(respText), &result); err != nil {
return fmt.Errorf("parse response: %w (response: %s)", err, respText[:min(200, len(respText))])
}
// Parse document date
var timestamp int64
if result.Document.Date != "" {
if t, err := time.Parse("2006-01-02", result.Document.Date); err == nil {
timestamp = t.Unix()
} else if t, err := time.Parse("2006-01", result.Document.Date); err == nil {
timestamp = t.Unix()
}
}
// Build data JSON
data := map[string]interface{}{
"original_filename": filename,
"content": result.Content,
"events": result.Events,
"assessments": result.Assessments,
"attributes": result.Attributes,
"extraction": map[string]interface{}{
"provider": result.Document.Provider,
"institution": result.Document.Institution,
"specialty": result.Document.Specialty,
"patient_age": result.Document.PatientAge,
},
}
dataJSON, _ := json.Marshal(data)
// Create entry
entry := lib.Entry{
EntryID: lib.NewID(),
DossierID: dossierID,
Category: lib.CategoryDocument,
Type: result.Document.Type,
Value: filename, // Use filename as title for now
Summary: result.Document.Summary,
Timestamp: timestamp,
Tags: batchTag + "," + strings.Join(result.Document.Topics, ","),
Data: string(dataJSON),
}
if err := lib.EntryAdd(&entry); err != nil {
return fmt.Errorf("save entry: %w", err)
}
return nil
}
func deleteImported() {
fmt.Println("Deleting entries with tag:", batchTag)
entries, err := lib.EntryQuery(dossierID, -1, "")
if err != nil {
fmt.Fprintf(os.Stderr, "Query: %v\n", err)
os.Exit(1)
}
var toDelete []*lib.Entry
for _, e := range entries {
if strings.Contains(e.Tags, batchTag) {
toDelete = append(toDelete, e)
}
}
fmt.Printf("Found %d entries to delete\n", len(toDelete))
for _, e := range toDelete {
if err := lib.EntryDelete(e.EntryID); err != nil {
fmt.Fprintf(os.Stderr, "Delete %s: %v\n", e.EntryID, err)
}
}
fmt.Println("Done")
}
func min(a, b int) int {
if a < b {
return a
}
return b
}