117 lines
3.5 KiB
Go
117 lines
3.5 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"fmt"
|
|
"inou/lib"
|
|
"log"
|
|
"os"
|
|
)
|
|
|
|
// ProcessedDoc is the result of document extraction
|
|
type ProcessedDoc struct {
|
|
Title string `json:"title"`
|
|
Type string `json:"type"`
|
|
DocumentDate string `json:"document_date"`
|
|
Summary string `json:"summary"`
|
|
Tags []string `json:"tags"`
|
|
Text string `json:"text"`
|
|
StructuredData map[string]interface{} `json:"structured_data"`
|
|
}
|
|
|
|
const extractionPrompt = `Analyze this medical document and extract the following information. Respond ONLY with valid JSON, no markdown or explanation.
|
|
|
|
{
|
|
"title": "Brief descriptive title for the document",
|
|
"type": "one of: consultation, radiology_report, lab_report, ultrasound, other",
|
|
"document_date": "YYYY-MM-DD format, extracted from document content",
|
|
"summary": "1-2 sentences describing WHAT information is in the document (not the findings)",
|
|
"tags": ["searchable", "terms", "doctor names", "body parts", "institutions"],
|
|
"text": "Full extracted text from the document, preserving structure",
|
|
"structured_data": null
|
|
}
|
|
|
|
For lab_report type, also populate structured_data with test results:
|
|
{
|
|
"structured_data": {
|
|
"tests": [
|
|
{"name": "Hemoglobin", "value": 14.2, "unit": "g/dL", "reference_range": "12.0-16.0", "flag": "normal"}
|
|
]
|
|
}
|
|
}
|
|
|
|
Summary guidelines:
|
|
- Describe WHAT information is in the document, not the medical findings
|
|
- Example: "Consultation with Dr. Smith regarding leg pain" NOT "Dr. Smith thinks her leg is broken"
|
|
- This helps LLMs decide if they need to read the full text
|
|
|
|
The document may be in any language. Extract text in the original language.`
|
|
|
|
const masterKeyPath = "/tank/inou/master.key"
|
|
|
|
func main() {
|
|
if len(os.Args) != 2 {
|
|
fmt.Fprintf(os.Stderr, "Usage: doc-processor <encrypted-file-path>\n")
|
|
os.Exit(1)
|
|
}
|
|
|
|
filePath := os.Args[1]
|
|
|
|
// Initialize crypto
|
|
if err := lib.CryptoInit(masterKeyPath); err != nil {
|
|
lib.SendErrorForAnalysis("doc-processor.CryptoInit", err, map[string]interface{}{
|
|
"key_path": masterKeyPath,
|
|
})
|
|
log.Fatalf("Failed to initialize crypto: %v", err)
|
|
}
|
|
|
|
// Decrypt the file
|
|
log.Printf("Decrypting %s...", filePath)
|
|
pdfBytes, err := lib.DecryptFile(filePath)
|
|
if err != nil {
|
|
log.Fatalf("Failed to decrypt file: %v", err)
|
|
}
|
|
log.Printf("Decrypted %d bytes", len(pdfBytes))
|
|
|
|
// Initialize lib package configuration
|
|
lib.ConfigInit()
|
|
|
|
// Build the Gemini request parts with PDF
|
|
b64 := base64.StdEncoding.EncodeToString(pdfBytes)
|
|
parts := []lib.GeminiPart{
|
|
{Text: extractionPrompt},
|
|
{
|
|
InlineData: &lib.GeminiInlineData{
|
|
MimeType: "application/pdf",
|
|
Data: b64,
|
|
},
|
|
},
|
|
}
|
|
|
|
// Call Gemini API
|
|
log.Printf("Extracting document information via Gemini API...")
|
|
geminiRespText, err := lib.CallGeminiMultimodal(parts, nil) // Use default config
|
|
if err != nil {
|
|
lib.SendErrorForAnalysis("doc-processor.ExtractDocument", err, map[string]interface{}{
|
|
"file_path": filePath,
|
|
"pdf_size": len(pdfBytes),
|
|
})
|
|
log.Fatalf("Failed to extract document: %v", err)
|
|
}
|
|
|
|
// Parse the extracted document
|
|
var doc ProcessedDoc
|
|
if err := json.Unmarshal([]byte(geminiRespText), &doc); err != nil {
|
|
log.Fatalf("Failed to parse extracted doc: %v (response: %s)", err, geminiRespText)
|
|
}
|
|
|
|
// Output JSON to stdout
|
|
output, err := json.MarshalIndent(doc, "", " ")
|
|
if err != nil {
|
|
log.Fatalf("Failed to marshal output: %v", err)
|
|
}
|
|
|
|
fmt.Println(string(output))
|
|
}
|