package main import ( "encoding/base64" "encoding/json" "fmt" "io" "log" "net/http" "os" "os/exec" "path/filepath" "regexp" "sort" "strings" "sync" "time" "inou/lib" ) var processProgress sync.Map // dossierID → map[string]interface{} // detectFileType identifies file type by magic bytes or data structure. func detectFileType(data []byte) string { n := len(data) if n < 8 { return "" } // DICOM: 128-byte preamble + "DICM" if n > 132 && string(data[128:132]) == "DICM" { return "dicom" } // PNG: 8-byte signature \x89PNG\r\n\x1a\n if data[0] == 0x89 && data[1] == 0x50 && data[2] == 0x4e && data[3] == 0x47 && data[4] == 0x0d && data[5] == 0x0a && data[6] == 0x1a && data[7] == 0x0a { return "png" } // PDF: %PDF-N.N if n > 8 && string(data[:5]) == "%PDF-" && data[5] >= '1' && data[5] <= '9' && data[6] == '.' { return "pdf" } // ZIP: PK\x03\x04 (local file header) if data[0] == 0x50 && data[1] == 0x4b && data[2] == 0x03 && data[3] == 0x04 { return "zip" } // JPEG: \xff\xd8\xff (SOI + first marker) if data[0] == 0xff && data[1] == 0xd8 && data[2] == 0xff { return "jpg" } // Genome: tab-separated text with rs[0-9]+ in first data column if isGenomeData(data) { return "genetics" } // JSON: starts with { or [ if b := firstNonSpace(data); b == '{' || b == '[' { return "json" } // Text: printable UTF-8 (fallback for markdown, CSV, plain text) if isReadableText(data) { return "text" } return "" } func firstNonSpace(data []byte) byte { for _, b := range data { if b != ' ' && b != '\t' && b != '\n' && b != '\r' { return b } } return 0 } // isReadableText checks if content is printable UTF-8 text (not binary). func isReadableText(data []byte) bool { peek := data if len(peek) > 1024 { peek = peek[:1024] } printable := 0 for _, b := range peek { if b == '\n' || b == '\r' || b == '\t' || (b >= 32 && b < 127) { printable++ } else if b >= 128 { printable++ // UTF-8 multibyte } else if b == 0 { return false // null byte = binary } } return printable > len(peek)*9/10 // >90% printable } var rsidRe = regexp.MustCompile(`^rs\d{1,12}\t`) // isGenomeData checks whether text content has the structure of a genome raw data file: // lines of tab-separated values where the first column is an rsID (rs[0-9]+). // Requires at least 10 matching lines to avoid false positives. func isGenomeData(data []byte) bool { peek := data if len(peek) > 8192 { peek = peek[:8192] } matches := 0 for _, line := range strings.SplitN(string(peek), "\n", 100) { line = strings.TrimSpace(line) if line == "" || line[0] == '#' { continue } // Skip header lines if strings.HasPrefix(line, "rsid") || strings.HasPrefix(line, "RSID") { continue } if rsidRe.MatchString(line) { matches++ if matches >= 10 { return true } } else { return false // non-matching data line = not genome } } return false } var sliceRe = regexp.MustCompile(`^(.+?) - (.+?) - slice (\d+)/(\d+)`) // Upload represents a file upload entry for display type Upload struct { ID string FileName, FilePath, SizeHuman, UploadedAt, ExpiresAt, DeletedReason string Category, Status string Deleted, CanUndo bool } // UploadData is the JSON structure stored in Entry.Data for uploads type UploadData struct { Path string `json:"path"` RelPath string `json:"rel_path,omitempty"` // Original relative path for folder uploads Size int64 `json:"size"` UploadedBy string `json:"uploaded_by"` Status string `json:"status"` } func formatBytes(b int64) string { const unit = 1024 if b < unit { return fmt.Sprintf("%d B", b) } div, exp := int64(unit), 0 for n := b / unit; n >= unit; n /= unit { div *= unit exp++ } return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp]) } // getUploads returns all uploads for a dossier using lib.EntryList func getUploads(dossierID string) []Upload { var uploads []Upload entries, err := lib.EntryList(lib.SystemAccessorID, "", lib.CategoryUpload, &lib.EntryFilter{ // nil ctx - internal operation DossierID: dossierID, Limit: 50, }) if err != nil { return uploads } for _, e := range entries { var data map[string]interface{} json.Unmarshal([]byte(e.Data), &data) size, _ := data["size"].(float64) status, _ := data["status"].(string) path, _ := data["path"].(string) createdEntries, _ := data["created_entries"].([]interface{}) u := Upload{ ID: e.EntryID, FileName: e.Value, FilePath: path, Category: e.Type, Status: status, SizeHuman: formatBytes(int64(size)), UploadedAt: time.Unix(e.Timestamp, 0).Format("Jan 2"), ExpiresAt: time.Unix(e.TimestampEnd, 0).Format("Jan 2"), CanUndo: len(createdEntries) > 0, } if e.Status != 0 { u.Deleted = true u.DeletedReason = "Deleted" } uploads = append(uploads, u) } return uploads } // getUploadEntry retrieves a single upload entry using lib.EntryGet func getUploadEntry(entryID, dossierID string) (filePath, fileName, category, status string, deleted bool) { e, err := lib.EntryGet(nil, entryID) // nil ctx - internal operation if err != nil || e == nil { return } // Verify it belongs to the right dossier and is an upload if e.DossierID != dossierID || e.Category != lib.CategoryUpload { return } var data UploadData json.Unmarshal([]byte(e.Data), &data) fileName = e.Value category = e.Type filePath = data.Path status = data.Status deleted = e.Status != 0 return } // findUploadByFilename finds existing uploads with the same filename func findUploadByFilename(dossierID, filename string) []*lib.Entry { entries, err := lib.EntryList(lib.SystemAccessorID, "", lib.CategoryUpload, &lib.EntryFilter{ // nil ctx - internal operation DossierID: dossierID, Value: filename, }) if err != nil { return nil } return entries } func handleUploadPage(w http.ResponseWriter, r *http.Request) { p := getLoggedInDossier(r) if p == nil { http.Redirect(w, r, "/", http.StatusSeeOther) return } parts := strings.Split(r.URL.Path, "/") if len(parts) < 4 { http.NotFound(w, r) return } targetID := parts[2] isSelf := targetID == p.DossierID if !isSelf { if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit { http.Error(w, "Forbidden", http.StatusForbidden) return } } target, _ := lib.DossierGet(p.DossierID, targetID) if target == nil { http.NotFound(w, r) return } lang := getLang(r) data := PageData{ Page: "upload", Lang: lang, T: translations[lang], Dossier: p, TargetDossier: target, UploadList: getUploads(targetID), } w.Header().Set("Content-Type", "text/html; charset=utf-8") templates.ExecuteTemplate(w, "base.tmpl", data) } func handleUploadPost(w http.ResponseWriter, r *http.Request) { p := getLoggedInDossier(r) if p == nil { http.Error(w, "Unauthorized", http.StatusUnauthorized) return } parts := strings.Split(r.URL.Path, "/") if len(parts) < 4 { http.NotFound(w, r) return } targetID := parts[2] isSelf := targetID == p.DossierID if !isSelf { if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit { http.Error(w, "Forbidden", http.StatusForbidden) return } } r.ParseMultipartForm(10 << 30) file, header, err := r.FormFile("file") if err != nil { http.Error(w, "No file", http.StatusBadRequest) return } defer file.Close() relPath := r.FormValue("path") if relPath == "" { relPath = header.Filename } fileName := filepath.Base(relPath) category := r.FormValue("category") // Read file content content, err := io.ReadAll(file) if err != nil { http.Error(w, "Failed to read", http.StatusInternalServerError) return } // Auto-detect file type if category == "" { category = detectFileType(content) } // Generate file ID using lib.NewID() (Hex16 format) fileID := lib.NewID() // Store in uploads directory uploadDir := filepath.Join(uploadsDir, formatHexID(targetID)) filePath := filepath.Join(uploadDir, fileID) os.MkdirAll(uploadDir, 0755) if err := lib.EncryptFile(content, filePath); err != nil { w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusInternalServerError) w.Write([]byte(`{"status":"error","message":"Upload failed, we've been notified"}`)) return } written := int64(len(content)) // Delete existing upload with same filename (re-upload cleanup) existingUploads := findUploadByFilename(targetID, fileName) for _, old := range existingUploads { lib.EntryDelete("", targetID, &lib.Filter{EntryID: old.EntryID}) } now := time.Now().Unix() expires := now + 7*24*60*60 uploadData := UploadData{ Path: filePath, RelPath: relPath, Size: written, UploadedBy: p.DossierID, Status: "uploaded", } dataJSON, _ := json.Marshal(uploadData) uploadEntry := &lib.Entry{ DossierID: targetID, Category: lib.CategoryUpload, Type: category, Value: fileName, Timestamp: now, TimestampEnd: expires, Data: string(dataJSON), } lib.EntryWrite("", uploadEntry) // nil ctx - internal operation entryID := uploadEntry.EntryID lib.AuditLog(p.DossierID, "file_upload", targetID, fileName) // Spawn processing for known types (JSON waits for batch processing like DICOM) if category == "genetics" { go processGenomeUpload(entryID, targetID, filePath) } if category == "pdf" { go processDocumentUpload(entryID, targetID, filePath, fileName) } w.Header().Set("Content-Type", "application/json") w.Write([]byte(fmt.Sprintf(`{"status":"ok","id":"%s"}`, entryID))) } func handleDeleteFile(w http.ResponseWriter, r *http.Request) { if r.Method != "POST" { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } p := getLoggedInDossier(r) if p == nil { http.Error(w, "Unauthorized", http.StatusUnauthorized) return } // Parse /dossier/{hex}/files/{id}/delete parts := strings.Split(r.URL.Path, "/") if len(parts) < 6 { http.NotFound(w, r) return } targetID := parts[2] fileID := parts[4] isSelf := targetID == p.DossierID if !isSelf { if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit { http.Error(w, "Forbidden", http.StatusForbidden) return } } // Get file info for audit and deletion filePath, fileName, _, _, _ := getUploadEntry(fileID, targetID) if filePath != "" { os.Remove(filePath) } // Mark as deleted using lib.EntryGet + lib.EntryWrite entry, err := lib.EntryGet(nil, fileID) // nil ctx - internal operation if err == nil && entry != nil && entry.DossierID == targetID { entry.Status = 1 // Mark as deleted lib.EntryWrite("", entry) // nil ctx - internal operation } lib.AuditLog(p.DossierID, "file_delete", targetID, fileName) http.Redirect(w, r, fmt.Sprintf("/dossier/%s/upload", formatHexID(targetID)), http.StatusSeeOther) } func handleUpdateFile(w http.ResponseWriter, r *http.Request) { if r.Method != "POST" { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } p := getLoggedInDossier(r) if p == nil { http.Error(w, "Unauthorized", http.StatusUnauthorized) return } // Parse /dossier/{hex}/files/{id}/update parts := strings.Split(r.URL.Path, "/") if len(parts) < 6 { http.NotFound(w, r) return } targetID := parts[2] fileID := parts[4] isSelf := targetID == p.DossierID if !isSelf { if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit { http.Error(w, "Forbidden", http.StatusForbidden) return } } // Get entry using lib.EntryGet entry, err := lib.EntryGet(nil, fileID) // nil ctx - internal operation if err != nil || entry == nil || entry.DossierID != targetID || entry.Category != lib.CategoryUpload { http.NotFound(w, r) return } if entry.Status != 0 { http.NotFound(w, r) // Deleted return } var data UploadData json.Unmarshal([]byte(entry.Data), &data) if data.Status != "uploaded" { http.Error(w, "Cannot modify processed file", http.StatusBadRequest) return } newCategory := r.FormValue("category") if newCategory != "" && newCategory != entry.Type { entry.Type = newCategory lib.EntryWrite("", entry) // nil ctx - internal operation lib.AuditLog(p.DossierID, "file_category_change", targetID, entry.Value) } http.Redirect(w, r, fmt.Sprintf("/dossier/%s/upload", formatHexID(targetID)), http.StatusSeeOther) } func handleUndoImport(w http.ResponseWriter, r *http.Request) { if r.Method != "POST" { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } p := getLoggedInDossier(r) if p == nil { http.Error(w, "Unauthorized", http.StatusUnauthorized) return } parts := strings.Split(r.URL.Path, "/") if len(parts) < 6 { http.NotFound(w, r) return } targetID := parts[2] fileID := parts[4] isSelf := targetID == p.DossierID if !isSelf { if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit { http.Error(w, "Forbidden", http.StatusForbidden) return } } entry, err := lib.EntryGet(nil, fileID) if err != nil || entry == nil || entry.DossierID != targetID || entry.Category != lib.CategoryUpload { http.NotFound(w, r) return } // Parse created_entries from upload data var data map[string]interface{} if json.Unmarshal([]byte(entry.Data), &data) != nil { http.Error(w, "Invalid upload data", http.StatusBadRequest) return } ids, _ := data["created_entries"].([]interface{}) if len(ids) == 0 { http.Error(w, "No entries to undo", http.StatusBadRequest) return } // Delete each parent entry (cascading deletes children) for _, id := range ids { parentID, _ := id.(string) if parentID == "" { continue } lib.EntryDelete("", targetID, &lib.Filter{EntryID: parentID}) } // Reset upload status to "uploaded" so it can be reprocessed data["status"] = "uploaded" delete(data, "created_entries") b, _ := json.Marshal(data) entry.Data = string(b) lib.EntryWrite("", entry) lib.AuditLog(p.DossierID, "undo_import", targetID, fmt.Sprintf("upload=%s orders=%d", entry.Value, len(ids))) w.Header().Set("Content-Type", "application/json") w.Write([]byte(`{"status":"ok"}`)) } func handleProcessImaging(w http.ResponseWriter, r *http.Request) { if r.Method != "POST" { http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) return } p := getLoggedInDossier(r) if p == nil { http.Error(w, "Unauthorized", http.StatusUnauthorized) return } parts := strings.Split(r.URL.Path, "/") if len(parts) < 4 { http.NotFound(w, r) return } targetID := parts[2] isSelf := targetID == p.DossierID if !isSelf { if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit { http.Error(w, "Forbidden", http.StatusForbidden) return } } // Don't start if already running if v, ok := processProgress.Load(targetID); ok { prog := v.(map[string]interface{}) if prog["stage"] != "done" { w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]interface{}{"running": true}) return } } processProgress.Store(targetID, map[string]interface{}{"stage": "starting"}) go runProcessImaging(p.DossierID, targetID) w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]interface{}{"started": true}) } func handleProcessStatus(w http.ResponseWriter, r *http.Request) { p := getLoggedInDossier(r) if p == nil { http.Error(w, "Unauthorized", http.StatusUnauthorized) return } parts := strings.Split(r.URL.Path, "/") if len(parts) < 4 { http.NotFound(w, r) return } targetID := parts[2] v, ok := processProgress.Load(targetID) if !ok { w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]interface{}{"stage": "idle"}) return } prog := v.(map[string]interface{}) if prog["stage"] == "done" { processProgress.Delete(targetID) } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(prog) } func runProcessImaging(actorID, targetID string) { var processed int entries, err := lib.EntryList(lib.SystemAccessorID, "", lib.CategoryUpload, &lib.EntryFilter{DossierID: targetID}) if err != nil { processProgress.Store(targetID, map[string]interface{}{"stage": "done", "error": err.Error()}) return } // Split pending uploads into JSON and other (DICOM) result := &lib.ImportResult{} var pendingDICOM, pendingJSON []*lib.Entry for _, e := range entries { if e.Status != 0 || e.Type == "genetics" { continue } var d UploadData json.Unmarshal([]byte(e.Data), &d) if d.Status != "uploaded" { continue } switch e.Type { case "dicom", "imaging", "png", "jpg", "pdf", "zip": pendingDICOM = append(pendingDICOM, e) default: pendingJSON = append(pendingJSON, e) } } if len(pendingDICOM) == 0 && len(pendingJSON) == 0 { processProgress.Store(targetID, map[string]interface{}{"stage": "done", "slices": 0}) return } // --- Process DICOM files --- var importErr error if len(pendingDICOM) > 0 { totalFiles := len(pendingDICOM) logFn := func(format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) if m := sliceRe.FindStringSubmatch(msg); m != nil { processed++ processProgress.Store(targetID, map[string]interface{}{ "stage": "importing", "study": m[1], "series": m[2], "slice": m[3], "slice_total": m[4], "processed": processed, "total": totalFiles, }) } } processProgress.Store(targetID, map[string]interface{}{ "stage": "decrypting", "total": totalFiles, }) tempDir, err := os.MkdirTemp("", "dicom-import-*") if err != nil { processProgress.Store(targetID, map[string]interface{}{"stage": "done", "error": err.Error()}) return } defer os.RemoveAll(tempDir) for i, e := range pendingDICOM { processProgress.Store(targetID, map[string]interface{}{ "stage": "decrypting", "processed": i + 1, "total": totalFiles, }) var d UploadData json.Unmarshal([]byte(e.Data), &d) relPath := d.RelPath if relPath == "" { relPath = e.Value } outPath := filepath.Join(tempDir, relPath) os.MkdirAll(filepath.Dir(outPath), 0755) content, err := lib.DecryptFile(d.Path) if err != nil { continue } os.WriteFile(outPath, content, 0644) d.Status = "processing" dataJSON, _ := json.Marshal(d) pendingDICOM[i].Data = string(dataJSON) lib.EntryWrite("", pendingDICOM[i]) } result, importErr = lib.ImportDICOMFromPath(targetID, tempDir, "", logFn) finalStatus := "processed" if result.Slices == 0 && importErr == nil { finalStatus = "pending_importer" } else if importErr != nil { finalStatus = "failed" } for _, e := range pendingDICOM { var d UploadData json.Unmarshal([]byte(e.Data), &d) d.Status = finalStatus dataJSON, _ := json.Marshal(d) e.Data = string(dataJSON) if finalStatus == "processed" { e.Type = "imaging" } lib.EntryWrite("", e) } } // --- Process JSON files in batch --- var jsonImported int if len(pendingJSON) > 0 { processProgress.Store(targetID, map[string]interface{}{ "stage": "importing_json", "processed": 0, "total": len(pendingJSON), }) jsonImported = processJSONBatch(targetID, pendingJSON) } // Report results errStr := "" if importErr != nil { errStr = importErr.Error() } processProgress.Store(targetID, map[string]interface{}{ "stage": "done", "studies": result.Studies, "series_count": result.Series, "slices": result.Slices, "json_imported": jsonImported, "error": errStr, }) if importErr != nil { lib.AuditLog(actorID, "imaging_import", targetID, fmt.Sprintf("error=%v", importErr)) } else if result.Slices > 0 { summary := fmt.Sprintf("studies=%d series=%d slices=%d", result.Studies, result.Series, result.Slices) lib.AuditLog(actorID, "imaging_import", targetID, summary) if target, _ := lib.DossierGet(actorID, targetID); target != nil { if actor, _ := lib.DossierGet("", actorID); actor != nil { lib.SendSignal(fmt.Sprintf("Import done: %s → %s (%s)", actor.Name, target.Name, summary)) } } } if jsonImported > 0 { lib.AuditLog(actorID, "json_import", targetID, fmt.Sprintf("files=%d", jsonImported)) } } // extractedEntry is the JSON structure returned by extraction prompts type extractedEntry struct { Type string `json:"type"` Value string `json:"value"` Summary string `json:"summary"` SummaryTranslated string `json:"summary_translated,omitempty"` SearchKey string `json:"search_key,omitempty"` Timestamp string `json:"timestamp,omitempty"` Data map[string]interface{} `json:"data"` DataTranslated map[string]interface{} `json:"data_translated,omitempty"` SourceSpans []sourceSpan `json:"source_spans,omitempty"` } type sourceSpan struct { Start string `json:"start"` End string `json:"end"` } // langName maps ISO 639-1 codes to full language names for LLM prompts. func langName(code string) string { names := map[string]string{ "en": "English", "de": "German", "ru": "Russian", "nl": "Dutch", "fr": "French", "es": "Spanish", "it": "Italian", "pt": "Portuguese", "ja": "Japanese", "zh": "Chinese", "ko": "Korean", "ar": "Arabic", "tr": "Turkish", "pl": "Polish", "uk": "Ukrainian", } if n, ok := names[code]; ok { return n } return code } // extractionPreamble returns common instructions prepended to every extraction prompt. func extractionPreamble(targetLang string) string { s := `IMPORTANT RULES (apply to all entries you return): - Do NOT translate. Keep ALL text values (summary, value, data fields) in the ORIGINAL language of the document. - For each entry, include "source_spans": an array of {"start": "...", "end": "..."} where start/end are the VERBATIM first and last 5-8 words of the relevant passage(s) in the source markdown. This is used to highlight the source text. Multiple spans are allowed. - For each entry, include "search_key": a short normalized deduplication key in English lowercase. Format: "thing:qualifier:YYYY-MM" or "thing:qualifier" for undated facts. Examples: "surgery:vp-shunt:2020-07", "device:ommaya-reservoir:2020-04", "diagnosis:hydrocephalus", "provider:peraud:ulm". Same real-world fact across different documents MUST produce the same key. ` if targetLang != "" { s += `- Include "summary_translated": a translation of the summary field into ` + targetLang + `. - Include "data_translated": a copy of the "data" object with all string values translated into ` + targetLang + `. Keep the same keys (in English). Only translate the values. ` } return s } // loadExtractionPrompts discovers all extract_*.md files and returns {categoryID: prompt content}. func loadExtractionPrompts() map[int]string { pattern := filepath.Join(lib.TrackerPromptsDir(), "extract_*.md") files, _ := filepath.Glob(pattern) prompts := make(map[int]string) for _, f := range files { // extract_device.md → "device" base := filepath.Base(f) name := strings.TrimPrefix(base, "extract_") name = strings.TrimSuffix(name, ".md") catID, ok := lib.CategoryFromString[name] if !ok { log.Printf("[doc-import] Unknown category in prompt file: %s", base) continue } data, err := os.ReadFile(f) if err != nil { continue } prompts[catID] = string(data) } return prompts } // parseTimestamp tries to parse a date string into Unix timestamp. func parseTimestamp(s string) int64 { if s == "" { return 0 } for _, fmt := range []string{"2006-01-02", "02.01.2006", "01/02/2006", "Jan 2, 2006"} { if t, err := time.Parse(fmt, s); err == nil { return t.Unix() } } return 0 } const ( fireworksVisionModel = "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct" fireworksTextModel = "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct" ) var ocrPrompt = `You are a medical document OCR system. Produce a faithful markdown transcription of this document. The images are sequential pages of the same document. Process them in order: page 1 first, then page 2, etc. Rules: - Read each page top-to-bottom, left-to-right. For multi-column layouts, transcribe the full page as a human would read it. - Preserve ALL text, dates, values, names, addresses, and structure - Translate nothing — keep the original language - Use markdown headers, lists, and formatting to reflect the document structure - For tables, use markdown tables. Preserve numeric values exactly. - Be complete — do not skip or summarize anything - Do not describe visual elements (logos, signatures) — only transcribe text - For handwritten text, transcribe as accurately as possible. Mark uncertain readings with [?]` func processDocumentUpload(uploadID, dossierID, filePath, fileName string) { log.Printf("[doc-import] Starting for %s (%s)", fileName, dossierID) // Update upload status setUploadStatus := func(status string) { if entry, err := lib.EntryGet(nil, uploadID); err == nil { var d UploadData json.Unmarshal([]byte(entry.Data), &d) d.Status = status data, _ := json.Marshal(d) entry.Data = string(data) lib.EntryWrite("", entry) } } setUploadStatus("processing") // 1. Decrypt PDF pdfBytes, err := lib.DecryptFile(filePath) if err != nil { log.Printf("[doc-import] Decrypt failed: %v", err) setUploadStatus("failed") return } // 2. Convert PDF to PNG pages via pdftoppm tempDir, err := os.MkdirTemp("", "doc-import-*") if err != nil { log.Printf("[doc-import] MkdirTemp failed: %v", err) setUploadStatus("failed") return } defer os.RemoveAll(tempDir) pdfPath := filepath.Join(tempDir, "input.pdf") if err := os.WriteFile(pdfPath, pdfBytes, 0644); err != nil { log.Printf("[doc-import] WriteFile failed: %v", err) setUploadStatus("failed") return } prefix := filepath.Join(tempDir, "page") cmd := exec.Command("pdftoppm", "-png", "-r", "200", pdfPath, prefix) if out, err := cmd.CombinedOutput(); err != nil { log.Printf("[doc-import] pdftoppm failed: %v: %s", err, out) setUploadStatus("failed") return } // Collect page images sorted by name pageFiles, _ := filepath.Glob(prefix + "*.png") sort.Strings(pageFiles) if len(pageFiles) == 0 { log.Printf("[doc-import] No pages generated") setUploadStatus("failed") return } log.Printf("[doc-import] %d pages converted", len(pageFiles)) // 3. OCR: send pages to Fireworks vision model content := []interface{}{ map[string]string{"type": "text", "text": ocrPrompt}, } for _, pf := range pageFiles { imgBytes, err := os.ReadFile(pf) if err != nil { continue } b64 := base64.StdEncoding.EncodeToString(imgBytes) content = append(content, map[string]interface{}{ "type": "image_url", "image_url": map[string]string{ "url": "data:image/png;base64," + b64, }, }) } messages := []map[string]interface{}{ {"role": "user", "content": content}, } markdown, err := lib.CallFireworks(fireworksVisionModel, messages, 16384) if err != nil { log.Printf("[doc-import] OCR failed: %v", err) setUploadStatus("failed") return } log.Printf("[doc-import] OCR done: %d chars markdown", len(markdown)) // 4. Create document entry with markdown now := time.Now().Unix() docData := map[string]interface{}{ "markdown": markdown, "source_upload": uploadID, "pages": len(pageFiles), } docDataJSON, _ := json.Marshal(docData) docEntry := &lib.Entry{ DossierID: dossierID, Category: lib.CategoryDocument, Type: "pdf", Value: fileName, Timestamp: now, Data: string(docDataJSON), } lib.EntryWrite("", docEntry) docID := docEntry.EntryID log.Printf("[doc-import] Document entry created: %s", docID) // 5. Fan out category extraction + optional translation type catResult struct { Category int Entries []extractedEntry } var mu sync.Mutex var results []catResult var wg sync.WaitGroup // Get dossier language for translations var targetLang string if d, err := lib.DossierGet("", dossierID); err == nil && d.Preferences.Language != "" { targetLang = langName(d.Preferences.Language) } preamble := extractionPreamble(targetLang) // Translate full markdown in parallel if target language is set var translatedMarkdown string if targetLang != "" { wg.Add(1) go func() { defer wg.Done() prompt := fmt.Sprintf("Translate this medical document to %s. Preserve all markdown formatting, headers, tables, and structure exactly. Translate ALL text including headers and labels. Output ONLY the translated markdown, nothing else.\n\n%s", targetLang, markdown) msgs := []map[string]interface{}{ {"role": "user", "content": prompt}, } resp, err := lib.CallFireworks(fireworksTextModel, msgs, 16384) if err != nil { log.Printf("[doc-import] Translation failed: %v", err) return } mu.Lock() translatedMarkdown = resp mu.Unlock() log.Printf("[doc-import] Translated to %s: %d chars", targetLang, len(resp)) }() } prompts := loadExtractionPrompts() log.Printf("[doc-import] Loaded %d extraction prompts (lang=%s)", len(prompts), targetLang) for catID, promptTmpl := range prompts { wg.Add(1) go func(catID int, promptTmpl string) { defer wg.Done() prompt := preamble + "\n" + strings.ReplaceAll(promptTmpl, "{{MARKDOWN}}", markdown) msgs := []map[string]interface{}{ {"role": "user", "content": prompt}, } resp, err := lib.CallFireworks(fireworksTextModel, msgs, 4096) if err != nil { log.Printf("[doc-import] Category %d failed: %v", catID, err) return } resp = strings.TrimSpace(resp) if resp == "null" || resp == "" { return } // Parse as array of entries var entries []extractedEntry if err := json.Unmarshal([]byte(resp), &entries); err != nil { // Try single object var single extractedEntry if err2 := json.Unmarshal([]byte(resp), &single); err2 == nil && single.Summary != "" { entries = []extractedEntry{single} } else { log.Printf("[doc-import] Category %d: parse failed: %v", catID, err) return } } if len(entries) == 0 { return } mu.Lock() results = append(results, catResult{Category: catID, Entries: entries}) mu.Unlock() }(catID, promptTmpl) } wg.Wait() // Save translated markdown to document entry if available if translatedMarkdown != "" { if docEntry, err := lib.EntryGet(nil, docID); err == nil { var dd map[string]interface{} json.Unmarshal([]byte(docEntry.Data), &dd) dd["markdown_translated"] = translatedMarkdown dd["translated_to"] = targetLang b, _ := json.Marshal(dd) docEntry.Data = string(b) lib.EntryWrite("", docEntry) } } totalEntries := 0 for _, r := range results { totalEntries += len(r.Entries) } log.Printf("[doc-import] Extraction done: %d categories, %d entries", len(results), totalEntries) // 6. Create entries for each extracted item var createdIDs []string for _, r := range results { for _, e := range r.Entries { // Build Data JSON with source reference + extracted fields dataMap := map[string]interface{}{ "source_doc_id": docID, } for k, v := range e.Data { dataMap[k] = v } if len(e.SourceSpans) > 0 { dataMap["source_spans"] = e.SourceSpans } if e.SummaryTranslated != "" { dataMap["summary_translated"] = e.SummaryTranslated } if len(e.DataTranslated) > 0 { dataMap["data_translated"] = e.DataTranslated } dataJSON, _ := json.Marshal(dataMap) ts := now if parsed := parseTimestamp(e.Timestamp); parsed > 0 { ts = parsed } entry := &lib.Entry{ DossierID: dossierID, ParentID: docID, Category: r.Category, Type: e.Type, Value: e.Value, Summary: e.Summary, SearchKey: e.SearchKey, Timestamp: ts, Data: string(dataJSON), } lib.EntryWrite("", entry) createdIDs = append(createdIDs, entry.EntryID) } } // 7. Update upload status with created entry IDs (for undo) if entry, err := lib.EntryGet(nil, uploadID); err == nil { var data map[string]interface{} json.Unmarshal([]byte(entry.Data), &data) data["status"] = "processed" data["created_entries"] = append([]string{docID}, createdIDs...) b, _ := json.Marshal(data) entry.Data = string(b) lib.EntryWrite("", entry) } log.Printf("[doc-import] Complete: %s → doc=%s, %d extracts", fileName, docID, len(createdIDs)) lib.AuditLog("", "doc_import", dossierID, fmt.Sprintf("file=%s doc=%s categories=%d", fileName, docID, len(results))) }