inou/portal/upload.go

1109 lines
31 KiB
Go

package main
import (
"encoding/base64"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"os/exec"
"path/filepath"
"regexp"
"sort"
"strings"
"sync"
"time"
"inou/lib"
)
var processProgress sync.Map // dossierID → map[string]interface{}
// detectFileType identifies file type by magic bytes or data structure.
func detectFileType(data []byte) string {
n := len(data)
if n < 8 {
return ""
}
// DICOM: 128-byte preamble + "DICM"
if n > 132 && string(data[128:132]) == "DICM" {
return "dicom"
}
// PNG: 8-byte signature \x89PNG\r\n\x1a\n
if data[0] == 0x89 && data[1] == 0x50 && data[2] == 0x4e && data[3] == 0x47 &&
data[4] == 0x0d && data[5] == 0x0a && data[6] == 0x1a && data[7] == 0x0a {
return "png"
}
// PDF: %PDF-N.N
if n > 8 && string(data[:5]) == "%PDF-" && data[5] >= '1' && data[5] <= '9' && data[6] == '.' {
return "pdf"
}
// ZIP: PK\x03\x04 (local file header)
if data[0] == 0x50 && data[1] == 0x4b && data[2] == 0x03 && data[3] == 0x04 {
return "zip"
}
// JPEG: \xff\xd8\xff (SOI + first marker)
if data[0] == 0xff && data[1] == 0xd8 && data[2] == 0xff {
return "jpg"
}
// Genome: tab-separated text with rs[0-9]+ in first data column
if isGenomeData(data) {
return "genetics"
}
// JSON: starts with { or [
if b := firstNonSpace(data); b == '{' || b == '[' {
return "json"
}
// Text: printable UTF-8 (fallback for markdown, CSV, plain text)
if isReadableText(data) {
return "text"
}
return ""
}
func firstNonSpace(data []byte) byte {
for _, b := range data {
if b != ' ' && b != '\t' && b != '\n' && b != '\r' {
return b
}
}
return 0
}
// isReadableText checks if content is printable UTF-8 text (not binary).
func isReadableText(data []byte) bool {
peek := data
if len(peek) > 1024 {
peek = peek[:1024]
}
printable := 0
for _, b := range peek {
if b == '\n' || b == '\r' || b == '\t' || (b >= 32 && b < 127) {
printable++
} else if b >= 128 {
printable++ // UTF-8 multibyte
} else if b == 0 {
return false // null byte = binary
}
}
return printable > len(peek)*9/10 // >90% printable
}
var rsidRe = regexp.MustCompile(`^rs\d{1,12}\t`)
// isGenomeData checks whether text content has the structure of a genome raw data file:
// lines of tab-separated values where the first column is an rsID (rs[0-9]+).
// Requires at least 10 matching lines to avoid false positives.
func isGenomeData(data []byte) bool {
peek := data
if len(peek) > 8192 {
peek = peek[:8192]
}
matches := 0
for _, line := range strings.SplitN(string(peek), "\n", 100) {
line = strings.TrimSpace(line)
if line == "" || line[0] == '#' {
continue
}
// Skip header lines
if strings.HasPrefix(line, "rsid") || strings.HasPrefix(line, "RSID") {
continue
}
if rsidRe.MatchString(line) {
matches++
if matches >= 10 {
return true
}
} else {
return false // non-matching data line = not genome
}
}
return false
}
var sliceRe = regexp.MustCompile(`^(.+?) - (.+?) - slice (\d+)/(\d+)`)
// Upload represents a file upload entry for display
type Upload struct {
ID string
FileName, FilePath, SizeHuman, UploadedAt, ExpiresAt, DeletedReason string
Category, Status string
Deleted, CanUndo bool
}
// UploadData is the JSON structure stored in Entry.Data for uploads
type UploadData struct {
Path string `json:"path"`
RelPath string `json:"rel_path,omitempty"` // Original relative path for folder uploads
Size int64 `json:"size"`
UploadedBy string `json:"uploaded_by"`
Status string `json:"status"`
}
func formatBytes(b int64) string {
const unit = 1024
if b < unit {
return fmt.Sprintf("%d B", b)
}
div, exp := int64(unit), 0
for n := b / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
}
// getUploads returns all uploads for a dossier using lib.EntryList
func getUploads(dossierID string) []Upload {
var uploads []Upload
entries, err := lib.EntryList(lib.SystemAccessorID, "", lib.CategoryUpload, &lib.EntryFilter{ // nil ctx - internal operation
DossierID: dossierID,
Limit: 50,
})
if err != nil {
return uploads
}
for _, e := range entries {
var data map[string]interface{}
json.Unmarshal([]byte(e.Data), &data)
size, _ := data["size"].(float64)
status, _ := data["status"].(string)
path, _ := data["path"].(string)
createdEntries, _ := data["created_entries"].([]interface{})
u := Upload{
ID: e.EntryID,
FileName: e.Value,
FilePath: path,
Category: e.Type,
Status: status,
SizeHuman: formatBytes(int64(size)),
UploadedAt: time.Unix(e.Timestamp, 0).Format("Jan 2"),
ExpiresAt: time.Unix(e.TimestampEnd, 0).Format("Jan 2"),
CanUndo: len(createdEntries) > 0,
}
if e.Status != 0 {
u.Deleted = true
u.DeletedReason = "Deleted"
}
uploads = append(uploads, u)
}
return uploads
}
// getUploadEntry retrieves a single upload entry using lib.EntryGet
func getUploadEntry(entryID, dossierID string) (filePath, fileName, category, status string, deleted bool) {
e, err := lib.EntryGet(nil, entryID) // nil ctx - internal operation
if err != nil || e == nil {
return
}
// Verify it belongs to the right dossier and is an upload
if e.DossierID != dossierID || e.Category != lib.CategoryUpload {
return
}
var data UploadData
json.Unmarshal([]byte(e.Data), &data)
fileName = e.Value
category = e.Type
filePath = data.Path
status = data.Status
deleted = e.Status != 0
return
}
// findUploadByFilename finds existing uploads with the same filename
func findUploadByFilename(dossierID, filename string) []*lib.Entry {
entries, err := lib.EntryList(lib.SystemAccessorID, "", lib.CategoryUpload, &lib.EntryFilter{ // nil ctx - internal operation
DossierID: dossierID,
Value: filename,
})
if err != nil {
return nil
}
return entries
}
func handleUploadPage(w http.ResponseWriter, r *http.Request) {
p := getLoggedInDossier(r)
if p == nil {
http.Redirect(w, r, "/", http.StatusSeeOther)
return
}
parts := strings.Split(r.URL.Path, "/")
if len(parts) < 4 {
http.NotFound(w, r)
return
}
targetID := parts[2]
isSelf := targetID == p.DossierID
if !isSelf {
if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit {
http.Error(w, "Forbidden", http.StatusForbidden)
return
}
}
target, _ := lib.DossierGet(p.DossierID, targetID)
if target == nil {
http.NotFound(w, r)
return
}
lang := getLang(r)
data := PageData{
Page: "upload", Lang: lang, T: translations[lang],
Dossier: p, TargetDossier: target,
UploadList: getUploads(targetID),
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
templates.ExecuteTemplate(w, "base.tmpl", data)
}
func handleUploadPost(w http.ResponseWriter, r *http.Request) {
p := getLoggedInDossier(r)
if p == nil {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
return
}
parts := strings.Split(r.URL.Path, "/")
if len(parts) < 4 {
http.NotFound(w, r)
return
}
targetID := parts[2]
isSelf := targetID == p.DossierID
if !isSelf {
if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit {
http.Error(w, "Forbidden", http.StatusForbidden)
return
}
}
r.ParseMultipartForm(10 << 30)
file, header, err := r.FormFile("file")
if err != nil {
http.Error(w, "No file", http.StatusBadRequest)
return
}
defer file.Close()
relPath := r.FormValue("path")
if relPath == "" {
relPath = header.Filename
}
fileName := filepath.Base(relPath)
category := r.FormValue("category")
// Read file content
content, err := io.ReadAll(file)
if err != nil {
http.Error(w, "Failed to read", http.StatusInternalServerError)
return
}
// Auto-detect file type
if category == "" {
category = detectFileType(content)
}
// Generate file ID using lib.NewID() (Hex16 format)
fileID := lib.NewID()
// Store in uploads directory
uploadDir := filepath.Join(uploadsDir, formatHexID(targetID))
filePath := filepath.Join(uploadDir, fileID)
os.MkdirAll(uploadDir, 0755)
if err := lib.EncryptFile(content, filePath); err != nil {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusInternalServerError)
w.Write([]byte(`{"status":"error","message":"Upload failed, we've been notified"}`))
return
}
written := int64(len(content))
// Delete existing upload with same filename (re-upload cleanup)
existingUploads := findUploadByFilename(targetID, fileName)
for _, old := range existingUploads {
lib.EntryDelete("", targetID, &lib.Filter{EntryID: old.EntryID})
}
now := time.Now().Unix()
expires := now + 7*24*60*60
uploadData := UploadData{
Path: filePath,
RelPath: relPath,
Size: written,
UploadedBy: p.DossierID,
Status: "uploaded",
}
dataJSON, _ := json.Marshal(uploadData)
uploadEntry := &lib.Entry{
DossierID: targetID,
Category: lib.CategoryUpload,
Type: category,
Value: fileName,
Timestamp: now,
TimestampEnd: expires,
Data: string(dataJSON),
}
lib.EntryWrite("", uploadEntry) // nil ctx - internal operation
entryID := uploadEntry.EntryID
lib.AuditLog(p.DossierID, "file_upload", targetID, fileName)
// Spawn processing for known types (JSON waits for batch processing like DICOM)
if category == "genetics" {
go processGenomeUpload(entryID, targetID, filePath)
}
if category == "pdf" {
go processDocumentUpload(entryID, targetID, filePath, fileName)
}
w.Header().Set("Content-Type", "application/json")
w.Write([]byte(fmt.Sprintf(`{"status":"ok","id":"%s"}`, entryID)))
}
func handleDeleteFile(w http.ResponseWriter, r *http.Request) {
if r.Method != "POST" {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
p := getLoggedInDossier(r)
if p == nil {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
return
}
// Parse /dossier/{hex}/files/{id}/delete
parts := strings.Split(r.URL.Path, "/")
if len(parts) < 6 {
http.NotFound(w, r)
return
}
targetID := parts[2]
fileID := parts[4]
isSelf := targetID == p.DossierID
if !isSelf {
if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit {
http.Error(w, "Forbidden", http.StatusForbidden)
return
}
}
// Get file info for audit and deletion
filePath, fileName, _, _, _ := getUploadEntry(fileID, targetID)
if filePath != "" {
os.Remove(filePath)
}
// Mark as deleted using lib.EntryGet + lib.EntryWrite
entry, err := lib.EntryGet(nil, fileID) // nil ctx - internal operation
if err == nil && entry != nil && entry.DossierID == targetID {
entry.Status = 1 // Mark as deleted
lib.EntryWrite("", entry) // nil ctx - internal operation
}
lib.AuditLog(p.DossierID, "file_delete", targetID, fileName)
http.Redirect(w, r, fmt.Sprintf("/dossier/%s/upload", formatHexID(targetID)), http.StatusSeeOther)
}
func handleUpdateFile(w http.ResponseWriter, r *http.Request) {
if r.Method != "POST" {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
p := getLoggedInDossier(r)
if p == nil {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
return
}
// Parse /dossier/{hex}/files/{id}/update
parts := strings.Split(r.URL.Path, "/")
if len(parts) < 6 {
http.NotFound(w, r)
return
}
targetID := parts[2]
fileID := parts[4]
isSelf := targetID == p.DossierID
if !isSelf {
if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit {
http.Error(w, "Forbidden", http.StatusForbidden)
return
}
}
// Get entry using lib.EntryGet
entry, err := lib.EntryGet(nil, fileID) // nil ctx - internal operation
if err != nil || entry == nil || entry.DossierID != targetID || entry.Category != lib.CategoryUpload {
http.NotFound(w, r)
return
}
if entry.Status != 0 {
http.NotFound(w, r) // Deleted
return
}
var data UploadData
json.Unmarshal([]byte(entry.Data), &data)
if data.Status != "uploaded" {
http.Error(w, "Cannot modify processed file", http.StatusBadRequest)
return
}
newCategory := r.FormValue("category")
if newCategory != "" && newCategory != entry.Type {
entry.Type = newCategory
lib.EntryWrite("", entry) // nil ctx - internal operation
lib.AuditLog(p.DossierID, "file_category_change", targetID, entry.Value)
}
http.Redirect(w, r, fmt.Sprintf("/dossier/%s/upload", formatHexID(targetID)), http.StatusSeeOther)
}
func handleUndoImport(w http.ResponseWriter, r *http.Request) {
if r.Method != "POST" {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
p := getLoggedInDossier(r)
if p == nil {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
return
}
parts := strings.Split(r.URL.Path, "/")
if len(parts) < 6 {
http.NotFound(w, r)
return
}
targetID := parts[2]
fileID := parts[4]
isSelf := targetID == p.DossierID
if !isSelf {
if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit {
http.Error(w, "Forbidden", http.StatusForbidden)
return
}
}
entry, err := lib.EntryGet(nil, fileID)
if err != nil || entry == nil || entry.DossierID != targetID || entry.Category != lib.CategoryUpload {
http.NotFound(w, r)
return
}
// Parse created_entries from upload data
var data map[string]interface{}
if json.Unmarshal([]byte(entry.Data), &data) != nil {
http.Error(w, "Invalid upload data", http.StatusBadRequest)
return
}
ids, _ := data["created_entries"].([]interface{})
if len(ids) == 0 {
http.Error(w, "No entries to undo", http.StatusBadRequest)
return
}
// Delete each parent entry (cascading deletes children)
for _, id := range ids {
parentID, _ := id.(string)
if parentID == "" {
continue
}
lib.EntryDelete("", targetID, &lib.Filter{EntryID: parentID})
}
// Reset upload status to "uploaded" so it can be reprocessed
data["status"] = "uploaded"
delete(data, "created_entries")
b, _ := json.Marshal(data)
entry.Data = string(b)
lib.EntryWrite("", entry)
lib.AuditLog(p.DossierID, "undo_import", targetID, fmt.Sprintf("upload=%s orders=%d", entry.Value, len(ids)))
w.Header().Set("Content-Type", "application/json")
w.Write([]byte(`{"status":"ok"}`))
}
func handleProcessImaging(w http.ResponseWriter, r *http.Request) {
if r.Method != "POST" {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
p := getLoggedInDossier(r)
if p == nil {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
return
}
parts := strings.Split(r.URL.Path, "/")
if len(parts) < 4 {
http.NotFound(w, r)
return
}
targetID := parts[2]
isSelf := targetID == p.DossierID
if !isSelf {
if access, found := getAccess(formatHexID(p.DossierID), formatHexID(targetID)); !found || !access.CanEdit {
http.Error(w, "Forbidden", http.StatusForbidden)
return
}
}
// Don't start if already running
if v, ok := processProgress.Load(targetID); ok {
prog := v.(map[string]interface{})
if prog["stage"] != "done" {
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{"running": true})
return
}
}
processProgress.Store(targetID, map[string]interface{}{"stage": "starting"})
go runProcessImaging(p.DossierID, targetID)
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{"started": true})
}
func handleProcessStatus(w http.ResponseWriter, r *http.Request) {
p := getLoggedInDossier(r)
if p == nil {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
return
}
parts := strings.Split(r.URL.Path, "/")
if len(parts) < 4 {
http.NotFound(w, r)
return
}
targetID := parts[2]
v, ok := processProgress.Load(targetID)
if !ok {
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{"stage": "idle"})
return
}
prog := v.(map[string]interface{})
if prog["stage"] == "done" {
processProgress.Delete(targetID)
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(prog)
}
func runProcessImaging(actorID, targetID string) {
var processed int
entries, err := lib.EntryList(lib.SystemAccessorID, "", lib.CategoryUpload, &lib.EntryFilter{DossierID: targetID})
if err != nil {
processProgress.Store(targetID, map[string]interface{}{"stage": "done", "error": err.Error()})
return
}
// Split pending uploads into JSON and other (DICOM)
result := &lib.ImportResult{}
var pendingDICOM, pendingJSON []*lib.Entry
for _, e := range entries {
if e.Status != 0 || e.Type == "genetics" {
continue
}
var d UploadData
json.Unmarshal([]byte(e.Data), &d)
if d.Status != "uploaded" {
continue
}
switch e.Type {
case "dicom", "imaging", "png", "jpg", "pdf", "zip":
pendingDICOM = append(pendingDICOM, e)
default:
pendingJSON = append(pendingJSON, e)
}
}
if len(pendingDICOM) == 0 && len(pendingJSON) == 0 {
processProgress.Store(targetID, map[string]interface{}{"stage": "done", "slices": 0})
return
}
// --- Process DICOM files ---
var importErr error
if len(pendingDICOM) > 0 {
totalFiles := len(pendingDICOM)
logFn := func(format string, args ...interface{}) {
msg := fmt.Sprintf(format, args...)
if m := sliceRe.FindStringSubmatch(msg); m != nil {
processed++
processProgress.Store(targetID, map[string]interface{}{
"stage": "importing", "study": m[1], "series": m[2],
"slice": m[3], "slice_total": m[4],
"processed": processed, "total": totalFiles,
})
}
}
processProgress.Store(targetID, map[string]interface{}{
"stage": "decrypting", "total": totalFiles,
})
tempDir, err := os.MkdirTemp("", "dicom-import-*")
if err != nil {
processProgress.Store(targetID, map[string]interface{}{"stage": "done", "error": err.Error()})
return
}
defer os.RemoveAll(tempDir)
for i, e := range pendingDICOM {
processProgress.Store(targetID, map[string]interface{}{
"stage": "decrypting", "processed": i + 1, "total": totalFiles,
})
var d UploadData
json.Unmarshal([]byte(e.Data), &d)
relPath := d.RelPath
if relPath == "" {
relPath = e.Value
}
outPath := filepath.Join(tempDir, relPath)
os.MkdirAll(filepath.Dir(outPath), 0755)
content, err := lib.DecryptFile(d.Path)
if err != nil {
continue
}
os.WriteFile(outPath, content, 0644)
d.Status = "processing"
dataJSON, _ := json.Marshal(d)
pendingDICOM[i].Data = string(dataJSON)
lib.EntryWrite("", pendingDICOM[i])
}
result, importErr = lib.ImportDICOMFromPath(targetID, tempDir, "", logFn)
finalStatus := "processed"
if result.Slices == 0 && importErr == nil {
finalStatus = "pending_importer"
} else if importErr != nil {
finalStatus = "failed"
}
for _, e := range pendingDICOM {
var d UploadData
json.Unmarshal([]byte(e.Data), &d)
d.Status = finalStatus
dataJSON, _ := json.Marshal(d)
e.Data = string(dataJSON)
if finalStatus == "processed" {
e.Type = "imaging"
}
lib.EntryWrite("", e)
}
}
// --- Process JSON files in batch ---
var jsonImported int
if len(pendingJSON) > 0 {
processProgress.Store(targetID, map[string]interface{}{
"stage": "importing_json", "processed": 0, "total": len(pendingJSON),
})
jsonImported = processJSONBatch(targetID, pendingJSON)
}
// Report results
errStr := ""
if importErr != nil {
errStr = importErr.Error()
}
processProgress.Store(targetID, map[string]interface{}{
"stage": "done", "studies": result.Studies,
"series_count": result.Series, "slices": result.Slices,
"json_imported": jsonImported,
"error": errStr,
})
if importErr != nil {
lib.AuditLog(actorID, "imaging_import", targetID, fmt.Sprintf("error=%v", importErr))
} else if result.Slices > 0 {
summary := fmt.Sprintf("studies=%d series=%d slices=%d", result.Studies, result.Series, result.Slices)
lib.AuditLog(actorID, "imaging_import", targetID, summary)
if target, _ := lib.DossierGet(actorID, targetID); target != nil {
if actor, _ := lib.DossierGet("", actorID); actor != nil {
lib.SendSignal(fmt.Sprintf("Import done: %s → %s (%s)", actor.Name, target.Name, summary))
}
}
}
if jsonImported > 0 {
lib.AuditLog(actorID, "json_import", targetID, fmt.Sprintf("files=%d", jsonImported))
}
}
// extractedEntry is the JSON structure returned by extraction prompts
type extractedEntry struct {
Type string `json:"type"`
Value string `json:"value"`
Summary string `json:"summary"`
SummaryTranslated string `json:"summary_translated,omitempty"`
SearchKey string `json:"search_key,omitempty"`
Timestamp string `json:"timestamp,omitempty"`
Data map[string]interface{} `json:"data"`
SourceSpans []sourceSpan `json:"source_spans,omitempty"`
}
type sourceSpan struct {
Start string `json:"start"`
End string `json:"end"`
}
// extractionPreamble returns common instructions prepended to every extraction prompt.
func extractionPreamble(targetLang string) string {
s := `IMPORTANT RULES (apply to all entries you return):
- Do NOT translate. Keep ALL text values (summary, value, data fields) in the ORIGINAL language of the document.
- For each entry, include "source_spans": an array of {"start": "...", "end": "..."} where start/end are the VERBATIM first and last 5-8 words of the relevant passage(s) in the source markdown. This is used to highlight the source text. Multiple spans are allowed.
- For each entry, include "search_key": a short normalized deduplication key in English lowercase. Format: "thing:qualifier:YYYY-MM" or "thing:qualifier" for undated facts. Examples: "surgery:vp-shunt:2020-07", "device:ommaya-reservoir:2020-04", "diagnosis:hydrocephalus", "provider:peraud:ulm". Same real-world fact across different documents MUST produce the same key.
`
if targetLang != "" {
s += `- Include "summary_translated": a translation of the summary field into ` + targetLang + `.
`
}
return s
}
// loadExtractionPrompts discovers all extract_*.md files and returns {categoryID: prompt content}.
func loadExtractionPrompts() map[int]string {
pattern := filepath.Join(lib.TrackerPromptsDir(), "extract_*.md")
files, _ := filepath.Glob(pattern)
prompts := make(map[int]string)
for _, f := range files {
// extract_device.md → "device"
base := filepath.Base(f)
name := strings.TrimPrefix(base, "extract_")
name = strings.TrimSuffix(name, ".md")
catID, ok := lib.CategoryFromString[name]
if !ok {
log.Printf("[doc-import] Unknown category in prompt file: %s", base)
continue
}
data, err := os.ReadFile(f)
if err != nil {
continue
}
prompts[catID] = string(data)
}
return prompts
}
// parseTimestamp tries to parse a date string into Unix timestamp.
func parseTimestamp(s string) int64 {
if s == "" {
return 0
}
for _, fmt := range []string{"2006-01-02", "02.01.2006", "01/02/2006", "Jan 2, 2006"} {
if t, err := time.Parse(fmt, s); err == nil {
return t.Unix()
}
}
return 0
}
const (
fireworksVisionModel = "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct"
fireworksTextModel = "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct"
)
var ocrPrompt = `You are a medical document OCR system. Produce a faithful markdown transcription of this document.
The images are sequential pages of the same document. Process them in order: page 1 first, then page 2, etc.
Rules:
- Read each page top-to-bottom, left-to-right. For multi-column layouts, transcribe the full page as a human would read it.
- Preserve ALL text, dates, values, names, addresses, and structure
- Translate nothing — keep the original language
- Use markdown headers, lists, and formatting to reflect the document structure
- For tables, use markdown tables. Preserve numeric values exactly.
- Be complete — do not skip or summarize anything
- Do not describe visual elements (logos, signatures) — only transcribe text
- For handwritten text, transcribe as accurately as possible. Mark uncertain readings with [?]`
func processDocumentUpload(uploadID, dossierID, filePath, fileName string) {
log.Printf("[doc-import] Starting for %s (%s)", fileName, dossierID)
// Update upload status
setUploadStatus := func(status string) {
if entry, err := lib.EntryGet(nil, uploadID); err == nil {
var d UploadData
json.Unmarshal([]byte(entry.Data), &d)
d.Status = status
data, _ := json.Marshal(d)
entry.Data = string(data)
lib.EntryWrite("", entry)
}
}
setUploadStatus("processing")
// 1. Decrypt PDF
pdfBytes, err := lib.DecryptFile(filePath)
if err != nil {
log.Printf("[doc-import] Decrypt failed: %v", err)
setUploadStatus("failed")
return
}
// 2. Convert PDF to PNG pages via pdftoppm
tempDir, err := os.MkdirTemp("", "doc-import-*")
if err != nil {
log.Printf("[doc-import] MkdirTemp failed: %v", err)
setUploadStatus("failed")
return
}
defer os.RemoveAll(tempDir)
pdfPath := filepath.Join(tempDir, "input.pdf")
if err := os.WriteFile(pdfPath, pdfBytes, 0644); err != nil {
log.Printf("[doc-import] WriteFile failed: %v", err)
setUploadStatus("failed")
return
}
prefix := filepath.Join(tempDir, "page")
cmd := exec.Command("pdftoppm", "-png", "-r", "200", pdfPath, prefix)
if out, err := cmd.CombinedOutput(); err != nil {
log.Printf("[doc-import] pdftoppm failed: %v: %s", err, out)
setUploadStatus("failed")
return
}
// Collect page images sorted by name
pageFiles, _ := filepath.Glob(prefix + "*.png")
sort.Strings(pageFiles)
if len(pageFiles) == 0 {
log.Printf("[doc-import] No pages generated")
setUploadStatus("failed")
return
}
log.Printf("[doc-import] %d pages converted", len(pageFiles))
// 3. OCR: send pages to Fireworks vision model
content := []interface{}{
map[string]string{"type": "text", "text": ocrPrompt},
}
for _, pf := range pageFiles {
imgBytes, err := os.ReadFile(pf)
if err != nil {
continue
}
b64 := base64.StdEncoding.EncodeToString(imgBytes)
content = append(content, map[string]interface{}{
"type": "image_url",
"image_url": map[string]string{
"url": "data:image/png;base64," + b64,
},
})
}
messages := []map[string]interface{}{
{"role": "user", "content": content},
}
markdown, err := lib.CallFireworks(fireworksVisionModel, messages, 16384)
if err != nil {
log.Printf("[doc-import] OCR failed: %v", err)
setUploadStatus("failed")
return
}
log.Printf("[doc-import] OCR done: %d chars markdown", len(markdown))
// 4. Create document entry with markdown
now := time.Now().Unix()
docData := map[string]interface{}{
"markdown": markdown,
"source_upload": uploadID,
"pages": len(pageFiles),
}
docDataJSON, _ := json.Marshal(docData)
docEntry := &lib.Entry{
DossierID: dossierID,
Category: lib.CategoryDocument,
Type: "pdf",
Value: fileName,
Timestamp: now,
Data: string(docDataJSON),
}
lib.EntryWrite("", docEntry)
docID := docEntry.EntryID
log.Printf("[doc-import] Document entry created: %s", docID)
// 5. Fan out category extraction + optional translation
type catResult struct {
Category int
Entries []extractedEntry
}
var mu sync.Mutex
var results []catResult
var wg sync.WaitGroup
// Get dossier language for translations
var targetLang string
if d, err := lib.DossierGet("", dossierID); err == nil && d.Preferences.Language != "" {
targetLang = d.Preferences.Language
}
preamble := extractionPreamble(targetLang)
// Translate full markdown in parallel if target language is set
var translatedMarkdown string
if targetLang != "" {
wg.Add(1)
go func() {
defer wg.Done()
prompt := fmt.Sprintf("Translate this medical document to %s. Preserve all markdown formatting, headers, tables, and structure exactly. Translate ALL text including headers and labels. Output ONLY the translated markdown, nothing else.\n\n%s", targetLang, markdown)
msgs := []map[string]interface{}{
{"role": "user", "content": prompt},
}
resp, err := lib.CallFireworks(fireworksTextModel, msgs, 16384)
if err != nil {
log.Printf("[doc-import] Translation failed: %v", err)
return
}
mu.Lock()
translatedMarkdown = resp
mu.Unlock()
log.Printf("[doc-import] Translated to %s: %d chars", targetLang, len(resp))
}()
}
prompts := loadExtractionPrompts()
log.Printf("[doc-import] Loaded %d extraction prompts (lang=%s)", len(prompts), targetLang)
for catID, promptTmpl := range prompts {
wg.Add(1)
go func(catID int, promptTmpl string) {
defer wg.Done()
prompt := preamble + "\n" + strings.ReplaceAll(promptTmpl, "{{MARKDOWN}}", markdown)
msgs := []map[string]interface{}{
{"role": "user", "content": prompt},
}
resp, err := lib.CallFireworks(fireworksTextModel, msgs, 4096)
if err != nil {
log.Printf("[doc-import] Category %d failed: %v", catID, err)
return
}
resp = strings.TrimSpace(resp)
if resp == "null" || resp == "" {
return
}
// Parse as array of entries
var entries []extractedEntry
if err := json.Unmarshal([]byte(resp), &entries); err != nil {
// Try single object
var single extractedEntry
if err2 := json.Unmarshal([]byte(resp), &single); err2 == nil && single.Summary != "" {
entries = []extractedEntry{single}
} else {
log.Printf("[doc-import] Category %d: parse failed: %v", catID, err)
return
}
}
if len(entries) == 0 {
return
}
mu.Lock()
results = append(results, catResult{Category: catID, Entries: entries})
mu.Unlock()
}(catID, promptTmpl)
}
wg.Wait()
// Save translated markdown to document entry if available
if translatedMarkdown != "" {
if docEntry, err := lib.EntryGet(nil, docID); err == nil {
var dd map[string]interface{}
json.Unmarshal([]byte(docEntry.Data), &dd)
dd["markdown_translated"] = translatedMarkdown
dd["translated_to"] = targetLang
b, _ := json.Marshal(dd)
docEntry.Data = string(b)
lib.EntryWrite("", docEntry)
}
}
totalEntries := 0
for _, r := range results {
totalEntries += len(r.Entries)
}
log.Printf("[doc-import] Extraction done: %d categories, %d entries", len(results), totalEntries)
// 6. Create entries for each extracted item
var createdIDs []string
for _, r := range results {
for _, e := range r.Entries {
// Build Data JSON with source reference + extracted fields
dataMap := map[string]interface{}{
"source_doc_id": docID,
}
for k, v := range e.Data {
dataMap[k] = v
}
if len(e.SourceSpans) > 0 {
dataMap["source_spans"] = e.SourceSpans
}
if e.SummaryTranslated != "" {
dataMap["summary_translated"] = e.SummaryTranslated
}
dataJSON, _ := json.Marshal(dataMap)
ts := now
if parsed := parseTimestamp(e.Timestamp); parsed > 0 {
ts = parsed
}
entry := &lib.Entry{
DossierID: dossierID,
ParentID: docID,
Category: r.Category,
Type: e.Type,
Value: e.Value,
Summary: e.Summary,
SearchKey: e.SearchKey,
Timestamp: ts,
Data: string(dataJSON),
}
lib.EntryWrite("", entry)
createdIDs = append(createdIDs, entry.EntryID)
}
}
// 7. Update upload status with created entry IDs (for undo)
if entry, err := lib.EntryGet(nil, uploadID); err == nil {
var data map[string]interface{}
json.Unmarshal([]byte(entry.Data), &data)
data["status"] = "processed"
data["created_entries"] = append([]string{docID}, createdIDs...)
b, _ := json.Marshal(data)
entry.Data = string(b)
lib.EntryWrite("", entry)
}
log.Printf("[doc-import] Complete: %s → doc=%s, %d extracts", fileName, docID, len(createdIDs))
lib.AuditLog("", "doc_import", dossierID, fmt.Sprintf("file=%s doc=%s categories=%d", fileName, docID, len(results)))
}