package lib import ( "encoding/json" "fmt" "log" "sort" "strings" ) // Normalize normalizes entry names within a dossier for a given category. // Uses heuristic pre-grouping + LLM to map variant names to canonical forms. // Updates Summary (display) and Data JSON (normalized_name, abbreviation). // Original Type field is never modified. // Silently returns nil if no API key is configured. func Normalize(dossierID string, category int, progress ...func(processed, total int)) error { reportProgress := func(p, t int) { if len(progress) > 0 && progress[0] != nil { progress[0](p, t) } } if FireworksKey == "" { SendSignal("normalize: FIREWORKS_API_KEY not configured, skipping normalization") return nil } // 1. Load all entries, collect types only from entries that need normalization entries, err := EntryQueryOld(dossierID, category, "") if err != nil { return fmt.Errorf("load entries: %w", err) } seen := make(map[string]bool) var allNames []string for _, e := range entries { if e.ParentID == "" || e.Type == "lab_order" || e.Type == "" { continue } // FIXED(review-2026-02-28): Skip only if FULLY normalized (has both SearchKey2 AND LOINC) // Previously skipped on SearchKey2 alone, causing LOINC to never be populated if e.SearchKey2 != "" { var data map[string]interface{} json.Unmarshal([]byte(e.Data), &data) if loinc, ok := data["loinc"].(string); ok && loinc != "" { continue // fully normalized } // Has SearchKey2 but no LOINC - needs normalization } if !seen[e.Type] { seen[e.Type] = true allNames = append(allNames, e.Type) } } if len(allNames) == 0 { log.Printf("normalize: all entries already normalized") return nil } // 2. Pre-group by heuristic key (strip POCT, specimen suffixes, normalize case) groups := make(map[string][]string) // cleanKey → [original names] for _, name := range allNames { key := normalizeKey(name) groups[key] = append(groups[key], name) } // Send just the group keys to LLM keys := make([]string, 0, len(groups)) for k := range groups { keys = append(keys, k) } sort.Strings(keys) log.Printf("normalize: %d unique types → %d groups after pre-grouping", len(allNames), len(keys)) // 3. Call LLM with group keys (batched to stay within token limits) mapping := make(map[string]normMapping) batchSize := 50 for i := 0; i < len(keys); i += batchSize { end := i + batchSize if end > len(keys) { end = len(keys) } batch := keys[i:end] reportProgress(end, len(keys)) log.Printf("normalize: LLM batch %d-%d of %d", i+1, end, len(keys)) batchMap, err := callNormalizeLLM(batch) if err != nil { SendSignal(fmt.Sprintf("normalize: LLM batch %d-%d failed: %v", i+1, end, err)) return fmt.Errorf("LLM batch %d-%d: %w", i+1, end, err) } for k, v := range batchMap { mapping[k] = v } } // 4. Expand: each original name in a group gets the group's canonical mapping fullMapping := make(map[string]normMapping) for key, origNames := range groups { if m, ok := mapping[key]; ok { for _, orig := range origNames { fullMapping[orig] = m } } } log.Printf("normalize: LLM mapped %d groups → %d original names covered", len(mapping), len(fullMapping)) // 5. Save LabTest entries for any new LOINC codes seenLoinc := make(map[string]bool) var labTests []LabTest for _, m := range fullMapping { if m.Loinc == "" || seenLoinc[m.Loinc] { continue } seenLoinc[m.Loinc] = true dir := m.Direction if dir == "" { dir = DirRange } factor := m.SIFactor if factor == 0 { factor = 1.0 } labTests = append(labTests, LabTest{ LoincID: m.Loinc, Name: m.Name, SIUnit: m.SIUnit, Direction: dir, SIFactor: ToLabScale(factor), }) } for _, t := range labTests { RefExec(`INSERT OR IGNORE INTO lab_test (loinc_id, name, si_unit, direction, si_factor) VALUES (?, ?, ?, ?, ?)`, t.LoincID, t.Name, t.SIUnit, t.Direction, t.SIFactor) } if len(labTests) > 0 { log.Printf("normalize: saved %d lab tests", len(labTests)) } // 5. Apply mapping to loaded entries, save only changed ones var toSave []Entry for _, e := range entries { if e.ParentID == "" { continue } norm, ok := fullMapping[e.Type] if !ok { continue } var data map[string]interface{} if json.Unmarshal([]byte(e.Data), &data) != nil { data = make(map[string]interface{}) } // Skip if already fully normalized existingName, _ := data["normalized_name"].(string) existingLoinc, _ := data["loinc"].(string) needsSearchKey := (norm.Loinc != "" && e.SearchKey == "") needsSearchKey2 := e.SearchKey2 == "" if existingName == norm.Name && (norm.Loinc == "" || existingLoinc == norm.Loinc) && !needsSearchKey && !needsSearchKey2 { continue } data["normalized_name"] = norm.Name data["abbreviation"] = norm.Abbr if norm.Loinc != "" { data["loinc"] = norm.Loinc } if norm.SIUnit != "" { data["si_unit"] = norm.SIUnit } if norm.SIFactor != 0 && norm.SIFactor != 1.0 { data["si_factor"] = norm.SIFactor } b, _ := json.Marshal(data) e.Data = string(b) // Update SearchKey with LOINC code, SearchKey2 with canonical test name if norm.Loinc != "" { e.SearchKey = norm.Loinc } e.SearchKey2 = strings.ToLower(norm.Name) // Rebuild Summary: "Abbr: value unit" unit, _ := data["unit"].(string) summary := norm.Abbr + ": " + e.Value if unit != "" { summary += " " + unit } e.Summary = summary toSave = append(toSave, *e) } if len(toSave) == 0 { log.Printf("normalize: no changes needed") return nil } log.Printf("normalize: updating %d entries", len(toSave)) ptrs := make([]*Entry, len(toSave)) for i := range toSave { ptrs[i] = &toSave[i] } return EntryWrite("", ptrs...) } // normalizeKey reduces a test name to a heuristic grouping key. // Groups obvious duplicates: POCT variants, specimen suffixes, case. func normalizeKey(name string) string { s := strings.ToLower(strings.TrimSpace(name)) s = strings.TrimPrefix(s, "poct ") // Strip specimen-type suffixes only (not qualifiers like ", total", ", direct") for _, suf := range []string{", whole blood", ", wblood", ", wb", ", wbl", ", blood", ", s/p", " ach"} { s = strings.TrimSuffix(s, suf) } return strings.TrimSpace(s) } type normMapping struct { Name string `json:"name"` Abbr string `json:"abbr"` Loinc string `json:"loinc"` SIUnit string `json:"si_unit"` SIFactor float64 `json:"si_factor"` Direction string `json:"direction"` } func callNormalizeLLM(names []string) (map[string]normMapping, error) { nameList := strings.Join(names, "\n") prompt := fmt.Sprintf(`Normalize these medical test names. Return ONLY a JSON object, no explanation. Each key is the EXACT input name. Value format: {"name":"Canonical Name","abbr":"Abbreviation","loinc":"LOINC","si_unit":"unit","si_factor":1.0,"direction":"range"} Key LOINC codes: WBC=6690-2, RBC=789-8, Hemoglobin=718-7, Hematocrit=4544-3, MCV=787-2, MCH=785-6, MCHC=786-4, RDW=788-0, Platelets=777-3, Neutrophils%%=770-8, Lymphocytes%%=736-9, Monocytes%%=5905-5, Eosinophils%%=713-8, Basophils%%=706-2, Glucose=2345-7, BUN=3094-0, Creatinine=2160-0, Sodium=2951-2, Potassium=2823-3, Chloride=2075-0, CO2=2028-9, Calcium=17861-6, Total Protein=2885-2, Albumin=1751-7, Total Bilirubin=1975-2, ALP=6768-6, AST=1920-8, ALT=1742-6. Abbreviations: WBC, RBC, Hgb, Hct, MCV, MCH, MCHC, RDW, PLT, Neut, Lymph, Mono, Eos, Baso, Glu, BUN, Cr, Na, K, Cl, CO2, Ca, TP, Alb, Bili, ALP, AST, ALT, Mg, Phos, Fe, etc. si_factor: conventional→SI multiplier (e.g. Hgb g/dL→g/L=10.0). Use 1.0 if same or unknown. direction: "range" (default), "lower_better" (CRP, LDL, glucose), "higher_better" (HDL). Test names: %s`, nameList) messages := []map[string]interface{}{ {"role": "user", "content": prompt}, } resp, err := CallFireworks("accounts/fireworks/models/qwen3-vl-30b-a3b-instruct", messages, 4096) if err != nil { return nil, err } resp = strings.TrimSpace(resp) resp = strings.TrimPrefix(resp, "```json") resp = strings.TrimPrefix(resp, "```") resp = strings.TrimSuffix(resp, "```") resp = strings.TrimSpace(resp) var mapping map[string]normMapping if err := json.Unmarshal([]byte(resp), &mapping); err != nil { return nil, fmt.Errorf("parse response: %w (first 500 chars: %.500s)", err, resp) } return mapping, nil }