inou/lib/normalize.go

269 lines
8.2 KiB
Go

package lib
import (
"encoding/json"
"fmt"
"log"
"sort"
"strings"
)
// Normalize normalizes entry names within a dossier for a given category.
// Uses heuristic pre-grouping + LLM to map variant names to canonical forms.
// Updates Summary (display) and Data JSON (normalized_name, abbreviation).
// Original Type field is never modified.
// Silently returns nil if no API key is configured.
func Normalize(dossierID string, category int, progress ...func(processed, total int)) error {
reportProgress := func(p, t int) {
if len(progress) > 0 && progress[0] != nil {
progress[0](p, t)
}
}
if FireworksKey == "" {
SendSignal("normalize: FIREWORKS_API_KEY not configured, skipping normalization")
return nil
}
// 1. Load all entries, collect types only from entries that need normalization
entries, err := EntryQueryOld(dossierID, category, "")
if err != nil {
return fmt.Errorf("load entries: %w", err)
}
seen := make(map[string]bool)
var allNames []string
for _, e := range entries {
if e.ParentID == "" || e.Type == "lab_order" || e.Type == "" {
continue
}
// FIXED(review-2026-02-28): Skip only if FULLY normalized (has both SearchKey2 AND LOINC)
// Previously skipped on SearchKey2 alone, causing LOINC to never be populated
if e.SearchKey2 != "" {
var data map[string]interface{}
json.Unmarshal([]byte(e.Data), &data)
if loinc, ok := data["loinc"].(string); ok && loinc != "" {
continue // fully normalized
}
// Has SearchKey2 but no LOINC - needs normalization
}
if !seen[e.Type] {
seen[e.Type] = true
allNames = append(allNames, e.Type)
}
}
if len(allNames) == 0 {
log.Printf("normalize: all entries already normalized")
return nil
}
// 2. Pre-group by heuristic key (strip POCT, specimen suffixes, normalize case)
groups := make(map[string][]string) // cleanKey → [original names]
for _, name := range allNames {
key := normalizeKey(name)
groups[key] = append(groups[key], name)
}
// Send just the group keys to LLM
keys := make([]string, 0, len(groups))
for k := range groups {
keys = append(keys, k)
}
sort.Strings(keys)
log.Printf("normalize: %d unique types → %d groups after pre-grouping", len(allNames), len(keys))
// 3. Call LLM with group keys (batched to stay within token limits)
mapping := make(map[string]normMapping)
batchSize := 50
for i := 0; i < len(keys); i += batchSize {
end := i + batchSize
if end > len(keys) {
end = len(keys)
}
batch := keys[i:end]
reportProgress(end, len(keys))
log.Printf("normalize: LLM batch %d-%d of %d", i+1, end, len(keys))
batchMap, err := callNormalizeLLM(batch)
if err != nil {
SendSignal(fmt.Sprintf("normalize: LLM batch %d-%d failed: %v", i+1, end, err))
return fmt.Errorf("LLM batch %d-%d: %w", i+1, end, err)
}
for k, v := range batchMap {
mapping[k] = v
}
}
// 4. Expand: each original name in a group gets the group's canonical mapping
fullMapping := make(map[string]normMapping)
for key, origNames := range groups {
if m, ok := mapping[key]; ok {
for _, orig := range origNames {
fullMapping[orig] = m
}
}
}
log.Printf("normalize: LLM mapped %d groups → %d original names covered", len(mapping), len(fullMapping))
// 5. Save LabTest entries for any new LOINC codes
seenLoinc := make(map[string]bool)
var labTests []LabTest
for _, m := range fullMapping {
if m.Loinc == "" || seenLoinc[m.Loinc] {
continue
}
seenLoinc[m.Loinc] = true
dir := m.Direction
if dir == "" {
dir = DirRange
}
factor := m.SIFactor
if factor == 0 {
factor = 1.0
}
labTests = append(labTests, LabTest{
LoincID: m.Loinc,
Name: m.Name,
SIUnit: m.SIUnit,
Direction: dir,
SIFactor: ToLabScale(factor),
})
}
for _, t := range labTests {
RefExec(`INSERT OR IGNORE INTO lab_test (loinc_id, name, si_unit, direction, si_factor) VALUES (?, ?, ?, ?, ?)`,
t.LoincID, t.Name, t.SIUnit, t.Direction, t.SIFactor)
}
if len(labTests) > 0 {
log.Printf("normalize: saved %d lab tests", len(labTests))
}
// 5. Apply mapping to loaded entries, save only changed ones
var toSave []Entry
for _, e := range entries {
if e.ParentID == "" {
continue
}
norm, ok := fullMapping[e.Type]
if !ok {
continue
}
var data map[string]interface{}
if json.Unmarshal([]byte(e.Data), &data) != nil {
data = make(map[string]interface{})
}
// Skip if already fully normalized
existingName, _ := data["normalized_name"].(string)
existingLoinc, _ := data["loinc"].(string)
needsSearchKey := (norm.Loinc != "" && e.SearchKey == "")
needsSearchKey2 := e.SearchKey2 == ""
if existingName == norm.Name && (norm.Loinc == "" || existingLoinc == norm.Loinc) && !needsSearchKey && !needsSearchKey2 {
continue
}
data["normalized_name"] = norm.Name
data["abbreviation"] = norm.Abbr
if norm.Loinc != "" {
data["loinc"] = norm.Loinc
}
if norm.SIUnit != "" {
data["si_unit"] = norm.SIUnit
}
if norm.SIFactor != 0 && norm.SIFactor != 1.0 {
data["si_factor"] = norm.SIFactor
}
b, _ := json.Marshal(data)
e.Data = string(b)
// Update SearchKey with LOINC code, SearchKey2 with canonical test name
if norm.Loinc != "" {
e.SearchKey = norm.Loinc
}
e.SearchKey2 = strings.ToLower(norm.Name)
// Rebuild Summary: "Abbr: value unit"
unit, _ := data["unit"].(string)
summary := norm.Abbr + ": " + e.Value
if unit != "" {
summary += " " + unit
}
e.Summary = summary
toSave = append(toSave, *e)
}
if len(toSave) == 0 {
log.Printf("normalize: no changes needed")
return nil
}
log.Printf("normalize: updating %d entries", len(toSave))
ptrs := make([]*Entry, len(toSave))
for i := range toSave {
ptrs[i] = &toSave[i]
}
return EntryWrite("", ptrs...)
}
// normalizeKey reduces a test name to a heuristic grouping key.
// Groups obvious duplicates: POCT variants, specimen suffixes, case.
func normalizeKey(name string) string {
s := strings.ToLower(strings.TrimSpace(name))
s = strings.TrimPrefix(s, "poct ")
// Strip specimen-type suffixes only (not qualifiers like ", total", ", direct")
for _, suf := range []string{", whole blood", ", wblood", ", wb", ", wbl", ", blood", ", s/p", " ach"} {
s = strings.TrimSuffix(s, suf)
}
return strings.TrimSpace(s)
}
type normMapping struct {
Name string `json:"name"`
Abbr string `json:"abbr"`
Loinc string `json:"loinc"`
SIUnit string `json:"si_unit"`
SIFactor float64 `json:"si_factor"`
Direction string `json:"direction"`
}
func callNormalizeLLM(names []string) (map[string]normMapping, error) {
nameList := strings.Join(names, "\n")
prompt := fmt.Sprintf(`Normalize these medical test names. Return ONLY a JSON object, no explanation.
Each key is the EXACT input name. Value format: {"name":"Canonical Name","abbr":"Abbreviation","loinc":"LOINC","si_unit":"unit","si_factor":1.0,"direction":"range"}
Key LOINC codes: WBC=6690-2, RBC=789-8, Hemoglobin=718-7, Hematocrit=4544-3, MCV=787-2, MCH=785-6, MCHC=786-4, RDW=788-0, Platelets=777-3, Neutrophils%%=770-8, Lymphocytes%%=736-9, Monocytes%%=5905-5, Eosinophils%%=713-8, Basophils%%=706-2, Glucose=2345-7, BUN=3094-0, Creatinine=2160-0, Sodium=2951-2, Potassium=2823-3, Chloride=2075-0, CO2=2028-9, Calcium=17861-6, Total Protein=2885-2, Albumin=1751-7, Total Bilirubin=1975-2, ALP=6768-6, AST=1920-8, ALT=1742-6.
Abbreviations: WBC, RBC, Hgb, Hct, MCV, MCH, MCHC, RDW, PLT, Neut, Lymph, Mono, Eos, Baso, Glu, BUN, Cr, Na, K, Cl, CO2, Ca, TP, Alb, Bili, ALP, AST, ALT, Mg, Phos, Fe, etc.
si_factor: conventional→SI multiplier (e.g. Hgb g/dL→g/L=10.0). Use 1.0 if same or unknown.
direction: "range" (default), "lower_better" (CRP, LDL, glucose), "higher_better" (HDL).
Test names:
%s`, nameList)
messages := []map[string]interface{}{
{"role": "user", "content": prompt},
}
resp, err := CallFireworks("accounts/fireworks/models/qwen3-vl-30b-a3b-instruct", messages, 4096)
if err != nil {
return nil, err
}
resp = strings.TrimSpace(resp)
resp = strings.TrimPrefix(resp, "```json")
resp = strings.TrimPrefix(resp, "```")
resp = strings.TrimSuffix(resp, "```")
resp = strings.TrimSpace(resp)
var mapping map[string]normMapping
if err := json.Unmarshal([]byte(resp), &mapping); err != nil {
return nil, fmt.Errorf("parse response: %w (first 500 chars: %.500s)", err, resp)
}
return mapping, nil
}