inou/tools/loinc-lookup/main.go

500 lines
13 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main
import (
"database/sql"
"encoding/csv"
"encoding/json"
"fmt"
"log"
"os"
"strings"
_ "github.com/mattn/go-sqlite3"
"inou/lib"
)
const refDBPath = "/tank/inou/data/reference.db"
var refDB *sql.DB
func main() {
if len(os.Args) < 2 {
fmt.Fprintln(os.Stderr, `Usage:
loinc-lookup import <loinc_lab.csv> Import LOINC lab table into reference.db
loinc-lookup <name> [specimen] [unit] Look up LOINC code for a lab test
loinc-lookup batch <file.jsonl> Batch lookup from JSONL (one {"name","specimen","unit"} per line)
loinc-lookup stats Show cache statistics`)
os.Exit(1)
}
lib.ConfigInit()
if err := lib.RefDBInit(refDBPath); err != nil {
log.Fatalf("RefDBInit: %v", err)
}
var err error
refDB, err = sql.Open("sqlite3", refDBPath)
if err != nil {
log.Fatalf("open reference.db: %v", err)
}
defer refDB.Close()
switch os.Args[1] {
case "import":
if len(os.Args) < 3 {
log.Fatal("Usage: loinc-lookup import <loinc_lab.csv>")
}
cmdImport(os.Args[2])
case "stats":
cmdStats()
case "batch":
if len(os.Args) < 3 {
log.Fatal("Usage: loinc-lookup batch <file.jsonl>")
}
cmdBatch(os.Args[2])
default:
name := os.Args[1]
specimen := ""
unit := ""
if len(os.Args) > 2 {
specimen = os.Args[2]
}
if len(os.Args) > 3 {
unit = os.Args[3]
}
cmdLookup(name, specimen, unit)
}
}
// --- import command ---
func cmdImport(csvPath string) {
// Create tables
for _, stmt := range []string{
`CREATE TABLE IF NOT EXISTS loinc_lab (
loinc_num TEXT PRIMARY KEY,
component TEXT NOT NULL,
property TEXT NOT NULL,
system TEXT NOT NULL,
scale TEXT NOT NULL,
method TEXT NOT NULL,
class TEXT NOT NULL,
long_name TEXT NOT NULL,
short_name TEXT NOT NULL
)`,
`CREATE TABLE IF NOT EXISTS loinc_cache (
cache_key TEXT PRIMARY KEY,
input_name TEXT NOT NULL,
input_specimen TEXT NOT NULL,
input_unit TEXT NOT NULL,
loinc_code TEXT NOT NULL,
loinc_name TEXT NOT NULL,
confidence TEXT NOT NULL DEFAULT 'llm'
)`,
} {
if _, err := refDB.Exec(stmt); err != nil {
log.Fatalf("create table: %v", err)
}
}
f, err := os.Open(csvPath)
if err != nil {
log.Fatalf("open %s: %v", csvPath, err)
}
defer f.Close()
reader := csv.NewReader(f)
header, err := reader.Read()
if err != nil {
log.Fatalf("read header: %v", err)
}
// Map column names to indices
colIdx := map[string]int{}
for i, h := range header {
colIdx[h] = i
}
need := []string{"LOINC_NUM", "COMPONENT", "PROPERTY", "SYSTEM", "SCALE_TYP", "METHOD_TYP", "CLASS", "LONG_COMMON_NAME", "SHORTNAME"}
for _, n := range need {
if _, ok := colIdx[n]; !ok {
log.Fatalf("missing column: %s", n)
}
}
// Clear and re-import
refDB.Exec("DELETE FROM loinc_lab")
tx, err := refDB.Begin()
if err != nil {
log.Fatalf("begin tx: %v", err)
}
stmt, err := tx.Prepare(`INSERT INTO loinc_lab (loinc_num, component, property, system, scale, method, class, long_name, short_name)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`)
if err != nil {
log.Fatalf("prepare: %v", err)
}
count := 0
for {
row, err := reader.Read()
if err != nil {
break
}
stmt.Exec(
row[colIdx["LOINC_NUM"]],
row[colIdx["COMPONENT"]],
row[colIdx["PROPERTY"]],
row[colIdx["SYSTEM"]],
row[colIdx["SCALE_TYP"]],
row[colIdx["METHOD_TYP"]],
row[colIdx["CLASS"]],
row[colIdx["LONG_COMMON_NAME"]],
row[colIdx["SHORTNAME"]],
)
count++
}
stmt.Close()
if err := tx.Commit(); err != nil {
log.Fatalf("commit: %v", err)
}
log.Printf("Imported %d LOINC lab codes", count)
}
// --- lookup command ---
func cmdLookup(name, specimen, unit string) {
result, err := loincLookup(name, specimen, unit)
if err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}
out, _ := json.MarshalIndent(result, "", " ")
fmt.Println(string(out))
}
type LookupResult struct {
LoincCode string `json:"loinc_code"`
LoincName string `json:"loinc_name"`
Source string `json:"source"` // "cache" or "llm"
Candidates int `json:"candidates"`
}
func loincLookup(name, specimen, unit string) (*LookupResult, error) {
// 1. Check cache
cacheKey := strings.ToLower(name + "|" + specimen + "|" + unit)
var cached []struct {
LoincCode string `db:"loinc_code"`
LoincName string `db:"loinc_name"`
}
lib.RefQuery("SELECT loinc_code, loinc_name FROM loinc_cache WHERE cache_key = ?", []any{cacheKey}, &cached)
if len(cached) > 0 {
return &LookupResult{
LoincCode: cached[0].LoincCode,
LoincName: cached[0].LoincName,
Source: "cache",
}, nil
}
// 2. Expand input to LOINC terminology via LLM, then search
// Replace "%" with "percentage" so LLM connects to LOINC's "/100 leukocytes" naming
lookupUnit := unit
if lookupUnit == "%" {
lookupUnit = "percentage"
}
tokens := tokenize(name + " " + specimen + " " + lookupUnit)
if expanded, err := llmExpand(name, specimen, lookupUnit); err == nil {
tokens = expanded
}
candidates, _ := searchCandidates(tokens)
// If unit is %, drop candidates that are counts (#/volume, NCnc)
if unit == "%" {
var filtered []candidate
for _, c := range candidates {
if c.Property == "NCnc" {
continue
}
filtered = append(filtered, c)
}
if len(filtered) > 0 {
candidates = filtered
}
}
if len(candidates) == 0 {
return nil, fmt.Errorf("no LOINC candidates found for %q", name)
}
// 4. LLM pick from candidates
code, lname, err := llmPick(name, specimen, lookupUnit, candidates)
if err != nil {
return nil, err
}
// 5. Cache
refDB.Exec(`INSERT OR REPLACE INTO loinc_cache (cache_key, input_name, input_specimen, input_unit, loinc_code, loinc_name, confidence)
VALUES (?, ?, ?, ?, ?, ?, 'llm')`, cacheKey, name, specimen, unit, code, lname)
return &LookupResult{
LoincCode: code,
LoincName: lname,
Source: "llm",
Candidates: len(candidates),
}, nil
}
func tokenize(s string) []string {
s = strings.ToLower(s)
// Replace common separators with spaces
for _, c := range []string{",", ";", "(", ")", "[", "]", "/", "-", ".", ":"} {
s = strings.ReplaceAll(s, c, " ")
}
var tokens []string
seen := map[string]bool{}
for _, t := range strings.Fields(s) {
if len(t) < 2 || seen[t] {
continue
}
tokens = append(tokens, t)
seen[t] = true
}
return tokens
}
type candidate struct {
LoincNum string `db:"loinc_num"`
LongName string `db:"long_name"`
ShortName string `db:"short_name"`
System string `db:"system"`
Component string `db:"component"`
Property string `db:"property"`
}
func searchCandidates(tokens []string) ([]candidate, int) {
if len(tokens) == 0 {
return nil, 0
}
// Query per token, collect into a map keyed by loinc_num
type entry struct {
c candidate
hits int // number of distinct tokens that matched
bonus int // extra score for quality of match
}
entries := map[string]*entry{}
for _, t := range tokens {
pattern := "%" + t + "%"
query := "SELECT loinc_num, long_name, short_name, system, component, property FROM loinc_lab WHERE " +
"LOWER(long_name) LIKE ? OR LOWER(short_name) LIKE ? OR LOWER(component) LIKE ?"
var results []candidate
lib.RefQuery(query, []any{pattern, pattern, pattern}, &results)
for _, c := range results {
if e, ok := entries[c.LoincNum]; ok {
e.hits++
} else {
entries[c.LoincNum] = &entry{c: c, hits: 1}
}
}
}
// Require at least 2 token matches (or 1 if only 1 token)
minHits := 2
if len(tokens) <= 1 {
minHits = 1
}
// Score: hits × 100 + bonus for component exactness (shorter component = more specific)
type scored struct {
c candidate
score int
}
var scoredResults []scored
for _, e := range entries {
if e.hits < minHits {
continue
}
s := e.hits * 100
// Bonus: prefer entries where component is a simple term, not a compound like "Carboxyhemoglobin/Hemoglobin.total"
compLen := len(e.c.Component)
if compLen > 0 && compLen < 50 {
s += 50 - compLen // shorter component = higher bonus
}
// Bonus: prefer entries without "/" in component (simple analytes)
if !strings.Contains(e.c.Component, "/") {
s += 20
}
scoredResults = append(scoredResults, scored{e.c, s})
}
// Sort by score descending, take top 30
for i := range scoredResults {
for j := i + 1; j < len(scoredResults); j++ {
if scoredResults[j].score > scoredResults[i].score {
scoredResults[i], scoredResults[j] = scoredResults[j], scoredResults[i]
}
}
}
var top []candidate
maxHits := 0
for i, s := range scoredResults {
if i >= 30 {
break
}
top = append(top, s.c)
hits := s.score / 100 // extract hit count from score
if hits > maxHits {
maxHits = hits
}
}
return top, maxHits
}
func llmExpand(name, specimen, unit string) ([]string, error) {
prompt := fmt.Sprintf(`Given a lab test, return search terms to find it in the LOINC database.
LOINC uses formal medical terminology (e.g. "Leukocytes" not "White Blood Cells", "Erythrocytes" not "Red Blood Cells", "Oxygen" not "O2" or "pO2").
Lab test:
Name: %s
Specimen: %s
Unit: %s
Return a JSON object: {"terms": ["term1", "term2", ...]}
Include: the LOINC component name, specimen system code (e.g. Bld, BldA, BldC, BldV, Ser/Plas, Urine), and any synonyms that might appear in LOINC long names.
Keep it to 3-6 terms. JSON only.`, name, specimen, unit)
resp, err := lib.CallGemini(prompt)
if err != nil {
return nil, err
}
var result struct {
Terms []string `json:"terms"`
}
if err := json.Unmarshal([]byte(resp), &result); err != nil {
return nil, fmt.Errorf("parse expand response %q: %w", resp, err)
}
// Lowercase all terms and add original input tokens as fallback
var terms []string
seen := map[string]bool{}
for _, t := range result.Terms {
t = strings.ToLower(strings.TrimSpace(t))
if t != "" && !seen[t] {
terms = append(terms, t)
seen[t] = true
}
}
// Also include original tokens so we never lose the raw input
for _, t := range tokenize(name + " " + specimen) {
if !seen[t] {
terms = append(terms, t)
seen[t] = true
}
}
return terms, nil
}
func llmPick(name, specimen, unit string, candidates []candidate) (string, string, error) {
// Format candidates as a numbered list
// Replace "/100" and "fraction" with "percentage" so LLM connects them to "%" unit
var lines []string
for i, c := range candidates {
display := c.LongName
display = strings.ReplaceAll(display, "/100 ", "percentage of ")
display = strings.ReplaceAll(display, "fraction", "percentage")
lines = append(lines, fmt.Sprintf("%d. %s — %s [System: %s]", i+1, c.LoincNum, display, c.System))
}
prompt := fmt.Sprintf(`You are a clinical laboratory informatics system. Given a lab test, pick the BEST matching LOINC code from the candidate list.
Lab test:
Name: %s
Specimen: %s
Unit: %s
Candidates:
%s
Return ONLY a JSON object: {"pick": <number>, "loinc": "<code>", "name": "<long name>"}
Pick the candidate that best matches the test name, specimen type, and unit. If none match well, pick the closest.
JSON only, no explanation.`, name, specimen, unit, strings.Join(lines, "\n"))
resp, err := lib.CallGemini(prompt)
if err != nil {
return "", "", fmt.Errorf("LLM call failed: %w", err)
}
var result struct {
Pick int `json:"pick"`
Loinc string `json:"loinc"`
Name string `json:"name"`
}
if err := json.Unmarshal([]byte(resp), &result); err != nil {
return "", "", fmt.Errorf("parse LLM response %q: %w", resp, err)
}
// Resolve by pick number if loinc field is empty
if result.Loinc == "" && result.Pick > 0 && result.Pick <= len(candidates) {
result.Loinc = candidates[result.Pick-1].LoincNum
result.Name = candidates[result.Pick-1].LongName
}
// Verify the code is actually in our candidate list
for _, c := range candidates {
if c.LoincNum == result.Loinc {
return result.Loinc, c.LongName, nil
}
}
// Pick number as fallback
if result.Pick > 0 && result.Pick <= len(candidates) {
c := candidates[result.Pick-1]
return c.LoincNum, c.LongName, nil
}
return "", "", fmt.Errorf("LLM returned %q (pick %d) — not in %d candidates", result.Loinc, result.Pick, len(candidates))
}
// --- batch command ---
func cmdBatch(path string) {
data, err := os.ReadFile(path)
if err != nil {
log.Fatalf("read %s: %v", path, err)
}
for _, line := range strings.Split(string(data), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
var input struct {
Name string `json:"name"`
Specimen string `json:"specimen"`
Unit string `json:"unit"`
}
if err := json.Unmarshal([]byte(line), &input); err != nil {
fmt.Fprintf(os.Stderr, "skip bad line: %s\n", line)
continue
}
result, err := loincLookup(input.Name, input.Specimen, input.Unit)
if err != nil {
fmt.Fprintf(os.Stderr, "%s: %v\n", input.Name, err)
continue
}
fmt.Printf("%-40s → %s %s [%s]\n", input.Name, result.LoincCode, result.LoincName, result.Source)
}
}
// --- stats command ---
func cmdStats() {
var total []struct{ N int `db:"n"` }
lib.RefQuery("SELECT COUNT(*) as n FROM loinc_lab", nil, &total)
if len(total) > 0 {
fmt.Printf("LOINC lab codes: %d\n", total[0].N)
}
var cached []struct{ N int `db:"n"` }
lib.RefQuery("SELECT COUNT(*) as n FROM loinc_cache", nil, &cached)
if len(cached) > 0 {
fmt.Printf("Cached lookups: %d\n", cached[0].N)
}
}