500 lines
13 KiB
Go
500 lines
13 KiB
Go
package main
|
||
|
||
import (
|
||
"database/sql"
|
||
"encoding/csv"
|
||
"encoding/json"
|
||
"fmt"
|
||
"log"
|
||
"os"
|
||
"strings"
|
||
|
||
_ "github.com/mattn/go-sqlite3"
|
||
"inou/lib"
|
||
)
|
||
|
||
const refDBPath = "/tank/inou/data/reference.db"
|
||
|
||
var refDB *sql.DB
|
||
|
||
func main() {
|
||
if len(os.Args) < 2 {
|
||
fmt.Fprintln(os.Stderr, `Usage:
|
||
loinc-lookup import <loinc_lab.csv> Import LOINC lab table into reference.db
|
||
loinc-lookup <name> [specimen] [unit] Look up LOINC code for a lab test
|
||
loinc-lookup batch <file.jsonl> Batch lookup from JSONL (one {"name","specimen","unit"} per line)
|
||
loinc-lookup stats Show cache statistics`)
|
||
os.Exit(1)
|
||
}
|
||
|
||
lib.ConfigInit()
|
||
if err := lib.RefDBInit(refDBPath); err != nil {
|
||
log.Fatalf("RefDBInit: %v", err)
|
||
}
|
||
var err error
|
||
refDB, err = sql.Open("sqlite3", refDBPath)
|
||
if err != nil {
|
||
log.Fatalf("open reference.db: %v", err)
|
||
}
|
||
defer refDB.Close()
|
||
|
||
switch os.Args[1] {
|
||
case "import":
|
||
if len(os.Args) < 3 {
|
||
log.Fatal("Usage: loinc-lookup import <loinc_lab.csv>")
|
||
}
|
||
cmdImport(os.Args[2])
|
||
case "stats":
|
||
cmdStats()
|
||
case "batch":
|
||
if len(os.Args) < 3 {
|
||
log.Fatal("Usage: loinc-lookup batch <file.jsonl>")
|
||
}
|
||
cmdBatch(os.Args[2])
|
||
default:
|
||
name := os.Args[1]
|
||
specimen := ""
|
||
unit := ""
|
||
if len(os.Args) > 2 {
|
||
specimen = os.Args[2]
|
||
}
|
||
if len(os.Args) > 3 {
|
||
unit = os.Args[3]
|
||
}
|
||
cmdLookup(name, specimen, unit)
|
||
}
|
||
}
|
||
|
||
// --- import command ---
|
||
|
||
func cmdImport(csvPath string) {
|
||
// Create tables
|
||
for _, stmt := range []string{
|
||
`CREATE TABLE IF NOT EXISTS loinc_lab (
|
||
loinc_num TEXT PRIMARY KEY,
|
||
component TEXT NOT NULL,
|
||
property TEXT NOT NULL,
|
||
system TEXT NOT NULL,
|
||
scale TEXT NOT NULL,
|
||
method TEXT NOT NULL,
|
||
class TEXT NOT NULL,
|
||
long_name TEXT NOT NULL,
|
||
short_name TEXT NOT NULL
|
||
)`,
|
||
`CREATE TABLE IF NOT EXISTS loinc_cache (
|
||
cache_key TEXT PRIMARY KEY,
|
||
input_name TEXT NOT NULL,
|
||
input_specimen TEXT NOT NULL,
|
||
input_unit TEXT NOT NULL,
|
||
loinc_code TEXT NOT NULL,
|
||
loinc_name TEXT NOT NULL,
|
||
confidence TEXT NOT NULL DEFAULT 'llm'
|
||
)`,
|
||
} {
|
||
if _, err := refDB.Exec(stmt); err != nil {
|
||
log.Fatalf("create table: %v", err)
|
||
}
|
||
}
|
||
|
||
f, err := os.Open(csvPath)
|
||
if err != nil {
|
||
log.Fatalf("open %s: %v", csvPath, err)
|
||
}
|
||
defer f.Close()
|
||
|
||
reader := csv.NewReader(f)
|
||
header, err := reader.Read()
|
||
if err != nil {
|
||
log.Fatalf("read header: %v", err)
|
||
}
|
||
|
||
// Map column names to indices
|
||
colIdx := map[string]int{}
|
||
for i, h := range header {
|
||
colIdx[h] = i
|
||
}
|
||
need := []string{"LOINC_NUM", "COMPONENT", "PROPERTY", "SYSTEM", "SCALE_TYP", "METHOD_TYP", "CLASS", "LONG_COMMON_NAME", "SHORTNAME"}
|
||
for _, n := range need {
|
||
if _, ok := colIdx[n]; !ok {
|
||
log.Fatalf("missing column: %s", n)
|
||
}
|
||
}
|
||
|
||
// Clear and re-import
|
||
refDB.Exec("DELETE FROM loinc_lab")
|
||
|
||
tx, err := refDB.Begin()
|
||
if err != nil {
|
||
log.Fatalf("begin tx: %v", err)
|
||
}
|
||
stmt, err := tx.Prepare(`INSERT INTO loinc_lab (loinc_num, component, property, system, scale, method, class, long_name, short_name)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`)
|
||
if err != nil {
|
||
log.Fatalf("prepare: %v", err)
|
||
}
|
||
|
||
count := 0
|
||
for {
|
||
row, err := reader.Read()
|
||
if err != nil {
|
||
break
|
||
}
|
||
stmt.Exec(
|
||
row[colIdx["LOINC_NUM"]],
|
||
row[colIdx["COMPONENT"]],
|
||
row[colIdx["PROPERTY"]],
|
||
row[colIdx["SYSTEM"]],
|
||
row[colIdx["SCALE_TYP"]],
|
||
row[colIdx["METHOD_TYP"]],
|
||
row[colIdx["CLASS"]],
|
||
row[colIdx["LONG_COMMON_NAME"]],
|
||
row[colIdx["SHORTNAME"]],
|
||
)
|
||
count++
|
||
}
|
||
stmt.Close()
|
||
if err := tx.Commit(); err != nil {
|
||
log.Fatalf("commit: %v", err)
|
||
}
|
||
log.Printf("Imported %d LOINC lab codes", count)
|
||
}
|
||
|
||
// --- lookup command ---
|
||
|
||
func cmdLookup(name, specimen, unit string) {
|
||
result, err := loincLookup(name, specimen, unit)
|
||
if err != nil {
|
||
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
|
||
os.Exit(1)
|
||
}
|
||
out, _ := json.MarshalIndent(result, "", " ")
|
||
fmt.Println(string(out))
|
||
}
|
||
|
||
type LookupResult struct {
|
||
LoincCode string `json:"loinc_code"`
|
||
LoincName string `json:"loinc_name"`
|
||
Source string `json:"source"` // "cache" or "llm"
|
||
Candidates int `json:"candidates"`
|
||
}
|
||
|
||
func loincLookup(name, specimen, unit string) (*LookupResult, error) {
|
||
// 1. Check cache
|
||
cacheKey := strings.ToLower(name + "|" + specimen + "|" + unit)
|
||
var cached []struct {
|
||
LoincCode string `db:"loinc_code"`
|
||
LoincName string `db:"loinc_name"`
|
||
}
|
||
lib.RefQuery("SELECT loinc_code, loinc_name FROM loinc_cache WHERE cache_key = ?", []any{cacheKey}, &cached)
|
||
if len(cached) > 0 {
|
||
return &LookupResult{
|
||
LoincCode: cached[0].LoincCode,
|
||
LoincName: cached[0].LoincName,
|
||
Source: "cache",
|
||
}, nil
|
||
}
|
||
|
||
// 2. Expand input to LOINC terminology via LLM, then search
|
||
// Replace "%" with "percentage" so LLM connects to LOINC's "/100 leukocytes" naming
|
||
lookupUnit := unit
|
||
if lookupUnit == "%" {
|
||
lookupUnit = "percentage"
|
||
}
|
||
tokens := tokenize(name + " " + specimen + " " + lookupUnit)
|
||
if expanded, err := llmExpand(name, specimen, lookupUnit); err == nil {
|
||
tokens = expanded
|
||
}
|
||
candidates, _ := searchCandidates(tokens)
|
||
|
||
// If unit is %, drop candidates that are counts (#/volume, NCnc)
|
||
if unit == "%" {
|
||
var filtered []candidate
|
||
for _, c := range candidates {
|
||
if c.Property == "NCnc" {
|
||
continue
|
||
}
|
||
filtered = append(filtered, c)
|
||
}
|
||
if len(filtered) > 0 {
|
||
candidates = filtered
|
||
}
|
||
}
|
||
|
||
if len(candidates) == 0 {
|
||
return nil, fmt.Errorf("no LOINC candidates found for %q", name)
|
||
}
|
||
|
||
// 4. LLM pick from candidates
|
||
code, lname, err := llmPick(name, specimen, lookupUnit, candidates)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
// 5. Cache
|
||
refDB.Exec(`INSERT OR REPLACE INTO loinc_cache (cache_key, input_name, input_specimen, input_unit, loinc_code, loinc_name, confidence)
|
||
VALUES (?, ?, ?, ?, ?, ?, 'llm')`, cacheKey, name, specimen, unit, code, lname)
|
||
|
||
return &LookupResult{
|
||
LoincCode: code,
|
||
LoincName: lname,
|
||
Source: "llm",
|
||
Candidates: len(candidates),
|
||
}, nil
|
||
}
|
||
|
||
func tokenize(s string) []string {
|
||
s = strings.ToLower(s)
|
||
// Replace common separators with spaces
|
||
for _, c := range []string{",", ";", "(", ")", "[", "]", "/", "-", ".", ":"} {
|
||
s = strings.ReplaceAll(s, c, " ")
|
||
}
|
||
var tokens []string
|
||
seen := map[string]bool{}
|
||
for _, t := range strings.Fields(s) {
|
||
if len(t) < 2 || seen[t] {
|
||
continue
|
||
}
|
||
tokens = append(tokens, t)
|
||
seen[t] = true
|
||
}
|
||
return tokens
|
||
}
|
||
|
||
type candidate struct {
|
||
LoincNum string `db:"loinc_num"`
|
||
LongName string `db:"long_name"`
|
||
ShortName string `db:"short_name"`
|
||
System string `db:"system"`
|
||
Component string `db:"component"`
|
||
Property string `db:"property"`
|
||
}
|
||
|
||
func searchCandidates(tokens []string) ([]candidate, int) {
|
||
if len(tokens) == 0 {
|
||
return nil, 0
|
||
}
|
||
|
||
// Query per token, collect into a map keyed by loinc_num
|
||
type entry struct {
|
||
c candidate
|
||
hits int // number of distinct tokens that matched
|
||
bonus int // extra score for quality of match
|
||
}
|
||
entries := map[string]*entry{}
|
||
|
||
for _, t := range tokens {
|
||
pattern := "%" + t + "%"
|
||
query := "SELECT loinc_num, long_name, short_name, system, component, property FROM loinc_lab WHERE " +
|
||
"LOWER(long_name) LIKE ? OR LOWER(short_name) LIKE ? OR LOWER(component) LIKE ?"
|
||
var results []candidate
|
||
lib.RefQuery(query, []any{pattern, pattern, pattern}, &results)
|
||
for _, c := range results {
|
||
if e, ok := entries[c.LoincNum]; ok {
|
||
e.hits++
|
||
} else {
|
||
entries[c.LoincNum] = &entry{c: c, hits: 1}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Require at least 2 token matches (or 1 if only 1 token)
|
||
minHits := 2
|
||
if len(tokens) <= 1 {
|
||
minHits = 1
|
||
}
|
||
|
||
// Score: hits × 100 + bonus for component exactness (shorter component = more specific)
|
||
type scored struct {
|
||
c candidate
|
||
score int
|
||
}
|
||
var scoredResults []scored
|
||
for _, e := range entries {
|
||
if e.hits < minHits {
|
||
continue
|
||
}
|
||
s := e.hits * 100
|
||
// Bonus: prefer entries where component is a simple term, not a compound like "Carboxyhemoglobin/Hemoglobin.total"
|
||
compLen := len(e.c.Component)
|
||
if compLen > 0 && compLen < 50 {
|
||
s += 50 - compLen // shorter component = higher bonus
|
||
}
|
||
// Bonus: prefer entries without "/" in component (simple analytes)
|
||
if !strings.Contains(e.c.Component, "/") {
|
||
s += 20
|
||
}
|
||
scoredResults = append(scoredResults, scored{e.c, s})
|
||
}
|
||
|
||
// Sort by score descending, take top 30
|
||
for i := range scoredResults {
|
||
for j := i + 1; j < len(scoredResults); j++ {
|
||
if scoredResults[j].score > scoredResults[i].score {
|
||
scoredResults[i], scoredResults[j] = scoredResults[j], scoredResults[i]
|
||
}
|
||
}
|
||
}
|
||
var top []candidate
|
||
maxHits := 0
|
||
for i, s := range scoredResults {
|
||
if i >= 30 {
|
||
break
|
||
}
|
||
top = append(top, s.c)
|
||
hits := s.score / 100 // extract hit count from score
|
||
if hits > maxHits {
|
||
maxHits = hits
|
||
}
|
||
}
|
||
return top, maxHits
|
||
}
|
||
|
||
func llmExpand(name, specimen, unit string) ([]string, error) {
|
||
prompt := fmt.Sprintf(`Given a lab test, return search terms to find it in the LOINC database.
|
||
LOINC uses formal medical terminology (e.g. "Leukocytes" not "White Blood Cells", "Erythrocytes" not "Red Blood Cells", "Oxygen" not "O2" or "pO2").
|
||
|
||
Lab test:
|
||
Name: %s
|
||
Specimen: %s
|
||
Unit: %s
|
||
|
||
Return a JSON object: {"terms": ["term1", "term2", ...]}
|
||
Include: the LOINC component name, specimen system code (e.g. Bld, BldA, BldC, BldV, Ser/Plas, Urine), and any synonyms that might appear in LOINC long names.
|
||
Keep it to 3-6 terms. JSON only.`, name, specimen, unit)
|
||
|
||
resp, err := lib.CallGemini(prompt)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
var result struct {
|
||
Terms []string `json:"terms"`
|
||
}
|
||
if err := json.Unmarshal([]byte(resp), &result); err != nil {
|
||
return nil, fmt.Errorf("parse expand response %q: %w", resp, err)
|
||
}
|
||
|
||
// Lowercase all terms and add original input tokens as fallback
|
||
var terms []string
|
||
seen := map[string]bool{}
|
||
for _, t := range result.Terms {
|
||
t = strings.ToLower(strings.TrimSpace(t))
|
||
if t != "" && !seen[t] {
|
||
terms = append(terms, t)
|
||
seen[t] = true
|
||
}
|
||
}
|
||
// Also include original tokens so we never lose the raw input
|
||
for _, t := range tokenize(name + " " + specimen) {
|
||
if !seen[t] {
|
||
terms = append(terms, t)
|
||
seen[t] = true
|
||
}
|
||
}
|
||
return terms, nil
|
||
}
|
||
|
||
func llmPick(name, specimen, unit string, candidates []candidate) (string, string, error) {
|
||
// Format candidates as a numbered list
|
||
// Replace "/100" and "fraction" with "percentage" so LLM connects them to "%" unit
|
||
var lines []string
|
||
for i, c := range candidates {
|
||
display := c.LongName
|
||
display = strings.ReplaceAll(display, "/100 ", "percentage of ")
|
||
display = strings.ReplaceAll(display, "fraction", "percentage")
|
||
lines = append(lines, fmt.Sprintf("%d. %s — %s [System: %s]", i+1, c.LoincNum, display, c.System))
|
||
}
|
||
|
||
prompt := fmt.Sprintf(`You are a clinical laboratory informatics system. Given a lab test, pick the BEST matching LOINC code from the candidate list.
|
||
|
||
Lab test:
|
||
Name: %s
|
||
Specimen: %s
|
||
Unit: %s
|
||
|
||
Candidates:
|
||
%s
|
||
|
||
Return ONLY a JSON object: {"pick": <number>, "loinc": "<code>", "name": "<long name>"}
|
||
Pick the candidate that best matches the test name, specimen type, and unit. If none match well, pick the closest.
|
||
JSON only, no explanation.`, name, specimen, unit, strings.Join(lines, "\n"))
|
||
|
||
resp, err := lib.CallGemini(prompt)
|
||
if err != nil {
|
||
return "", "", fmt.Errorf("LLM call failed: %w", err)
|
||
}
|
||
|
||
var result struct {
|
||
Pick int `json:"pick"`
|
||
Loinc string `json:"loinc"`
|
||
Name string `json:"name"`
|
||
}
|
||
if err := json.Unmarshal([]byte(resp), &result); err != nil {
|
||
return "", "", fmt.Errorf("parse LLM response %q: %w", resp, err)
|
||
}
|
||
|
||
// Resolve by pick number if loinc field is empty
|
||
if result.Loinc == "" && result.Pick > 0 && result.Pick <= len(candidates) {
|
||
result.Loinc = candidates[result.Pick-1].LoincNum
|
||
result.Name = candidates[result.Pick-1].LongName
|
||
}
|
||
// Verify the code is actually in our candidate list
|
||
for _, c := range candidates {
|
||
if c.LoincNum == result.Loinc {
|
||
return result.Loinc, c.LongName, nil
|
||
}
|
||
}
|
||
// Pick number as fallback
|
||
if result.Pick > 0 && result.Pick <= len(candidates) {
|
||
c := candidates[result.Pick-1]
|
||
return c.LoincNum, c.LongName, nil
|
||
}
|
||
return "", "", fmt.Errorf("LLM returned %q (pick %d) — not in %d candidates", result.Loinc, result.Pick, len(candidates))
|
||
}
|
||
|
||
// --- batch command ---
|
||
|
||
func cmdBatch(path string) {
|
||
data, err := os.ReadFile(path)
|
||
if err != nil {
|
||
log.Fatalf("read %s: %v", path, err)
|
||
}
|
||
for _, line := range strings.Split(string(data), "\n") {
|
||
line = strings.TrimSpace(line)
|
||
if line == "" {
|
||
continue
|
||
}
|
||
var input struct {
|
||
Name string `json:"name"`
|
||
Specimen string `json:"specimen"`
|
||
Unit string `json:"unit"`
|
||
}
|
||
if err := json.Unmarshal([]byte(line), &input); err != nil {
|
||
fmt.Fprintf(os.Stderr, "skip bad line: %s\n", line)
|
||
continue
|
||
}
|
||
result, err := loincLookup(input.Name, input.Specimen, input.Unit)
|
||
if err != nil {
|
||
fmt.Fprintf(os.Stderr, "%s: %v\n", input.Name, err)
|
||
continue
|
||
}
|
||
fmt.Printf("%-40s → %s %s [%s]\n", input.Name, result.LoincCode, result.LoincName, result.Source)
|
||
}
|
||
}
|
||
|
||
// --- stats command ---
|
||
|
||
func cmdStats() {
|
||
var total []struct{ N int `db:"n"` }
|
||
lib.RefQuery("SELECT COUNT(*) as n FROM loinc_lab", nil, &total)
|
||
if len(total) > 0 {
|
||
fmt.Printf("LOINC lab codes: %d\n", total[0].N)
|
||
}
|
||
|
||
var cached []struct{ N int `db:"n"` }
|
||
lib.RefQuery("SELECT COUNT(*) as n FROM loinc_cache", nil, &cached)
|
||
if len(cached) > 0 {
|
||
fmt.Printf("Cached lookups: %d\n", cached[0].N)
|
||
}
|
||
}
|