inou/import-genome/main.go

576 lines
14 KiB
Go

package main
import (
"bufio"
"bytes"
"database/sql"
"encoding/json"
"flag"
"fmt"
"os"
"sort"
"strings"
"time"
_ "github.com/mattn/go-sqlite3"
"inou/lib"
)
const version = "5.0.0"
type Variant struct {
RSID string
Genotype string
}
type SNPediaMatch struct {
RSID string
Genotype string
Gene string
Magnitude float64
Repute string
Summary string
Category string
Subcategory string
}
type CategoryCount struct {
Shown int `json:"shown"`
Hidden int `json:"hidden"`
}
func usage() {
fmt.Println(`import-genome - Import genetic data with SNPedia enrichment
USAGE:
import-genome <plain-file> <dossier-id>
SUPPORTED FORMATS:
AncestryDNA Tab-delimited, 5 columns (alleles split)
23andMe Tab-delimited, 4 columns (alleles combined)
MyHeritage CSV with quotes, 4 columns
FTDNA CSV clean, 4 columns
FORMAT AUTO-DETECTION:
The tool automatically detects the format from the file structure.
EXAMPLE:
import-genome /path/to/dna.txt 3b38234f2b0f7ee6
DATABASE:
SNPedia reference: /tank/inou/data/reference.db (genotypes table, read-only)
Entries: via lib.EntryAddBatchValues() to /tank/inou/data/inou.db
VERSION: ` + version)
}
func detectFormat(firstLine string) string {
if strings.Contains(firstLine, "\"") {
return "myheritage"
}
if strings.Contains(firstLine, "\t") {
parts := strings.Split(firstLine, "\t")
if len(parts) >= 5 {
return "ancestry"
}
return "23andme"
}
return "ftdna"
}
func complement(b byte) byte {
switch b {
case 'A':
return 'T'
case 'T':
return 'A'
case 'C':
return 'G'
case 'G':
return 'C'
}
return b
}
func normalizeGenotype(genotype, alleles string) string {
if len(genotype) != 2 || alleles == "" {
if len(genotype) == 2 && genotype[0] > genotype[1] {
return string(genotype[1]) + string(genotype[0])
}
return genotype
}
valid := make(map[byte]bool)
for _, a := range strings.Split(alleles, "/") {
if len(a) == 1 {
valid[a[0]] = true
}
}
var result [2]byte
for i := 0; i < 2; i++ {
b := genotype[i]
if valid[b] {
result[i] = b
} else {
result[i] = complement(b)
}
}
if result[0] > result[1] {
result[0], result[1] = result[1], result[0]
}
return string(result[0]) + string(result[1])
}
func parseVariant(line, format string) (string, string, bool) {
if strings.HasPrefix(line, "#") || strings.HasPrefix(line, "rsid") || strings.HasPrefix(line, "RSID") || (strings.HasPrefix(line, "\"") && strings.Contains(line, "RSID")) {
return "", "", false
}
var parts []string
var rsid, genotype string
switch format {
case "ancestry":
parts = strings.Split(line, "\t")
if len(parts) < 5 {
return "", "", false
}
rsid = parts[0]
allele1, allele2 := parts[3], parts[4]
if allele1 == "0" || allele2 == "0" {
return "", "", false
}
genotype = allele1 + allele2
case "23andme":
parts = strings.Split(line, "\t")
if len(parts) < 4 {
return "", "", false
}
rsid = parts[0]
genotype = parts[3]
if genotype == "--" {
return "", "", false
}
case "myheritage":
line = strings.ReplaceAll(line, "\"", "")
parts = strings.Split(line, ",")
if len(parts) < 4 {
return "", "", false
}
rsid = parts[0]
genotype = parts[3]
case "ftdna":
parts = strings.Split(line, ",")
if len(parts) < 4 {
return "", "", false
}
rsid = parts[0]
genotype = parts[3]
}
if !strings.HasPrefix(rsid, "rs") {
return "", "", false
}
if len(genotype) == 2 && genotype[0] > genotype[1] {
genotype = string(genotype[1]) + string(genotype[0])
}
return rsid, genotype, true
}
// shouldShow returns true if variant should be shown by default (not hidden)
func shouldShow(mag float64, repute string) bool {
if mag > 4.0 {
return false
}
if strings.EqualFold(repute, "bad") {
return false
}
return true
}
func main() {
help := flag.Bool("help", false, "Show help")
flag.BoolVar(help, "h", false, "Show help")
flag.Usage = usage
flag.Parse()
if *help {
usage()
os.Exit(0)
}
args := flag.Args()
if len(args) < 2 {
usage()
os.Exit(1)
}
filePath := args[0]
dossierID := args[1]
totalStart := time.Now()
// ===== PHASE 1: Read file =====
phase1Start := time.Now()
data, err := os.ReadFile(filePath)
if err != nil {
fmt.Println("Read failed:", err)
os.Exit(1)
}
fmt.Printf("Phase 1 - Read: %v (%d bytes)\n", time.Since(phase1Start), len(data))
// ===== PHASE 2: Parse variants =====
phase2Start := time.Now()
scanner := bufio.NewScanner(bytes.NewReader(data))
scanner.Buffer(make([]byte, 1024*1024), 1024*1024)
var format string
var firstDataLine string
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "#") && len(line) > 0 {
firstDataLine = line
break
}
}
format = detectFormat(firstDataLine)
fmt.Printf("Detected format: %s\n", format)
variants := make([]Variant, 0, 800000)
if rsid, geno, ok := parseVariant(firstDataLine, format); ok {
variants = append(variants, Variant{rsid, geno})
}
for scanner.Scan() {
if rsid, geno, ok := parseVariant(scanner.Text(), format); ok {
variants = append(variants, Variant{rsid, geno})
}
}
fmt.Printf("Phase 2 - Parse: %v (%d variants)\n", time.Since(phase2Start), len(variants))
// ===== PHASE 3: Sort by rsid =====
phase3Start := time.Now()
sort.Slice(variants, func(i, j int) bool {
return variants[i].RSID < variants[j].RSID
})
fmt.Printf("Phase 3 - Sort: %v\n", time.Since(phase3Start))
// ===== PHASE 4: Load SNPedia and match =====
phase4Start := time.Now()
snpediaDB, err := sql.Open("sqlite3", "/tank/inou/data/reference.db?mode=ro")
if err != nil {
fmt.Println("SNPedia DB open failed:", err)
os.Exit(1)
}
defer snpediaDB.Close()
// Load alleles for normalization
snpediaAlleles := make(map[string]string, 15000)
rows, err := snpediaDB.Query("SELECT DISTINCT rsid, alleles FROM genotypes")
if err != nil {
fmt.Println("SNPedia alleles query failed:", err)
os.Exit(1)
}
for rows.Next() {
var rsid, alleles string
rows.Scan(&rsid, &alleles)
snpediaAlleles[rsid] = alleles
}
rows.Close()
// Match variants with SNPedia genotypes
matched := make([]SNPediaMatch, 0, 2000)
matchedRsids := make(map[string]bool) // track which rsids had positive matches
for _, v := range variants {
alleles, ok := snpediaAlleles[v.RSID]
if !ok {
continue
}
normalized := normalizeGenotype(v.Genotype, alleles)
// Query for this specific rsid+genotype
rows, err := snpediaDB.Query(`
SELECT gene, magnitude, repute, summary, category, subcategory
FROM genotypes
WHERE rsid = ? AND genotype_norm = ?`,
v.RSID, normalized)
if err != nil {
continue
}
for rows.Next() {
var gene, repute, summary, category, subcategory sql.NullString
var magnitude float64
rows.Scan(&gene, &magnitude, &repute, &summary, &category, &subcategory)
if category.String == "" {
continue
}
matchedRsids[v.RSID] = true
matched = append(matched, SNPediaMatch{
RSID: v.RSID,
Genotype: normalized,
Gene: gene.String,
Magnitude: magnitude,
Repute: repute.String,
Summary: summary.String,
Category: category.String,
Subcategory: subcategory.String,
})
}
rows.Close()
}
positiveMatches := len(matched)
// Find "clear" findings: rsids in SNPedia where user's genotype doesn't match any risk variant
clearFindings := 0
for _, v := range variants {
if matchedRsids[v.RSID] {
continue // already has positive matches
}
alleles, ok := snpediaAlleles[v.RSID]
if !ok {
continue // not in SNPedia
}
normalized := normalizeGenotype(v.Genotype, alleles)
// Get what SNPedia DOES have for this rsid (the risk variants user doesn't have)
rows, err := snpediaDB.Query(`
SELECT gene, genotype_norm, magnitude, repute, summary, category, subcategory
FROM genotypes
WHERE rsid = ?
ORDER BY magnitude DESC`,
v.RSID)
if err != nil {
continue
}
// Collect risk variants to build the "clear" message
type riskInfo struct {
genotype string
mag float64
summary string
}
var risks []riskInfo
var gene, topCategory, topSubcategory string
var topMag float64
for rows.Next() {
var g, geno, rep, sum, cat, sub sql.NullString
var mag float64
rows.Scan(&g, &geno, &mag, &rep, &sum, &cat, &sub)
if cat.String == "" {
continue
}
// Track highest magnitude category for this clear finding
if mag > topMag || topCategory == "" {
topMag = mag
topCategory = cat.String
topSubcategory = sub.String
gene = g.String
}
// Collect unique risk genotypes
found := false
for _, r := range risks {
if r.genotype == geno.String {
found = true
break
}
}
if !found && len(risks) < 3 {
risks = append(risks, riskInfo{geno.String, mag, sum.String})
}
}
rows.Close()
if len(risks) == 0 || topCategory == "" {
continue
}
// Build the "clear" summary
var riskDescs []string
for _, r := range risks {
desc := r.genotype
if r.summary != "" {
// Truncate summary
s := r.summary
if len(s) > 40 {
s = s[:40] + "..."
}
desc += ": " + s
}
riskDescs = append(riskDescs, desc)
}
clearSummary := fmt.Sprintf("No risk variant detected. You have %s. (Documented risks: %s)",
normalized, strings.Join(riskDescs, "; "))
clearFindings++
matched = append(matched, SNPediaMatch{
RSID: v.RSID,
Genotype: normalized,
Gene: gene,
Magnitude: 0,
Repute: "Clear",
Summary: clearSummary,
Category: topCategory,
Subcategory: topSubcategory,
})
}
fmt.Printf("Phase 4 - Load SNPedia & match: %v (%d positive, %d clear)\n", time.Since(phase4Start), positiveMatches, clearFindings)
// ===== PHASE 5: Group by category and calculate counts =====
phase5Start := time.Now()
byCategory := make(map[string][]SNPediaMatch)
for _, m := range matched {
byCategory[m.Category] = append(byCategory[m.Category], m)
}
// Calculate counts per category
counts := make(map[string]CategoryCount)
for cat, variants := range byCategory {
c := CategoryCount{}
for _, v := range variants {
if shouldShow(v.Magnitude, v.Repute) {
c.Shown++
} else {
c.Hidden++
}
}
counts[cat] = c
}
fmt.Printf("Phase 5 - Group & count: %v (%d categories)\n", time.Since(phase5Start), len(byCategory))
// ===== PHASE 6: Initialize lib and delete existing =====
phase6Start := time.Now()
if err := lib.Init(); err != nil {
fmt.Println("lib.Init failed:", err)
os.Exit(1)
}
if err := lib.EntryDelete("", dossierID, &lib.Filter{Category: lib.CategoryGenome}); err != nil {
fmt.Println("Delete existing failed:", err)
os.Exit(1)
}
fmt.Printf("Phase 6 - Init & delete existing: %v\n", time.Since(phase6Start))
// ===== PHASE 7: Build entries =====
phase7Start := time.Now()
now := time.Now().Unix()
// Extraction entry with counts
extractionID := lib.NewID()
extractionData := struct {
Source string `json:"source"`
Total int `json:"total"`
Matched int `json:"matched"`
Positive int `json:"positive"`
Clear int `json:"clear"`
Counts map[string]CategoryCount `json:"counts"`
}{
Source: format,
Total: len(variants),
Matched: len(matched),
Positive: positiveMatches,
Clear: clearFindings,
Counts: counts,
}
extractionJSON, _ := json.Marshal(extractionData)
entries := make([]*lib.Entry, 0, len(matched)+len(byCategory)+1)
entries = append(entries, &lib.Entry{
EntryID: extractionID,
DossierID: dossierID,
Category: lib.CategoryGenome,
Type: "extraction",
Value: format,
Timestamp: now,
Data: string(extractionJSON),
})
// Tier entries (one per category, category = GenomeTier for ordering)
tierIDs := make(map[string]string)
for cat := range byCategory {
tierID := lib.NewID()
tierIDs[cat] = tierID
c := counts[cat]
tierData, _ := json.Marshal(c)
entries = append(entries, &lib.Entry{
EntryID: tierID,
DossierID: dossierID,
ParentID: extractionID,
Category: lib.CategoryGenome,
Type: "tier",
Value: cat,
Ordinal: lib.GenomeTierFromString[cat],
Timestamp: now,
Data: string(tierData),
})
}
// Variant entries (under their category tier)
for cat, variants := range byCategory {
tierID := tierIDs[cat]
for i, v := range variants {
variantData := struct {
Mag float64 `json:"mag,omitempty"`
Rep string `json:"rep,omitempty"`
Sum string `json:"sum,omitempty"`
Sub string `json:"sub,omitempty"`
}{
Mag: v.Magnitude,
Rep: v.Repute,
Sum: v.Summary,
Sub: v.Subcategory,
}
dataJSON, _ := json.Marshal(variantData)
entries = append(entries, &lib.Entry{
EntryID: lib.NewID(),
DossierID: dossierID,
ParentID: tierID,
Category: lib.CategoryGenome,
Type: v.RSID,
Value: v.Genotype,
Tags: v.Gene,
SearchKey: cat,
Ordinal: i + 1,
Timestamp: now,
Data: string(dataJSON),
})
}
}
fmt.Printf("Phase 7 - Build entries: %v (%d entries)\n", time.Since(phase7Start), len(entries))
// ===== PHASE 8: Save to database =====
phase8Start := time.Now()
importID := lib.NextImportID()
for _, e := range entries {
e.Import = importID
}
if err := lib.EntryWrite("", entries...); err != nil {
fmt.Println("EntryWrite failed:", err)
os.Exit(1)
}
fmt.Printf("Phase 8 - Save: %v (%d entries saved)\n", time.Since(phase8Start), len(entries))
fmt.Printf("\nTOTAL: %v\n", time.Since(totalStart))
fmt.Printf("Extraction ID: %s\n", extractionID)
fmt.Printf("Categories: %d\n", len(byCategory))
for cat, c := range counts {
fmt.Printf(" %s: %d shown, %d hidden\n", cat, c.Shown, c.Hidden)
}
}