package main import ( "bufio" "bytes" "database/sql" "encoding/json" "flag" "fmt" "os" "sort" "strings" "time" _ "github.com/mattn/go-sqlite3" "inou/lib" ) const version = "5.0.0" type Variant struct { RSID string Genotype string } type SNPediaMatch struct { RSID string Genotype string Gene string Magnitude float64 Repute string Summary string Category string Subcategory string } type CategoryCount struct { Shown int `json:"shown"` Hidden int `json:"hidden"` } func usage() { fmt.Println(`import-genome - Import genetic data with SNPedia enrichment USAGE: import-genome SUPPORTED FORMATS: AncestryDNA Tab-delimited, 5 columns (alleles split) 23andMe Tab-delimited, 4 columns (alleles combined) MyHeritage CSV with quotes, 4 columns FTDNA CSV clean, 4 columns FORMAT AUTO-DETECTION: The tool automatically detects the format from the file structure. EXAMPLE: import-genome /path/to/dna.txt 3b38234f2b0f7ee6 DATABASE: SNPedia reference: ~/dev/inou/snpedia-genotypes/genotypes.db (read-only) Entries: via lib.Save() to /tank/inou/data/inou.db VERSION: ` + version) } func detectFormat(firstLine string) string { if strings.Contains(firstLine, "\"") { return "myheritage" } if strings.Contains(firstLine, "\t") { parts := strings.Split(firstLine, "\t") if len(parts) >= 5 { return "ancestry" } return "23andme" } return "ftdna" } func complement(b byte) byte { switch b { case 'A': return 'T' case 'T': return 'A' case 'C': return 'G' case 'G': return 'C' } return b } func normalizeGenotype(genotype, alleles string) string { if len(genotype) != 2 || alleles == "" { if len(genotype) == 2 && genotype[0] > genotype[1] { return string(genotype[1]) + string(genotype[0]) } return genotype } valid := make(map[byte]bool) for _, a := range strings.Split(alleles, "/") { if len(a) == 1 { valid[a[0]] = true } } var result [2]byte for i := 0; i < 2; i++ { b := genotype[i] if valid[b] { result[i] = b } else { result[i] = complement(b) } } if result[0] > result[1] { result[0], result[1] = result[1], result[0] } return string(result[0]) + string(result[1]) } func parseVariant(line, format string) (string, string, bool) { if strings.HasPrefix(line, "#") || strings.HasPrefix(line, "rsid") || strings.HasPrefix(line, "RSID") || (strings.HasPrefix(line, "\"") && strings.Contains(line, "RSID")) { return "", "", false } var parts []string var rsid, genotype string switch format { case "ancestry": parts = strings.Split(line, "\t") if len(parts) < 5 { return "", "", false } rsid = parts[0] allele1, allele2 := parts[3], parts[4] if allele1 == "0" || allele2 == "0" { return "", "", false } genotype = allele1 + allele2 case "23andme": parts = strings.Split(line, "\t") if len(parts) < 4 { return "", "", false } rsid = parts[0] genotype = parts[3] if genotype == "--" { return "", "", false } case "myheritage": line = strings.ReplaceAll(line, "\"", "") parts = strings.Split(line, ",") if len(parts) < 4 { return "", "", false } rsid = parts[0] genotype = parts[3] case "ftdna": parts = strings.Split(line, ",") if len(parts) < 4 { return "", "", false } rsid = parts[0] genotype = parts[3] } if !strings.HasPrefix(rsid, "rs") { return "", "", false } if len(genotype) == 2 && genotype[0] > genotype[1] { genotype = string(genotype[1]) + string(genotype[0]) } return rsid, genotype, true } // shouldShow returns true if variant should be shown by default (not hidden) func shouldShow(mag float64, repute string) bool { if mag > 4.0 { return false } if strings.EqualFold(repute, "bad") { return false } return true } func main() { help := flag.Bool("help", false, "Show help") flag.BoolVar(help, "h", false, "Show help") flag.Usage = usage flag.Parse() if *help { usage() os.Exit(0) } args := flag.Args() if len(args) < 2 { usage() os.Exit(1) } filePath := args[0] dossierID := args[1] totalStart := time.Now() // ===== PHASE 1: Read file ===== phase1Start := time.Now() data, err := os.ReadFile(filePath) if err != nil { fmt.Println("Read failed:", err) os.Exit(1) } fmt.Printf("Phase 1 - Read: %v (%d bytes)\n", time.Since(phase1Start), len(data)) // ===== PHASE 2: Parse variants ===== phase2Start := time.Now() scanner := bufio.NewScanner(bytes.NewReader(data)) scanner.Buffer(make([]byte, 1024*1024), 1024*1024) var format string var firstDataLine string for scanner.Scan() { line := scanner.Text() if !strings.HasPrefix(line, "#") && len(line) > 0 { firstDataLine = line break } } format = detectFormat(firstDataLine) fmt.Printf("Detected format: %s\n", format) variants := make([]Variant, 0, 800000) if rsid, geno, ok := parseVariant(firstDataLine, format); ok { variants = append(variants, Variant{rsid, geno}) } for scanner.Scan() { if rsid, geno, ok := parseVariant(scanner.Text(), format); ok { variants = append(variants, Variant{rsid, geno}) } } fmt.Printf("Phase 2 - Parse: %v (%d variants)\n", time.Since(phase2Start), len(variants)) // ===== PHASE 3: Sort by rsid ===== phase3Start := time.Now() sort.Slice(variants, func(i, j int) bool { return variants[i].RSID < variants[j].RSID }) fmt.Printf("Phase 3 - Sort: %v\n", time.Since(phase3Start)) // ===== PHASE 4: Load SNPedia and match ===== phase4Start := time.Now() snpediaDB, err := sql.Open("sqlite3", "/home/johan/dev/inou/snpedia-genotypes/genotypes.db?mode=ro") if err != nil { fmt.Println("SNPedia DB open failed:", err) os.Exit(1) } defer snpediaDB.Close() // Load alleles for normalization snpediaAlleles := make(map[string]string, 15000) rows, err := snpediaDB.Query("SELECT DISTINCT rsid, alleles FROM genotypes") if err != nil { fmt.Println("SNPedia alleles query failed:", err) os.Exit(1) } for rows.Next() { var rsid, alleles string rows.Scan(&rsid, &alleles) snpediaAlleles[rsid] = alleles } rows.Close() // Match variants with SNPedia genotypes matched := make([]SNPediaMatch, 0, 2000) matchedRsids := make(map[string]bool) // track which rsids had positive matches for _, v := range variants { alleles, ok := snpediaAlleles[v.RSID] if !ok { continue } normalized := normalizeGenotype(v.Genotype, alleles) // Query for this specific rsid+genotype rows, err := snpediaDB.Query(` SELECT gene, magnitude, repute, summary, category, subcategory FROM genotypes WHERE rsid = ? AND genotype_norm = ?`, v.RSID, normalized) if err != nil { continue } for rows.Next() { var gene, repute, summary, category, subcategory sql.NullString var magnitude float64 rows.Scan(&gene, &magnitude, &repute, &summary, &category, &subcategory) if category.String == "" { continue } matchedRsids[v.RSID] = true matched = append(matched, SNPediaMatch{ RSID: v.RSID, Genotype: normalized, Gene: gene.String, Magnitude: magnitude, Repute: repute.String, Summary: summary.String, Category: category.String, Subcategory: subcategory.String, }) } rows.Close() } positiveMatches := len(matched) // Find "clear" findings: rsids in SNPedia where user's genotype doesn't match any risk variant clearFindings := 0 for _, v := range variants { if matchedRsids[v.RSID] { continue // already has positive matches } alleles, ok := snpediaAlleles[v.RSID] if !ok { continue // not in SNPedia } normalized := normalizeGenotype(v.Genotype, alleles) // Get what SNPedia DOES have for this rsid (the risk variants user doesn't have) rows, err := snpediaDB.Query(` SELECT gene, genotype_norm, magnitude, repute, summary, category, subcategory FROM genotypes WHERE rsid = ? ORDER BY magnitude DESC`, v.RSID) if err != nil { continue } // Collect risk variants to build the "clear" message type riskInfo struct { genotype string mag float64 summary string } var risks []riskInfo var gene, topCategory, topSubcategory string var topMag float64 for rows.Next() { var g, geno, rep, sum, cat, sub sql.NullString var mag float64 rows.Scan(&g, &geno, &mag, &rep, &sum, &cat, &sub) if cat.String == "" { continue } // Track highest magnitude category for this clear finding if mag > topMag || topCategory == "" { topMag = mag topCategory = cat.String topSubcategory = sub.String gene = g.String } // Collect unique risk genotypes found := false for _, r := range risks { if r.genotype == geno.String { found = true break } } if !found && len(risks) < 3 { risks = append(risks, riskInfo{geno.String, mag, sum.String}) } } rows.Close() if len(risks) == 0 || topCategory == "" { continue } // Build the "clear" summary var riskDescs []string for _, r := range risks { desc := r.genotype if r.summary != "" { // Truncate summary s := r.summary if len(s) > 40 { s = s[:40] + "..." } desc += ": " + s } riskDescs = append(riskDescs, desc) } clearSummary := fmt.Sprintf("No risk variant detected. You have %s. (Documented risks: %s)", normalized, strings.Join(riskDescs, "; ")) clearFindings++ matched = append(matched, SNPediaMatch{ RSID: v.RSID, Genotype: normalized, Gene: gene, Magnitude: 0, Repute: "Clear", Summary: clearSummary, Category: topCategory, Subcategory: topSubcategory, }) } fmt.Printf("Phase 4 - Load SNPedia & match: %v (%d positive, %d clear)\n", time.Since(phase4Start), positiveMatches, clearFindings) // ===== PHASE 5: Group by category and calculate counts ===== phase5Start := time.Now() byCategory := make(map[string][]SNPediaMatch) for _, m := range matched { byCategory[m.Category] = append(byCategory[m.Category], m) } // Calculate counts per category counts := make(map[string]CategoryCount) for cat, variants := range byCategory { c := CategoryCount{} for _, v := range variants { if shouldShow(v.Magnitude, v.Repute) { c.Shown++ } else { c.Hidden++ } } counts[cat] = c } fmt.Printf("Phase 5 - Group & count: %v (%d categories)\n", time.Since(phase5Start), len(byCategory)) // ===== PHASE 6: Initialize lib and delete existing ===== phase6Start := time.Now() if err := lib.Init(); err != nil { fmt.Println("lib.Init failed:", err) os.Exit(1) } if err := lib.EntryDeleteByCategory(dossierID, lib.CategoryGenome); err != nil { fmt.Println("Delete existing failed:", err) os.Exit(1) } fmt.Printf("Phase 6 - Init & delete existing: %v\n", time.Since(phase6Start)) // ===== PHASE 7: Build entries ===== phase7Start := time.Now() now := time.Now().Unix() // Extraction entry with counts extractionID := lib.NewID() extractionData := struct { Source string `json:"source"` Total int `json:"total"` Matched int `json:"matched"` Positive int `json:"positive"` Clear int `json:"clear"` Counts map[string]CategoryCount `json:"counts"` }{ Source: format, Total: len(variants), Matched: len(matched), Positive: positiveMatches, Clear: clearFindings, Counts: counts, } extractionJSON, _ := json.Marshal(extractionData) entries := make([]lib.Entry, 0, len(matched)+len(byCategory)+1) entries = append(entries, lib.Entry{ EntryID: extractionID, DossierID: dossierID, Category: lib.CategoryGenome, Type: "extraction", Value: format, Timestamp: now, Data: string(extractionJSON), }) // Tier entries (one per category, category = GenomeTier for ordering) tierIDs := make(map[string]string) for cat := range byCategory { tierID := lib.NewID() tierIDs[cat] = tierID entries = append(entries, lib.Entry{ EntryID: tierID, DossierID: dossierID, ParentID: extractionID, Category: lib.CategoryGenome, Type: "tier", Value: cat, Ordinal: lib.GenomeTierFromString[cat], // for sorting Timestamp: now, }) } // Variant entries (under their category tier) for cat, variants := range byCategory { tierID := tierIDs[cat] for i, v := range variants { variantData := struct { Mag float64 `json:"mag,omitempty"` Rep string `json:"rep,omitempty"` Sum string `json:"sum,omitempty"` Sub string `json:"sub,omitempty"` }{ Mag: v.Magnitude, Rep: v.Repute, Sum: v.Summary, Sub: v.Subcategory, } dataJSON, _ := json.Marshal(variantData) entries = append(entries, lib.Entry{ EntryID: lib.NewID(), DossierID: dossierID, ParentID: tierID, Category: lib.CategoryGenome, Type: v.RSID, Value: v.Genotype, Tags: v.Gene, Ordinal: i + 1, Timestamp: now, Data: string(dataJSON), }) } } fmt.Printf("Phase 7 - Build entries: %v (%d entries)\n", time.Since(phase7Start), len(entries)) // ===== PHASE 8: Save to database ===== phase8Start := time.Now() if err := lib.Save("entries", entries); err != nil { fmt.Println("lib.Save failed:", err) os.Exit(1) } fmt.Printf("Phase 8 - lib.Save: %v (%d entries saved)\n", time.Since(phase8Start), len(entries)) fmt.Printf("\nTOTAL: %v\n", time.Since(totalStart)) fmt.Printf("Extraction ID: %s\n", extractionID) fmt.Printf("Categories: %d\n", len(byCategory)) for cat, c := range counts { fmt.Printf(" %s: %d shown, %d hidden\n", cat, c.Shown, c.Hidden) } }