569 lines
14 KiB
Go
569 lines
14 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
|
|
"inou/lib"
|
|
)
|
|
|
|
const version = "5.0.0"
|
|
|
|
type Variant struct {
|
|
RSID string
|
|
Genotype string
|
|
}
|
|
|
|
type SNPediaMatch struct {
|
|
RSID string
|
|
Genotype string
|
|
Gene string
|
|
Magnitude float64
|
|
Repute string
|
|
Summary string
|
|
Category string
|
|
Subcategory string
|
|
}
|
|
|
|
type CategoryCount struct {
|
|
Shown int `json:"shown"`
|
|
Hidden int `json:"hidden"`
|
|
}
|
|
|
|
func usage() {
|
|
fmt.Println(`import-genome - Import genetic data with SNPedia enrichment
|
|
|
|
USAGE:
|
|
import-genome <plain-file> <dossier-id>
|
|
|
|
SUPPORTED FORMATS:
|
|
AncestryDNA Tab-delimited, 5 columns (alleles split)
|
|
23andMe Tab-delimited, 4 columns (alleles combined)
|
|
MyHeritage CSV with quotes, 4 columns
|
|
FTDNA CSV clean, 4 columns
|
|
|
|
FORMAT AUTO-DETECTION:
|
|
The tool automatically detects the format from the file structure.
|
|
|
|
EXAMPLE:
|
|
import-genome /path/to/dna.txt 3b38234f2b0f7ee6
|
|
|
|
DATABASE:
|
|
SNPedia reference: ~/dev/inou/snpedia-genotypes/genotypes.db (read-only)
|
|
Entries: via lib.EntryAddBatchValues() to /tank/inou/data/inou.db
|
|
|
|
VERSION: ` + version)
|
|
}
|
|
|
|
func detectFormat(firstLine string) string {
|
|
if strings.Contains(firstLine, "\"") {
|
|
return "myheritage"
|
|
}
|
|
if strings.Contains(firstLine, "\t") {
|
|
parts := strings.Split(firstLine, "\t")
|
|
if len(parts) >= 5 {
|
|
return "ancestry"
|
|
}
|
|
return "23andme"
|
|
}
|
|
return "ftdna"
|
|
}
|
|
|
|
func complement(b byte) byte {
|
|
switch b {
|
|
case 'A':
|
|
return 'T'
|
|
case 'T':
|
|
return 'A'
|
|
case 'C':
|
|
return 'G'
|
|
case 'G':
|
|
return 'C'
|
|
}
|
|
return b
|
|
}
|
|
|
|
func normalizeGenotype(genotype, alleles string) string {
|
|
if len(genotype) != 2 || alleles == "" {
|
|
if len(genotype) == 2 && genotype[0] > genotype[1] {
|
|
return string(genotype[1]) + string(genotype[0])
|
|
}
|
|
return genotype
|
|
}
|
|
|
|
valid := make(map[byte]bool)
|
|
for _, a := range strings.Split(alleles, "/") {
|
|
if len(a) == 1 {
|
|
valid[a[0]] = true
|
|
}
|
|
}
|
|
|
|
var result [2]byte
|
|
for i := 0; i < 2; i++ {
|
|
b := genotype[i]
|
|
if valid[b] {
|
|
result[i] = b
|
|
} else {
|
|
result[i] = complement(b)
|
|
}
|
|
}
|
|
|
|
if result[0] > result[1] {
|
|
result[0], result[1] = result[1], result[0]
|
|
}
|
|
|
|
return string(result[0]) + string(result[1])
|
|
}
|
|
|
|
func parseVariant(line, format string) (string, string, bool) {
|
|
if strings.HasPrefix(line, "#") || strings.HasPrefix(line, "rsid") || strings.HasPrefix(line, "RSID") || (strings.HasPrefix(line, "\"") && strings.Contains(line, "RSID")) {
|
|
return "", "", false
|
|
}
|
|
|
|
var parts []string
|
|
var rsid, genotype string
|
|
|
|
switch format {
|
|
case "ancestry":
|
|
parts = strings.Split(line, "\t")
|
|
if len(parts) < 5 {
|
|
return "", "", false
|
|
}
|
|
rsid = parts[0]
|
|
allele1, allele2 := parts[3], parts[4]
|
|
if allele1 == "0" || allele2 == "0" {
|
|
return "", "", false
|
|
}
|
|
genotype = allele1 + allele2
|
|
|
|
case "23andme":
|
|
parts = strings.Split(line, "\t")
|
|
if len(parts) < 4 {
|
|
return "", "", false
|
|
}
|
|
rsid = parts[0]
|
|
genotype = parts[3]
|
|
if genotype == "--" {
|
|
return "", "", false
|
|
}
|
|
|
|
case "myheritage":
|
|
line = strings.ReplaceAll(line, "\"", "")
|
|
parts = strings.Split(line, ",")
|
|
if len(parts) < 4 {
|
|
return "", "", false
|
|
}
|
|
rsid = parts[0]
|
|
genotype = parts[3]
|
|
|
|
case "ftdna":
|
|
parts = strings.Split(line, ",")
|
|
if len(parts) < 4 {
|
|
return "", "", false
|
|
}
|
|
rsid = parts[0]
|
|
genotype = parts[3]
|
|
}
|
|
|
|
if !strings.HasPrefix(rsid, "rs") {
|
|
return "", "", false
|
|
}
|
|
|
|
if len(genotype) == 2 && genotype[0] > genotype[1] {
|
|
genotype = string(genotype[1]) + string(genotype[0])
|
|
}
|
|
|
|
return rsid, genotype, true
|
|
}
|
|
|
|
// shouldShow returns true if variant should be shown by default (not hidden)
|
|
func shouldShow(mag float64, repute string) bool {
|
|
if mag > 4.0 {
|
|
return false
|
|
}
|
|
if strings.EqualFold(repute, "bad") {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func main() {
|
|
help := flag.Bool("help", false, "Show help")
|
|
flag.BoolVar(help, "h", false, "Show help")
|
|
flag.Usage = usage
|
|
flag.Parse()
|
|
|
|
if *help {
|
|
usage()
|
|
os.Exit(0)
|
|
}
|
|
|
|
args := flag.Args()
|
|
if len(args) < 2 {
|
|
usage()
|
|
os.Exit(1)
|
|
}
|
|
|
|
filePath := args[0]
|
|
dossierID := args[1]
|
|
|
|
totalStart := time.Now()
|
|
|
|
// ===== PHASE 1: Read file =====
|
|
phase1Start := time.Now()
|
|
data, err := os.ReadFile(filePath)
|
|
if err != nil {
|
|
fmt.Println("Read failed:", err)
|
|
os.Exit(1)
|
|
}
|
|
fmt.Printf("Phase 1 - Read: %v (%d bytes)\n", time.Since(phase1Start), len(data))
|
|
|
|
// ===== PHASE 2: Parse variants =====
|
|
phase2Start := time.Now()
|
|
scanner := bufio.NewScanner(bytes.NewReader(data))
|
|
scanner.Buffer(make([]byte, 1024*1024), 1024*1024)
|
|
|
|
var format string
|
|
var firstDataLine string
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
if !strings.HasPrefix(line, "#") && len(line) > 0 {
|
|
firstDataLine = line
|
|
break
|
|
}
|
|
}
|
|
format = detectFormat(firstDataLine)
|
|
fmt.Printf("Detected format: %s\n", format)
|
|
|
|
variants := make([]Variant, 0, 800000)
|
|
if rsid, geno, ok := parseVariant(firstDataLine, format); ok {
|
|
variants = append(variants, Variant{rsid, geno})
|
|
}
|
|
for scanner.Scan() {
|
|
if rsid, geno, ok := parseVariant(scanner.Text(), format); ok {
|
|
variants = append(variants, Variant{rsid, geno})
|
|
}
|
|
}
|
|
fmt.Printf("Phase 2 - Parse: %v (%d variants)\n", time.Since(phase2Start), len(variants))
|
|
|
|
// ===== PHASE 3: Sort by rsid =====
|
|
phase3Start := time.Now()
|
|
sort.Slice(variants, func(i, j int) bool {
|
|
return variants[i].RSID < variants[j].RSID
|
|
})
|
|
fmt.Printf("Phase 3 - Sort: %v\n", time.Since(phase3Start))
|
|
|
|
// ===== PHASE 4: Load SNPedia and match =====
|
|
phase4Start := time.Now()
|
|
snpediaDB, err := sql.Open("sqlite3", "/tank/inou/data/genotypes.db?mode=ro")
|
|
if err != nil {
|
|
fmt.Println("SNPedia DB open failed:", err)
|
|
os.Exit(1)
|
|
}
|
|
defer snpediaDB.Close()
|
|
|
|
// Load alleles for normalization
|
|
snpediaAlleles := make(map[string]string, 15000)
|
|
rows, err := snpediaDB.Query("SELECT DISTINCT rsid, alleles FROM genotypes")
|
|
if err != nil {
|
|
fmt.Println("SNPedia alleles query failed:", err)
|
|
os.Exit(1)
|
|
}
|
|
for rows.Next() {
|
|
var rsid, alleles string
|
|
rows.Scan(&rsid, &alleles)
|
|
snpediaAlleles[rsid] = alleles
|
|
}
|
|
rows.Close()
|
|
|
|
// Match variants with SNPedia genotypes
|
|
matched := make([]SNPediaMatch, 0, 2000)
|
|
matchedRsids := make(map[string]bool) // track which rsids had positive matches
|
|
|
|
for _, v := range variants {
|
|
alleles, ok := snpediaAlleles[v.RSID]
|
|
if !ok {
|
|
continue
|
|
}
|
|
normalized := normalizeGenotype(v.Genotype, alleles)
|
|
|
|
// Query for this specific rsid+genotype
|
|
rows, err := snpediaDB.Query(`
|
|
SELECT gene, magnitude, repute, summary, category, subcategory
|
|
FROM genotypes
|
|
WHERE rsid = ? AND genotype_norm = ?`,
|
|
v.RSID, normalized)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
for rows.Next() {
|
|
var gene, repute, summary, category, subcategory sql.NullString
|
|
var magnitude float64
|
|
rows.Scan(&gene, &magnitude, &repute, &summary, &category, &subcategory)
|
|
|
|
if category.String == "" {
|
|
continue
|
|
}
|
|
|
|
matchedRsids[v.RSID] = true
|
|
matched = append(matched, SNPediaMatch{
|
|
RSID: v.RSID,
|
|
Genotype: normalized,
|
|
Gene: gene.String,
|
|
Magnitude: magnitude,
|
|
Repute: repute.String,
|
|
Summary: summary.String,
|
|
Category: category.String,
|
|
Subcategory: subcategory.String,
|
|
})
|
|
}
|
|
rows.Close()
|
|
}
|
|
positiveMatches := len(matched)
|
|
|
|
// Find "clear" findings: rsids in SNPedia where user's genotype doesn't match any risk variant
|
|
clearFindings := 0
|
|
for _, v := range variants {
|
|
if matchedRsids[v.RSID] {
|
|
continue // already has positive matches
|
|
}
|
|
alleles, ok := snpediaAlleles[v.RSID]
|
|
if !ok {
|
|
continue // not in SNPedia
|
|
}
|
|
normalized := normalizeGenotype(v.Genotype, alleles)
|
|
|
|
// Get what SNPedia DOES have for this rsid (the risk variants user doesn't have)
|
|
rows, err := snpediaDB.Query(`
|
|
SELECT gene, genotype_norm, magnitude, repute, summary, category, subcategory
|
|
FROM genotypes
|
|
WHERE rsid = ?
|
|
ORDER BY magnitude DESC`,
|
|
v.RSID)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
// Collect risk variants to build the "clear" message
|
|
type riskInfo struct {
|
|
genotype string
|
|
mag float64
|
|
summary string
|
|
}
|
|
var risks []riskInfo
|
|
var gene, topCategory, topSubcategory string
|
|
var topMag float64
|
|
|
|
for rows.Next() {
|
|
var g, geno, rep, sum, cat, sub sql.NullString
|
|
var mag float64
|
|
rows.Scan(&g, &geno, &mag, &rep, &sum, &cat, &sub)
|
|
|
|
if cat.String == "" {
|
|
continue
|
|
}
|
|
|
|
// Track highest magnitude category for this clear finding
|
|
if mag > topMag || topCategory == "" {
|
|
topMag = mag
|
|
topCategory = cat.String
|
|
topSubcategory = sub.String
|
|
gene = g.String
|
|
}
|
|
|
|
// Collect unique risk genotypes
|
|
found := false
|
|
for _, r := range risks {
|
|
if r.genotype == geno.String {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found && len(risks) < 3 {
|
|
risks = append(risks, riskInfo{geno.String, mag, sum.String})
|
|
}
|
|
}
|
|
rows.Close()
|
|
|
|
if len(risks) == 0 || topCategory == "" {
|
|
continue
|
|
}
|
|
|
|
// Build the "clear" summary
|
|
var riskDescs []string
|
|
for _, r := range risks {
|
|
desc := r.genotype
|
|
if r.summary != "" {
|
|
// Truncate summary
|
|
s := r.summary
|
|
if len(s) > 40 {
|
|
s = s[:40] + "..."
|
|
}
|
|
desc += ": " + s
|
|
}
|
|
riskDescs = append(riskDescs, desc)
|
|
}
|
|
clearSummary := fmt.Sprintf("No risk variant detected. You have %s. (Documented risks: %s)",
|
|
normalized, strings.Join(riskDescs, "; "))
|
|
|
|
clearFindings++
|
|
matched = append(matched, SNPediaMatch{
|
|
RSID: v.RSID,
|
|
Genotype: normalized,
|
|
Gene: gene,
|
|
Magnitude: 0,
|
|
Repute: "Clear",
|
|
Summary: clearSummary,
|
|
Category: topCategory,
|
|
Subcategory: topSubcategory,
|
|
})
|
|
}
|
|
fmt.Printf("Phase 4 - Load SNPedia & match: %v (%d positive, %d clear)\n", time.Since(phase4Start), positiveMatches, clearFindings)
|
|
|
|
// ===== PHASE 5: Group by category and calculate counts =====
|
|
phase5Start := time.Now()
|
|
byCategory := make(map[string][]SNPediaMatch)
|
|
for _, m := range matched {
|
|
byCategory[m.Category] = append(byCategory[m.Category], m)
|
|
}
|
|
|
|
// Calculate counts per category
|
|
counts := make(map[string]CategoryCount)
|
|
for cat, variants := range byCategory {
|
|
c := CategoryCount{}
|
|
for _, v := range variants {
|
|
if shouldShow(v.Magnitude, v.Repute) {
|
|
c.Shown++
|
|
} else {
|
|
c.Hidden++
|
|
}
|
|
}
|
|
counts[cat] = c
|
|
}
|
|
fmt.Printf("Phase 5 - Group & count: %v (%d categories)\n", time.Since(phase5Start), len(byCategory))
|
|
|
|
// ===== PHASE 6: Initialize lib and delete existing =====
|
|
phase6Start := time.Now()
|
|
if err := lib.Init(); err != nil {
|
|
fmt.Println("lib.Init failed:", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
if err := lib.EntryDeleteByCategory(nil, dossierID, lib.CategoryGenome); err != nil { // nil ctx = system import
|
|
fmt.Println("Delete existing failed:", err)
|
|
os.Exit(1)
|
|
}
|
|
fmt.Printf("Phase 6 - Init & delete existing: %v\n", time.Since(phase6Start))
|
|
|
|
// ===== PHASE 7: Build entries =====
|
|
phase7Start := time.Now()
|
|
now := time.Now().Unix()
|
|
|
|
// Extraction entry with counts
|
|
extractionID := lib.NewID()
|
|
extractionData := struct {
|
|
Source string `json:"source"`
|
|
Total int `json:"total"`
|
|
Matched int `json:"matched"`
|
|
Positive int `json:"positive"`
|
|
Clear int `json:"clear"`
|
|
Counts map[string]CategoryCount `json:"counts"`
|
|
}{
|
|
Source: format,
|
|
Total: len(variants),
|
|
Matched: len(matched),
|
|
Positive: positiveMatches,
|
|
Clear: clearFindings,
|
|
Counts: counts,
|
|
}
|
|
extractionJSON, _ := json.Marshal(extractionData)
|
|
|
|
entries := make([]lib.Entry, 0, len(matched)+len(byCategory)+1)
|
|
entries = append(entries, lib.Entry{
|
|
EntryID: extractionID,
|
|
DossierID: dossierID,
|
|
Category: lib.CategoryGenome,
|
|
Type: "extraction",
|
|
Value: format,
|
|
Timestamp: now,
|
|
Data: string(extractionJSON),
|
|
})
|
|
|
|
// Tier entries (one per category, category = GenomeTier for ordering)
|
|
tierIDs := make(map[string]string)
|
|
for cat := range byCategory {
|
|
tierID := lib.NewID()
|
|
tierIDs[cat] = tierID
|
|
entries = append(entries, lib.Entry{
|
|
EntryID: tierID,
|
|
DossierID: dossierID,
|
|
ParentID: extractionID,
|
|
Category: lib.CategoryGenome,
|
|
Type: "tier",
|
|
Value: cat,
|
|
Ordinal: lib.GenomeTierFromString[cat], // for sorting
|
|
Timestamp: now,
|
|
})
|
|
}
|
|
|
|
// Variant entries (under their category tier)
|
|
for cat, variants := range byCategory {
|
|
tierID := tierIDs[cat]
|
|
for i, v := range variants {
|
|
variantData := struct {
|
|
Mag float64 `json:"mag,omitempty"`
|
|
Rep string `json:"rep,omitempty"`
|
|
Sum string `json:"sum,omitempty"`
|
|
Sub string `json:"sub,omitempty"`
|
|
}{
|
|
Mag: v.Magnitude,
|
|
Rep: v.Repute,
|
|
Sum: v.Summary,
|
|
Sub: v.Subcategory,
|
|
}
|
|
dataJSON, _ := json.Marshal(variantData)
|
|
|
|
entries = append(entries, lib.Entry{
|
|
EntryID: lib.NewID(),
|
|
DossierID: dossierID,
|
|
ParentID: tierID,
|
|
Category: lib.CategoryGenome,
|
|
Type: v.RSID,
|
|
Value: v.Genotype,
|
|
Tags: v.Gene,
|
|
SearchKey: v.Gene,
|
|
Ordinal: i + 1,
|
|
Timestamp: now,
|
|
Data: string(dataJSON),
|
|
})
|
|
}
|
|
}
|
|
fmt.Printf("Phase 7 - Build entries: %v (%d entries)\n", time.Since(phase7Start), len(entries))
|
|
|
|
// ===== PHASE 8: Save to database =====
|
|
phase8Start := time.Now()
|
|
if err := lib.EntryAddBatchValues(entries); err != nil {
|
|
fmt.Println("EntryAddBatchValues failed:", err)
|
|
os.Exit(1)
|
|
}
|
|
fmt.Printf("Phase 8 - Save: %v (%d entries saved)\n", time.Since(phase8Start), len(entries))
|
|
|
|
fmt.Printf("\nTOTAL: %v\n", time.Since(totalStart))
|
|
fmt.Printf("Extraction ID: %s\n", extractionID)
|
|
fmt.Printf("Categories: %d\n", len(byCategory))
|
|
for cat, c := range counts {
|
|
fmt.Printf(" %s: %d shown, %d hidden\n", cat, c.Shown, c.Hidden)
|
|
}
|
|
}
|