package main import ( "bufio" "bytes" "database/sql" "encoding/json" "fmt" "os" "sort" "strings" "time" "inou/lib" ) type Variant struct { RSID string Genotype string } // detectGenomeFormat returns format name based on first data line func detectGenomeFormat(firstLine string) string { if strings.Contains(firstLine, "\"") { return "myheritage" } if strings.Contains(firstLine, "\t") { parts := strings.Split(firstLine, "\t") if len(parts) >= 5 { return "ancestry" } return "23andme" } return "ftdna" } // parseGenomeVariant extracts rsid and genotype from a line func parseGenomeVariant(line, format string) (string, string, bool) { if strings.HasPrefix(line, "#") || strings.HasPrefix(line, "rsid") || strings.HasPrefix(line, "RSID") || (strings.HasPrefix(line, "\"") && strings.Contains(line, "RSID")) { return "", "", false } var parts []string var rsid, genotype string switch format { case "ancestry": parts = strings.Split(line, "\t") if len(parts) < 5 { return "", "", false } rsid = parts[0] allele1, allele2 := parts[3], parts[4] if allele1 == "0" || allele2 == "0" { return "", "", false } genotype = allele1 + allele2 case "23andme": parts = strings.Split(line, "\t") if len(parts) == 2 { rsid = parts[0] genotype = parts[1] } else if len(parts) >= 4 { rsid = parts[0] genotype = parts[3] } else { return "", "", false } if genotype == "--" { return "", "", false } case "myheritage": line = strings.ReplaceAll(line, "\"", "") parts = strings.Split(line, ",") if len(parts) < 4 { return "", "", false } rsid = parts[0] genotype = parts[3] case "ftdna": parts = strings.Split(line, ",") if len(parts) < 4 { return "", "", false } rsid = parts[0] genotype = parts[3] } if !strings.HasPrefix(rsid, "rs") { return "", "", false } // Normalize: sort alleles (GA -> AG) if len(genotype) == 2 && genotype[0] > genotype[1] { genotype = string(genotype[1]) + string(genotype[0]) } return rsid, genotype, true } // normalizeGenotype complements alleles to match the reference strand, then sorts. func normalizeGenotype(genotype, alleles string) string { if len(genotype) != 2 || alleles == "" { if len(genotype) == 2 && genotype[0] > genotype[1] { return string(genotype[1]) + string(genotype[0]) } return genotype } valid := make(map[byte]bool) for i := 0; i < len(alleles); i++ { valid[alleles[i]] = true } comp := [256]byte{'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} var result [2]byte for i := 0; i < 2; i++ { b := genotype[i] if valid[b] { result[i] = b } else if c := comp[b]; c != 0 { result[i] = c } else { result[i] = b } } if result[0] > result[1] { result[0], result[1] = result[1], result[0] } return string(result[0]) + string(result[1]) } // updateUploadStatus updates the status in the upload entry Data JSON func updateUploadStatus(uploadID string, status string, details string) { entry, err := lib.EntryGet(nil, uploadID) // nil ctx - internal operation if err != nil || entry == nil { return } // Parse existing data, update status, preserve other fields var data map[string]interface{} json.Unmarshal([]byte(entry.Data), &data) if data == nil { data = make(map[string]interface{}) } data["status"] = status if details != "" { data["details"] = details } newData, _ := json.Marshal(data) entry.Data = string(newData) lib.EntryWrite("", entry) } // processGenomeUpload processes a genetics file in the background func processGenomeUpload(uploadID string, dossierID string, filePath string) { updateUploadStatus(uploadID, "processing", "") startTime := time.Now() // Read and decrypt encrypted, err := os.ReadFile(filePath) if err != nil { updateUploadStatus(uploadID, "failed", "Read failed: "+err.Error()) return } decrypted, err := lib.CryptoDecryptBytes(encrypted) if err != nil { updateUploadStatus(uploadID, "failed", "Decrypt failed: "+err.Error()) return } // Parse variants scanner := bufio.NewScanner(bytes.NewReader(decrypted)) scanner.Buffer(make([]byte, 1024*1024), 1024*1024) var format string var firstDataLine string for scanner.Scan() { line := scanner.Text() if !strings.HasPrefix(line, "#") && len(line) > 0 { firstDataLine = line break } } format = detectGenomeFormat(firstDataLine) variants := make([]Variant, 0, 800000) if rsid, geno, ok := parseGenomeVariant(firstDataLine, format); ok { variants = append(variants, Variant{rsid, geno}) } for scanner.Scan() { if rsid, geno, ok := parseGenomeVariant(scanner.Text(), format); ok { variants = append(variants, Variant{rsid, geno}) } } // Sort by rsid sort.Slice(variants, func(i, j int) bool { return variants[i].RSID < variants[j].RSID }) // Load SNPedia data from reference DB (initialized at portal startup) type CatInfo struct { Category string Subcategory string Gene string Magnitude float64 Repute string Summary string } // Key: rsid+genotype -> slice of category associations snpediaMap := make(map[string][]CatInfo, 50000) snpediaRsids := make(map[string]bool, 15000) indelRsids := make(map[string]bool) // rsIDs that have any indel genotype if snpDB := lib.RefDB(); snpDB != nil { // First pass: identify indel rsIDs indelRows, _ := snpDB.Query("SELECT DISTINCT rsid FROM genotypes WHERE genotype LIKE '%-%'") if indelRows != nil { for indelRows.Next() { var rsid string indelRows.Scan(&rsid) indelRsids[rsid] = true } indelRows.Close() } rows, _ := snpDB.Query("SELECT rsid, genotype_norm, gene, magnitude, repute, summary, category, subcategory FROM genotypes") if rows != nil { for rows.Next() { var rsid, geno string var gene, repute, summary, cat, subcat sql.NullString var mag float64 rows.Scan(&rsid, &geno, &gene, &mag, &repute, &summary, &cat, &subcat) if cat.String == "" || indelRsids[rsid] { continue // skip entries without category or indel variants } key := rsid + ":" + geno snpediaMap[key] = append(snpediaMap[key], CatInfo{ Category: cat.String, Subcategory: subcat.String, Gene: gene.String, Magnitude: mag, Repute: repute.String, Summary: summary.String, }) snpediaRsids[rsid] = true } rows.Close() } } // Build valid alleles per rsid from actual genotype entries (not the alleles column, // which includes the reference allele that SNPedia doesn't use in genotype notation) snpediaAlleles := make(map[string]string, len(snpediaRsids)) for key := range snpediaMap { parts := strings.SplitN(key, ":", 2) if len(parts) == 2 { rsid, geno := parts[0], parts[1] existing := snpediaAlleles[rsid] for i := 0; i < len(geno); i++ { if !strings.ContainsRune(existing, rune(geno[i])) { existing += string(geno[i]) } } snpediaAlleles[rsid] = existing } } // Match variants (only those with rsid in SNPedia), normalizing genotype to reference strand matched := make([]Variant, 0, len(snpediaRsids)) for _, v := range variants { if snpediaRsids[v.RSID] { v.Genotype = normalizeGenotype(v.Genotype, snpediaAlleles[v.RSID]) matched = append(matched, v) } } // Split into positive matches (genotype in SNPedia) and clear findings (rsid in SNPedia but genotype doesn't match any risk variant) type clearInfo struct { RSID, Genotype, Gene, Category, Subcategory string TopMag float64 RiskDescs []string } positiveRsids := make(map[string]bool) for _, v := range matched { if len(snpediaMap[v.RSID+":"+v.Genotype]) > 0 { positiveRsids[v.RSID] = true } } // Build clear findings: rsids in SNPedia where user has no matching risk genotype var clearFindings []clearInfo for _, v := range matched { if positiveRsids[v.RSID] { continue } // Find what risk variants SNPedia documents for this rsid var topCat, topSub, gene string var topMag float64 seen := make(map[string]bool) var descs []string for key, infos := range snpediaMap { if !strings.HasPrefix(key, v.RSID+":") { continue } for _, info := range infos { if topCat == "" || info.Magnitude > topMag { topMag = info.Magnitude topCat = info.Category topSub = info.Subcategory gene = info.Gene } geno := strings.TrimPrefix(key, v.RSID+":") if !seen[geno] && len(descs) < 3 { seen[geno] = true d := geno if info.Summary != "" { s := info.Summary if len(s) > 40 { s = s[:40] + "..." } d += ": " + s } descs = append(descs, d) } } } if topCat != "" { clearFindings = append(clearFindings, clearInfo{ RSID: v.RSID, Genotype: v.Genotype, Gene: gene, Category: topCat, Subcategory: topSub, TopMag: topMag, RiskDescs: descs, }) } } // Delete existing genome entries (all genome data uses CategoryGenome with different Types) lib.EntryDelete("", dossierID, &lib.Filter{Category: lib.CategoryGenome}) // Create extraction entry (tier 1) now := time.Now().Unix() totalAssoc := 0 for _, v := range matched { totalAssoc += len(snpediaMap[v.RSID+":"+v.Genotype]) } parentEntry := &lib.Entry{ DossierID: dossierID, ParentID: dossierID, Category: lib.CategoryGenome, Type: "extraction", Value: format, Timestamp: now, Data: fmt.Sprintf(`{"upload_id":"%s","variants":%d,"associations":%d,"clear":%d,"total_parsed":%d}`, uploadID, len(matched), totalAssoc, len(clearFindings), len(variants)), } lib.EntryWrite("", parentEntry) extractionID := parentEntry.EntryID // Count shown/hidden per category (deduplicated by category+rsid) type catCount struct{ Shown, Hidden int } catCounts := map[string]*catCount{} type catRsid struct{ cat, rsid string } counted := map[catRsid]bool{} for _, v := range matched { for _, info := range snpediaMap[v.RSID+":"+v.Genotype] { key := catRsid{info.Category, v.RSID} if counted[key] { continue } counted[key] = true c, ok := catCounts[info.Category] if !ok { c = &catCount{} catCounts[info.Category] = c } if info.Magnitude > 4.0 || strings.EqualFold(info.Repute, "bad") { c.Hidden++ } else { c.Shown++ } } } // Clear findings count as shown for _, cf := range clearFindings { c, ok := catCounts[cf.Category] if !ok { c = &catCount{} catCounts[cf.Category] = c } c.Shown++ } tierMap := make(map[string]string) // category -> tier entry_id for cat, c := range catCounts { tierEntry := &lib.Entry{ DossierID: dossierID, ParentID: extractionID, Category: lib.CategoryGenome, Type: "tier", Value: cat, Ordinal: lib.GenomeTierFromString[cat], Timestamp: now, Data: fmt.Sprintf(`{"shown":%d,"hidden":%d}`, c.Shown, c.Hidden), } lib.EntryWrite("", tierEntry) tierMap[cat] = tierEntry.EntryID } // Batch insert variants (tier 3) - Type="rsid", Value=genotype // Deduplicate: one entry per tier+rsid (merge subcategories, keep highest magnitude) type variantKey struct{ tier, rsid string } deduped := make(map[variantKey]*lib.Entry) for _, v := range matched { for _, info := range snpediaMap[v.RSID+":"+v.Genotype] { tierID := tierMap[info.Category] key := variantKey{tierID, v.RSID} if existing, ok := deduped[key]; ok { // Keep higher magnitude entry if info.Magnitude > float64(100-existing.Ordinal)/10 { data := fmt.Sprintf(`{"mag":%.1f,"rep":"%s","sum":"%s","sub":"%s"}`, info.Magnitude, info.Repute, strings.ReplaceAll(info.Summary, `"`, `\"`), info.Subcategory) existing.Ordinal = int(100 - info.Magnitude*10) existing.Data = data } continue } data := fmt.Sprintf(`{"mag":%.1f,"rep":"%s","sum":"%s","sub":"%s"}`, info.Magnitude, info.Repute, strings.ReplaceAll(info.Summary, `"`, `\"`), info.Subcategory) deduped[key] = &lib.Entry{ DossierID: dossierID, ParentID: tierID, Category: lib.CategoryGenome, Type: "variant", Value: v.Genotype, Ordinal: int(100 - info.Magnitude*10), Timestamp: now, SearchKey: strings.ToLower(info.Gene), SearchKey2: strings.ToLower(v.RSID), Data: data, } } } // Add clear findings as variant entries (repute "Clear", magnitude from the risk they cleared) for _, cf := range clearFindings { tierID := tierMap[cf.Category] key := variantKey{tierID, cf.RSID} if _, ok := deduped[key]; ok { continue // already has a positive match in this tier } summary := fmt.Sprintf("No risk variant detected. You have %s. (Risks: %s)", cf.Genotype, strings.Join(cf.RiskDescs, "; ")) data := fmt.Sprintf(`{"mag":%.1f,"rep":"Clear","sum":"%s","sub":"%s"}`, cf.TopMag, strings.ReplaceAll(summary, `"`, `\"`), cf.Subcategory) deduped[key] = &lib.Entry{ DossierID: dossierID, ParentID: tierID, Category: lib.CategoryGenome, Type: "variant", Value: cf.Genotype, Ordinal: int(100 - cf.TopMag*10), Timestamp: now, SearchKey: strings.ToLower(cf.Gene), SearchKey2: strings.ToLower(cf.RSID), Data: data, } } var batch []*lib.Entry for _, e := range deduped { batch = append(batch, e) if len(batch) >= 500 { lib.EntryWrite("", batch...) batch = batch[:0] } } if len(batch) > 0 { lib.EntryWrite("", batch...) } elapsed := time.Since(startTime) details := fmt.Sprintf("parent=%s variants=%d parsed=%d elapsed=%v", extractionID, len(matched), len(variants), elapsed) updateUploadStatus(uploadID, "completed", details) lib.AuditLog(dossierID, "genome_import", dossierID, fmt.Sprintf("%d", len(matched))) }