inou/portal/genome.go

491 lines
13 KiB
Go

package main
import (
"bufio"
"bytes"
"database/sql"
"encoding/json"
"fmt"
"os"
"sort"
"strings"
"time"
"inou/lib"
)
type Variant struct {
RSID string
Genotype string
}
// detectGenomeFormat returns format name based on first data line
func detectGenomeFormat(firstLine string) string {
if strings.Contains(firstLine, "\"") {
return "myheritage"
}
if strings.Contains(firstLine, "\t") {
parts := strings.Split(firstLine, "\t")
if len(parts) >= 5 {
return "ancestry"
}
return "23andme"
}
return "ftdna"
}
// parseGenomeVariant extracts rsid and genotype from a line
func parseGenomeVariant(line, format string) (string, string, bool) {
if strings.HasPrefix(line, "#") || strings.HasPrefix(line, "rsid") || strings.HasPrefix(line, "RSID") || (strings.HasPrefix(line, "\"") && strings.Contains(line, "RSID")) {
return "", "", false
}
var parts []string
var rsid, genotype string
switch format {
case "ancestry":
parts = strings.Split(line, "\t")
if len(parts) < 5 {
return "", "", false
}
rsid = parts[0]
allele1, allele2 := parts[3], parts[4]
if allele1 == "0" || allele2 == "0" {
return "", "", false
}
genotype = allele1 + allele2
case "23andme":
parts = strings.Split(line, "\t")
if len(parts) == 2 {
rsid = parts[0]
genotype = parts[1]
} else if len(parts) >= 4 {
rsid = parts[0]
genotype = parts[3]
} else {
return "", "", false
}
if genotype == "--" {
return "", "", false
}
case "myheritage":
line = strings.ReplaceAll(line, "\"", "")
parts = strings.Split(line, ",")
if len(parts) < 4 {
return "", "", false
}
rsid = parts[0]
genotype = parts[3]
case "ftdna":
parts = strings.Split(line, ",")
if len(parts) < 4 {
return "", "", false
}
rsid = parts[0]
genotype = parts[3]
}
if !strings.HasPrefix(rsid, "rs") {
return "", "", false
}
// Normalize: sort alleles (GA -> AG)
if len(genotype) == 2 && genotype[0] > genotype[1] {
genotype = string(genotype[1]) + string(genotype[0])
}
return rsid, genotype, true
}
// normalizeGenotype complements alleles to match the reference strand, then sorts.
func normalizeGenotype(genotype, alleles string) string {
if len(genotype) != 2 || alleles == "" {
if len(genotype) == 2 && genotype[0] > genotype[1] {
return string(genotype[1]) + string(genotype[0])
}
return genotype
}
valid := make(map[byte]bool)
for i := 0; i < len(alleles); i++ {
valid[alleles[i]] = true
}
comp := [256]byte{'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
var result [2]byte
for i := 0; i < 2; i++ {
b := genotype[i]
if valid[b] {
result[i] = b
} else if c := comp[b]; c != 0 {
result[i] = c
} else {
result[i] = b
}
}
if result[0] > result[1] {
result[0], result[1] = result[1], result[0]
}
return string(result[0]) + string(result[1])
}
// updateUploadStatus updates the status in the upload entry Data JSON
func updateUploadStatus(uploadID string, status string, details string) {
entry, err := lib.EntryGet(nil, uploadID) // nil ctx - internal operation
if err != nil || entry == nil {
return
}
// Parse existing data, update status, preserve other fields
var data map[string]interface{}
json.Unmarshal([]byte(entry.Data), &data)
if data == nil {
data = make(map[string]interface{})
}
data["status"] = status
if details != "" {
data["details"] = details
}
newData, _ := json.Marshal(data)
entry.Data = string(newData)
lib.EntryWrite("", entry)
}
// processGenomeUpload processes a genetics file in the background
func processGenomeUpload(uploadID string, dossierID string, filePath string) {
updateUploadStatus(uploadID, "processing", "")
startTime := time.Now()
// Read and decrypt
encrypted, err := os.ReadFile(filePath)
if err != nil {
updateUploadStatus(uploadID, "failed", "Read failed: "+err.Error())
return
}
decrypted, err := lib.CryptoDecryptBytes(encrypted)
if err != nil {
updateUploadStatus(uploadID, "failed", "Decrypt failed: "+err.Error())
return
}
// Parse variants
scanner := bufio.NewScanner(bytes.NewReader(decrypted))
scanner.Buffer(make([]byte, 1024*1024), 1024*1024)
var format string
var firstDataLine string
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "#") && len(line) > 0 {
firstDataLine = line
break
}
}
format = detectGenomeFormat(firstDataLine)
variants := make([]Variant, 0, 800000)
if rsid, geno, ok := parseGenomeVariant(firstDataLine, format); ok {
variants = append(variants, Variant{rsid, geno})
}
for scanner.Scan() {
if rsid, geno, ok := parseGenomeVariant(scanner.Text(), format); ok {
variants = append(variants, Variant{rsid, geno})
}
}
// Sort by rsid
sort.Slice(variants, func(i, j int) bool {
return variants[i].RSID < variants[j].RSID
})
// Load SNPedia data from reference DB (initialized at portal startup)
type CatInfo struct {
Category string
Subcategory string
Gene string
Magnitude float64
Repute string
Summary string
}
// Key: rsid+genotype -> slice of category associations
snpediaMap := make(map[string][]CatInfo, 50000)
snpediaRsids := make(map[string]bool, 15000)
indelRsids := make(map[string]bool) // rsIDs that have any indel genotype
if snpDB := lib.RefDB(); snpDB != nil {
// First pass: identify indel rsIDs
indelRows, _ := snpDB.Query("SELECT DISTINCT rsid FROM genotypes WHERE genotype LIKE '%-%'")
if indelRows != nil {
for indelRows.Next() {
var rsid string
indelRows.Scan(&rsid)
indelRsids[rsid] = true
}
indelRows.Close()
}
rows, _ := snpDB.Query("SELECT rsid, genotype_norm, gene, magnitude, repute, summary, category, subcategory FROM genotypes")
if rows != nil {
for rows.Next() {
var rsid, geno string
var gene, repute, summary, cat, subcat sql.NullString
var mag float64
rows.Scan(&rsid, &geno, &gene, &mag, &repute, &summary, &cat, &subcat)
if cat.String == "" || indelRsids[rsid] {
continue // skip entries without category or indel variants
}
key := rsid + ":" + geno
snpediaMap[key] = append(snpediaMap[key], CatInfo{
Category: cat.String,
Subcategory: subcat.String,
Gene: gene.String,
Magnitude: mag,
Repute: repute.String,
Summary: summary.String,
})
snpediaRsids[rsid] = true
}
rows.Close()
}
}
// Build valid alleles per rsid from actual genotype entries (not the alleles column,
// which includes the reference allele that SNPedia doesn't use in genotype notation)
snpediaAlleles := make(map[string]string, len(snpediaRsids))
for key := range snpediaMap {
parts := strings.SplitN(key, ":", 2)
if len(parts) == 2 {
rsid, geno := parts[0], parts[1]
existing := snpediaAlleles[rsid]
for i := 0; i < len(geno); i++ {
if !strings.ContainsRune(existing, rune(geno[i])) {
existing += string(geno[i])
}
}
snpediaAlleles[rsid] = existing
}
}
// Match variants (only those with rsid in SNPedia), normalizing genotype to reference strand
matched := make([]Variant, 0, len(snpediaRsids))
for _, v := range variants {
if snpediaRsids[v.RSID] {
v.Genotype = normalizeGenotype(v.Genotype, snpediaAlleles[v.RSID])
matched = append(matched, v)
}
}
// Split into positive matches (genotype in SNPedia) and clear findings (rsid in SNPedia but genotype doesn't match any risk variant)
type clearInfo struct {
RSID, Genotype, Gene, Category, Subcategory string
TopMag float64
RiskDescs []string
}
positiveRsids := make(map[string]bool)
for _, v := range matched {
if len(snpediaMap[v.RSID+":"+v.Genotype]) > 0 {
positiveRsids[v.RSID] = true
}
}
// Build clear findings: rsids in SNPedia where user has no matching risk genotype
var clearFindings []clearInfo
for _, v := range matched {
if positiveRsids[v.RSID] {
continue
}
// Find what risk variants SNPedia documents for this rsid
var topCat, topSub, gene string
var topMag float64
seen := make(map[string]bool)
var descs []string
for key, infos := range snpediaMap {
if !strings.HasPrefix(key, v.RSID+":") {
continue
}
for _, info := range infos {
if topCat == "" || info.Magnitude > topMag {
topMag = info.Magnitude
topCat = info.Category
topSub = info.Subcategory
gene = info.Gene
}
geno := strings.TrimPrefix(key, v.RSID+":")
if !seen[geno] && len(descs) < 3 {
seen[geno] = true
d := geno
if info.Summary != "" {
s := info.Summary
if len(s) > 40 {
s = s[:40] + "..."
}
d += ": " + s
}
descs = append(descs, d)
}
}
}
if topCat != "" {
clearFindings = append(clearFindings, clearInfo{
RSID: v.RSID, Genotype: v.Genotype, Gene: gene,
Category: topCat, Subcategory: topSub, TopMag: topMag, RiskDescs: descs,
})
}
}
// Delete existing genome entries (all genome data uses CategoryGenome with different Types)
lib.EntryDelete("", dossierID, &lib.Filter{Category: lib.CategoryGenome})
// Create extraction entry (tier 1)
now := time.Now().Unix()
totalAssoc := 0
for _, v := range matched {
totalAssoc += len(snpediaMap[v.RSID+":"+v.Genotype])
}
parentEntry := &lib.Entry{
DossierID: dossierID,
ParentID: dossierID,
Category: lib.CategoryGenome,
Type: "extraction",
Value: format,
Timestamp: now,
Data: fmt.Sprintf(`{"upload_id":"%s","variants":%d,"associations":%d,"clear":%d,"total_parsed":%d}`, uploadID, len(matched), totalAssoc, len(clearFindings), len(variants)),
}
lib.EntryWrite("", parentEntry)
extractionID := parentEntry.EntryID
// Count shown/hidden per category (deduplicated by category+rsid)
type catCount struct{ Shown, Hidden int }
catCounts := map[string]*catCount{}
type catRsid struct{ cat, rsid string }
counted := map[catRsid]bool{}
for _, v := range matched {
for _, info := range snpediaMap[v.RSID+":"+v.Genotype] {
key := catRsid{info.Category, v.RSID}
if counted[key] {
continue
}
counted[key] = true
c, ok := catCounts[info.Category]
if !ok {
c = &catCount{}
catCounts[info.Category] = c
}
if info.Magnitude > 4.0 || strings.EqualFold(info.Repute, "bad") {
c.Hidden++
} else {
c.Shown++
}
}
}
// Clear findings count as shown
for _, cf := range clearFindings {
c, ok := catCounts[cf.Category]
if !ok {
c = &catCount{}
catCounts[cf.Category] = c
}
c.Shown++
}
tierMap := make(map[string]string) // category -> tier entry_id
for cat, c := range catCounts {
tierEntry := &lib.Entry{
DossierID: dossierID,
ParentID: extractionID,
Category: lib.CategoryGenome,
Type: "tier",
Value: cat,
Ordinal: lib.GenomeTierFromString[cat],
Timestamp: now,
Data: fmt.Sprintf(`{"shown":%d,"hidden":%d}`, c.Shown, c.Hidden),
}
lib.EntryWrite("", tierEntry)
tierMap[cat] = tierEntry.EntryID
}
// Batch insert variants (tier 3) - Type="rsid", Value=genotype
// Deduplicate: one entry per tier+rsid (merge subcategories, keep highest magnitude)
type variantKey struct{ tier, rsid string }
deduped := make(map[variantKey]*lib.Entry)
for _, v := range matched {
for _, info := range snpediaMap[v.RSID+":"+v.Genotype] {
tierID := tierMap[info.Category]
key := variantKey{tierID, v.RSID}
if existing, ok := deduped[key]; ok {
// Keep higher magnitude entry
if info.Magnitude > float64(100-existing.Ordinal)/10 {
data := fmt.Sprintf(`{"mag":%.1f,"rep":"%s","sum":"%s","sub":"%s"}`,
info.Magnitude, info.Repute, strings.ReplaceAll(info.Summary, `"`, `\"`), info.Subcategory)
existing.Ordinal = int(100 - info.Magnitude*10)
existing.Data = data
}
continue
}
data := fmt.Sprintf(`{"mag":%.1f,"rep":"%s","sum":"%s","sub":"%s"}`,
info.Magnitude, info.Repute, strings.ReplaceAll(info.Summary, `"`, `\"`), info.Subcategory)
deduped[key] = &lib.Entry{
DossierID: dossierID,
ParentID: tierID,
Category: lib.CategoryGenome,
Type: "variant",
Value: v.Genotype,
Ordinal: int(100 - info.Magnitude*10),
Timestamp: now,
SearchKey: strings.ToLower(info.Gene),
SearchKey2: strings.ToLower(v.RSID),
Data: data,
}
}
}
// Add clear findings as variant entries (repute "Clear", magnitude from the risk they cleared)
for _, cf := range clearFindings {
tierID := tierMap[cf.Category]
key := variantKey{tierID, cf.RSID}
if _, ok := deduped[key]; ok {
continue // already has a positive match in this tier
}
summary := fmt.Sprintf("No risk variant detected. You have %s. (Risks: %s)",
cf.Genotype, strings.Join(cf.RiskDescs, "; "))
data := fmt.Sprintf(`{"mag":%.1f,"rep":"Clear","sum":"%s","sub":"%s"}`,
cf.TopMag, strings.ReplaceAll(summary, `"`, `\"`), cf.Subcategory)
deduped[key] = &lib.Entry{
DossierID: dossierID,
ParentID: tierID,
Category: lib.CategoryGenome,
Type: "variant",
Value: cf.Genotype,
Ordinal: int(100 - cf.TopMag*10),
Timestamp: now,
SearchKey: strings.ToLower(cf.Gene),
SearchKey2: strings.ToLower(cf.RSID),
Data: data,
}
}
var batch []*lib.Entry
for _, e := range deduped {
batch = append(batch, e)
if len(batch) >= 500 {
lib.EntryWrite("", batch...)
batch = batch[:0]
}
}
if len(batch) > 0 {
lib.EntryWrite("", batch...)
}
elapsed := time.Since(startTime)
details := fmt.Sprintf("parent=%s variants=%d parsed=%d elapsed=%v", extractionID, len(matched), len(variants), elapsed)
updateUploadStatus(uploadID, "completed", details)
lib.AuditLog(dossierID, "genome_import", dossierID, fmt.Sprintf("%d", len(matched)))
}