491 lines
13 KiB
Go
491 lines
13 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"inou/lib"
|
|
)
|
|
|
|
type Variant struct {
|
|
RSID string
|
|
Genotype string
|
|
}
|
|
|
|
// detectGenomeFormat returns format name based on first data line
|
|
func detectGenomeFormat(firstLine string) string {
|
|
if strings.Contains(firstLine, "\"") {
|
|
return "myheritage"
|
|
}
|
|
if strings.Contains(firstLine, "\t") {
|
|
parts := strings.Split(firstLine, "\t")
|
|
if len(parts) >= 5 {
|
|
return "ancestry"
|
|
}
|
|
return "23andme"
|
|
}
|
|
return "ftdna"
|
|
}
|
|
|
|
// parseGenomeVariant extracts rsid and genotype from a line
|
|
func parseGenomeVariant(line, format string) (string, string, bool) {
|
|
if strings.HasPrefix(line, "#") || strings.HasPrefix(line, "rsid") || strings.HasPrefix(line, "RSID") || (strings.HasPrefix(line, "\"") && strings.Contains(line, "RSID")) {
|
|
return "", "", false
|
|
}
|
|
|
|
var parts []string
|
|
var rsid, genotype string
|
|
|
|
switch format {
|
|
case "ancestry":
|
|
parts = strings.Split(line, "\t")
|
|
if len(parts) < 5 {
|
|
return "", "", false
|
|
}
|
|
rsid = parts[0]
|
|
allele1, allele2 := parts[3], parts[4]
|
|
if allele1 == "0" || allele2 == "0" {
|
|
return "", "", false
|
|
}
|
|
genotype = allele1 + allele2
|
|
|
|
case "23andme":
|
|
parts = strings.Split(line, "\t")
|
|
if len(parts) == 2 {
|
|
rsid = parts[0]
|
|
genotype = parts[1]
|
|
} else if len(parts) >= 4 {
|
|
rsid = parts[0]
|
|
genotype = parts[3]
|
|
} else {
|
|
return "", "", false
|
|
}
|
|
if genotype == "--" {
|
|
return "", "", false
|
|
}
|
|
|
|
case "myheritage":
|
|
line = strings.ReplaceAll(line, "\"", "")
|
|
parts = strings.Split(line, ",")
|
|
if len(parts) < 4 {
|
|
return "", "", false
|
|
}
|
|
rsid = parts[0]
|
|
genotype = parts[3]
|
|
|
|
case "ftdna":
|
|
parts = strings.Split(line, ",")
|
|
if len(parts) < 4 {
|
|
return "", "", false
|
|
}
|
|
rsid = parts[0]
|
|
genotype = parts[3]
|
|
}
|
|
|
|
if !strings.HasPrefix(rsid, "rs") {
|
|
return "", "", false
|
|
}
|
|
|
|
// Normalize: sort alleles (GA -> AG)
|
|
if len(genotype) == 2 && genotype[0] > genotype[1] {
|
|
genotype = string(genotype[1]) + string(genotype[0])
|
|
}
|
|
|
|
return rsid, genotype, true
|
|
}
|
|
|
|
// normalizeGenotype complements alleles to match the reference strand, then sorts.
|
|
func normalizeGenotype(genotype, alleles string) string {
|
|
if len(genotype) != 2 || alleles == "" {
|
|
if len(genotype) == 2 && genotype[0] > genotype[1] {
|
|
return string(genotype[1]) + string(genotype[0])
|
|
}
|
|
return genotype
|
|
}
|
|
valid := make(map[byte]bool)
|
|
for i := 0; i < len(alleles); i++ {
|
|
valid[alleles[i]] = true
|
|
}
|
|
comp := [256]byte{'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
|
|
var result [2]byte
|
|
for i := 0; i < 2; i++ {
|
|
b := genotype[i]
|
|
if valid[b] {
|
|
result[i] = b
|
|
} else if c := comp[b]; c != 0 {
|
|
result[i] = c
|
|
} else {
|
|
result[i] = b
|
|
}
|
|
}
|
|
if result[0] > result[1] {
|
|
result[0], result[1] = result[1], result[0]
|
|
}
|
|
return string(result[0]) + string(result[1])
|
|
}
|
|
|
|
// updateUploadStatus updates the status in the upload entry Data JSON
|
|
func updateUploadStatus(uploadID string, status string, details string) {
|
|
entry, err := lib.EntryGet(nil, uploadID) // nil ctx - internal operation
|
|
if err != nil || entry == nil {
|
|
return
|
|
}
|
|
// Parse existing data, update status, preserve other fields
|
|
var data map[string]interface{}
|
|
json.Unmarshal([]byte(entry.Data), &data)
|
|
if data == nil {
|
|
data = make(map[string]interface{})
|
|
}
|
|
data["status"] = status
|
|
if details != "" {
|
|
data["details"] = details
|
|
}
|
|
newData, _ := json.Marshal(data)
|
|
entry.Data = string(newData)
|
|
lib.EntryWrite("", entry)
|
|
}
|
|
|
|
// processGenomeUpload processes a genetics file in the background
|
|
func processGenomeUpload(uploadID string, dossierID string, filePath string) {
|
|
updateUploadStatus(uploadID, "processing", "")
|
|
startTime := time.Now()
|
|
|
|
// Read and decrypt
|
|
encrypted, err := os.ReadFile(filePath)
|
|
if err != nil {
|
|
updateUploadStatus(uploadID, "failed", "Read failed: "+err.Error())
|
|
return
|
|
}
|
|
|
|
decrypted, err := lib.CryptoDecryptBytes(encrypted)
|
|
if err != nil {
|
|
updateUploadStatus(uploadID, "failed", "Decrypt failed: "+err.Error())
|
|
return
|
|
}
|
|
|
|
// Parse variants
|
|
scanner := bufio.NewScanner(bytes.NewReader(decrypted))
|
|
scanner.Buffer(make([]byte, 1024*1024), 1024*1024)
|
|
|
|
var format string
|
|
var firstDataLine string
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
if !strings.HasPrefix(line, "#") && len(line) > 0 {
|
|
firstDataLine = line
|
|
break
|
|
}
|
|
}
|
|
format = detectGenomeFormat(firstDataLine)
|
|
|
|
variants := make([]Variant, 0, 800000)
|
|
|
|
if rsid, geno, ok := parseGenomeVariant(firstDataLine, format); ok {
|
|
variants = append(variants, Variant{rsid, geno})
|
|
}
|
|
|
|
for scanner.Scan() {
|
|
if rsid, geno, ok := parseGenomeVariant(scanner.Text(), format); ok {
|
|
variants = append(variants, Variant{rsid, geno})
|
|
}
|
|
}
|
|
|
|
// Sort by rsid
|
|
sort.Slice(variants, func(i, j int) bool {
|
|
return variants[i].RSID < variants[j].RSID
|
|
})
|
|
|
|
// Load SNPedia data from reference DB (initialized at portal startup)
|
|
type CatInfo struct {
|
|
Category string
|
|
Subcategory string
|
|
Gene string
|
|
Magnitude float64
|
|
Repute string
|
|
Summary string
|
|
}
|
|
// Key: rsid+genotype -> slice of category associations
|
|
snpediaMap := make(map[string][]CatInfo, 50000)
|
|
snpediaRsids := make(map[string]bool, 15000)
|
|
indelRsids := make(map[string]bool) // rsIDs that have any indel genotype
|
|
if snpDB := lib.RefDB(); snpDB != nil {
|
|
// First pass: identify indel rsIDs
|
|
indelRows, _ := snpDB.Query("SELECT DISTINCT rsid FROM genotypes WHERE genotype LIKE '%-%'")
|
|
if indelRows != nil {
|
|
for indelRows.Next() {
|
|
var rsid string
|
|
indelRows.Scan(&rsid)
|
|
indelRsids[rsid] = true
|
|
}
|
|
indelRows.Close()
|
|
}
|
|
rows, _ := snpDB.Query("SELECT rsid, genotype_norm, gene, magnitude, repute, summary, category, subcategory FROM genotypes")
|
|
if rows != nil {
|
|
for rows.Next() {
|
|
var rsid, geno string
|
|
var gene, repute, summary, cat, subcat sql.NullString
|
|
var mag float64
|
|
rows.Scan(&rsid, &geno, &gene, &mag, &repute, &summary, &cat, &subcat)
|
|
if cat.String == "" || indelRsids[rsid] {
|
|
continue // skip entries without category or indel variants
|
|
}
|
|
key := rsid + ":" + geno
|
|
snpediaMap[key] = append(snpediaMap[key], CatInfo{
|
|
Category: cat.String,
|
|
Subcategory: subcat.String,
|
|
Gene: gene.String,
|
|
Magnitude: mag,
|
|
Repute: repute.String,
|
|
Summary: summary.String,
|
|
})
|
|
snpediaRsids[rsid] = true
|
|
}
|
|
rows.Close()
|
|
}
|
|
}
|
|
|
|
// Build valid alleles per rsid from actual genotype entries (not the alleles column,
|
|
// which includes the reference allele that SNPedia doesn't use in genotype notation)
|
|
snpediaAlleles := make(map[string]string, len(snpediaRsids))
|
|
for key := range snpediaMap {
|
|
parts := strings.SplitN(key, ":", 2)
|
|
if len(parts) == 2 {
|
|
rsid, geno := parts[0], parts[1]
|
|
existing := snpediaAlleles[rsid]
|
|
for i := 0; i < len(geno); i++ {
|
|
if !strings.ContainsRune(existing, rune(geno[i])) {
|
|
existing += string(geno[i])
|
|
}
|
|
}
|
|
snpediaAlleles[rsid] = existing
|
|
}
|
|
}
|
|
|
|
// Match variants (only those with rsid in SNPedia), normalizing genotype to reference strand
|
|
matched := make([]Variant, 0, len(snpediaRsids))
|
|
for _, v := range variants {
|
|
if snpediaRsids[v.RSID] {
|
|
v.Genotype = normalizeGenotype(v.Genotype, snpediaAlleles[v.RSID])
|
|
matched = append(matched, v)
|
|
}
|
|
}
|
|
|
|
// Split into positive matches (genotype in SNPedia) and clear findings (rsid in SNPedia but genotype doesn't match any risk variant)
|
|
type clearInfo struct {
|
|
RSID, Genotype, Gene, Category, Subcategory string
|
|
TopMag float64
|
|
RiskDescs []string
|
|
}
|
|
positiveRsids := make(map[string]bool)
|
|
for _, v := range matched {
|
|
if len(snpediaMap[v.RSID+":"+v.Genotype]) > 0 {
|
|
positiveRsids[v.RSID] = true
|
|
}
|
|
}
|
|
|
|
// Build clear findings: rsids in SNPedia where user has no matching risk genotype
|
|
var clearFindings []clearInfo
|
|
for _, v := range matched {
|
|
if positiveRsids[v.RSID] {
|
|
continue
|
|
}
|
|
// Find what risk variants SNPedia documents for this rsid
|
|
var topCat, topSub, gene string
|
|
var topMag float64
|
|
seen := make(map[string]bool)
|
|
var descs []string
|
|
for key, infos := range snpediaMap {
|
|
if !strings.HasPrefix(key, v.RSID+":") {
|
|
continue
|
|
}
|
|
for _, info := range infos {
|
|
if topCat == "" || info.Magnitude > topMag {
|
|
topMag = info.Magnitude
|
|
topCat = info.Category
|
|
topSub = info.Subcategory
|
|
gene = info.Gene
|
|
}
|
|
geno := strings.TrimPrefix(key, v.RSID+":")
|
|
if !seen[geno] && len(descs) < 3 {
|
|
seen[geno] = true
|
|
d := geno
|
|
if info.Summary != "" {
|
|
s := info.Summary
|
|
if len(s) > 40 {
|
|
s = s[:40] + "..."
|
|
}
|
|
d += ": " + s
|
|
}
|
|
descs = append(descs, d)
|
|
}
|
|
}
|
|
}
|
|
if topCat != "" {
|
|
clearFindings = append(clearFindings, clearInfo{
|
|
RSID: v.RSID, Genotype: v.Genotype, Gene: gene,
|
|
Category: topCat, Subcategory: topSub, TopMag: topMag, RiskDescs: descs,
|
|
})
|
|
}
|
|
}
|
|
|
|
// Delete existing genome entries (all genome data uses CategoryGenome with different Types)
|
|
lib.EntryDelete("", dossierID, &lib.Filter{Category: lib.CategoryGenome})
|
|
|
|
// Create extraction entry (tier 1)
|
|
now := time.Now().Unix()
|
|
totalAssoc := 0
|
|
for _, v := range matched {
|
|
totalAssoc += len(snpediaMap[v.RSID+":"+v.Genotype])
|
|
}
|
|
|
|
parentEntry := &lib.Entry{
|
|
DossierID: dossierID,
|
|
ParentID: dossierID,
|
|
Category: lib.CategoryGenome,
|
|
Type: "extraction",
|
|
Value: format,
|
|
Timestamp: now,
|
|
Data: fmt.Sprintf(`{"upload_id":"%s","variants":%d,"associations":%d,"clear":%d,"total_parsed":%d}`, uploadID, len(matched), totalAssoc, len(clearFindings), len(variants)),
|
|
}
|
|
lib.EntryWrite("", parentEntry)
|
|
extractionID := parentEntry.EntryID
|
|
|
|
// Count shown/hidden per category (deduplicated by category+rsid)
|
|
type catCount struct{ Shown, Hidden int }
|
|
catCounts := map[string]*catCount{}
|
|
type catRsid struct{ cat, rsid string }
|
|
counted := map[catRsid]bool{}
|
|
for _, v := range matched {
|
|
for _, info := range snpediaMap[v.RSID+":"+v.Genotype] {
|
|
key := catRsid{info.Category, v.RSID}
|
|
if counted[key] {
|
|
continue
|
|
}
|
|
counted[key] = true
|
|
c, ok := catCounts[info.Category]
|
|
if !ok {
|
|
c = &catCount{}
|
|
catCounts[info.Category] = c
|
|
}
|
|
if info.Magnitude > 4.0 || strings.EqualFold(info.Repute, "bad") {
|
|
c.Hidden++
|
|
} else {
|
|
c.Shown++
|
|
}
|
|
}
|
|
}
|
|
// Clear findings count as shown
|
|
for _, cf := range clearFindings {
|
|
c, ok := catCounts[cf.Category]
|
|
if !ok {
|
|
c = &catCount{}
|
|
catCounts[cf.Category] = c
|
|
}
|
|
c.Shown++
|
|
}
|
|
|
|
tierMap := make(map[string]string) // category -> tier entry_id
|
|
for cat, c := range catCounts {
|
|
tierEntry := &lib.Entry{
|
|
DossierID: dossierID,
|
|
ParentID: extractionID,
|
|
Category: lib.CategoryGenome,
|
|
Type: "tier",
|
|
Value: cat,
|
|
Ordinal: lib.GenomeTierFromString[cat],
|
|
Timestamp: now,
|
|
Data: fmt.Sprintf(`{"shown":%d,"hidden":%d}`, c.Shown, c.Hidden),
|
|
}
|
|
lib.EntryWrite("", tierEntry)
|
|
tierMap[cat] = tierEntry.EntryID
|
|
}
|
|
|
|
// Batch insert variants (tier 3) - Type="rsid", Value=genotype
|
|
// Deduplicate: one entry per tier+rsid (merge subcategories, keep highest magnitude)
|
|
type variantKey struct{ tier, rsid string }
|
|
deduped := make(map[variantKey]*lib.Entry)
|
|
|
|
for _, v := range matched {
|
|
for _, info := range snpediaMap[v.RSID+":"+v.Genotype] {
|
|
tierID := tierMap[info.Category]
|
|
key := variantKey{tierID, v.RSID}
|
|
|
|
if existing, ok := deduped[key]; ok {
|
|
// Keep higher magnitude entry
|
|
if info.Magnitude > float64(100-existing.Ordinal)/10 {
|
|
data := fmt.Sprintf(`{"mag":%.1f,"rep":"%s","sum":"%s","sub":"%s"}`,
|
|
info.Magnitude, info.Repute, strings.ReplaceAll(info.Summary, `"`, `\"`), info.Subcategory)
|
|
existing.Ordinal = int(100 - info.Magnitude*10)
|
|
existing.Data = data
|
|
}
|
|
continue
|
|
}
|
|
|
|
data := fmt.Sprintf(`{"mag":%.1f,"rep":"%s","sum":"%s","sub":"%s"}`,
|
|
info.Magnitude, info.Repute, strings.ReplaceAll(info.Summary, `"`, `\"`), info.Subcategory)
|
|
|
|
deduped[key] = &lib.Entry{
|
|
DossierID: dossierID,
|
|
ParentID: tierID,
|
|
Category: lib.CategoryGenome,
|
|
Type: "variant",
|
|
Value: v.Genotype,
|
|
Ordinal: int(100 - info.Magnitude*10),
|
|
Timestamp: now,
|
|
SearchKey: strings.ToLower(info.Gene),
|
|
SearchKey2: strings.ToLower(v.RSID),
|
|
Data: data,
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add clear findings as variant entries (repute "Clear", magnitude from the risk they cleared)
|
|
for _, cf := range clearFindings {
|
|
tierID := tierMap[cf.Category]
|
|
key := variantKey{tierID, cf.RSID}
|
|
if _, ok := deduped[key]; ok {
|
|
continue // already has a positive match in this tier
|
|
}
|
|
summary := fmt.Sprintf("No risk variant detected. You have %s. (Risks: %s)",
|
|
cf.Genotype, strings.Join(cf.RiskDescs, "; "))
|
|
data := fmt.Sprintf(`{"mag":%.1f,"rep":"Clear","sum":"%s","sub":"%s"}`,
|
|
cf.TopMag, strings.ReplaceAll(summary, `"`, `\"`), cf.Subcategory)
|
|
deduped[key] = &lib.Entry{
|
|
DossierID: dossierID,
|
|
ParentID: tierID,
|
|
Category: lib.CategoryGenome,
|
|
Type: "variant",
|
|
Value: cf.Genotype,
|
|
Ordinal: int(100 - cf.TopMag*10),
|
|
Timestamp: now,
|
|
SearchKey: strings.ToLower(cf.Gene),
|
|
SearchKey2: strings.ToLower(cf.RSID),
|
|
Data: data,
|
|
}
|
|
}
|
|
|
|
var batch []*lib.Entry
|
|
for _, e := range deduped {
|
|
batch = append(batch, e)
|
|
if len(batch) >= 500 {
|
|
lib.EntryWrite("", batch...)
|
|
batch = batch[:0]
|
|
}
|
|
}
|
|
if len(batch) > 0 {
|
|
lib.EntryWrite("", batch...)
|
|
}
|
|
|
|
elapsed := time.Since(startTime)
|
|
details := fmt.Sprintf("parent=%s variants=%d parsed=%d elapsed=%v", extractionID, len(matched), len(variants), elapsed)
|
|
updateUploadStatus(uploadID, "completed", details)
|
|
lib.AuditLog(dossierID, "genome_import", dossierID, fmt.Sprintf("%d", len(matched)))
|
|
}
|