inou/portal/genome.go

package main

import (
	"bufio"
	"bytes"
	"database/sql"
	"encoding/json"
	"fmt"
	"os"
	"sort"
	"strings"
	"time"

	"inou/lib"
)

type Variant struct {
	RSID     string
	Genotype string
}

// detectGenomeFormat returns format name based on first data line
func detectGenomeFormat(firstLine string) string {
	if strings.Contains(firstLine, "\"") {
		return "myheritage"
	}
	if strings.Contains(firstLine, "\t") {
		parts := strings.Split(firstLine, "\t")
		if len(parts) >= 5 {
			return "ancestry"
		}
		return "23andme"
	}
	return "ftdna"
}

// parseGenomeVariant extracts rsid and genotype from a line
func parseGenomeVariant(line, format string) (string, string, bool) {
	if strings.HasPrefix(line, "#") || strings.HasPrefix(line, "rsid") || strings.HasPrefix(line, "RSID") || (strings.HasPrefix(line, "\"") && strings.Contains(line, "RSID")) {
		return "", "", false
	}

	var parts []string
	var rsid, genotype string

	switch format {
	case "ancestry":
		parts = strings.Split(line, "\t")
		if len(parts) < 5 {
			return "", "", false
		}
		rsid = parts[0]
		allele1, allele2 := parts[3], parts[4]
		if allele1 == "0" || allele2 == "0" {
			return "", "", false
		}
		genotype = allele1 + allele2

	case "23andme":
		parts = strings.Split(line, "\t")
		if len(parts) == 2 {
			rsid = parts[0]
			genotype = parts[1]
		} else if len(parts) >= 4 {
			rsid = parts[0]
			genotype = parts[3]
		} else {
			return "", "", false
		}
		if genotype == "--" {
			return "", "", false
		}

	case "myheritage":
		line = strings.ReplaceAll(line, "\"", "")
		parts = strings.Split(line, ",")
		if len(parts) < 4 {
			return "", "", false
		}
		rsid = parts[0]
		genotype = parts[3]

	case "ftdna":
		parts = strings.Split(line, ",")
		if len(parts) < 4 {
			return "", "", false
		}
		rsid = parts[0]
		genotype = parts[3]
	}

	if !strings.HasPrefix(rsid, "rs") {
		return "", "", false
	}

	// Normalize: sort alleles (GA -> AG)
	if len(genotype) == 2 && genotype[0] > genotype[1] {
		genotype = string(genotype[1]) + string(genotype[0])
	}

	return rsid, genotype, true
}

// normalizeGenotype complements alleles to match the reference strand, then sorts.
func normalizeGenotype(genotype, alleles string) string {
	if len(genotype) != 2 || alleles == "" {
		if len(genotype) == 2 && genotype[0] > genotype[1] {
			return string(genotype[1]) + string(genotype[0])
		}
		return genotype
	}
	valid := make(map[byte]bool)
	for i := 0; i < len(alleles); i++ {
		valid[alleles[i]] = true
	}
	comp := [256]byte{'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
	var result [2]byte
	for i := 0; i < 2; i++ {
		b := genotype[i]
		if valid[b] {
			result[i] = b
		} else if c := comp[b]; c != 0 {
			result[i] = c
		} else {
			result[i] = b
		}
	}
	if result[0] > result[1] {
		result[0], result[1] = result[1], result[0]
	}
	return string(result[0]) + string(result[1])
}

// updateUploadStatus updates the status in the upload entry Data JSON
func updateUploadStatus(uploadID string, status string, details string) {
	entry, err := lib.EntryGet(nil, uploadID) // nil ctx - internal operation
	if err != nil || entry == nil {
		return
	}
	// Parse existing data, update status, preserve other fields
	var data map[string]interface{}
	json.Unmarshal([]byte(entry.Data), &data)
	if data == nil {
		data = make(map[string]interface{})
	}
	data["status"] = status
	if details != "" {
		data["details"] = details
	}
	newData, _ := json.Marshal(data)
	entry.Data = string(newData)
	lib.EntryWrite("", entry)
}

// processGenomeUpload processes a genetics file in the background
func processGenomeUpload(uploadID string, dossierID string, filePath string) {
	updateUploadStatus(uploadID, "processing", "")
	startTime := time.Now()

	// Read and decrypt
	encrypted, err := os.ReadFile(filePath)
	if err != nil {
		updateUploadStatus(uploadID, "failed", "Read failed: "+err.Error())
		return
	}

	decrypted, err := lib.CryptoDecryptBytes(encrypted)
	if err != nil {
		updateUploadStatus(uploadID, "failed", "Decrypt failed: "+err.Error())
		return
	}

	// Parse variants
	scanner := bufio.NewScanner(bytes.NewReader(decrypted))
	scanner.Buffer(make([]byte, 1024*1024), 1024*1024)

	var format string
	var firstDataLine string
	for scanner.Scan() {
		line := scanner.Text()
		if !strings.HasPrefix(line, "#") && len(line) > 0 {
			firstDataLine = line
			break
		}
	}
	format = detectGenomeFormat(firstDataLine)

	variants := make([]Variant, 0, 800000)

	if rsid, geno, ok := parseGenomeVariant(firstDataLine, format); ok {
		variants = append(variants, Variant{rsid, geno})
	}

	for scanner.Scan() {
		if rsid, geno, ok := parseGenomeVariant(scanner.Text(), format); ok {
			variants = append(variants, Variant{rsid, geno})
		}
	}

	// Sort by rsid
	sort.Slice(variants, func(i, j int) bool {
		return variants[i].RSID < variants[j].RSID
	})

	// Load SNPedia data from reference DB (initialized at portal startup)
	type CatInfo struct {
		Category    string
		Subcategory string
		Gene        string
		Magnitude   float64
		Repute      string
		Summary     string
	}
	// Key: rsid+genotype -> slice of category associations
	snpediaMap := make(map[string][]CatInfo, 50000)
	snpediaRsids := make(map[string]bool, 15000)
	indelRsids := make(map[string]bool) // rsIDs that have any indel genotype
	if snpDB := lib.RefDB(); snpDB != nil {
		// First pass: identify indel rsIDs
		indelRows, _ := snpDB.Query("SELECT DISTINCT rsid FROM genotypes WHERE genotype LIKE '%-%'")
		if indelRows != nil {
			for indelRows.Next() {
				var rsid string
				indelRows.Scan(&rsid)
				indelRsids[rsid] = true
			}
			indelRows.Close()
		}
		rows, _ := snpDB.Query("SELECT rsid, genotype_norm, gene, magnitude, repute, summary, category, subcategory FROM genotypes")
		if rows != nil {
			for rows.Next() {
				var rsid, geno string
				var gene, repute, summary, cat, subcat sql.NullString
				var mag float64
				rows.Scan(&rsid, &geno, &gene, &mag, &repute, &summary, &cat, &subcat)
				if cat.String == "" || indelRsids[rsid] {
					continue // skip entries without category or indel variants
				}
				key := rsid + ":" + geno
				snpediaMap[key] = append(snpediaMap[key], CatInfo{
					Category:    cat.String,
					Subcategory: subcat.String,
					Gene:        gene.String,
					Magnitude:   mag,
					Repute:      repute.String,
					Summary:     summary.String,
				})
				snpediaRsids[rsid] = true
			}
			rows.Close()
		}
	}

	// Build valid alleles per rsid from actual genotype entries (not the alleles column,
	// which includes the reference allele that SNPedia doesn't use in genotype notation)
	snpediaAlleles := make(map[string]string, len(snpediaRsids))
	for key := range snpediaMap {
		parts := strings.SplitN(key, ":", 2)
		if len(parts) == 2 {
			rsid, geno := parts[0], parts[1]
			existing := snpediaAlleles[rsid]
			for i := 0; i < len(geno); i++ {
				if !strings.ContainsRune(existing, rune(geno[i])) {
					existing += string(geno[i])
				}
			}
			snpediaAlleles[rsid] = existing
		}
	}

	// Match variants (only those with rsid in SNPedia), normalizing genotype to reference strand
	matched := make([]Variant, 0, len(snpediaRsids))
	for _, v := range variants {
		if snpediaRsids[v.RSID] {
			v.Genotype = normalizeGenotype(v.Genotype, snpediaAlleles[v.RSID])
			matched = append(matched, v)
		}
	}

	// Split into positive matches (genotype in SNPedia) and clear findings (rsid in SNPedia but genotype doesn't match any risk variant)
	type clearInfo struct {
		RSID, Genotype, Gene, Category, Subcategory string
		TopMag                                      float64
		RiskDescs                                   []string
	}
	positiveRsids := make(map[string]bool)
	for _, v := range matched {
		if len(snpediaMap[v.RSID+":"+v.Genotype]) > 0 {
			positiveRsids[v.RSID] = true
		}
	}

	// Build clear findings: rsids in SNPedia where user has no matching risk genotype
	var clearFindings []clearInfo
	for _, v := range matched {
		if positiveRsids[v.RSID] {
			continue
		}
		// Find what risk variants SNPedia documents for this rsid
		var topCat, topSub, gene string
		var topMag float64
		seen := make(map[string]bool)
		var descs []string
		for key, infos := range snpediaMap {
			if !strings.HasPrefix(key, v.RSID+":") {
				continue
			}
			for _, info := range infos {
				if topCat == "" || info.Magnitude > topMag {
					topMag = info.Magnitude
					topCat = info.Category
					topSub = info.Subcategory
					gene = info.Gene
				}
				geno := strings.TrimPrefix(key, v.RSID+":")
				if !seen[geno] && len(descs) < 3 {
					seen[geno] = true
					d := geno
					if info.Summary != "" {
						s := info.Summary
						if len(s) > 40 {
							s = s[:40] + "..."
						}
						d += ": " + s
					}
					descs = append(descs, d)
				}
			}
		}
		if topCat != "" {
			clearFindings = append(clearFindings, clearInfo{
				RSID: v.RSID, Genotype: v.Genotype, Gene: gene,
				Category: topCat, Subcategory: topSub, TopMag: topMag, RiskDescs: descs,
			})
		}
	}

	// Delete existing genome entries (all genome data uses CategoryGenome with different Types)
	lib.EntryDelete("", dossierID, &lib.Filter{Category: lib.CategoryGenome})

	// Create extraction entry (tier 1)
	now := time.Now().Unix()
	totalAssoc := 0
	for _, v := range matched {
		totalAssoc += len(snpediaMap[v.RSID+":"+v.Genotype])
	}

	parentEntry := &lib.Entry{
		DossierID: dossierID,
		ParentID:  dossierID,
		Category:  lib.CategoryGenome,
		Type:      "extraction",
		Value:     format,
		Timestamp: now,
		Data:      fmt.Sprintf(`{"upload_id":"%s","variants":%d,"associations":%d,"clear":%d,"total_parsed":%d}`, uploadID, len(matched), totalAssoc, len(clearFindings), len(variants)),
	}
	lib.EntryWrite("", parentEntry)
	extractionID := parentEntry.EntryID

	// Count shown/hidden per category (deduplicated by category+rsid)
	type catCount struct{ Shown, Hidden int }
	catCounts := map[string]*catCount{}
	type catRsid struct{ cat, rsid string }
	counted := map[catRsid]bool{}
	for _, v := range matched {
		for _, info := range snpediaMap[v.RSID+":"+v.Genotype] {
			key := catRsid{info.Category, v.RSID}
			if counted[key] {
				continue
			}
			counted[key] = true
			c, ok := catCounts[info.Category]
			if !ok {
				c = &catCount{}
				catCounts[info.Category] = c
			}
			if info.Magnitude > 4.0 || strings.EqualFold(info.Repute, "bad") {
				c.Hidden++
			} else {
				c.Shown++
			}
		}
	}
	// Clear findings count as shown
	for _, cf := range clearFindings {
		c, ok := catCounts[cf.Category]
		if !ok {
			c = &catCount{}
			catCounts[cf.Category] = c
		}
		c.Shown++
	}

	tierMap := make(map[string]string) // category -> tier entry_id
	for cat, c := range catCounts {
		tierEntry := &lib.Entry{
			DossierID: dossierID,
			ParentID:  extractionID,
			Category:  lib.CategoryGenome,
			Type:      "tier",
			Value:     cat,
			Ordinal:   lib.GenomeTierFromString[cat],
			Timestamp: now,
			Data:      fmt.Sprintf(`{"shown":%d,"hidden":%d}`, c.Shown, c.Hidden),
		}
		lib.EntryWrite("", tierEntry)
		tierMap[cat] = tierEntry.EntryID
	}

	// Batch insert variants (tier 3) - Type="rsid", Value=genotype
	// Deduplicate: one entry per tier+rsid (merge subcategories, keep highest magnitude)
	type variantKey struct{ tier, rsid string }
	deduped := make(map[variantKey]*lib.Entry)

	for _, v := range matched {
		for _, info := range snpediaMap[v.RSID+":"+v.Genotype] {
			tierID := tierMap[info.Category]
			key := variantKey{tierID, v.RSID}

			if existing, ok := deduped[key]; ok {
				// Keep higher magnitude entry
				if info.Magnitude > float64(100-existing.Ordinal)/10 {
					data := fmt.Sprintf(`{"mag":%.1f,"rep":"%s","sum":"%s","sub":"%s"}`,
						info.Magnitude, info.Repute, strings.ReplaceAll(info.Summary, `"`, `\"`), info.Subcategory)
					existing.Ordinal = int(100 - info.Magnitude*10)
					existing.Data = data
				}
				continue
			}

			data := fmt.Sprintf(`{"mag":%.1f,"rep":"%s","sum":"%s","sub":"%s"}`,
				info.Magnitude, info.Repute, strings.ReplaceAll(info.Summary, `"`, `\"`), info.Subcategory)

			deduped[key] = &lib.Entry{
				DossierID:  dossierID,
				ParentID:   tierID,
				Category:   lib.CategoryGenome,
				Type:       "variant",
				Value:      v.Genotype,
				Ordinal:    int(100 - info.Magnitude*10),
				Timestamp:  now,
				SearchKey:  strings.ToLower(info.Gene),
				SearchKey2: strings.ToLower(v.RSID),
				Data:       data,
			}
		}
	}

	// Add clear findings as variant entries (repute "Clear", magnitude from the risk they cleared)
	for _, cf := range clearFindings {
		tierID := tierMap[cf.Category]
		key := variantKey{tierID, cf.RSID}
		if _, ok := deduped[key]; ok {
			continue // already has a positive match in this tier
		}
		summary := fmt.Sprintf("No risk variant detected. You have %s. (Risks: %s)",
			cf.Genotype, strings.Join(cf.RiskDescs, "; "))
		data := fmt.Sprintf(`{"mag":%.1f,"rep":"Clear","sum":"%s","sub":"%s"}`,
			cf.TopMag, strings.ReplaceAll(summary, `"`, `\"`), cf.Subcategory)
		deduped[key] = &lib.Entry{
			DossierID:  dossierID,
			ParentID:   tierID,
			Category:   lib.CategoryGenome,
			Type:       "variant",
			Value:      cf.Genotype,
			Ordinal:    int(100 - cf.TopMag*10),
			Timestamp:  now,
			SearchKey:  strings.ToLower(cf.Gene),
			SearchKey2: strings.ToLower(cf.RSID),
			Data:       data,
		}
	}

	var batch []*lib.Entry
	for _, e := range deduped {
		batch = append(batch, e)
		if len(batch) >= 500 {
			lib.EntryWrite("", batch...)
			batch = batch[:0]
		}
	}
	if len(batch) > 0 {
		lib.EntryWrite("", batch...)
	}

	elapsed := time.Since(startTime)
	details := fmt.Sprintf("parent=%s variants=%d parsed=%d elapsed=%v", extractionID, len(matched), len(variants), elapsed)
	updateUploadStatus(uploadID, "completed", details)
	lib.AuditLog(dossierID, "genome_import", dossierID, fmt.Sprintf("%d", len(matched)))
}