diff --git a/clavis/clavis-vault/clavitor-linux-arm64 b/clavis/clavis-vault/clavitor-linux-arm64 index 5a2793f..859afb8 100755 Binary files a/clavis/clavis-vault/clavitor-linux-arm64 and b/clavis/clavis-vault/clavitor-linux-arm64 differ diff --git a/clavitor.com/clavitor-web b/clavitor.com/clavitor-web index cbb4325..f2201c5 100755 Binary files a/clavitor.com/clavitor-web and b/clavitor.com/clavitor-web differ diff --git a/clavitor.com/clavitor-web-linux-amd64 b/clavitor.com/clavitor-web-linux-amd64 index 5f03778..ec7c6da 100755 Binary files a/clavitor.com/clavitor-web-linux-amd64 and b/clavitor.com/clavitor-web-linux-amd64 differ diff --git a/clavitor.com/clavitor.db-shm b/clavitor.com/clavitor.db-shm index 4d60724..e6078b5 100644 Binary files a/clavitor.com/clavitor.db-shm and b/clavitor.com/clavitor.db-shm differ diff --git a/clavitor.com/clavitor.db-wal b/clavitor.com/clavitor.db-wal index 6df1d49..24f7b84 100644 Binary files a/clavitor.com/clavitor.db-wal and b/clavitor.com/clavitor.db-wal differ diff --git a/clavitor.com/main.go b/clavitor.com/main.go index 025754e..d93bc9b 100644 --- a/clavitor.com/main.go +++ b/clavitor.com/main.go @@ -78,7 +78,7 @@ func loadTemplates() { } func loadPops() []Pop { - rows, err := db.Query("SELECT pop_id, city, country, lat, lon, region_name, ip, dns, status, provider FROM pops ORDER BY CASE status WHEN 'live' THEN 0 ELSE 1 END, lon") + rows, err := db.Query("SELECT pop_id, city, country, lat, lon, region_name, ip, dns, status, provider FROM pops ORDER BY CASE status WHEN 'live' THEN 0 ELSE 1 END, lon DESC") if err != nil { log.Printf("pops query error: %v", err) return nil @@ -378,9 +378,10 @@ func main() { if !nocPin(r) { http.NotFound(w, r); return } pops := loadPops() type N struct { - ID string `json:"ID"` - City string `json:"City"` - Status string `json:"Status"` + ID string `json:"ID"` + City string `json:"City"` + Country string `json:"Country"` + Status string `json:"Status"` } var nodes []N for _, p := range pops { @@ -391,7 +392,7 @@ func main() { if id == "" { id = p.City } - nodes = append(nodes, N{ID: id, City: p.City, Status: p.Status}) + nodes = append(nodes, N{ID: id, City: p.City, Country: countryName(p.Country), Status: p.Status}) } w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]any{"nodes": nodes}) @@ -471,6 +472,24 @@ func main() { return -1 } + // Find when this node first came online (first span ever) + var firstEver int64 + db.QueryRow(`SELECT MIN(start_at) FROM uptime_spans WHERE node_id = ?`, nodeID).Scan(&firstEver) + + // If the node didn't exist yet on this day, no data + if firstEver == 0 || firstEver >= deUnix { + return -1 + } + + // If the node came online partway through this day, start counting from then + if firstEver > dsUnix { + dsUnix = firstEver + totalSeconds = deUnix - dsUnix + if totalSeconds <= 0 { + return -1 + } + } + // Sum span overlap with this day var upSeconds int64 var hasSpans bool @@ -724,6 +743,13 @@ func main() { deUnix = time.Now().Unix() } + // Don't count time before the node first came online + var firstEver int64 + db.QueryRow(`SELECT MIN(start_at) FROM uptime_spans WHERE node_id = ?`, node).Scan(&firstEver) + if firstEver > 0 && firstEver > dsUnix { + dsUnix = firstEver + } + type Span struct { Start int64 `json:"start"` End int64 `json:"end"` diff --git a/clavitor.com/templates/noc.tmpl b/clavitor.com/templates/noc.tmpl index b357577..1690dfd 100644 --- a/clavitor.com/templates/noc.tmpl +++ b/clavitor.com/templates/noc.tmpl @@ -49,9 +49,17 @@ ctx.scale(devicePixelRatio, devicePixelRatio); if (!points || points.length < 2) return; const vals = points.map(p => p[key]); - const max = Math.max(...vals, 5); + const dataMax = Math.max(...vals); + // Snap ceiling to nice round number above data + const max = dataMax <= 5 ? 5 : dataMax <= 10 ? 10 : dataMax <= 25 ? 25 : dataMax <= 50 ? 50 : 100; const xs = i => (i / (points.length-1)) * W; const ys = v => H - 2 - (v/max) * (H-4); + // Guide lines at 25%, 50%, 75% of ceiling + ctx.strokeStyle = 'rgba(0,0,0,0.06)'; ctx.lineWidth = 1; + for (const f of [0.25, 0.5, 0.75]) { + const y = ys(max * f); + ctx.beginPath(); ctx.moveTo(0, y); ctx.lineTo(W, y); ctx.stroke(); + } ctx.beginPath(); ctx.strokeStyle = color; ctx.lineWidth = 1.5; ctx.moveTo(xs(0), ys(vals[0])); for (let i=1;i -
${t._city || t.node_id}PENDING
+
${t._city || t.node_id}${t._country || ''}
PENDING
Awaiting telemetry
`; const ageS = Math.round(Date.now()/1000 - t.received_at); @@ -77,18 +85,16 @@ return `
- ${t._city || t.node_id} +
${t._city || t.node_id}${t._country || ''}
${statusText}
${t.hostname || ''} · v${t.version || ''}
CPU
${t.cpu_percent.toFixed(1)}%
load ${t.load_1m.toFixed(2)}
-
Memory
${memPct}%
${t.memory_used_mb} / ${t.memory_total_mb} MB
+
Memory
${memPct}%
${t.memory_used_mb} / ${t.memory_total_mb} MB
-
Mem${memPct}%
Disk${diskPct}% · ${t.disk_used_mb} / ${t.disk_total_mb} MB
-
CPU % — last ${hist ? hist.length : 0} samples
-
Mem % — last ${hist ? hist.length : 0} samples
+
CPU % — last 30 min
↑ ${fmtUptime(t.uptime_seconds)} ◈ ${t.vault_count} vaults · ${t.vault_size_mb.toFixed(1)} MB @@ -118,6 +124,7 @@ const nodes = liveNodes.map(n => { const t = tMap[n.ID] || {node_id:n.ID, _pending:true}; t._city = n.City || n.ID; + t._country = n.Country || ''; return t; }); document.getElementById('noc-error').style.display = 'none'; @@ -138,9 +145,7 @@ nodes.forEach(t => { const hist = history[t.node_id] || []; const cpu = document.getElementById('spark-cpu-'+t.node_id); - const mem = document.getElementById('spark-mem-'+t.node_id); if (cpu) drawSpark(cpu, hist, 'cpu', 'rgb(220,38,38)'); - if (mem) drawSpark(mem, hist, 'mem_pct', 'rgb(10,10,10)'); }); document.getElementById('noc-updated').textContent = 'Updated ' + new Date().toLocaleTimeString(); } catch(e) { diff --git a/operations/pop-sync/main.go b/operations/pop-sync/main.go index 98ee91e..7ca80d7 100644 --- a/operations/pop-sync/main.go +++ b/operations/pop-sync/main.go @@ -797,7 +797,46 @@ func provisionNode(cfg Config, pop POP) NodeResult { ec2Client := ec2.NewFromConfig(awsCfg) ssmClient := ssm.NewFromConfig(awsCfg) - // 1. Look up latest AL2023 ARM64 AMI + // Auto-generate DNS subdomain if not set + sub := pop.Subdomain() + if sub == "" { + prefix := strings.ToLower(pop.Country) + // Find next available ordinal + allPOPs, _ := readPOPs(cfg.DBPath) + ordinal := 1 + for { + candidate := fmt.Sprintf("%s%d", prefix, ordinal) + taken := false + for _, p := range allPOPs { + if p.Subdomain() == candidate { + taken = true + break + } + } + if !taken { + sub = candidate + break + } + ordinal++ + } + // Persist to DB immediately + dns := sub + "." + cfg.Zone + pop.DNS = dns + localDB, _ := sql.Open("sqlite", cfg.DBPath) + localDB.Exec(`UPDATE pops SET dns=? WHERE pop_id=?`, dns, pop.PopID) + localDB.Close() + log(cfg, " [%s] auto-assigned DNS: %s", pop.City, dns) + } + dns := sub + "." + cfg.Zone + + if cfg.DryRun { + r.OK = true + r.Message = fmt.Sprintf("would provision %s (%s) as %s", pop.City, region, dns) + log(cfg, " [%s] DRY RUN: %s", pop.City, r.Message) + return r + } + + // --- Step 1: Launch EC2 --- log(cfg, " [%s] looking up AMI in %s...", pop.City, region) amiParam, err := ssmClient.GetParameter(ctx, &ssm.GetParameterInput{ Name: strPtr("/aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-arm64"), @@ -807,32 +846,14 @@ func provisionNode(cfg Config, pop POP) NodeResult { return r } amiID := *amiParam.Parameter.Value - log(cfg, " [%s] AMI: %s", pop.City, amiID) - // 2. Find or create security group sgID, err := ensureSecurityGroup(ctx, ec2Client, region, pop.City, cfg) if err != nil { r.Error = fmt.Sprintf("security group: %v", err) return r } - log(cfg, " [%s] SG: %s", pop.City, sgID) - // 3. Determine subdomain for the node - sub := pop.Subdomain() - if sub == "" { - // Generate from country code + ordinal - sub = strings.ToLower(pop.Country) + "1" - } - - if cfg.DryRun { - r.OK = true - r.Message = fmt.Sprintf("would launch t4g.nano in %s (AMI: %s, SG: %s)", region, amiID, sgID) - log(cfg, " [%s] DRY RUN: %s", pop.City, r.Message) - return r - } - - // 4. Launch instance - log(cfg, " [%s] launching t4g.nano...", pop.City) + log(cfg, " [%s] launching t4g.nano (AMI: %s, SG: %s)...", pop.City, amiID, sgID) runOut, err := ec2Client.RunInstances(ctx, &ec2.RunInstancesInput{ ImageId: &amiID, InstanceType: ec2types.InstanceTypeT4gNano, @@ -855,55 +876,154 @@ func provisionNode(cfg Config, pop POP) NodeResult { return r } instanceID := *runOut.Instances[0].InstanceId - log(cfg, " [%s] launched: %s, waiting for public IP...", pop.City, instanceID) + log(cfg, " [%s] instance: %s — waiting for IP...", pop.City, instanceID) - // 5. Wait for running + public IP waiter := ec2.NewInstanceRunningWaiter(ec2Client) - err = waiter.Wait(ctx, &ec2.DescribeInstancesInput{ - InstanceIds: []string{instanceID}, - }, 3*time.Minute) - if err != nil { - r.Error = fmt.Sprintf("wait running: %v", err) + if err := waiter.Wait(ctx, &ec2.DescribeInstancesInput{InstanceIds: []string{instanceID}}, 3*time.Minute); err != nil { + r.Error = fmt.Sprintf("wait: %v", err) return r } - // Get public IP - descOut, err := ec2Client.DescribeInstances(ctx, &ec2.DescribeInstancesInput{ - InstanceIds: []string{instanceID}, - }) + descOut, err := ec2Client.DescribeInstances(ctx, &ec2.DescribeInstancesInput{InstanceIds: []string{instanceID}}) if err != nil || len(descOut.Reservations) == 0 || len(descOut.Reservations[0].Instances) == 0 { r.Error = fmt.Sprintf("describe: %v", err) return r } - inst := descOut.Reservations[0].Instances[0] publicIP := "" - if inst.PublicIpAddress != nil { - publicIP = *inst.PublicIpAddress + if descOut.Reservations[0].Instances[0].PublicIpAddress != nil { + publicIP = *descOut.Reservations[0].Instances[0].PublicIpAddress } if publicIP == "" { - r.Error = "no public IP assigned" + r.Error = "no public IP" return r } log(cfg, " [%s] IP: %s", pop.City, publicIP) - // 6. Update DB - dns := sub + "." + cfg.Zone - db, err := sql.Open("sqlite", cfg.DBPath) + // --- Step 2: Update local DB --- + log(cfg, " [%s] updating DB...", pop.City) + localDB, err := sql.Open("sqlite", cfg.DBPath) if err != nil { r.Error = fmt.Sprintf("open db: %v", err) return r } - defer db.Close() - _, err = db.Exec(`UPDATE pops SET instance_id=?, ip=?, dns=?, status='live' WHERE pop_id=?`, + localDB.Exec(`UPDATE pops SET instance_id=?, ip=?, dns=?, status='live' WHERE pop_id=?`, instanceID, publicIP, dns, pop.PopID) - if err != nil { - r.Error = fmt.Sprintf("update db: %v", err) + localDB.Close() + + // Reload pop with updated fields + pop.InstanceID = instanceID + pop.IP = publicIP + pop.DNS = dns + pop.Status = "live" + + // --- Step 3: DNS --- + log(cfg, " [%s] creating DNS %s → %s...", pop.City, dns, publicIP) + if cfg.CFToken != "" { + zoneID := cfg.CFZoneID + if zoneID == "" { + zoneID, _ = cfResolveZoneID(cfg.CFToken, cfg.Zone) + } + if zoneID != "" { + cfCreateRecord(cfg.CFToken, zoneID, dns, publicIP) + } + } + + // --- Step 4: Wait for SSM agent, then bootstrap Tailscale --- + log(cfg, " [%s] waiting for SSM agent (up to 90s)...", pop.City) + time.Sleep(30 * time.Second) // SSM agent takes ~30s to register after boot + if cfg.TSAuthKey != "" { + if err := ssmBootstrapTailscale(cfg, pop); err != nil { + log(cfg, " [%s] tailscale warning: %v", pop.City, err) + } + } + + // --- Step 5: Deploy vault binary --- + log(cfg, " [%s] deploying vault...", pop.City) + binaryPath := buildVault(cfg) + + hansDir := "/tmp/clavitor-serve" + hansFile := hansDir + "/clavitor" + sshExec(cfg.HansHost, "mkdir -p "+hansDir) + if err := scpToHost(cfg.HansHost, binaryPath, hansFile); err != nil { + r.Error = fmt.Sprintf("scp to hans: %v", err) return r } + sshExec(cfg.HansHost, "pkill -f 'http.server 9876' 2>/dev/null; exit 0") + sshExec(cfg.HansHost, "sudo iptables -I INPUT -p tcp --dport 9876 -j ACCEPT") + time.Sleep(500 * time.Millisecond) + sshBackground(cfg.HansHost, fmt.Sprintf("cd %s && exec python3 -m http.server 9876 --bind 0.0.0.0 >/dev/null 2>&1", hansDir)) + time.Sleep(2 * time.Second) + downloadURL := fmt.Sprintf("http://185.218.204.47:9876/clavitor") + + installCmds := []string{ + "mkdir -p /opt/clavitor/bin /opt/clavitor/data /opt/clavitor/certs", + fmt.Sprintf("curl -sf -o /tmp/clavitor-new %s", downloadURL), + "mv /tmp/clavitor-new /opt/clavitor/bin/clavitor", + "chmod +x /opt/clavitor/bin/clavitor", + fmt.Sprintf(`cat > /opt/clavitor/env << 'ENVEOF' +PORT=1984 +VAULT_MODE=hosted +DATA_DIR=/opt/clavitor/data +TELEMETRY_FREQ=30 +TELEMETRY_HOST=https://clavitor.ai/telemetry +TELEMETRY_TOKEN=clavitor-fleet-2026 +TLS_DOMAIN=%s +CF_API_TOKEN=dSVz7JZtyK023q7kh4MMNmIggK1dahWdnBxVnP3O +TLS_CERT_DIR=/opt/clavitor/certs +TLS_EMAIL=ops@clavitor.ai +ENVEOF`, dns), + `cat > /etc/systemd/system/clavitor.service << 'UNITEOF' +[Unit] +Description=Clavitor Vault +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +ExecStart=/opt/clavitor/bin/clavitor +EnvironmentFile=/opt/clavitor/env +WorkingDirectory=/opt/clavitor/data +Restart=always +RestartSec=5 +User=root + +[Install] +WantedBy=multi-user.target +UNITEOF`, + "systemctl daemon-reload", + "systemctl enable clavitor", + "systemctl restart clavitor", + "sleep 3", + "systemctl is-active clavitor", + } + + out, err := ssmRunCommand(cfg.HansHost, instanceID, region, installCmds) + // Clean up Hans + sshExec(cfg.HansHost, "pkill -f 'http.server 9876' 2>/dev/null; exit 0") + sshExec(cfg.HansHost, "sudo iptables -D INPUT -p tcp --dport 9876 -j ACCEPT 2>/dev/null; exit 0") + sshExec(cfg.HansHost, "rm -rf "+hansDir) + + if err != nil { + r.Error = fmt.Sprintf("deploy: %v\n%s", err, out) + return r + } + log(cfg, " [%s] service: %s", pop.City, strings.TrimSpace(out)) + + // --- Step 6: Verify --- + log(cfg, " [%s] verifying TLS...", pop.City) + time.Sleep(5 * time.Second) + verifyURL := fmt.Sprintf("https://%s:1984/ping", dns) + resp, err := http.Get(verifyURL) + if err == nil { + resp.Body.Close() + log(cfg, " [%s] TLS verified ✓", pop.City) + } else { + log(cfg, " [%s] TLS not ready yet (cert may take a minute): %v", pop.City, err) + } r.OK = true - r.Message = fmt.Sprintf("launched %s (%s) → %s → %s", instanceID, region, publicIP, dns) - log(cfg, " [%s] DONE: %s", pop.City, r.Message) + r.Message = fmt.Sprintf("%s → %s → %s (%s) — live", instanceID, publicIP, dns, region) + log(cfg, "\n [%s] DONE: %s\n", pop.City, r.Message) return r } diff --git a/operations/pop-sync/pop-sync b/operations/pop-sync/pop-sync index cc7e003..1e93ac2 100755 Binary files a/operations/pop-sync/pop-sync and b/operations/pop-sync/pop-sync differ