From b4aced5c03644cdee891d37bc13048c5e949177c Mon Sep 17 00:00:00 2001 From: James Date: Thu, 9 Apr 2026 01:20:08 -0400 Subject: [PATCH] telemetry: fix CRITICAL silent failures (Cardinal Rule #1) Fixes #2, #3, #4 Issue #2 - Silent database errors in updateSpan(): - Add error handling for telemetry INSERT (ERR-TELEMETRY-004) - Add error handling for all table/index creation (ERR-TELEMETRY-005 to -010) - Return HTTP 500 to client on insert failure Issue #3 - Silent failure in Kuma push: - Return early on non-OK status from Kuma - Proper error logging with body close handling Issue #4 - Unchecked flush error in tarpit: - Verify http.Flusher available before tarpit - Log ERR-TELEMETRY-040 and abort if flusher unavailable - Remove redundant flusher checks in loop All changes: security failures are now LOUD (Cardinal Rule #1) Author: Hans --- clavis/clavis-telemetry/kuma.go | 4 +++ clavis/clavis-telemetry/main.go | 54 ++++++++++++++++++++++----------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/clavis/clavis-telemetry/kuma.go b/clavis/clavis-telemetry/kuma.go index c58f25e..bf2c09f 100644 --- a/clavis/clavis-telemetry/kuma.go +++ b/clavis/clavis-telemetry/kuma.go @@ -63,6 +63,10 @@ func sendKumaPush(kumaURL string) { } if resp.StatusCode != http.StatusOK { log.Printf("ERR-TELEMETRY-031: Kuma returned non-OK status %d from %s", resp.StatusCode, kumaURL) + if err := resp.Body.Close(); err != nil { + log.Printf("ERR-TELEMETRY-032: Failed to close Kuma response body after non-OK status - %v", err) + } + return } if err := resp.Body.Close(); err != nil { log.Printf("ERR-TELEMETRY-032: Failed to close Kuma response body - %v", err) diff --git a/clavis/clavis-telemetry/main.go b/clavis/clavis-telemetry/main.go index ce1a52e..08890a7 100644 --- a/clavis/clavis-telemetry/main.go +++ b/clavis/clavis-telemetry/main.go @@ -130,9 +130,13 @@ func tarpit(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") w.WriteHeader(200) - if flusher, ok := w.(http.Flusher); ok { - flusher.Flush() + // Verify flusher is available - otherwise tarpit is ineffective + flusher, ok := w.(http.Flusher) + if !ok { + log.Printf("ERR-TELEMETRY-040: tarpit called with ResponseWriter that does not implement http.Flusher - aborting") + return } + flusher.Flush() // Drip one byte per second for 30 seconds for i := 0; i < 30; i++ { @@ -140,18 +144,16 @@ func tarpit(w http.ResponseWriter, r *http.Request) { if err != nil { return // Client disconnected } - if flusher, ok := w.(http.Flusher); ok { - // Flush has no return value per http.Flusher interface - // Write error above is the primary signal for client disconnect - flusher.Flush() - } + // Flush has no return value per http.Flusher interface + // Write error above is the primary signal for client disconnect + flusher.Flush() time.Sleep(time.Second) } } func ensureTables() { // Telemetry table - db.Exec(`CREATE TABLE IF NOT EXISTS telemetry ( + if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS telemetry ( id INTEGER PRIMARY KEY AUTOINCREMENT, node_id TEXT NOT NULL, received_at INTEGER NOT NULL DEFAULT (strftime('%s','now')), @@ -168,28 +170,40 @@ func ensureTables() { vault_size_mb REAL NOT NULL DEFAULT 0, vault_entries INTEGER NOT NULL DEFAULT 0, mode TEXT NOT NULL DEFAULT '' - )`) - db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_id ON telemetry(node_id)`) - db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_latest ON telemetry(node_id, id DESC)`) + )`); err != nil { + log.Fatalf("ERR-TELEMETRY-005: Failed to create telemetry table - %v", err) + } + if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_id ON telemetry(node_id)`); err != nil { + log.Fatalf("ERR-TELEMETRY-006: Failed to create telemetry node_id index - %v", err) + } + if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_latest ON telemetry(node_id, id DESC)`); err != nil { + log.Fatalf("ERR-TELEMETRY-007: Failed to create telemetry node_latest index - %v", err) + } // Uptime spans table - db.Exec(`CREATE TABLE IF NOT EXISTS uptime_spans ( + if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS uptime_spans ( id INTEGER PRIMARY KEY AUTOINCREMENT, node_id TEXT NOT NULL, start_at INTEGER NOT NULL, end_at INTEGER NOT NULL - )`) - db.Exec(`CREATE INDEX IF NOT EXISTS idx_spans_node_end ON uptime_spans(node_id, end_at DESC)`) + )`); err != nil { + log.Fatalf("ERR-TELEMETRY-008: Failed to create uptime_spans table - %v", err) + } + if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_spans_node_end ON uptime_spans(node_id, end_at DESC)`); err != nil { + log.Fatalf("ERR-TELEMETRY-009: Failed to create uptime_spans index - %v", err) + } // Maintenance table - db.Exec(`CREATE TABLE IF NOT EXISTS maintenance ( + if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS maintenance ( id INTEGER PRIMARY KEY AUTOINCREMENT, start_at INTEGER NOT NULL DEFAULT (strftime('%s','now')), end_at INTEGER, reason TEXT NOT NULL DEFAULT '', started_by TEXT NOT NULL DEFAULT '', ended_by TEXT NOT NULL DEFAULT '' - )`) + )`); err != nil { + log.Fatalf("ERR-TELEMETRY-010: Failed to create maintenance table - %v", err) + } } func handleHealth(w http.ResponseWriter, r *http.Request) { @@ -313,8 +327,12 @@ func handleTelemetry(w http.ResponseWriter, r *http.Request) { } // Insert telemetry - db.Exec(`INSERT INTO telemetry (node_id, version, hostname, uptime_seconds, cpu_percent, memory_total_mb, memory_used_mb, disk_total_mb, disk_used_mb, load_1m, vault_count, vault_size_mb, vault_entries, mode) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)`, - t.NodeID, t.Version, t.Hostname, t.UptimeSeconds, t.CPUPercent, t.MemTotalMB, t.MemUsedMB, t.DiskTotalMB, t.DiskUsedMB, t.Load1m, t.VaultCount, t.VaultSizeMB, t.VaultEntries, t.Mode) + if _, err := db.Exec(`INSERT INTO telemetry (node_id, version, hostname, uptime_seconds, cpu_percent, memory_total_mb, memory_used_mb, disk_total_mb, disk_used_mb, load_1m, vault_count, vault_size_mb, vault_entries, mode) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)`, + t.NodeID, t.Version, t.Hostname, t.UptimeSeconds, t.CPUPercent, t.MemTotalMB, t.MemUsedMB, t.DiskTotalMB, t.DiskUsedMB, t.Load1m, t.VaultCount, t.VaultSizeMB, t.VaultEntries, t.Mode); err != nil { + log.Printf("ERR-TELEMETRY-004: Failed to insert telemetry for node=%s - %v", t.NodeID, err) + http.Error(w, `{"error":"database error"}`, 500) + return + } // Uptime span tracking updateSpan(t.NodeID, t.Hostname, t.Version, t.CPUPercent, t.MemUsedMB, t.MemTotalMB, t.DiskUsedMB, t.DiskTotalMB, t.Load1m, t.UptimeSeconds)