telemetry: fix CRITICAL silent failures (Cardinal Rule #1)

Fixes #2, #3, #4

Issue #2 - Silent database errors in updateSpan():
- Add error handling for telemetry INSERT (ERR-TELEMETRY-004)
- Add error handling for all table/index creation (ERR-TELEMETRY-005 to -010)
- Return HTTP 500 to client on insert failure

Issue #3 - Silent failure in Kuma push:
- Return early on non-OK status from Kuma
- Proper error logging with body close handling

Issue #4 - Unchecked flush error in tarpit:
- Verify http.Flusher available before tarpit
- Log ERR-TELEMETRY-040 and abort if flusher unavailable
- Remove redundant flusher checks in loop

All changes: security failures are now LOUD (Cardinal Rule #1)

Author: Hans <hans-20250409-001>
This commit is contained in:
James 2026-04-09 01:20:08 -04:00
parent 6d5837c7b4
commit b4aced5c03
2 changed files with 40 additions and 18 deletions

View File

@ -63,6 +63,10 @@ func sendKumaPush(kumaURL string) {
} }
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
log.Printf("ERR-TELEMETRY-031: Kuma returned non-OK status %d from %s", resp.StatusCode, kumaURL) log.Printf("ERR-TELEMETRY-031: Kuma returned non-OK status %d from %s", resp.StatusCode, kumaURL)
if err := resp.Body.Close(); err != nil {
log.Printf("ERR-TELEMETRY-032: Failed to close Kuma response body after non-OK status - %v", err)
}
return
} }
if err := resp.Body.Close(); err != nil { if err := resp.Body.Close(); err != nil {
log.Printf("ERR-TELEMETRY-032: Failed to close Kuma response body - %v", err) log.Printf("ERR-TELEMETRY-032: Failed to close Kuma response body - %v", err)

View File

@ -130,9 +130,13 @@ func tarpit(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain") w.Header().Set("Content-Type", "text/plain")
w.WriteHeader(200) w.WriteHeader(200)
if flusher, ok := w.(http.Flusher); ok { // Verify flusher is available - otherwise tarpit is ineffective
flusher.Flush() flusher, ok := w.(http.Flusher)
if !ok {
log.Printf("ERR-TELEMETRY-040: tarpit called with ResponseWriter that does not implement http.Flusher - aborting")
return
} }
flusher.Flush()
// Drip one byte per second for 30 seconds // Drip one byte per second for 30 seconds
for i := 0; i < 30; i++ { for i := 0; i < 30; i++ {
@ -140,18 +144,16 @@ func tarpit(w http.ResponseWriter, r *http.Request) {
if err != nil { if err != nil {
return // Client disconnected return // Client disconnected
} }
if flusher, ok := w.(http.Flusher); ok { // Flush has no return value per http.Flusher interface
// Flush has no return value per http.Flusher interface // Write error above is the primary signal for client disconnect
// Write error above is the primary signal for client disconnect flusher.Flush()
flusher.Flush()
}
time.Sleep(time.Second) time.Sleep(time.Second)
} }
} }
func ensureTables() { func ensureTables() {
// Telemetry table // Telemetry table
db.Exec(`CREATE TABLE IF NOT EXISTS telemetry ( if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS telemetry (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
node_id TEXT NOT NULL, node_id TEXT NOT NULL,
received_at INTEGER NOT NULL DEFAULT (strftime('%s','now')), received_at INTEGER NOT NULL DEFAULT (strftime('%s','now')),
@ -168,28 +170,40 @@ func ensureTables() {
vault_size_mb REAL NOT NULL DEFAULT 0, vault_size_mb REAL NOT NULL DEFAULT 0,
vault_entries INTEGER NOT NULL DEFAULT 0, vault_entries INTEGER NOT NULL DEFAULT 0,
mode TEXT NOT NULL DEFAULT '' mode TEXT NOT NULL DEFAULT ''
)`) )`); err != nil {
db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_id ON telemetry(node_id)`) log.Fatalf("ERR-TELEMETRY-005: Failed to create telemetry table - %v", err)
db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_latest ON telemetry(node_id, id DESC)`) }
if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_id ON telemetry(node_id)`); err != nil {
log.Fatalf("ERR-TELEMETRY-006: Failed to create telemetry node_id index - %v", err)
}
if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_latest ON telemetry(node_id, id DESC)`); err != nil {
log.Fatalf("ERR-TELEMETRY-007: Failed to create telemetry node_latest index - %v", err)
}
// Uptime spans table // Uptime spans table
db.Exec(`CREATE TABLE IF NOT EXISTS uptime_spans ( if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS uptime_spans (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
node_id TEXT NOT NULL, node_id TEXT NOT NULL,
start_at INTEGER NOT NULL, start_at INTEGER NOT NULL,
end_at INTEGER NOT NULL end_at INTEGER NOT NULL
)`) )`); err != nil {
db.Exec(`CREATE INDEX IF NOT EXISTS idx_spans_node_end ON uptime_spans(node_id, end_at DESC)`) log.Fatalf("ERR-TELEMETRY-008: Failed to create uptime_spans table - %v", err)
}
if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_spans_node_end ON uptime_spans(node_id, end_at DESC)`); err != nil {
log.Fatalf("ERR-TELEMETRY-009: Failed to create uptime_spans index - %v", err)
}
// Maintenance table // Maintenance table
db.Exec(`CREATE TABLE IF NOT EXISTS maintenance ( if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS maintenance (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
start_at INTEGER NOT NULL DEFAULT (strftime('%s','now')), start_at INTEGER NOT NULL DEFAULT (strftime('%s','now')),
end_at INTEGER, end_at INTEGER,
reason TEXT NOT NULL DEFAULT '', reason TEXT NOT NULL DEFAULT '',
started_by TEXT NOT NULL DEFAULT '', started_by TEXT NOT NULL DEFAULT '',
ended_by TEXT NOT NULL DEFAULT '' ended_by TEXT NOT NULL DEFAULT ''
)`) )`); err != nil {
log.Fatalf("ERR-TELEMETRY-010: Failed to create maintenance table - %v", err)
}
} }
func handleHealth(w http.ResponseWriter, r *http.Request) { func handleHealth(w http.ResponseWriter, r *http.Request) {
@ -313,8 +327,12 @@ func handleTelemetry(w http.ResponseWriter, r *http.Request) {
} }
// Insert telemetry // Insert telemetry
db.Exec(`INSERT INTO telemetry (node_id, version, hostname, uptime_seconds, cpu_percent, memory_total_mb, memory_used_mb, disk_total_mb, disk_used_mb, load_1m, vault_count, vault_size_mb, vault_entries, mode) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)`, if _, err := db.Exec(`INSERT INTO telemetry (node_id, version, hostname, uptime_seconds, cpu_percent, memory_total_mb, memory_used_mb, disk_total_mb, disk_used_mb, load_1m, vault_count, vault_size_mb, vault_entries, mode) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)`,
t.NodeID, t.Version, t.Hostname, t.UptimeSeconds, t.CPUPercent, t.MemTotalMB, t.MemUsedMB, t.DiskTotalMB, t.DiskUsedMB, t.Load1m, t.VaultCount, t.VaultSizeMB, t.VaultEntries, t.Mode) t.NodeID, t.Version, t.Hostname, t.UptimeSeconds, t.CPUPercent, t.MemTotalMB, t.MemUsedMB, t.DiskTotalMB, t.DiskUsedMB, t.Load1m, t.VaultCount, t.VaultSizeMB, t.VaultEntries, t.Mode); err != nil {
log.Printf("ERR-TELEMETRY-004: Failed to insert telemetry for node=%s - %v", t.NodeID, err)
http.Error(w, `{"error":"database error"}`, 500)
return
}
// Uptime span tracking // Uptime span tracking
updateSpan(t.NodeID, t.Hostname, t.Version, t.CPUPercent, t.MemUsedMB, t.MemTotalMB, t.DiskUsedMB, t.DiskTotalMB, t.Load1m, t.UptimeSeconds) updateSpan(t.NodeID, t.Hostname, t.Version, t.CPUPercent, t.MemUsedMB, t.MemTotalMB, t.DiskUsedMB, t.DiskTotalMB, t.Load1m, t.UptimeSeconds)