telemetry: fix CRITICAL silent failures (Cardinal Rule #1)
Fixes #2, #3, #4 Issue #2 - Silent database errors in updateSpan(): - Add error handling for telemetry INSERT (ERR-TELEMETRY-004) - Add error handling for all table/index creation (ERR-TELEMETRY-005 to -010) - Return HTTP 500 to client on insert failure Issue #3 - Silent failure in Kuma push: - Return early on non-OK status from Kuma - Proper error logging with body close handling Issue #4 - Unchecked flush error in tarpit: - Verify http.Flusher available before tarpit - Log ERR-TELEMETRY-040 and abort if flusher unavailable - Remove redundant flusher checks in loop All changes: security failures are now LOUD (Cardinal Rule #1) Author: Hans <hans-20250409-001>
This commit is contained in:
parent
6d5837c7b4
commit
b4aced5c03
|
|
@ -63,6 +63,10 @@ func sendKumaPush(kumaURL string) {
|
|||
}
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
log.Printf("ERR-TELEMETRY-031: Kuma returned non-OK status %d from %s", resp.StatusCode, kumaURL)
|
||||
if err := resp.Body.Close(); err != nil {
|
||||
log.Printf("ERR-TELEMETRY-032: Failed to close Kuma response body after non-OK status - %v", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
if err := resp.Body.Close(); err != nil {
|
||||
log.Printf("ERR-TELEMETRY-032: Failed to close Kuma response body - %v", err)
|
||||
|
|
|
|||
|
|
@ -130,9 +130,13 @@ func tarpit(w http.ResponseWriter, r *http.Request) {
|
|||
w.Header().Set("Content-Type", "text/plain")
|
||||
w.WriteHeader(200)
|
||||
|
||||
if flusher, ok := w.(http.Flusher); ok {
|
||||
flusher.Flush()
|
||||
// Verify flusher is available - otherwise tarpit is ineffective
|
||||
flusher, ok := w.(http.Flusher)
|
||||
if !ok {
|
||||
log.Printf("ERR-TELEMETRY-040: tarpit called with ResponseWriter that does not implement http.Flusher - aborting")
|
||||
return
|
||||
}
|
||||
flusher.Flush()
|
||||
|
||||
// Drip one byte per second for 30 seconds
|
||||
for i := 0; i < 30; i++ {
|
||||
|
|
@ -140,18 +144,16 @@ func tarpit(w http.ResponseWriter, r *http.Request) {
|
|||
if err != nil {
|
||||
return // Client disconnected
|
||||
}
|
||||
if flusher, ok := w.(http.Flusher); ok {
|
||||
// Flush has no return value per http.Flusher interface
|
||||
// Write error above is the primary signal for client disconnect
|
||||
flusher.Flush()
|
||||
}
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
func ensureTables() {
|
||||
// Telemetry table
|
||||
db.Exec(`CREATE TABLE IF NOT EXISTS telemetry (
|
||||
if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS telemetry (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
node_id TEXT NOT NULL,
|
||||
received_at INTEGER NOT NULL DEFAULT (strftime('%s','now')),
|
||||
|
|
@ -168,28 +170,40 @@ func ensureTables() {
|
|||
vault_size_mb REAL NOT NULL DEFAULT 0,
|
||||
vault_entries INTEGER NOT NULL DEFAULT 0,
|
||||
mode TEXT NOT NULL DEFAULT ''
|
||||
)`)
|
||||
db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_id ON telemetry(node_id)`)
|
||||
db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_latest ON telemetry(node_id, id DESC)`)
|
||||
)`); err != nil {
|
||||
log.Fatalf("ERR-TELEMETRY-005: Failed to create telemetry table - %v", err)
|
||||
}
|
||||
if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_id ON telemetry(node_id)`); err != nil {
|
||||
log.Fatalf("ERR-TELEMETRY-006: Failed to create telemetry node_id index - %v", err)
|
||||
}
|
||||
if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_latest ON telemetry(node_id, id DESC)`); err != nil {
|
||||
log.Fatalf("ERR-TELEMETRY-007: Failed to create telemetry node_latest index - %v", err)
|
||||
}
|
||||
|
||||
// Uptime spans table
|
||||
db.Exec(`CREATE TABLE IF NOT EXISTS uptime_spans (
|
||||
if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS uptime_spans (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
node_id TEXT NOT NULL,
|
||||
start_at INTEGER NOT NULL,
|
||||
end_at INTEGER NOT NULL
|
||||
)`)
|
||||
db.Exec(`CREATE INDEX IF NOT EXISTS idx_spans_node_end ON uptime_spans(node_id, end_at DESC)`)
|
||||
)`); err != nil {
|
||||
log.Fatalf("ERR-TELEMETRY-008: Failed to create uptime_spans table - %v", err)
|
||||
}
|
||||
if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_spans_node_end ON uptime_spans(node_id, end_at DESC)`); err != nil {
|
||||
log.Fatalf("ERR-TELEMETRY-009: Failed to create uptime_spans index - %v", err)
|
||||
}
|
||||
|
||||
// Maintenance table
|
||||
db.Exec(`CREATE TABLE IF NOT EXISTS maintenance (
|
||||
if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS maintenance (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
start_at INTEGER NOT NULL DEFAULT (strftime('%s','now')),
|
||||
end_at INTEGER,
|
||||
reason TEXT NOT NULL DEFAULT '',
|
||||
started_by TEXT NOT NULL DEFAULT '',
|
||||
ended_by TEXT NOT NULL DEFAULT ''
|
||||
)`)
|
||||
)`); err != nil {
|
||||
log.Fatalf("ERR-TELEMETRY-010: Failed to create maintenance table - %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func handleHealth(w http.ResponseWriter, r *http.Request) {
|
||||
|
|
@ -313,8 +327,12 @@ func handleTelemetry(w http.ResponseWriter, r *http.Request) {
|
|||
}
|
||||
|
||||
// Insert telemetry
|
||||
db.Exec(`INSERT INTO telemetry (node_id, version, hostname, uptime_seconds, cpu_percent, memory_total_mb, memory_used_mb, disk_total_mb, disk_used_mb, load_1m, vault_count, vault_size_mb, vault_entries, mode) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)`,
|
||||
t.NodeID, t.Version, t.Hostname, t.UptimeSeconds, t.CPUPercent, t.MemTotalMB, t.MemUsedMB, t.DiskTotalMB, t.DiskUsedMB, t.Load1m, t.VaultCount, t.VaultSizeMB, t.VaultEntries, t.Mode)
|
||||
if _, err := db.Exec(`INSERT INTO telemetry (node_id, version, hostname, uptime_seconds, cpu_percent, memory_total_mb, memory_used_mb, disk_total_mb, disk_used_mb, load_1m, vault_count, vault_size_mb, vault_entries, mode) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)`,
|
||||
t.NodeID, t.Version, t.Hostname, t.UptimeSeconds, t.CPUPercent, t.MemTotalMB, t.MemUsedMB, t.DiskTotalMB, t.DiskUsedMB, t.Load1m, t.VaultCount, t.VaultSizeMB, t.VaultEntries, t.Mode); err != nil {
|
||||
log.Printf("ERR-TELEMETRY-004: Failed to insert telemetry for node=%s - %v", t.NodeID, err)
|
||||
http.Error(w, `{"error":"database error"}`, 500)
|
||||
return
|
||||
}
|
||||
|
||||
// Uptime span tracking
|
||||
updateSpan(t.NodeID, t.Hostname, t.Version, t.CPUPercent, t.MemUsedMB, t.MemTotalMB, t.DiskUsedMB, t.DiskTotalMB, t.Load1m, t.UptimeSeconds)
|
||||
|
|
|
|||
Loading…
Reference in New Issue