telemetry: fix CRITICAL silent failures (Cardinal Rule #1)
Fixes #2, #3, #4 Issue #2 - Silent database errors in updateSpan(): - Add error handling for telemetry INSERT (ERR-TELEMETRY-004) - Add error handling for all table/index creation (ERR-TELEMETRY-005 to -010) - Return HTTP 500 to client on insert failure Issue #3 - Silent failure in Kuma push: - Return early on non-OK status from Kuma - Proper error logging with body close handling Issue #4 - Unchecked flush error in tarpit: - Verify http.Flusher available before tarpit - Log ERR-TELEMETRY-040 and abort if flusher unavailable - Remove redundant flusher checks in loop All changes: security failures are now LOUD (Cardinal Rule #1) Author: Hans <hans-20250409-001>
This commit is contained in:
parent
6d5837c7b4
commit
b4aced5c03
|
|
@ -63,6 +63,10 @@ func sendKumaPush(kumaURL string) {
|
||||||
}
|
}
|
||||||
if resp.StatusCode != http.StatusOK {
|
if resp.StatusCode != http.StatusOK {
|
||||||
log.Printf("ERR-TELEMETRY-031: Kuma returned non-OK status %d from %s", resp.StatusCode, kumaURL)
|
log.Printf("ERR-TELEMETRY-031: Kuma returned non-OK status %d from %s", resp.StatusCode, kumaURL)
|
||||||
|
if err := resp.Body.Close(); err != nil {
|
||||||
|
log.Printf("ERR-TELEMETRY-032: Failed to close Kuma response body after non-OK status - %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
}
|
}
|
||||||
if err := resp.Body.Close(); err != nil {
|
if err := resp.Body.Close(); err != nil {
|
||||||
log.Printf("ERR-TELEMETRY-032: Failed to close Kuma response body - %v", err)
|
log.Printf("ERR-TELEMETRY-032: Failed to close Kuma response body - %v", err)
|
||||||
|
|
|
||||||
|
|
@ -130,9 +130,13 @@ func tarpit(w http.ResponseWriter, r *http.Request) {
|
||||||
w.Header().Set("Content-Type", "text/plain")
|
w.Header().Set("Content-Type", "text/plain")
|
||||||
w.WriteHeader(200)
|
w.WriteHeader(200)
|
||||||
|
|
||||||
if flusher, ok := w.(http.Flusher); ok {
|
// Verify flusher is available - otherwise tarpit is ineffective
|
||||||
flusher.Flush()
|
flusher, ok := w.(http.Flusher)
|
||||||
|
if !ok {
|
||||||
|
log.Printf("ERR-TELEMETRY-040: tarpit called with ResponseWriter that does not implement http.Flusher - aborting")
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
flusher.Flush()
|
||||||
|
|
||||||
// Drip one byte per second for 30 seconds
|
// Drip one byte per second for 30 seconds
|
||||||
for i := 0; i < 30; i++ {
|
for i := 0; i < 30; i++ {
|
||||||
|
|
@ -140,18 +144,16 @@ func tarpit(w http.ResponseWriter, r *http.Request) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return // Client disconnected
|
return // Client disconnected
|
||||||
}
|
}
|
||||||
if flusher, ok := w.(http.Flusher); ok {
|
// Flush has no return value per http.Flusher interface
|
||||||
// Flush has no return value per http.Flusher interface
|
// Write error above is the primary signal for client disconnect
|
||||||
// Write error above is the primary signal for client disconnect
|
flusher.Flush()
|
||||||
flusher.Flush()
|
|
||||||
}
|
|
||||||
time.Sleep(time.Second)
|
time.Sleep(time.Second)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func ensureTables() {
|
func ensureTables() {
|
||||||
// Telemetry table
|
// Telemetry table
|
||||||
db.Exec(`CREATE TABLE IF NOT EXISTS telemetry (
|
if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS telemetry (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
node_id TEXT NOT NULL,
|
node_id TEXT NOT NULL,
|
||||||
received_at INTEGER NOT NULL DEFAULT (strftime('%s','now')),
|
received_at INTEGER NOT NULL DEFAULT (strftime('%s','now')),
|
||||||
|
|
@ -168,28 +170,40 @@ func ensureTables() {
|
||||||
vault_size_mb REAL NOT NULL DEFAULT 0,
|
vault_size_mb REAL NOT NULL DEFAULT 0,
|
||||||
vault_entries INTEGER NOT NULL DEFAULT 0,
|
vault_entries INTEGER NOT NULL DEFAULT 0,
|
||||||
mode TEXT NOT NULL DEFAULT ''
|
mode TEXT NOT NULL DEFAULT ''
|
||||||
)`)
|
)`); err != nil {
|
||||||
db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_id ON telemetry(node_id)`)
|
log.Fatalf("ERR-TELEMETRY-005: Failed to create telemetry table - %v", err)
|
||||||
db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_latest ON telemetry(node_id, id DESC)`)
|
}
|
||||||
|
if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_id ON telemetry(node_id)`); err != nil {
|
||||||
|
log.Fatalf("ERR-TELEMETRY-006: Failed to create telemetry node_id index - %v", err)
|
||||||
|
}
|
||||||
|
if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_telemetry_node_latest ON telemetry(node_id, id DESC)`); err != nil {
|
||||||
|
log.Fatalf("ERR-TELEMETRY-007: Failed to create telemetry node_latest index - %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Uptime spans table
|
// Uptime spans table
|
||||||
db.Exec(`CREATE TABLE IF NOT EXISTS uptime_spans (
|
if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS uptime_spans (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
node_id TEXT NOT NULL,
|
node_id TEXT NOT NULL,
|
||||||
start_at INTEGER NOT NULL,
|
start_at INTEGER NOT NULL,
|
||||||
end_at INTEGER NOT NULL
|
end_at INTEGER NOT NULL
|
||||||
)`)
|
)`); err != nil {
|
||||||
db.Exec(`CREATE INDEX IF NOT EXISTS idx_spans_node_end ON uptime_spans(node_id, end_at DESC)`)
|
log.Fatalf("ERR-TELEMETRY-008: Failed to create uptime_spans table - %v", err)
|
||||||
|
}
|
||||||
|
if _, err := db.Exec(`CREATE INDEX IF NOT EXISTS idx_spans_node_end ON uptime_spans(node_id, end_at DESC)`); err != nil {
|
||||||
|
log.Fatalf("ERR-TELEMETRY-009: Failed to create uptime_spans index - %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Maintenance table
|
// Maintenance table
|
||||||
db.Exec(`CREATE TABLE IF NOT EXISTS maintenance (
|
if _, err := db.Exec(`CREATE TABLE IF NOT EXISTS maintenance (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
start_at INTEGER NOT NULL DEFAULT (strftime('%s','now')),
|
start_at INTEGER NOT NULL DEFAULT (strftime('%s','now')),
|
||||||
end_at INTEGER,
|
end_at INTEGER,
|
||||||
reason TEXT NOT NULL DEFAULT '',
|
reason TEXT NOT NULL DEFAULT '',
|
||||||
started_by TEXT NOT NULL DEFAULT '',
|
started_by TEXT NOT NULL DEFAULT '',
|
||||||
ended_by TEXT NOT NULL DEFAULT ''
|
ended_by TEXT NOT NULL DEFAULT ''
|
||||||
)`)
|
)`); err != nil {
|
||||||
|
log.Fatalf("ERR-TELEMETRY-010: Failed to create maintenance table - %v", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func handleHealth(w http.ResponseWriter, r *http.Request) {
|
func handleHealth(w http.ResponseWriter, r *http.Request) {
|
||||||
|
|
@ -313,8 +327,12 @@ func handleTelemetry(w http.ResponseWriter, r *http.Request) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Insert telemetry
|
// Insert telemetry
|
||||||
db.Exec(`INSERT INTO telemetry (node_id, version, hostname, uptime_seconds, cpu_percent, memory_total_mb, memory_used_mb, disk_total_mb, disk_used_mb, load_1m, vault_count, vault_size_mb, vault_entries, mode) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)`,
|
if _, err := db.Exec(`INSERT INTO telemetry (node_id, version, hostname, uptime_seconds, cpu_percent, memory_total_mb, memory_used_mb, disk_total_mb, disk_used_mb, load_1m, vault_count, vault_size_mb, vault_entries, mode) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)`,
|
||||||
t.NodeID, t.Version, t.Hostname, t.UptimeSeconds, t.CPUPercent, t.MemTotalMB, t.MemUsedMB, t.DiskTotalMB, t.DiskUsedMB, t.Load1m, t.VaultCount, t.VaultSizeMB, t.VaultEntries, t.Mode)
|
t.NodeID, t.Version, t.Hostname, t.UptimeSeconds, t.CPUPercent, t.MemTotalMB, t.MemUsedMB, t.DiskTotalMB, t.DiskUsedMB, t.Load1m, t.VaultCount, t.VaultSizeMB, t.VaultEntries, t.Mode); err != nil {
|
||||||
|
log.Printf("ERR-TELEMETRY-004: Failed to insert telemetry for node=%s - %v", t.NodeID, err)
|
||||||
|
http.Error(w, `{"error":"database error"}`, 500)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// Uptime span tracking
|
// Uptime span tracking
|
||||||
updateSpan(t.NodeID, t.Hostname, t.Version, t.CPUPercent, t.MemUsedMB, t.MemTotalMB, t.DiskUsedMB, t.DiskTotalMB, t.Load1m, t.UptimeSeconds)
|
updateSpan(t.NodeID, t.Hostname, t.Version, t.CPUPercent, t.MemUsedMB, t.MemTotalMB, t.DiskUsedMB, t.DiskTotalMB, t.Load1m, t.UptimeSeconds)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue