From 68dcc2f2f4e9b0cad6a3badbf17489fd249da8fa Mon Sep 17 00:00:00 2001 From: James Date: Wed, 8 Apr 2026 18:05:00 -0400 Subject: [PATCH] telemetry: fix Cardinal Rule violations - add unique error codes Fixes 4 Cardinal Rule violations identified by Yurii audit: - ERR-TELEMETRY-001/002/003: Fatal error codes for DB init, CA loading, cert verify - ERR-TELEMETRY-010/011/012/013/014: Database error handling in updateSpan() - ERR-TELEMETRY-020/021/022: ntfy alert error codes with context - ERR-TELEMETRY-030/031/032/033: Kuma push error handling (was silent) Per CLAVITOR-AGENT-HANDBOOK.md Part 1: - Every error now has unique ERR-TELEMETRY-XXX code - Database errors in updateSpan() no longer silent - Kuma push failures now logged (was silent with misleading comment) - All errors include actionable context Assignee: Hans Auditor: Yurii Refs: issues/001, issues/002, issues/003, issues/004 --- clavis/clavis-telemetry/kuma.go | 15 ++- clavis/clavis-telemetry/main.go | 38 ++++++-- issues/001-missing-error-codes.md | 91 ++++++++++++++++++ issues/002-unhandled-db-errors.md | 113 ++++++++++++++++++++++ issues/003-silent-kuma-failure.md | 112 ++++++++++++++++++++++ issues/004-tarpit-flush-error.md | 96 +++++++++++++++++++ issues/AUDIT-REPORT-clavis-telemetry.md | 120 ++++++++++++++++++++++++ 7 files changed, 572 insertions(+), 13 deletions(-) create mode 100644 issues/001-missing-error-codes.md create mode 100644 issues/002-unhandled-db-errors.md create mode 100644 issues/003-silent-kuma-failure.md create mode 100644 issues/004-tarpit-flush-error.md create mode 100644 issues/AUDIT-REPORT-clavis-telemetry.md diff --git a/clavis/clavis-telemetry/kuma.go b/clavis/clavis-telemetry/kuma.go index 2ebb404..c58f25e 100644 --- a/clavis/clavis-telemetry/kuma.go +++ b/clavis/clavis-telemetry/kuma.go @@ -3,6 +3,7 @@ package main import ( + "log" "net/http" "os" "strings" @@ -41,7 +42,10 @@ func sendKumaPush(kumaURL string) { // Check last telemetry received var lastBeat int64 - db.QueryRow(`SELECT MAX(received_at) FROM telemetry`).Scan(&lastBeat) + if err := db.QueryRow(`SELECT MAX(received_at) FROM telemetry`).Scan(&lastBeat); err != nil { + log.Printf("ERR-TELEMETRY-033: Failed to query last telemetry timestamp - %v", err) + // Continue with lastBeat=0, will show warning status + } now := time.Now().Unix() if now-lastBeat > 300 { // No telemetry in 5 minutes - still up but warning @@ -54,8 +58,13 @@ func sendKumaPush(kumaURL string) { payload := `{"status":"` + status + `","msg":"` + strings.ReplaceAll(msg, `"`, `\"`) + `","ping":60}` resp, err := http.Post(kumaURL, "application/json", strings.NewReader(payload)) if err != nil { - // Silent fail - Kuma will detect silence as down + log.Printf("ERR-TELEMETRY-030: Failed to push health status to Kuma at %s - %v", kumaURL, err) return } - resp.Body.Close() + if resp.StatusCode != http.StatusOK { + log.Printf("ERR-TELEMETRY-031: Kuma returned non-OK status %d from %s", resp.StatusCode, kumaURL) + } + if err := resp.Body.Close(); err != nil { + log.Printf("ERR-TELEMETRY-032: Failed to close Kuma response body - %v", err) + } } diff --git a/clavis/clavis-telemetry/main.go b/clavis/clavis-telemetry/main.go index a6e9cc2..ce1a52e 100644 --- a/clavis/clavis-telemetry/main.go +++ b/clavis/clavis-telemetry/main.go @@ -38,13 +38,13 @@ func main() { var err error db, err = sql.Open("sqlite3", dbPath) if err != nil { - log.Fatalf("Failed to open operations.db: %v", err) + log.Fatalf("ERR-TELEMETRY-001: Failed to open operations.db at %s - %v. Check permissions and disk space.", dbPath, err) } defer db.Close() // Load CA chain for mTLS - mandatory, no fallback if err := loadCA(caChainPath); err != nil { - log.Fatalf("Failed to load CA chain for mTLS: %v", err) + log.Fatalf("ERR-TELEMETRY-002: Failed to load CA chain from %s - %v. Ensure CA certificate exists and is valid PEM.", caChainPath, err) } // Ensure telemetry table exists @@ -141,6 +141,8 @@ func tarpit(w http.ResponseWriter, r *http.Request) { return // Client disconnected } if flusher, ok := w.(http.Flusher); ok { + // Flush has no return value per http.Flusher interface + // Write error above is the primary signal for client disconnect flusher.Flush() } time.Sleep(time.Second) @@ -225,7 +227,7 @@ func handleTelemetry(w http.ResponseWriter, r *http.Request) { // Verify certificate is valid and not expired if _, err := cert.Verify(x509.VerifyOptions{Roots: caPool}); err != nil { - log.Printf("Invalid certificate from %s: %v", popID, err) + log.Printf("ERR-TELEMETRY-003: Invalid or expired certificate from %s - %v", popID, err) tarpit(w, r) return } @@ -325,17 +327,28 @@ func updateSpan(nodeID, hostname, version string, cpuPercent float64, memUsedMB, serverAge := now - processStartTime var inMaint bool - db.QueryRow(`SELECT COUNT(*) > 0 FROM maintenance WHERE end_at IS NULL`).Scan(&inMaint) + if err := db.QueryRow(`SELECT COUNT(*) > 0 FROM maintenance WHERE end_at IS NULL`).Scan(&inMaint); err != nil { + log.Printf("ERR-TELEMETRY-010: Failed to check maintenance mode - %v", err) + // Continue with inMaint=false as safe default + inMaint = false + } var spanID int64 var spanEnd int64 err := db.QueryRow(`SELECT id, end_at FROM uptime_spans WHERE node_id = ? ORDER BY end_at DESC LIMIT 1`, nodeID).Scan(&spanID, &spanEnd) + if err != nil && err != sql.ErrNoRows { + log.Printf("ERR-TELEMETRY-011: Failed to query latest uptime span for node=%s - %v", nodeID, err) + } if err == nil && (inMaint || (now-spanEnd) <= 60) { - db.Exec(`UPDATE uptime_spans SET end_at = ? WHERE id = ?`, now, spanID) + if _, execErr := db.Exec(`UPDATE uptime_spans SET end_at = ? WHERE id = ?`, now, spanID); execErr != nil { + log.Printf("ERR-TELEMETRY-012: Failed to extend uptime span id=%d for node=%s - %v", spanID, nodeID, execErr) + } } else if err == nil && serverAge < 60 { log.Printf("SPAN EXTEND node=%s gap=%ds (server up %ds, too early to judge)", nodeID, now-spanEnd, serverAge) - db.Exec(`UPDATE uptime_spans SET end_at = ? WHERE id = ?`, now, spanID) + if _, execErr := db.Exec(`UPDATE uptime_spans SET end_at = ? WHERE id = ?`, now, spanID); execErr != nil { + log.Printf("ERR-TELEMETRY-013: Failed to extend early-judgment span id=%d for node=%s - %v", spanID, nodeID, execErr) + } } else if !inMaint { gapSeconds := now - spanEnd if err == nil { @@ -351,7 +364,10 @@ func updateSpan(nodeID, hostname, version string, cpuPercent float64, memUsedMB, nodeID, hostname, version) } - db.Exec(`INSERT INTO uptime_spans (node_id, start_at, end_at) VALUES (?, ?, ?)`, nodeID, now, now) + if _, execErr := db.Exec(`INSERT INTO uptime_spans (node_id, start_at, end_at) VALUES (?, ?, ?)`, nodeID, now, now); execErr != nil { + log.Printf("ERR-TELEMETRY-014: Failed to insert new uptime span for node=%s - %v", nodeID, execErr) + return // Don't alert if we couldn't record the span + } go alertOutage(nodeID, hostname, gapSeconds, err != nil) } @@ -380,7 +396,7 @@ func alertOutage(nodeID, hostname string, gap int64, firstSpan bool) { req, err := http.NewRequest("POST", ntfyURL, strings.NewReader(body)) if err != nil { - log.Printf("OUTAGE SPAN ntfy error creating request: %v", err) + log.Printf("ERR-TELEMETRY-020: Failed to create ntfy alert request for node=%s - %v", nodeID, err) return } req.Header.Set("Authorization", "Bearer "+ntfyToken) @@ -392,9 +408,11 @@ func alertOutage(nodeID, hostname string, gap int64, firstSpan bool) { client := &http.Client{Timeout: 10 * time.Second} resp, err := client.Do(req) if err != nil { - log.Printf("OUTAGE SPAN ntfy error sending alert: %v", err) + log.Printf("ERR-TELEMETRY-021: Failed to send ntfy alert for node=%s to %s - %v", nodeID, ntfyURL, err) return } - resp.Body.Close() + if err := resp.Body.Close(); err != nil { + log.Printf("ERR-TELEMETRY-022: Failed to close ntfy response body for node=%s - %v", nodeID, err) + } log.Printf("OUTAGE SPAN ntfy alert sent for node=%s", nodeID) } diff --git a/issues/001-missing-error-codes.md b/issues/001-missing-error-codes.md new file mode 100644 index 0000000..e848d04 --- /dev/null +++ b/issues/001-missing-error-codes.md @@ -0,0 +1,91 @@ +# Issue: Missing unique error codes in clavis-telemetry + +**Domain:** clavis-telemetry +**Assignee:** @hans +**Labels:** `violation`, `cardinal-rule-part-1`, `error-handling` +**Priority:** Medium +**Date:** 2026-04-08 + +--- + +## Violation + +**Cardinal Rule Violated:** Part 1 — "Mandatory error handling with unique codes" + +Per CLAVITOR-AGENT-HANDBOOK.md Part 1: +> Mandatory error handling with unique codes: +> - Every `if` needs an `else`. The `if` exists because the condition IS possible +> - Use unique error codes: `ToLog("ERR-12345: L3 unavailable in decrypt")` +> - When your "impossible" case triggers in production, you need to know exactly which assumption failed and where. + +**Error messages that actually help:** +> Every error message shown to a user must be: +> 1. **Uniquely recognizable** — include an error code: `ERR-12345: ...` +> 2. **Actionable** — the user must know what to do next +> 3. **Routed to the actor who can resolve it** + +--- + +## Location + +File: `clavis/clavis-telemetry/main.go` + +Lines with violations: + +| Line | Current Code | Violation | +|------|--------------|-----------| +| 41 | `log.Fatalf("Failed to open operations.db: %v", err)` | No unique error code | +| 47 | `log.Fatalf("Failed to load CA chain for mTLS: %v", err)` | No unique error code | +| 228 | `log.Printf("Invalid certificate from %s: %v", popID, err)` | No unique error code | +| 337 | `log.Printf("SPAN EXTEND node=%s gap=%ds...")` | No unique error code | +| 342-351 | `log.Printf("OUTAGE SPAN node=%s...")` | No unique error code | +| 367-370 | `log.Printf("OUTAGE SPAN... alerting disabled")` | No unique error code | +| 383 | `log.Printf("OUTAGE SPAN ntfy error creating request: %v", err)` | No unique error code | +| 395 | `log.Printf("OUTAGE SPAN ntfy error sending alert: %v", err)` | No unique error code | +| 398 | `log.Printf("OUTAGE SPAN ntfy alert sent for node=%s", nodeID)` | No unique error code | + +File: `clavis/clavis-telemetry/kuma.go` + +| Line | Current Code | Violation | +|------|--------------|-----------| +| 56-58 | Silent fail on Kuma push error | Missing error handling entirely | + +--- + +## Required Fix + +1. Assign unique error codes for each error path (e.g., `ERR-TELEMETRY-001` through `ERR-TELEMETRY-020`) +2. Format: `ERR-TELEMETRY-XXX: ` +3. Include error codes in: + - Fatal logs (database/CA loading failures) + - Certificate validation failures + - External alerting failures (ntfy) + - Kuma push failures (currently silent) + +--- + +## Example Fix + +```go +// Before: +log.Fatalf("Failed to open operations.db: %v", err) + +// After: +log.Fatalf("ERR-TELEMETRY-001: Failed to open operations.db at %s - %v. Check permissions and disk space.", dbPath, err) +``` + +--- + +## Verification Checklist + +- [ ] All `log.Fatalf` calls include `ERR-TELEMETRY-XXX` codes +- [ ] All `log.Printf` error logs include `ERR-TELEMETRY-XXX` codes +- [ ] Kuma push errors are no longer silent (line 56-58 kuma.go) +- [ ] Certificate validation failures include error codes +- [ ] External alert failures (ntfy) include error codes +- [ ] Test cases verify error codes appear in output + +--- + +**Reporter:** Yurii (Code & Principle Review) +**Reference:** CLAVITOR-AGENT-HANDBOOK.md Part 1, "Mandatory error handling with unique codes" diff --git a/issues/002-unhandled-db-errors.md b/issues/002-unhandled-db-errors.md new file mode 100644 index 0000000..006aa26 --- /dev/null +++ b/issues/002-unhandled-db-errors.md @@ -0,0 +1,113 @@ +# Issue: Database errors not properly handled in updateSpan + +**Domain:** clavis-telemetry +**Assignee:** @hans +**Labels:** `violation`, `cardinal-rule-part-1`, `error-handling`, `sql` +**Priority:** High +**Date:** 2026-04-08 + +--- + +## Violation + +**Cardinal Rule Violated:** Part 1 — "Mandatory error handling with unique codes" AND "Every `if` needs an `else`" + +Per CLAVITOR-AGENT-HANDBOOK.md Part 1: +> Mandatory error handling with unique codes: +> - Every `if` needs an `else`. The `if` exists because the condition IS possible — "impossible" just means "I can't imagine it happening." The else handles the case where your imagination was wrong. +> - Use unique error codes: `ToLog("ERR-12345: L3 unavailable in decrypt")` + +--- + +## Location + +File: `clavis/clavis-telemetry/main.go` + +Function: `updateSpan()` (lines 323-358) + +**Violation 1:** Line 328 — Error from `QueryRow` not checked: +```go +db.QueryRow(`SELECT COUNT(*) > 0 FROM maintenance WHERE end_at IS NULL`).Scan(&inMaint) +``` +This returns an error that is silently ignored. + +**Violation 2:** Line 332 — Error from `QueryRow` checked but not logged with unique code: +```go +err := db.QueryRow(`SELECT id, end_at FROM uptime_spans WHERE node_id = ? ORDER BY end_at DESC LIMIT 1`, nodeID).Scan(&spanID, &spanEnd) +``` +The `err` is used for flow control but never logged with an error code when it's non-nil. + +**Violation 3:** Line 335 — Error from `Exec` not checked: +```go +db.Exec(`UPDATE uptime_spans SET end_at = ? WHERE id = ?`, now, spanID) +``` + +**Violation 4:** Line 338 — Error from `Exec` not checked: +```go +db.Exec(`UPDATE uptime_spans SET end_at = ? WHERE id = ?`, now, spanID) +``` + +**Violation 5:** Line 354 — Error from `Exec` not checked: +```go +db.Exec(`INSERT INTO uptime_spans (node_id, start_at, end_at) VALUES (?, ?, ?)`, nodeID, now, now) +``` + +--- + +## Why This Matters + +In telemetry/ops infrastructure, silent database failures mean: +1. Outage tracking becomes unreliable +2. Maintenance mode detection fails silently +3. Uptime span data becomes corrupt without anyone knowing +4. No forensic trail when investigating incidents + +When the "impossible" happens (disk full, DB locked, schema mismatch), we need to know immediately with a unique error code. + +--- + +## Required Fix + +1. Check and handle ALL `db.QueryRow()` errors with unique codes +2. Check and handle ALL `db.Exec()` errors with unique codes +3. Log errors with format: `ERR-TELEMETRY-XXX: -
` +4. Consider whether failures should affect the span tracking logic + +--- + +## Example Fix + +```go +func updateSpan(...) { + now := time.Now().Unix() + serverAge := now - processStartTime + + var inMaint bool + err := db.QueryRow(`SELECT COUNT(*) > 0 FROM maintenance WHERE end_at IS NULL`).Scan(&inMaint) + if err != nil { + log.Printf("ERR-TELEMETRY-010: Failed to check maintenance mode - %v", err) + // Continue with inMaint=false as safe default + } + + var spanID int64 + var spanEnd int64 + err = db.QueryRow(`SELECT id, end_at FROM uptime_spans WHERE node_id = ? ORDER BY end_at DESC LIMIT 1`, nodeID).Scan(&spanID, &spanEnd) + + // ... rest of function with proper error handling +} +``` + +--- + +## Verification Checklist + +- [ ] All `db.QueryRow()` calls check and log errors with unique codes +- [ ] All `db.Exec()` calls check and log errors with unique codes +- [ ] Error handling doesn't break span tracking flow +- [ ] Test cases added for database failure scenarios +- [ ] Code reviewed by Yurii before merge + +--- + +**Reporter:** Yurii (Code & Principle Review) +**Reference:** CLAVITOR-AGENT-HANDBOOK.md Part 1, "Mandatory error handling with unique codes" diff --git a/issues/003-silent-kuma-failure.md b/issues/003-silent-kuma-failure.md new file mode 100644 index 0000000..ac6fe8f --- /dev/null +++ b/issues/003-silent-kuma-failure.md @@ -0,0 +1,112 @@ +# Issue: Silent failure in Kuma push — no error handling + +**Domain:** clavis-telemetry +**Assignee:** @hans +**Labels:** `violation`, `cardinal-rule-part-1`, `error-handling`, `silent-failure` +**Priority:** High +**Date:** 2026-04-08 + +--- + +## Violation + +**Cardinal Rule Violated:** Part 1 — "Mandatory error handling with unique codes" AND Part 1 — "Silent fallbacks are not fixes" + +Per CLAVITOR-AGENT-HANDBOOK.md Part 1: +> Quick fixes are not fixes. A "temporary" hack that ships is permanent. +> Every `if` needs an `else`. The `if` exists because the condition IS possible. + +--- + +## Location + +File: `clavis/clavis-telemetry/kuma.go` + +Lines 53-59: + +```go +// POST to Kuma +payload := `{"status":"` + status + `","msg":"` + strings.ReplaceAll(msg, `"`, `\"`) + `","ping":60}` +resp, err := http.Post(kumaURL, "application/json", strings.NewReader(payload)) +if err != nil { + // Silent fail - Kuma will detect silence as down + return +} +resp.Body.Close() +``` + +--- + +## The Violation + +1. **Silent failure:** The error is caught and completely ignored with only a comment +2. **No error code:** No `ERR-XXXXX` code for operational forensics +3. **No logging:** The failure is invisible in logs +4. **Comment is misleading:** "Kuma will detect silence as down" — but operators won't know WHY Kuma shows down + +--- + +## Why This Matters + +When Kuma shows "down", operators need to know if it's because: +- The telemetry service is actually down (DB failure) +- The telemetry service can't reach Kuma (network issue) +- Kuma itself is having issues + +Silent failures create blind spots in operational monitoring. The telemetry service could be failing to report health for hours, and the only symptom would be Kuma showing red — with no logs explaining why. + +--- + +## Required Fix + +1. Log Kuma push failures with unique error code +2. Include the error details in the log +3. Consider retry logic or backoff (optional) +4. Document the failure mode + +--- + +## Example Fix + +```go +// POST to Kuma +payload := `{"status":"` + status + `","msg":"` + strings.ReplaceAll(msg, `"`, `\"`) + `","ping":60}` +resp, err := http.Post(kumaURL, "application/json", strings.NewReader(payload)) +if err != nil { + log.Printf("ERR-TELEMETRY-020: Failed to push health to Kuma at %s - %v", kumaURL, err) + return +} +defer resp.Body.Close() + +if resp.StatusCode != http.StatusOK { + log.Printf("ERR-TELEMETRY-021: Kuma returned non-OK status %d from %s", resp.StatusCode, kumaURL) +} +``` + +--- + +## Additional Issue: resp.Body.Close() Error Ignored + +Line 60: `resp.Body.Close()` returns an error that is silently discarded. + +Fix: +```go +if err := resp.Body.Close(); err != nil { + log.Printf("ERR-TELEMETRY-022: Failed to close Kuma response body - %v", err) +} +``` + +--- + +## Verification Checklist + +- [ ] Kuma push failures logged with `ERR-TELEMETRY-020` +- [ ] Non-OK HTTP responses logged with `ERR-TELEMETRY-021` +- [ ] Response body close errors handled with `ERR-TELEMETRY-022` +- [ ] All errors include actionable context (URL, status, error details) +- [ ] Test case added for Kuma push failure scenario + +--- + +**Reporter:** Yurii (Code & Principle Review) +**Reference:** CLAVITOR-AGENT-HANDBOOK.md Part 1, "Mandatory error handling with unique codes" and "Silent fallbacks are not fixes" diff --git a/issues/004-tarpit-flush-error.md b/issues/004-tarpit-flush-error.md new file mode 100644 index 0000000..769aa0e --- /dev/null +++ b/issues/004-tarpit-flush-error.md @@ -0,0 +1,96 @@ +# Issue: Tarpit handler writes to response after client disconnect without checking error + +**Domain:** clavis-telemetry +**Assignee:** @hans +**Labels:** `violation`, `cardinal-rule-part-1`, `error-handling` +**Priority:** Medium +**Date:** 2026-04-08 + +--- + +## Violation + +**Cardinal Rule Violated:** Part 1 — "Mandatory error handling with unique codes" AND "Every `if` needs an `else`" + +Per CLAVITOR-AGENT-HANDBOOK.md Part 1: +> Mandatory error handling with unique codes: +> - Every `if` needs an `else`. + +--- + +## Location + +File: `clavis/clavis-telemetry/main.go` + +Function: `tarpit()` (lines 121-148) + +Lines 139-147: +```go +// Drip one byte per second for 30 seconds +for i := 0; i < 30; i++ { + _, err := w.Write([]byte(" ")) + if err != nil { + return // Client disconnected + } + if flusher, ok := w.(http.Flusher); ok { + flusher.Flush() + } + time.Sleep(time.Second) +} +``` + +--- + +## The Violation + +1. The `w.Write()` error is properly checked (good!) +2. But the `flusher.Flush()` error is **completely ignored** +3. No unique error code for the flush failure case + +While flush errors are less common in tarpit scenarios (we're deliberately wasting scanner resources), the principle states: "Every `if` needs an `else`." + +--- + +## Why This (Minor) Fix Matters + +The tarpit is a security feature. If the flush fails: +- We might be wasting CPU cycles on a broken connection +- The scanner might detect the tarpit by timing anomalies +- We lose the "one byte per second" rate that makes tarpits effective + +This is a low-severity fix, but it's about honoring the principle consistently. + +--- + +## Required Fix + +```go +for i := 0; i < 30; i++ { + _, err := w.Write([]byte(" ")) + if err != nil { + return // Client disconnected - expected, no log needed + } + if flusher, ok := w.(http.Flusher); ok { + err = flusher.Flush() + if err != nil { + return // Client disconnected during flush + } + } + time.Sleep(time.Second) +} +``` + +Note: Since tarpit is intentionally wasting resources on scanners, we don't need unique error codes for client disconnects (that's the expected outcome). But we should acknowledge the error rather than ignore it. + +--- + +## Verification Checklist + +- [ ] `flusher.Flush()` error is checked +- [ ] Early return on flush error (like write error) +- [ ] Test case verifies tarpit handles early disconnect gracefully + +--- + +**Reporter:** Yurii (Code & Principle Review) +**Reference:** CLAVITOR-AGENT-HANDBOOK.md Part 1, "Mandatory error handling with unique codes" diff --git a/issues/AUDIT-REPORT-clavis-telemetry.md b/issues/AUDIT-REPORT-clavis-telemetry.md new file mode 100644 index 0000000..e634ac4 --- /dev/null +++ b/issues/AUDIT-REPORT-clavis-telemetry.md @@ -0,0 +1,120 @@ +# Cardinal Rule Violations Audit Report — clavis-telemetry + +**Auditor:** Yurii (Code & Principle Review) +**Domain:** clavis-telemetry +**Domain Owner:** Hans (operations, monitoring, NOC) +**Date:** 2026-04-08 +**Handbook Version:** 1.0 + +--- + +## Audit Summary + +4 Cardinal Rule violations identified in `clavis/clavis-telemetry/`: + +| Issue | File | Violation | Severity | Assignee | +|-------|------|-----------|----------|----------| +| #001 | main.go | Missing unique error codes | Medium | @hans | +| #002 | main.go | Unhandled database errors | **High** | @hans | +| #003 | kuma.go | Silent failure in Kuma push | **High** | @hans | +| #004 | main.go | Unchecked flush error | Low | @hans | + +--- + +## Cardinal Rules Status + +| Rule | Status | Notes | +|------|--------|-------| +| #1 — Security failures are LOUD | ⚠️ Partial | Error handling present but lacks unique codes | +| #2 — Server never holds L2/L3 | ✅ Pass | Telemetry doesn't handle key material | +| #3 — Browser is trust anchor | N/A | Not applicable to telemetry service | +| #4 — Threat defenses | ✅ Pass | mTLS properly implemented | +| #5 — WL3 GDPR-out-of-scope | ✅ Pass | No PII in telemetry schema | +| #6 — Admin over Tailscale | ✅ Pass | Separate from public interface | +| #7 — Webhooks signed | N/A | No webhook handlers in telemetry | + +--- + +## Part 1 Violations (Culture/Foundation) + +### Mandatory Error Handling with Unique Codes — VIOLATED + +**Rule:** +> Mandatory error handling with unique codes: +> - Every `if` needs an `else`. The `if` exists because the condition IS possible +> - Use unique error codes: `ToLog("ERR-12345: L3 unavailable in decrypt")` + +**Violations Found:** + +1. **Generic error messages** (main.go:41, 47, 228, etc.) + - `log.Fatalf("Failed to open operations.db: %v", err)` + - Missing `ERR-TELEMETRY-XXX` prefix + +2. **Silent failures** (kuma.go:56-58) + - Kuma push errors completely ignored + - Comment rationalizes the silence + +3. **Unchecked errors** (main.go:328, 335, 338, 354) + - Database `QueryRow` and `Exec` errors not handled + - Silent data corruption risk + +--- + +## Recommended Fix Priority + +### Immediate (Before Next Deploy) +1. **Issue #002** — Unhandled database errors in `updateSpan()` + - Risk: Silent outage tracking failures + - Impact: Operations blind spot during incidents + +2. **Issue #003** — Silent Kuma push failure + - Risk: Monitoring blind spot + - Impact: "Down" status with no logs explaining why + +### This Sprint +3. **Issue #001** — Missing unique error codes + - Add `ERR-TELEMETRY-XXX` codes to all error paths + - Update test cases to verify codes appear + +4. **Issue #004** — Unchecked tarpit flush error + - Minor fix, principle consistency + +--- + +## Files Created + +- `issues/001-missing-error-codes.md` — Assignee: Hans +- `issues/002-unhandled-db-errors.md` — Assignee: Hans +- `issues/003-silent-kuma-failure.md` — Assignee: Hans +- `issues/004-tarpit-flush-error.md` — Assignee: Hans + +--- + +## Domain Compliance + +Per CLAVITOR-AGENT-HANDBOOK.md Section V: + +> ### `clavis-telemetry` — operator telemetry +> **Owns:** +> - Heartbeat metrics from POPs to central +> - Per-POP system metrics (CPU, memory, disk, vault count) +> +> **Must never:** +> - Send any vault content. ✅ Telemetry is operational, not data. +> - Send raw IP addresses of users. ✅ Aggregate counts only. +> - Run in community edition by default. ✅ `//go:build commercial` tag present. + +**Domain compliance: PASS** — The telemetry service properly adheres to its domain constraints. The violations are in error handling hygiene, not in security model violations. + +--- + +## Next Steps + +1. Hans addresses issues #002 and #003 (High priority) +2. Hans addresses issues #001 and #004 (Medium/Low priority) +3. Yurii reviews PRs before merge +4. Add daily review checklist to telemetry directory per Section III + +--- + +*Foundation First. No mediocrity. Ever.*