clavitor/clavis/clavis-telemetry/error_handling_test.go

582 lines
16 KiB
Go

//go:build commercial
package main
import (
"bytes"
"database/sql"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"net/http/httptest"
"os"
"strings"
"sync"
"testing"
"time"
_ "github.com/mattn/go-sqlite3"
)
// logCapture captures log output for verification
type logCapture struct {
original *os.File
reader *os.File
writer *os.File
buffer chan string
done chan bool
}
func captureLogs() *logCapture {
// Save original stderr
original := os.Stderr
// Create pipe to capture log output
r, w, _ := os.Pipe()
// Redirect log output
os.Stderr = w
log.SetOutput(w)
lc := &logCapture{
original: original,
reader: r,
writer: w,
buffer: make(chan string, 100),
done: make(chan bool),
}
// Start goroutine to read from pipe
go func() {
var buf bytes.Buffer
io.Copy(&buf, r)
lc.buffer <- buf.String()
close(lc.buffer)
lc.done <- true
}()
return lc
}
func (lc *logCapture) restore() string {
// Restore original stderr
os.Stderr = lc.original
log.SetOutput(lc.original)
// Close writer to signal EOF to reader
lc.writer.Close()
// Wait for reader goroutine to finish
<-lc.done
// Return captured output
return <-lc.buffer
}
// TestUpdateSpan_DatabaseErrors verifies ERR-TELEMETRY-010 through ERR-TELEMETRY-014
func TestUpdateSpan_DatabaseErrors(t *testing.T) {
tests := []struct {
name string
setupDB func(*sql.DB)
expectedErrors []string
description string
}{
{
name: "ERR-TELEMETRY-010: Maintenance check fails",
setupDB: func(testDB *sql.DB) {
// Drop maintenance table to cause query failure
testDB.Exec(`DROP TABLE IF EXISTS maintenance`)
},
expectedErrors: []string{"ERR-TELEMETRY-010"},
description: "Should log ERR-TELEMETRY-010 when maintenance check fails",
},
{
name: "ERR-TELEMETRY-011: Uptime span query fails",
setupDB: func(testDB *sql.DB) {
// Drop uptime_spans table to cause query failure
testDB.Exec(`DROP TABLE IF EXISTS uptime_spans`)
},
expectedErrors: []string{"ERR-TELEMETRY-011"},
description: "Should log ERR-TELEMETRY-011 when uptime span query fails",
},
{
name: "ERR-TELEMETRY-012: Span extend fails",
setupDB: func(testDB *sql.DB) {
// Insert a span but make the table read-only by corrupting it
testDB.Exec(`INSERT INTO uptime_spans (node_id, start_at, end_at) VALUES (?, ?, ?)`,
"test-node-err", time.Now().Unix()-30, time.Now().Unix()-30)
// Now drop and recreate to cause update to reference non-existent row
testDB.Exec(`DELETE FROM uptime_spans WHERE node_id = ?`, "test-node-err")
},
expectedErrors: []string{}, // May or may not trigger depending on timing
description: "Database error handling for span extend",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Setup test database
setupTestDB(t)
defer cleanupTestDB()
// Apply database corruption/setup
tt.setupDB(db)
// Capture logs
lc := captureLogs()
// Call updateSpan which should trigger error logging
updateSpan("test-node-err", "test-host", "1.0.0", 50.0, 4096, 8192, 50000, 100000, 0.5, 3600)
// Restore and get logs
logs := lc.restore()
// Verify expected errors appear in logs
for _, errCode := range tt.expectedErrors {
if !strings.Contains(logs, errCode) {
t.Errorf("%s: expected log to contain %s, but got:\n%s", tt.description, errCode, logs)
}
}
})
}
}
// TestUpdateSpan_MaintenanceModeError specifically tests ERR-TELEMETRY-010
func TestUpdateSpan_MaintenanceModeError(t *testing.T) {
setupTestDB(t)
defer cleanupTestDB()
// Capture logs
var logBuffer bytes.Buffer
originalOutput := log.Writer()
log.SetOutput(&logBuffer)
defer log.SetOutput(originalOutput)
// Corrupt the database by dropping the maintenance table
_, err := db.Exec(`DROP TABLE maintenance`)
if err != nil {
t.Fatalf("Failed to drop maintenance table: %v", err)
}
// Call updateSpan
updateSpan("test-node-maint", "test-host", "1.0.0", 50.0, 4096, 8192, 50000, 100000, 0.5, 3600)
// Verify ERR-TELEMETRY-010 appears in logs
logs := logBuffer.String()
if !strings.Contains(logs, "ERR-TELEMETRY-010") {
t.Errorf("Expected ERR-TELEMETRY-010 in logs when maintenance check fails, got:\n%s", logs)
}
if !strings.Contains(logs, "Failed to check maintenance mode") {
t.Errorf("Expected 'Failed to check maintenance mode' message, got:\n%s", logs)
}
}
// TestUpdateSpan_UptimeSpanQueryError specifically tests ERR-TELEMETRY-011
func TestUpdateSpan_UptimeSpanQueryError(t *testing.T) {
setupTestDB(t)
defer cleanupTestDB()
// Capture logs
var logBuffer bytes.Buffer
originalOutput := log.Writer()
log.SetOutput(&logBuffer)
defer log.SetOutput(originalOutput)
// Corrupt the database by making it read-only via closing and reopening with bad path
db.Close()
var err error
db, err = sql.Open("sqlite3", ":memory:")
if err != nil {
t.Fatalf("Failed to reopen database: %v", err)
}
// Don't create tables - queries will fail
// Call updateSpan
updateSpan("test-node-span", "test-host", "1.0.0", 50.0, 4096, 8192, 50000, 100000, 0.5, 3600)
// Verify ERR-TELEMETRY-011 appears in logs
logs := logBuffer.String()
if !strings.Contains(logs, "ERR-TELEMETRY-011") {
t.Errorf("Expected ERR-TELEMETRY-011 in logs when uptime span query fails, got:\n%s", logs)
}
if !strings.Contains(logs, "Failed to query latest uptime span") {
t.Errorf("Expected 'Failed to query latest uptime span' message, got:\n%s", logs)
}
}
// TestSendKumaPush_DatabaseError tests ERR-TELEMETRY-033
func TestSendKumaPush_DatabaseError(t *testing.T) {
// Setup test database then corrupt it
var err error
db, err = sql.Open("sqlite3", ":memory:")
if err != nil {
t.Fatalf("Failed to open test database: %v", err)
}
defer db.Close()
// Don't create tables - this will cause queries to fail
// Capture logs
var logBuffer bytes.Buffer
originalOutput := log.Writer()
log.SetOutput(&logBuffer)
defer log.SetOutput(originalOutput)
// Create a test server that accepts Kuma pushes
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
// Call sendKumaPush
sendKumaPush(server.URL)
// Verify ERR-TELEMETRY-033 appears in logs (failed to query last telemetry)
logs := logBuffer.String()
if !strings.Contains(logs, "ERR-TELEMETRY-033") {
t.Errorf("Expected ERR-TELEMETRY-033 in logs when telemetry timestamp query fails, got:\n%s", logs)
}
}
// TestSendKumaPush_NetworkError tests ERR-TELEMETRY-030
func TestSendKumaPush_NetworkError(t *testing.T) {
setupTestDB(t)
defer cleanupTestDB()
// Capture logs
var logBuffer bytes.Buffer
originalOutput := log.Writer()
log.SetOutput(&logBuffer)
defer log.SetOutput(originalOutput)
// Use an invalid URL that will cause network error
invalidURL := "http://localhost:1/invalid" // Port 1 is typically invalid/unused
// Call sendKumaPush with invalid URL
sendKumaPush(invalidURL)
// Verify ERR-TELEMETRY-030 appears in logs
logs := logBuffer.String()
if !strings.Contains(logs, "ERR-TELEMETRY-030") {
t.Errorf("Expected ERR-TELEMETRY-030 in logs when Kuma push fails, got:\n%s", logs)
}
if !strings.Contains(logs, "Failed to push health status to Kuma") {
t.Errorf("Expected 'Failed to push health status to Kuma' message, got:\n%s", logs)
}
}
// TestSendKumaPush_NonOKStatus tests ERR-TELEMETRY-031
func TestSendKumaPush_NonOKStatus(t *testing.T) {
setupTestDB(t)
defer cleanupTestDB()
// Capture logs
var logBuffer bytes.Buffer
originalOutput := log.Writer()
log.SetOutput(&logBuffer)
defer log.SetOutput(originalOutput)
// Create a test server that returns non-OK status
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusServiceUnavailable) // 503
w.Write([]byte("Service Unavailable"))
}))
defer server.Close()
// Call sendKumaPush
sendKumaPush(server.URL)
// Verify ERR-TELEMETRY-031 appears in logs
logs := logBuffer.String()
if !strings.Contains(logs, "ERR-TELEMETRY-031") {
t.Errorf("Expected ERR-TELEMETRY-031 in logs when Kuma returns non-OK status, got:\n%s", logs)
}
if !strings.Contains(logs, "non-OK status 503") {
t.Errorf("Expected 'non-OK status 503' message, got:\n%s", logs)
}
}
// TestSendKumaPush_ResponseBodyCloseError tests response body handling
func TestSendKumaPush_ResponseBodyCloseError(t *testing.T) {
setupTestDB(t)
defer cleanupTestDB()
// Capture logs
var logBuffer bytes.Buffer
originalOutput := log.Writer()
log.SetOutput(&logBuffer)
defer log.SetOutput(originalOutput)
// Create a test server that returns OK
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
// Call sendKumaPush
sendKumaPush(server.URL)
// Should complete without ERR-TELEMETRY-032 (successful close doesn't log error)
logs := logBuffer.String()
if strings.Contains(logs, "ERR-TELEMETRY-032") {
t.Errorf("Did not expect ERR-TELEMETRY-032 for successful response body close, got:\n%s", logs)
}
}
// TestTarpit_EarlyDisconnect tests that tarpit handles client disconnect gracefully
func TestTarpit_EarlyDisconnect(t *testing.T) {
// Create a custom ResponseWriter that simulates disconnect after first write
disconnectWriter := &disconnectSimulatingWriter{
headers: make(http.Header),
failAfter: 1, // Fail after first write
}
req := httptest.NewRequest("GET", "/unknown", nil)
// Use a goroutine since tarpit blocks
done := make(chan bool)
go func() {
tarpit(disconnectWriter, req)
done <- true
}()
// Wait for tarpit to complete (should be quick due to disconnect)
select {
case <-done:
// Expected - tarpit returned early due to disconnect
case <-time.After(2 * time.Second):
t.Error("tarpit did not return early on client disconnect")
}
// Verify that at least one write was attempted
if !disconnectWriter.written {
t.Error("tarpit should have attempted at least one write")
}
}
// disconnectSimulatingWriter simulates a client that disconnects after N writes
type disconnectSimulatingWriter struct {
headers http.Header
status int
written bool
writeCount int
failAfter int
flushCount int
}
func (m *disconnectSimulatingWriter) Header() http.Header {
return m.headers
}
func (m *disconnectSimulatingWriter) Write(p []byte) (int, error) {
m.written = true
m.writeCount++
if m.writeCount >= m.failAfter {
return 0, io.ErrClosedPipe // Simulate client disconnect
}
return len(p), nil
}
func (m *disconnectSimulatingWriter) WriteHeader(status int) {
m.status = status
}
func (m *disconnectSimulatingWriter) Flush() {
m.flushCount++
}
// TestErrorCodes_Unique verifies that all error codes are unique
func TestErrorCodes_Unique(t *testing.T) {
// Read the main.go and kuma.go files to extract error codes
mainContent, err := os.ReadFile("main.go")
if err != nil {
t.Fatalf("Failed to read main.go: %v", err)
}
kumaContent, err := os.ReadFile("kuma.go")
if err != nil {
t.Fatalf("Failed to read kuma.go: %v", err)
}
content := string(mainContent) + string(kumaContent)
// Extract all ERR-TELEMETRY-XXX codes
codes := make(map[string]int)
for i := 0; i < len(content)-15; i++ {
if content[i:i+14] == "ERR-TELEMETRY-" {
// Find the end of the code (3 digits)
end := i + 14
for end < len(content) && content[end] >= '0' && content[end] <= '9' {
end++
}
if end > i+14 {
code := content[i:end]
codes[code]++
}
}
}
// Verify each code appears only once (unique)
for code, count := range codes {
if count > 1 {
t.Errorf("Error code %s appears %d times - should be unique", code, count)
}
}
// Verify expected codes exist
expectedCodes := []string{
"ERR-TELEMETRY-001", // Failed to open operations.db
"ERR-TELEMETRY-002", // Failed to load CA chain
"ERR-TELEMETRY-003", // Invalid certificate
"ERR-TELEMETRY-004", // Failed to insert telemetry
"ERR-TELEMETRY-005", // Failed to create telemetry table
"ERR-TELEMETRY-006", // Failed to create telemetry index
"ERR-TELEMETRY-007", // Failed to create telemetry node_latest index
"ERR-TELEMETRY-008", // Failed to create uptime_spans table
"ERR-TELEMETRY-009", // Failed to create uptime_spans index
"ERR-TELEMETRY-010", // Failed to check maintenance mode
"ERR-TELEMETRY-011", // Failed to query latest uptime span
"ERR-TELEMETRY-012", // Failed to extend uptime span
"ERR-TELEMETRY-013", // Failed to extend early-judgment span
"ERR-TELEMETRY-014", // Failed to insert new uptime span
"ERR-TELEMETRY-015", // Failed to create maintenance table
"ERR-TELEMETRY-020", // Failed to create ntfy alert request
"ERR-TELEMETRY-021", // Failed to send ntfy alert
"ERR-TELEMETRY-022", // Failed to close ntfy response body
"ERR-TELEMETRY-030", // Failed to push health to Kuma
"ERR-TELEMETRY-031", // Kuma returned non-OK status
"ERR-TELEMETRY-032", // Failed to close Kuma response body (OK status)
"ERR-TELEMETRY-033", // Failed to query last telemetry timestamp
"ERR-TELEMETRY-034", // Failed to close Kuma response body (non-OK status)
"ERR-TELEMETRY-040", // tarpit called without Flusher
}
for _, code := range expectedCodes {
if _, exists := codes[code]; !exists {
t.Errorf("Expected error code %s not found in source", code)
}
}
t.Logf("Found %d unique error codes", len(codes))
}
// TestErrorCodes_Format verifies error codes follow the correct format
func TestErrorCodes_Format(t *testing.T) {
// Read source files
mainContent, err := os.ReadFile("main.go")
if err != nil {
t.Fatalf("Failed to read main.go: %v", err)
}
kumaContent, err := os.ReadFile("kuma.go")
if err != nil {
t.Fatalf("Failed to read kuma.go: %v", err)
}
content := string(mainContent) + string(kumaContent)
// Check that error messages follow format: ERR-TELEMETRY-XXX: Actionable message
lines := strings.Split(content, "\n")
for i, line := range lines {
if strings.Contains(line, "ERR-TELEMETRY-") && strings.Contains(line, "log.") {
// Verify format includes colon after error code
if !strings.Contains(line, "ERR-TELEMETRY-") || !strings.Contains(line, ":") {
t.Errorf("Line %d: Error message should follow format 'ERR-TELEMETRY-XXX: message', got: %s", i+1, strings.TrimSpace(line))
}
}
}
}
// TestHandleTelemetry_DatabaseInsertError tests ERR-TELEMETRY-004
func TestHandleTelemetry_DatabaseInsertError(t *testing.T) {
setupTestDB(t)
defer cleanupTestDB()
// Temporarily disable mTLS
oldCAPool := caPool
caPool = nil
defer func() { caPool = oldCAPool }()
// Corrupt database by making it read-only
db.Exec(`PRAGMA query_only = ON`)
payload := map[string]interface{}{
"node_id": "test-node-db-err",
"version": "1.0.0",
"hostname": "test-host",
"uptime_seconds": 3600,
"cpu_percent": 25.5,
"memory_total_mb": 8192,
"memory_used_mb": 4096,
"disk_total_mb": 100000,
"disk_used_mb": 50000,
"load_1m": 0.5,
"vault_count": 5,
"vault_size_mb": 10.5,
"vault_entries": 100,
"mode": "commercial",
}
body, _ := json.Marshal(payload)
req := httptest.NewRequest("POST", "/telemetry", bytes.NewReader(body))
w := httptest.NewRecorder()
// Capture logs
var logBuffer bytes.Buffer
originalOutput := log.Writer()
log.SetOutput(&logBuffer)
defer log.SetOutput(originalOutput)
handleTelemetry(w, req)
// Restore query_only mode
db.Exec(`PRAGMA query_only = OFF`)
// Should return 500 error
resp := w.Result()
if resp.StatusCode != 500 {
t.Errorf("Expected 500 status for database error, got %d", resp.StatusCode)
}
// Verify ERR-TELEMETRY-004 appears in logs
logs := logBuffer.String()
if !strings.Contains(logs, "ERR-TELEMETRY-004") {
t.Errorf("Expected ERR-TELEMETRY-004 in logs when telemetry insert fails, got:\n%s", logs)
}
}
// TestConcurrentErrorHandling verifies error handling is thread-safe
func TestConcurrentErrorHandling(t *testing.T) {
setupTestDB(t)
defer cleanupTestDB()
// Capture logs
var logBuffer bytes.Buffer
originalOutput := log.Writer()
log.SetOutput(&logBuffer)
defer log.SetOutput(originalOutput)
// Run multiple goroutines that trigger error handling
var wg sync.WaitGroup
for i := 0; i < 10; i++ {
wg.Add(1)
go func(id int) {
defer wg.Done()
nodeID := fmt.Sprintf("concurrent-node-%d", id)
updateSpan(nodeID, "test-host", "1.0.0", 50.0, 4096, 8192, 50000, 100000, 0.5, 3600)
}(i)
}
wg.Wait()
// Verify no panics occurred and logs were written safely
logs := logBuffer.String()
// Should have OUTAGE SPAN logs for each node
for i := 0; i < 10; i++ {
expectedNode := fmt.Sprintf("concurrent-node-%d", i)
if !strings.Contains(logs, expectedNode) {
t.Errorf("Expected logs to contain node %s", expectedNode)
}
}
}