clavitor/clavis/clavis-telemetry/error_handling_test.go

//go:build commercial

package main

import (
	"bytes"
	"database/sql"
	"encoding/json"
	"fmt"
	"io"
	"log"
	"net/http"
	"net/http/httptest"
	"os"
	"strings"
	"sync"
	"testing"
	"time"

	_ "github.com/mattn/go-sqlite3"
)

// logCapture captures log output for verification
type logCapture struct {
	original *os.File
	reader   *os.File
	writer   *os.File
	buffer   chan string
	done     chan bool
}

func captureLogs() *logCapture {
	// Save original stderr
	original := os.Stderr

	// Create pipe to capture log output
	r, w, _ := os.Pipe()

	// Redirect log output
	os.Stderr = w
	log.SetOutput(w)

	lc := &logCapture{
		original: original,
		reader:   r,
		writer:   w,
		buffer:   make(chan string, 100),
		done:     make(chan bool),
	}

	// Start goroutine to read from pipe
	go func() {
		var buf bytes.Buffer
		io.Copy(&buf, r)
		lc.buffer <- buf.String()
		close(lc.buffer)
		lc.done <- true
	}()

	return lc
}

func (lc *logCapture) restore() string {
	// Restore original stderr
	os.Stderr = lc.original
	log.SetOutput(lc.original)

	// Close writer to signal EOF to reader
	lc.writer.Close()

	// Wait for reader goroutine to finish
	<-lc.done

	// Return captured output
	return <-lc.buffer
}

// TestUpdateSpan_DatabaseErrors verifies ERR-TELEMETRY-010 through ERR-TELEMETRY-014
func TestUpdateSpan_DatabaseErrors(t *testing.T) {
	tests := []struct {
		name           string
		setupDB        func(*sql.DB)
		expectedErrors []string
		description    string
	}{
		{
			name: "ERR-TELEMETRY-010: Maintenance check fails",
			setupDB: func(testDB *sql.DB) {
				// Drop maintenance table to cause query failure
				testDB.Exec(`DROP TABLE IF EXISTS maintenance`)
			},
			expectedErrors: []string{"ERR-TELEMETRY-010"},
			description:    "Should log ERR-TELEMETRY-010 when maintenance check fails",
		},
		{
			name: "ERR-TELEMETRY-011: Uptime span query fails",
			setupDB: func(testDB *sql.DB) {
				// Drop uptime_spans table to cause query failure
				testDB.Exec(`DROP TABLE IF EXISTS uptime_spans`)
			},
			expectedErrors: []string{"ERR-TELEMETRY-011"},
			description:    "Should log ERR-TELEMETRY-011 when uptime span query fails",
		},
		{
			name: "ERR-TELEMETRY-012: Span extend fails",
			setupDB: func(testDB *sql.DB) {
				// Insert a span but make the table read-only by corrupting it
				testDB.Exec(`INSERT INTO uptime_spans (node_id, start_at, end_at) VALUES (?, ?, ?)`,
					"test-node-err", time.Now().Unix()-30, time.Now().Unix()-30)
				// Now drop and recreate to cause update to reference non-existent row
				testDB.Exec(`DELETE FROM uptime_spans WHERE node_id = ?`, "test-node-err")
			},
			expectedErrors: []string{}, // May or may not trigger depending on timing
			description:    "Database error handling for span extend",
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			// Setup test database
			setupTestDB(t)
			defer cleanupTestDB()

			// Apply database corruption/setup
			tt.setupDB(db)

			// Capture logs
			lc := captureLogs()

			// Call updateSpan which should trigger error logging
			updateSpan("test-node-err", "test-host", "1.0.0", 50.0, 4096, 8192, 50000, 100000, 0.5, 3600)

			// Restore and get logs
			logs := lc.restore()

			// Verify expected errors appear in logs
			for _, errCode := range tt.expectedErrors {
				if !strings.Contains(logs, errCode) {
					t.Errorf("%s: expected log to contain %s, but got:\n%s", tt.description, errCode, logs)
				}
			}
		})
	}
}

// TestUpdateSpan_MaintenanceModeError specifically tests ERR-TELEMETRY-010
func TestUpdateSpan_MaintenanceModeError(t *testing.T) {
	setupTestDB(t)
	defer cleanupTestDB()

	// Capture logs
	var logBuffer bytes.Buffer
	originalOutput := log.Writer()
	log.SetOutput(&logBuffer)
	defer log.SetOutput(originalOutput)

	// Corrupt the database by dropping the maintenance table
	_, err := db.Exec(`DROP TABLE maintenance`)
	if err != nil {
		t.Fatalf("Failed to drop maintenance table: %v", err)
	}

	// Call updateSpan
	updateSpan("test-node-maint", "test-host", "1.0.0", 50.0, 4096, 8192, 50000, 100000, 0.5, 3600)

	// Verify ERR-TELEMETRY-010 appears in logs
	logs := logBuffer.String()
	if !strings.Contains(logs, "ERR-TELEMETRY-010") {
		t.Errorf("Expected ERR-TELEMETRY-010 in logs when maintenance check fails, got:\n%s", logs)
	}
	if !strings.Contains(logs, "Failed to check maintenance mode") {
		t.Errorf("Expected 'Failed to check maintenance mode' message, got:\n%s", logs)
	}
}

// TestUpdateSpan_UptimeSpanQueryError specifically tests ERR-TELEMETRY-011
func TestUpdateSpan_UptimeSpanQueryError(t *testing.T) {
	setupTestDB(t)
	defer cleanupTestDB()

	// Capture logs
	var logBuffer bytes.Buffer
	originalOutput := log.Writer()
	log.SetOutput(&logBuffer)
	defer log.SetOutput(originalOutput)

	// Corrupt the database by making it read-only via closing and reopening with bad path
	db.Close()
	var err error
	db, err = sql.Open("sqlite3", ":memory:")
	if err != nil {
		t.Fatalf("Failed to reopen database: %v", err)
	}
	// Don't create tables - queries will fail

	// Call updateSpan
	updateSpan("test-node-span", "test-host", "1.0.0", 50.0, 4096, 8192, 50000, 100000, 0.5, 3600)

	// Verify ERR-TELEMETRY-011 appears in logs
	logs := logBuffer.String()
	if !strings.Contains(logs, "ERR-TELEMETRY-011") {
		t.Errorf("Expected ERR-TELEMETRY-011 in logs when uptime span query fails, got:\n%s", logs)
	}
	if !strings.Contains(logs, "Failed to query latest uptime span") {
		t.Errorf("Expected 'Failed to query latest uptime span' message, got:\n%s", logs)
	}
}

// TestSendKumaPush_DatabaseError tests ERR-TELEMETRY-033
func TestSendKumaPush_DatabaseError(t *testing.T) {
	// Setup test database then corrupt it
	var err error
	db, err = sql.Open("sqlite3", ":memory:")
	if err != nil {
		t.Fatalf("Failed to open test database: %v", err)
	}
	defer db.Close()
	// Don't create tables - this will cause queries to fail

	// Capture logs
	var logBuffer bytes.Buffer
	originalOutput := log.Writer()
	log.SetOutput(&logBuffer)
	defer log.SetOutput(originalOutput)

	// Create a test server that accepts Kuma pushes
	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		w.WriteHeader(http.StatusOK)
	}))
	defer server.Close()

	// Call sendKumaPush
	sendKumaPush(server.URL)

	// Verify ERR-TELEMETRY-033 appears in logs (failed to query last telemetry)
	logs := logBuffer.String()
	if !strings.Contains(logs, "ERR-TELEMETRY-033") {
		t.Errorf("Expected ERR-TELEMETRY-033 in logs when telemetry timestamp query fails, got:\n%s", logs)
	}
}

// TestSendKumaPush_NetworkError tests ERR-TELEMETRY-030
func TestSendKumaPush_NetworkError(t *testing.T) {
	setupTestDB(t)
	defer cleanupTestDB()

	// Capture logs
	var logBuffer bytes.Buffer
	originalOutput := log.Writer()
	log.SetOutput(&logBuffer)
	defer log.SetOutput(originalOutput)

	// Use an invalid URL that will cause network error
	invalidURL := "http://localhost:1/invalid" // Port 1 is typically invalid/unused

	// Call sendKumaPush with invalid URL
	sendKumaPush(invalidURL)

	// Verify ERR-TELEMETRY-030 appears in logs
	logs := logBuffer.String()
	if !strings.Contains(logs, "ERR-TELEMETRY-030") {
		t.Errorf("Expected ERR-TELEMETRY-030 in logs when Kuma push fails, got:\n%s", logs)
	}
	if !strings.Contains(logs, "Failed to push health status to Kuma") {
		t.Errorf("Expected 'Failed to push health status to Kuma' message, got:\n%s", logs)
	}
}

// TestSendKumaPush_NonOKStatus tests ERR-TELEMETRY-031
func TestSendKumaPush_NonOKStatus(t *testing.T) {
	setupTestDB(t)
	defer cleanupTestDB()

	// Capture logs
	var logBuffer bytes.Buffer
	originalOutput := log.Writer()
	log.SetOutput(&logBuffer)
	defer log.SetOutput(originalOutput)

	// Create a test server that returns non-OK status
	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		w.WriteHeader(http.StatusServiceUnavailable) // 503
		w.Write([]byte("Service Unavailable"))
	}))
	defer server.Close()

	// Call sendKumaPush
	sendKumaPush(server.URL)

	// Verify ERR-TELEMETRY-031 appears in logs
	logs := logBuffer.String()
	if !strings.Contains(logs, "ERR-TELEMETRY-031") {
		t.Errorf("Expected ERR-TELEMETRY-031 in logs when Kuma returns non-OK status, got:\n%s", logs)
	}
	if !strings.Contains(logs, "non-OK status 503") {
		t.Errorf("Expected 'non-OK status 503' message, got:\n%s", logs)
	}
}

// TestSendKumaPush_ResponseBodyCloseError tests response body handling
func TestSendKumaPush_ResponseBodyCloseError(t *testing.T) {
	setupTestDB(t)
	defer cleanupTestDB()

	// Capture logs
	var logBuffer bytes.Buffer
	originalOutput := log.Writer()
	log.SetOutput(&logBuffer)
	defer log.SetOutput(originalOutput)

	// Create a test server that returns OK
	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		w.WriteHeader(http.StatusOK)
	}))
	defer server.Close()

	// Call sendKumaPush
	sendKumaPush(server.URL)

	// Should complete without ERR-TELEMETRY-032 (successful close doesn't log error)
	logs := logBuffer.String()
	if strings.Contains(logs, "ERR-TELEMETRY-032") {
		t.Errorf("Did not expect ERR-TELEMETRY-032 for successful response body close, got:\n%s", logs)
	}
}

// TestTarpit_EarlyDisconnect tests that tarpit handles client disconnect gracefully
func TestTarpit_EarlyDisconnect(t *testing.T) {
	// Create a custom ResponseWriter that simulates disconnect after first write
	disconnectWriter := &disconnectSimulatingWriter{
		headers: make(http.Header),
		failAfter: 1, // Fail after first write
	}

	req := httptest.NewRequest("GET", "/unknown", nil)

	// Use a goroutine since tarpit blocks
	done := make(chan bool)
	go func() {
		tarpit(disconnectWriter, req)
		done <- true
	}()

	// Wait for tarpit to complete (should be quick due to disconnect)
	select {
	case <-done:
		// Expected - tarpit returned early due to disconnect
	case <-time.After(2 * time.Second):
		t.Error("tarpit did not return early on client disconnect")
	}

	// Verify that at least one write was attempted
	if !disconnectWriter.written {
		t.Error("tarpit should have attempted at least one write")
	}
}

// disconnectSimulatingWriter simulates a client that disconnects after N writes
type disconnectSimulatingWriter struct {
	headers    http.Header
	status     int
	written    bool
	writeCount int
	failAfter  int
	flushCount int
}

func (m *disconnectSimulatingWriter) Header() http.Header {
	return m.headers
}

func (m *disconnectSimulatingWriter) Write(p []byte) (int, error) {
	m.written = true
	m.writeCount++
	if m.writeCount >= m.failAfter {
		return 0, io.ErrClosedPipe // Simulate client disconnect
	}
	return len(p), nil
}

func (m *disconnectSimulatingWriter) WriteHeader(status int) {
	m.status = status
}

func (m *disconnectSimulatingWriter) Flush() {
	m.flushCount++
}

// TestErrorCodes_Unique verifies that all error codes are unique
func TestErrorCodes_Unique(t *testing.T) {
	// Read the main.go and kuma.go files to extract error codes
	mainContent, err := os.ReadFile("main.go")
	if err != nil {
		t.Fatalf("Failed to read main.go: %v", err)
	}
	kumaContent, err := os.ReadFile("kuma.go")
	if err != nil {
		t.Fatalf("Failed to read kuma.go: %v", err)
	}

	content := string(mainContent) + string(kumaContent)

	// Extract all ERR-TELEMETRY-XXX codes
	codes := make(map[string]int)
	for i := 0; i < len(content)-15; i++ {
		if content[i:i+14] == "ERR-TELEMETRY-" {
			// Find the end of the code (3 digits)
			end := i + 14
			for end < len(content) && content[end] >= '0' && content[end] <= '9' {
				end++
			}
			if end > i+14 {
				code := content[i:end]
				codes[code]++
			}
		}
	}

	// Verify each code appears only once (unique)
	for code, count := range codes {
		if count > 1 {
			t.Errorf("Error code %s appears %d times - should be unique", code, count)
		}
	}

	// Verify expected codes exist
	expectedCodes := []string{
		"ERR-TELEMETRY-001", // Failed to open operations.db
		"ERR-TELEMETRY-002", // Failed to load CA chain
		"ERR-TELEMETRY-003", // Invalid certificate
		"ERR-TELEMETRY-004", // Failed to insert telemetry
		"ERR-TELEMETRY-005", // Failed to create telemetry table
		"ERR-TELEMETRY-006", // Failed to create telemetry index
		"ERR-TELEMETRY-007", // Failed to create telemetry node_latest index
		"ERR-TELEMETRY-008", // Failed to create uptime_spans table
		"ERR-TELEMETRY-009", // Failed to create uptime_spans index
		"ERR-TELEMETRY-010", // Failed to check maintenance mode
		"ERR-TELEMETRY-011", // Failed to query latest uptime span
		"ERR-TELEMETRY-012", // Failed to extend uptime span
		"ERR-TELEMETRY-013", // Failed to extend early-judgment span
		"ERR-TELEMETRY-014", // Failed to insert new uptime span
		"ERR-TELEMETRY-015", // Failed to create maintenance table
		"ERR-TELEMETRY-020", // Failed to create ntfy alert request
		"ERR-TELEMETRY-021", // Failed to send ntfy alert
		"ERR-TELEMETRY-022", // Failed to close ntfy response body
		"ERR-TELEMETRY-030", // Failed to push health to Kuma
		"ERR-TELEMETRY-031", // Kuma returned non-OK status
		"ERR-TELEMETRY-032", // Failed to close Kuma response body (OK status)
		"ERR-TELEMETRY-033", // Failed to query last telemetry timestamp
		"ERR-TELEMETRY-034", // Failed to close Kuma response body (non-OK status)
		"ERR-TELEMETRY-040", // tarpit called without Flusher
	}

	for _, code := range expectedCodes {
		if _, exists := codes[code]; !exists {
			t.Errorf("Expected error code %s not found in source", code)
		}
	}

	t.Logf("Found %d unique error codes", len(codes))
}

// TestErrorCodes_Format verifies error codes follow the correct format
func TestErrorCodes_Format(t *testing.T) {
	// Read source files
	mainContent, err := os.ReadFile("main.go")
	if err != nil {
		t.Fatalf("Failed to read main.go: %v", err)
	}
	kumaContent, err := os.ReadFile("kuma.go")
	if err != nil {
		t.Fatalf("Failed to read kuma.go: %v", err)
	}

	content := string(mainContent) + string(kumaContent)

	// Check that error messages follow format: ERR-TELEMETRY-XXX: Actionable message
	lines := strings.Split(content, "\n")
	for i, line := range lines {
		if strings.Contains(line, "ERR-TELEMETRY-") && strings.Contains(line, "log.") {
			// Verify format includes colon after error code
			if !strings.Contains(line, "ERR-TELEMETRY-") || !strings.Contains(line, ":") {
				t.Errorf("Line %d: Error message should follow format 'ERR-TELEMETRY-XXX: message', got: %s", i+1, strings.TrimSpace(line))
			}
		}
	}
}

// TestHandleTelemetry_DatabaseInsertError tests ERR-TELEMETRY-004
func TestHandleTelemetry_DatabaseInsertError(t *testing.T) {
	setupTestDB(t)
	defer cleanupTestDB()

	// Temporarily disable mTLS
	oldCAPool := caPool
	caPool = nil
	defer func() { caPool = oldCAPool }()

	// Corrupt database by making it read-only
	db.Exec(`PRAGMA query_only = ON`)

	payload := map[string]interface{}{
		"node_id":         "test-node-db-err",
		"version":         "1.0.0",
		"hostname":        "test-host",
		"uptime_seconds":  3600,
		"cpu_percent":     25.5,
		"memory_total_mb": 8192,
		"memory_used_mb":  4096,
		"disk_total_mb":   100000,
		"disk_used_mb":    50000,
		"load_1m":         0.5,
		"vault_count":     5,
		"vault_size_mb":   10.5,
		"vault_entries":   100,
		"mode":            "commercial",
	}

	body, _ := json.Marshal(payload)
	req := httptest.NewRequest("POST", "/telemetry", bytes.NewReader(body))
	w := httptest.NewRecorder()

	// Capture logs
	var logBuffer bytes.Buffer
	originalOutput := log.Writer()
	log.SetOutput(&logBuffer)
	defer log.SetOutput(originalOutput)

	handleTelemetry(w, req)

	// Restore query_only mode
	db.Exec(`PRAGMA query_only = OFF`)

	// Should return 500 error
	resp := w.Result()
	if resp.StatusCode != 500 {
		t.Errorf("Expected 500 status for database error, got %d", resp.StatusCode)
	}

	// Verify ERR-TELEMETRY-004 appears in logs
	logs := logBuffer.String()
	if !strings.Contains(logs, "ERR-TELEMETRY-004") {
		t.Errorf("Expected ERR-TELEMETRY-004 in logs when telemetry insert fails, got:\n%s", logs)
	}
}

// TestConcurrentErrorHandling verifies error handling is thread-safe
func TestConcurrentErrorHandling(t *testing.T) {
	setupTestDB(t)
	defer cleanupTestDB()

	// Capture logs
	var logBuffer bytes.Buffer
	originalOutput := log.Writer()
	log.SetOutput(&logBuffer)
	defer log.SetOutput(originalOutput)

	// Run multiple goroutines that trigger error handling
	var wg sync.WaitGroup
	for i := 0; i < 10; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()
			nodeID := fmt.Sprintf("concurrent-node-%d", id)
			updateSpan(nodeID, "test-host", "1.0.0", 50.0, 4096, 8192, 50000, 100000, 0.5, 3600)
		}(i)
	}
	wg.Wait()

	// Verify no panics occurred and logs were written safely
	logs := logBuffer.String()
	// Should have OUTAGE SPAN logs for each node
	for i := 0; i < 10; i++ {
		expectedNode := fmt.Sprintf("concurrent-node-%d", i)
		if !strings.Contains(logs, expectedNode) {
			t.Errorf("Expected logs to contain node %s", expectedNode)
		}
	}
}