clavitor/clavis/clavis-vault/lib/errors.go

617 lines
20 KiB
Go

package lib
import (
"bytes"
"context"
"encoding/json"
"fmt"
"log"
"net/http"
"os"
"sync"
"time"
)
// ---------------------------------------------------------------------------
// Severity Levels
// ---------------------------------------------------------------------------
type Severity int
const (
SeverityDebug Severity = iota // Development/debug only
SeverityInfo // Informational, no action needed
SeverityWarning // Anomaly detected, monitoring required
SeverityError // User-impacting issue
SeverityCritical // System down / security breach
)
func (s Severity) String() string {
switch s {
case SeverityDebug:
return "debug"
case SeverityInfo:
return "info"
case SeverityWarning:
return "warning"
case SeverityError:
return "error"
case SeverityCritical:
return "critical"
default:
return "unknown"
}
}
// ---------------------------------------------------------------------------
// Error/Event Definition Registry
// ---------------------------------------------------------------------------
// EventDef defines a class of events in the registry.
// This is the single source of truth for all errors and warnings.
type EventDef struct {
Code string // Unique code: ERR-12345 or WRN-12345
Category string // auth, database, network, security, etc.
Severity Severity // How serious
Message string // User-facing message (for errors shown to users)
Description string // Internal description for operators
AutoResolve bool // True if system can auto-resolve (e.g., retry succeeded)
CreateTicket bool // True if this creates a trackable incident
}
// EventRegistry is the canonical list of all events.
// Errors = user-impacting and create tickets.
// Warnings = anomalies that may indicate future issues.
var EventRegistry = map[string]EventDef{
// WARNINGS (WRN-1xxxx) - Anomalies, not yet user-impacting
"WRN-10001": {
Code: "WRN-10001",
Category: "performance",
Severity: SeverityWarning,
Description: "Response time >500ms for credential lookup",
AutoResolve: true,
CreateTicket: false, // Log only, no ticket unless sustained
},
"WRN-10002": {
Code: "WRN-10002",
Category: "security",
Severity: SeverityWarning,
Description: "Agent rate limit at 80% of threshold",
AutoResolve: true,
CreateTicket: false,
},
"WRN-10003": {
Code: "WRN-10003",
Category: "storage",
Severity: SeverityWarning,
Description: "Disk usage >80% on vault storage",
AutoResolve: false,
CreateTicket: true, // Creates ticket for ops to expand storage
},
"WRN-10004": {
Code: "WRN-10004",
Category: "network",
Severity: SeverityWarning,
Description: "TLS handshake latency elevated",
AutoResolve: true,
CreateTicket: false,
},
// AUTH ERRORS (ERR-1xxxx)
"ERR-10001": {
Code: "ERR-10001",
Category: "auth",
Severity: SeverityError,
Message: "Authentication failed. Please check your credentials and try again.",
Description: "WebAuthn challenge verification failed",
AutoResolve: false,
CreateTicket: false, // User error, not system error
},
"ERR-10002": {
Code: "ERR-10002",
Category: "auth",
Severity: SeverityError,
Message: "Access denied. You don't have permission for this operation.",
Description: "Actor attempted operation outside their scope",
AutoResolve: false,
CreateTicket: false,
},
"ERR-10003": {
Code: "ERR-10003",
Category: "auth",
Severity: SeverityError,
Message: "Invalid agent token. The agent may need to be re-enrolled.",
Description: "CVT token validation failed",
AutoResolve: false,
CreateTicket: true,
},
"ERR-10004": {
Code: "ERR-10004",
Category: "security",
Severity: SeverityCritical,
Message: "Agent locked due to suspicious activity. Contact the vault owner.",
Description: "Agent triggered harvester defenses (two-strike lockdown)",
AutoResolve: false,
CreateTicket: true,
},
"ERR-10005": {
Code: "ERR-10005",
Category: "security",
Severity: SeverityCritical,
Message: "Request from unauthorized IP. Token may be compromised.",
Description: "Agent request from IP not in whitelist",
AutoResolve: false,
CreateTicket: true,
},
// INPUT ERRORS (ERR-2xxxx)
"ERR-20001": {
Code: "ERR-20001",
Category: "input",
Severity: SeverityError,
Message: "Invalid request. Please check your input and try again.",
Description: "JSON parse failed or required field missing",
AutoResolve: false,
CreateTicket: false,
},
"ERR-20002": {
Code: "ERR-20002",
Category: "input",
Severity: SeverityError,
Message: "Invalid ID format. The requested item may not exist.",
Description: "Entry/agent ID parsing failed",
AutoResolve: false,
CreateTicket: false,
},
// NOT FOUND ERRORS (ERR-3xxxx)
"ERR-30001": {
Code: "ERR-30001",
Category: "not_found",
Severity: SeverityError,
Message: "Vault not found. Please register or check your vault path.",
Description: "No vault exists at the requested path",
AutoResolve: false,
CreateTicket: false,
},
"ERR-30002": {
Code: "ERR-30002",
Category: "not_found",
Severity: SeverityError,
Message: "Entry not found. It may have been deleted.",
Description: "Requested entry ID does not exist",
AutoResolve: false,
CreateTicket: false,
},
// SYSTEM ERRORS (ERR-5xxxx) - These POST to central
"ERR-50001": {
Code: "ERR-50001",
Category: "database",
Severity: SeverityCritical,
Message: "Service temporarily unavailable. Our team has been alerted and is working on it. Reference: ERR-50001.",
Description: "Vault database connection failed or file inaccessible",
AutoResolve: false,
CreateTicket: true,
},
"ERR-50002": {
Code: "ERR-50002",
Category: "database",
Severity: SeverityError,
Message: "Failed to save data. Please try again in a moment.",
Description: "Database write operation failed",
AutoResolve: true,
CreateTicket: true,
},
"ERR-50003": {
Code: "ERR-50003",
Category: "storage",
Severity: SeverityCritical,
Message: "Service temporarily unavailable. Our team has been alerted. Reference: ERR-50003.",
Description: "WL3 credential storage write failed",
AutoResolve: false,
CreateTicket: true,
},
"ERR-50004": {
Code: "ERR-50004",
Category: "network",
Severity: SeverityError,
Message: "Connection issue. Please try again.",
Description: "Failed to connect to central admin for sync",
AutoResolve: true,
CreateTicket: true,
},
// RATE LIMITING (ERR-6xxxx)
"ERR-60001": {
Code: "ERR-60001",
Category: "rate_limit",
Severity: SeverityError,
Message: "Too many requests. Please slow down and try again in a moment.",
Description: "Global per-IP rate limit exceeded",
AutoResolve: true,
CreateTicket: false,
},
"ERR-60002": {
Code: "ERR-60002",
Category: "rate_limit",
Severity: SeverityWarning,
Message: "Agent rate limit warning. Reduce request frequency.",
Description: "Per-agent unique-entry quota at 90%",
AutoResolve: true,
CreateTicket: false,
},
// INTERNAL ERRORS (ERR-9xxxx) - Invariant violations
"ERR-90001": {
Code: "ERR-90001",
Category: "invariant",
Severity: SeverityCritical,
Message: "An unexpected error occurred. Please try again or contact support. Reference: ERR-90001.",
Description: "Condition assumed impossible was triggered",
AutoResolve: false,
CreateTicket: true,
},
}
// ---------------------------------------------------------------------------
// Incident Tracking - Centralized Error Management
// ---------------------------------------------------------------------------
// CentralEvent is sent to clavitor.ai for every ticket-creating event.
// No deduplication - "let it rain". Central handles aggregation if needed.
type CentralEvent struct {
EventID string `json:"event_id"` // UUID generated locally
Code string `json:"code"` // ERR-50001, etc.
Category string `json:"category"` // database, network, etc.
Severity string `json:"severity"`
Resource string `json:"resource"` // uk1, db-primary, etc.
POP string `json:"pop"` // Which POP reported this
Operation string `json:"operation"` // GetEntry, agent unlock, etc.
ErrorDetail string `json:"error_detail"` // The actual error message
Actor string `json:"actor"` // web, agent:abc123
Timestamp int64 `json:"timestamp"`
UserMessage string `json:"user_message"` // Shown to users
}
// CentralClient posts events to central (clavitor.ai).
// Only created in commercial edition; community edition logs locally only.
type CentralClient struct {
centralURL string
popID string
apiKey string
}
var (
globalCentralClient *CentralClient
onceCentral sync.Once
)
// InitCentralClient creates the global central reporter.
// Called at startup from main.go.
func InitCentralClient(centralURL, popID, apiKey string) {
onceCentral.Do(func() {
globalCentralClient = &CentralClient{
centralURL: centralURL,
popID: popID,
apiKey: apiKey,
}
})
}
// postEvent sends an event to central asynchronously.
// Every event is sent individually - no local deduplication.
func (c *CentralClient) postEvent(ev CentralEvent) {
if c == nil {
return // Community edition - no central reporting
}
payload, _ := json.Marshal(ev)
req, err := http.NewRequest("POST", c.centralURL+"/v1/events", bytes.NewReader(payload))
if err != nil {
log.Printf("[ERROR] Failed to create event request: %v", err)
return
}
req.Header.Set("Authorization", "Bearer "+c.apiKey)
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
if err != nil {
log.Printf("[ERROR] Failed to post event to central: %v", err)
return
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusCreated {
log.Printf("[ERROR] Central rejected event: %d", resp.StatusCode)
}
}
// ---------------------------------------------------------------------------
// Event Logging - The main interface for handlers
// ---------------------------------------------------------------------------
// EventContext holds all the searchable fields for an event.
type EventContext struct {
Code string // ERR-50001, WRN-10001, etc.
Category string // auth, database, network, security
Resource string // Which resource: uk1, db-primary, vault-file-123
Operation string // What was being attempted: "GetEntry", "agent unlock"
Actor string // web, agent:abc123, extension
ErrorDetail string // The actual error: "connection refused", "disk full"
IPAddr string // Client IP
Severity Severity
}
// LogEvent is the main entry point for all events (errors and warnings).
// This replaces direct AuditLog calls for error cases.
func LogEvent(db *DB, ctx context.Context, ec EventContext) {
def, ok := EventRegistry[ec.Code]
if !ok {
log.Printf("[ERROR] Unknown event code: %s", ec.Code)
def = EventRegistry["ERR-90001"]
ec.Code = "ERR-90001"
ec.Severity = SeverityCritical
}
// Log to local audit (searchable fields)
// Title contains structured data: resource, error, operation
// Searchable: grep "resource=uk1" audit.log
auditTitle := fmt.Sprintf("op=%s | resource=%s | error=%s | %s",
ec.Operation, ec.Resource, ec.ErrorDetail, def.Description)
AuditLog(db, &AuditEvent{
Action: ec.Code,
Actor: ec.Actor,
Title: auditTitle,
IPAddr: ec.IPAddr,
})
// Log to operator logs (human readable)
log.Printf("[%s] %s | resource=%s | op=%s | actor=%s | error=%s | severity=%s",
ec.Code,
def.Description,
ec.Resource,
ec.Operation,
ec.Actor,
ec.ErrorDetail,
def.Severity.String(),
)
// Post to central if ticket-creating event
// "Let it rain" - every event is sent individually, no deduplication
if def.CreateTicket && globalCentralClient != nil {
go globalCentralClient.postEvent(CentralEvent{
EventID: generateEventID(),
Code: ec.Code,
Category: def.Category,
Severity: def.Severity.String(),
Resource: ec.Resource,
POP: globalCentralClient.popID,
Operation: ec.Operation,
ErrorDetail: ec.ErrorDetail,
Actor: ec.Actor,
Timestamp: time.Now().Unix(),
UserMessage: def.Message,
})
}
}
// generateEventID creates a simple UUID-like identifier.
func generateEventID() string {
return fmt.Sprintf("EVT-%d-%d", time.Now().Unix(), time.Now().UnixNano()%1000)
}
// LogWarning for non-critical anomalies.
func LogWarning(db *DB, ctx context.Context, code, resource, operation, detail, actor, ip string) {
LogEvent(db, ctx, EventContext{
Code: code,
Category: "performance", // default, override via registry
Resource: resource,
Operation: operation,
Actor: actor,
ErrorDetail: detail,
IPAddr: ip,
Severity: SeverityWarning,
})
}
// LogError for user-impacting issues.
func LogError(db *DB, ctx context.Context, code, resource, operation, detail, actor, ip string) {
LogEvent(db, ctx, EventContext{
Code: code,
Category: "system", // default, override via registry
Resource: resource,
Operation: operation,
Actor: actor,
ErrorDetail: detail,
IPAddr: ip,
Severity: SeverityError,
})
}
// ---------------------------------------------------------------------------
// HTTP Response Helpers
// ---------------------------------------------------------------------------
// ErrorResponse sends a user-facing error with code.
// Use this for API responses to clients.
func ErrorResponse(w http.ResponseWriter, status int, code string) {
def, ok := EventRegistry[code]
if !ok {
def = EventDef{
Code: code,
Message: "An error occurred. Please try again.",
}
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
response := map[string]string{
"error": code,
"message": def.Message,
}
// For critical/system errors, add incident reference if available
if def.Severity >= SeverityError && def.CreateTicket {
response["reference"] = code // User can reference this when contacting support
response["status"] = "Our team has been alerted"
}
json.NewEncoder(w).Encode(response)
}
// HandleError is the full-flow helper for handlers.
// Logs event, posts to central if needed, and returns user response.
func HandleError(
w http.ResponseWriter,
r *http.Request,
db *DB,
code string,
resource string,
operation string,
internalErr error,
httpStatus int,
) {
ctx := r.Context()
// Build actor from context (set by middleware)
actor := "unknown"
if a, ok := ctx.Value("actor").(string); ok {
actor = a
}
// Log the full event
LogEvent(db, ctx, EventContext{
Code: code,
Resource: resource,
Operation: operation,
Actor: actor,
ErrorDetail: internalErr.Error(),
IPAddr: r.RemoteAddr,
})
// Return user-facing response
ErrorResponse(w, httpStatus, code)
}
// LookupEvent returns the event definition for documentation.
func LookupEvent(code string) (EventDef, bool) {
def, ok := EventRegistry[code]
return def, ok
}
// ListEventsByCategory returns all events in a category.
func ListEventsByCategory(category string) []EventDef {
var results []EventDef
for _, def := range EventRegistry {
if def.Category == category {
results = append(results, def)
}
}
return results
}
// ---------------------------------------------------------------------------
// Status Page Integration (for central/clavitor.ai)
// ---------------------------------------------------------------------------
// StatusPageEntry is returned by central's /status endpoint.
type StatusPageEntry struct {
Component string `json:"component"` // uk1, db-primary, etc.
Status string `json:"status"` // operational, degraded, down
IncidentID string `json:"incident_id,omitempty"`
UpdatedAt int64 `json:"updated_at"`
UserMessage string `json:"user_message,omitempty"`
}
// IsResourceAffected queries central for recent events on a resource.
// This queries central rather than local cache (no local deduplication).
func IsResourceAffected(resource string) bool {
// TODO: Query central /v1/events?resource=uk1&since=5m
// For now, returns false - central is source of truth
return false
}
// GetStatusForResource returns current status by querying central.
func GetStatusForResource(resource string) (StatusPageEntry, bool) {
// TODO: Query central /v1/status?resource=uk1
// For now, assumes operational - central drives status page
return StatusPageEntry{
Component: resource,
Status: "operational",
UpdatedAt: time.Now().Unix(),
}, false
}
// ---------------------------------------------------------------------------
// Central Query Helpers (for clavitor.ai implementation)
// ---------------------------------------------------------------------------
// ActiveEventSummary is what central's dashboard shows.
// SQL equivalent:
// SELECT code, resource, pop, COUNT(*) as count, MAX(timestamp) as last_seen
// FROM events
// WHERE status != 'resolved'
// GROUP BY code, resource, pop
// ORDER BY count DESC
//
// This gives you: "uk1 has 12 ERR-50001 in the last hour"
type ActiveEventSummary struct {
Code string `json:"code"` // ERR-50001
Resource string `json:"resource"` // uk1
POP string `json:"pop"` // zrh
Count int `json:"count"` // How many events
FirstSeen int64 `json:"first_seen"` // First event timestamp
LastSeen int64 `json:"last_seen"` // Most recent event timestamp
Status string `json:"status"` // investigating, identified, monitoring, resolved
}
// CentralQuery represents the query parameters for the central endpoint.
// The central API should support:
// GET /v1/events?status=active&group_by=code,resource,pop
// GET /v1/events?code=ERR-50001&resource=uk1&since=1h
// POST /v1/events/bulk-resolve { "code": "ERR-50001", "resource": "uk1" }
type CentralQuery struct {
Status string `json:"status,omitempty"` // active, resolved, all
Code string `json:"code,omitempty"` // ERR-50001
Resource string `json:"resource,omitempty"` // uk1
POP string `json:"pop,omitempty"` // zrh
Since string `json:"since,omitempty"` // 1h, 24h, 7d
GroupBy []string `json:"group_by,omitempty"` // code, resource, pop
Severity string `json:"severity,omitempty"` // error, critical
CreateTicket bool `json:"create_ticket,omitempty"` // true = ticket-creating only
}
// BulkResolveRequest marks events as resolved in bulk.
// Use case: uk1 fixed, resolve all ERR-50001 for uk1 at once.
type BulkResolveRequest struct {
Code string `json:"code"` // Required
Resource string `json:"resource,omitempty"` // Optional: resolve for specific resource
POP string `json:"pop,omitempty"` // Optional: resolve for specific POP
Since int64 `json:"since,omitempty"` // Optional: resolve events after this time
Message string `json:"message"` // Resolution message: "Disk space freed, service restored"
ResolvedBy string `json:"resolved_by"` // Who fixed it: "ops-johan"
}
// ---------------------------------------------------------------------------
// Environment-based initialization helper
// ---------------------------------------------------------------------------
// InitErrorsFromEnv sets up the central client from environment variables.
// Call this from main():
// lib.InitErrorsFromEnv()
func InitErrorsFromEnv() {
centralURL := os.Getenv("CLAVITOR_CENTRAL_URL")
popID := os.Getenv("CLAVITOR_POP_ID")
apiKey := os.Getenv("CLAVITOR_API_KEY")
if centralURL != "" && popID != "" && apiKey != "" {
InitCentralClient(centralURL, popID, apiKey)
log.Printf("[INIT] Central event reporting enabled: POP=%s -> %s", popID, centralURL)
} else {
log.Printf("[INIT] Central event reporting disabled (community edition or missing config)")
}
}