617 lines
20 KiB
Go
617 lines
20 KiB
Go
package lib
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Severity Levels
|
|
// ---------------------------------------------------------------------------
|
|
|
|
type Severity int
|
|
|
|
const (
|
|
SeverityDebug Severity = iota // Development/debug only
|
|
SeverityInfo // Informational, no action needed
|
|
SeverityWarning // Anomaly detected, monitoring required
|
|
SeverityError // User-impacting issue
|
|
SeverityCritical // System down / security breach
|
|
)
|
|
|
|
func (s Severity) String() string {
|
|
switch s {
|
|
case SeverityDebug:
|
|
return "debug"
|
|
case SeverityInfo:
|
|
return "info"
|
|
case SeverityWarning:
|
|
return "warning"
|
|
case SeverityError:
|
|
return "error"
|
|
case SeverityCritical:
|
|
return "critical"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Error/Event Definition Registry
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// EventDef defines a class of events in the registry.
|
|
// This is the single source of truth for all errors and warnings.
|
|
type EventDef struct {
|
|
Code string // Unique code: ERR-12345 or WRN-12345
|
|
Category string // auth, database, network, security, etc.
|
|
Severity Severity // How serious
|
|
Message string // User-facing message (for errors shown to users)
|
|
Description string // Internal description for operators
|
|
AutoResolve bool // True if system can auto-resolve (e.g., retry succeeded)
|
|
CreateTicket bool // True if this creates a trackable incident
|
|
}
|
|
|
|
// EventRegistry is the canonical list of all events.
|
|
// Errors = user-impacting and create tickets.
|
|
// Warnings = anomalies that may indicate future issues.
|
|
var EventRegistry = map[string]EventDef{
|
|
// WARNINGS (WRN-1xxxx) - Anomalies, not yet user-impacting
|
|
"WRN-10001": {
|
|
Code: "WRN-10001",
|
|
Category: "performance",
|
|
Severity: SeverityWarning,
|
|
Description: "Response time >500ms for credential lookup",
|
|
AutoResolve: true,
|
|
CreateTicket: false, // Log only, no ticket unless sustained
|
|
},
|
|
"WRN-10002": {
|
|
Code: "WRN-10002",
|
|
Category: "security",
|
|
Severity: SeverityWarning,
|
|
Description: "Agent rate limit at 80% of threshold",
|
|
AutoResolve: true,
|
|
CreateTicket: false,
|
|
},
|
|
"WRN-10003": {
|
|
Code: "WRN-10003",
|
|
Category: "storage",
|
|
Severity: SeverityWarning,
|
|
Description: "Disk usage >80% on vault storage",
|
|
AutoResolve: false,
|
|
CreateTicket: true, // Creates ticket for ops to expand storage
|
|
},
|
|
"WRN-10004": {
|
|
Code: "WRN-10004",
|
|
Category: "network",
|
|
Severity: SeverityWarning,
|
|
Description: "TLS handshake latency elevated",
|
|
AutoResolve: true,
|
|
CreateTicket: false,
|
|
},
|
|
|
|
// AUTH ERRORS (ERR-1xxxx)
|
|
"ERR-10001": {
|
|
Code: "ERR-10001",
|
|
Category: "auth",
|
|
Severity: SeverityError,
|
|
Message: "Authentication failed. Please check your credentials and try again.",
|
|
Description: "WebAuthn challenge verification failed",
|
|
AutoResolve: false,
|
|
CreateTicket: false, // User error, not system error
|
|
},
|
|
"ERR-10002": {
|
|
Code: "ERR-10002",
|
|
Category: "auth",
|
|
Severity: SeverityError,
|
|
Message: "Access denied. You don't have permission for this operation.",
|
|
Description: "Actor attempted operation outside their scope",
|
|
AutoResolve: false,
|
|
CreateTicket: false,
|
|
},
|
|
"ERR-10003": {
|
|
Code: "ERR-10003",
|
|
Category: "auth",
|
|
Severity: SeverityError,
|
|
Message: "Invalid agent token. The agent may need to be re-enrolled.",
|
|
Description: "CVT token validation failed",
|
|
AutoResolve: false,
|
|
CreateTicket: true,
|
|
},
|
|
"ERR-10004": {
|
|
Code: "ERR-10004",
|
|
Category: "security",
|
|
Severity: SeverityCritical,
|
|
Message: "Agent locked due to suspicious activity. Contact the vault owner.",
|
|
Description: "Agent triggered harvester defenses (two-strike lockdown)",
|
|
AutoResolve: false,
|
|
CreateTicket: true,
|
|
},
|
|
"ERR-10005": {
|
|
Code: "ERR-10005",
|
|
Category: "security",
|
|
Severity: SeverityCritical,
|
|
Message: "Request from unauthorized IP. Token may be compromised.",
|
|
Description: "Agent request from IP not in whitelist",
|
|
AutoResolve: false,
|
|
CreateTicket: true,
|
|
},
|
|
|
|
// INPUT ERRORS (ERR-2xxxx)
|
|
"ERR-20001": {
|
|
Code: "ERR-20001",
|
|
Category: "input",
|
|
Severity: SeverityError,
|
|
Message: "Invalid request. Please check your input and try again.",
|
|
Description: "JSON parse failed or required field missing",
|
|
AutoResolve: false,
|
|
CreateTicket: false,
|
|
},
|
|
"ERR-20002": {
|
|
Code: "ERR-20002",
|
|
Category: "input",
|
|
Severity: SeverityError,
|
|
Message: "Invalid ID format. The requested item may not exist.",
|
|
Description: "Entry/agent ID parsing failed",
|
|
AutoResolve: false,
|
|
CreateTicket: false,
|
|
},
|
|
|
|
// NOT FOUND ERRORS (ERR-3xxxx)
|
|
"ERR-30001": {
|
|
Code: "ERR-30001",
|
|
Category: "not_found",
|
|
Severity: SeverityError,
|
|
Message: "Vault not found. Please register or check your vault path.",
|
|
Description: "No vault exists at the requested path",
|
|
AutoResolve: false,
|
|
CreateTicket: false,
|
|
},
|
|
"ERR-30002": {
|
|
Code: "ERR-30002",
|
|
Category: "not_found",
|
|
Severity: SeverityError,
|
|
Message: "Entry not found. It may have been deleted.",
|
|
Description: "Requested entry ID does not exist",
|
|
AutoResolve: false,
|
|
CreateTicket: false,
|
|
},
|
|
|
|
// SYSTEM ERRORS (ERR-5xxxx) - These POST to central
|
|
"ERR-50001": {
|
|
Code: "ERR-50001",
|
|
Category: "database",
|
|
Severity: SeverityCritical,
|
|
Message: "Service temporarily unavailable. Our team has been alerted and is working on it. Reference: ERR-50001.",
|
|
Description: "Vault database connection failed or file inaccessible",
|
|
AutoResolve: false,
|
|
CreateTicket: true,
|
|
},
|
|
"ERR-50002": {
|
|
Code: "ERR-50002",
|
|
Category: "database",
|
|
Severity: SeverityError,
|
|
Message: "Failed to save data. Please try again in a moment.",
|
|
Description: "Database write operation failed",
|
|
AutoResolve: true,
|
|
CreateTicket: true,
|
|
},
|
|
"ERR-50003": {
|
|
Code: "ERR-50003",
|
|
Category: "storage",
|
|
Severity: SeverityCritical,
|
|
Message: "Service temporarily unavailable. Our team has been alerted. Reference: ERR-50003.",
|
|
Description: "WL3 credential storage write failed",
|
|
AutoResolve: false,
|
|
CreateTicket: true,
|
|
},
|
|
"ERR-50004": {
|
|
Code: "ERR-50004",
|
|
Category: "network",
|
|
Severity: SeverityError,
|
|
Message: "Connection issue. Please try again.",
|
|
Description: "Failed to connect to central admin for sync",
|
|
AutoResolve: true,
|
|
CreateTicket: true,
|
|
},
|
|
|
|
// RATE LIMITING (ERR-6xxxx)
|
|
"ERR-60001": {
|
|
Code: "ERR-60001",
|
|
Category: "rate_limit",
|
|
Severity: SeverityError,
|
|
Message: "Too many requests. Please slow down and try again in a moment.",
|
|
Description: "Global per-IP rate limit exceeded",
|
|
AutoResolve: true,
|
|
CreateTicket: false,
|
|
},
|
|
"ERR-60002": {
|
|
Code: "ERR-60002",
|
|
Category: "rate_limit",
|
|
Severity: SeverityWarning,
|
|
Message: "Agent rate limit warning. Reduce request frequency.",
|
|
Description: "Per-agent unique-entry quota at 90%",
|
|
AutoResolve: true,
|
|
CreateTicket: false,
|
|
},
|
|
|
|
// INTERNAL ERRORS (ERR-9xxxx) - Invariant violations
|
|
"ERR-90001": {
|
|
Code: "ERR-90001",
|
|
Category: "invariant",
|
|
Severity: SeverityCritical,
|
|
Message: "An unexpected error occurred. Please try again or contact support. Reference: ERR-90001.",
|
|
Description: "Condition assumed impossible was triggered",
|
|
AutoResolve: false,
|
|
CreateTicket: true,
|
|
},
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Incident Tracking - Centralized Error Management
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// CentralEvent is sent to clavitor.ai for every ticket-creating event.
|
|
// No deduplication - "let it rain". Central handles aggregation if needed.
|
|
type CentralEvent struct {
|
|
EventID string `json:"event_id"` // UUID generated locally
|
|
Code string `json:"code"` // ERR-50001, etc.
|
|
Category string `json:"category"` // database, network, etc.
|
|
Severity string `json:"severity"`
|
|
Resource string `json:"resource"` // uk1, db-primary, etc.
|
|
POP string `json:"pop"` // Which POP reported this
|
|
Operation string `json:"operation"` // GetEntry, agent unlock, etc.
|
|
ErrorDetail string `json:"error_detail"` // The actual error message
|
|
Actor string `json:"actor"` // web, agent:abc123
|
|
Timestamp int64 `json:"timestamp"`
|
|
UserMessage string `json:"user_message"` // Shown to users
|
|
}
|
|
|
|
// CentralClient posts events to central (clavitor.ai).
|
|
// Only created in commercial edition; community edition logs locally only.
|
|
type CentralClient struct {
|
|
centralURL string
|
|
popID string
|
|
apiKey string
|
|
}
|
|
|
|
var (
|
|
globalCentralClient *CentralClient
|
|
onceCentral sync.Once
|
|
)
|
|
|
|
// InitCentralClient creates the global central reporter.
|
|
// Called at startup from main.go.
|
|
func InitCentralClient(centralURL, popID, apiKey string) {
|
|
onceCentral.Do(func() {
|
|
globalCentralClient = &CentralClient{
|
|
centralURL: centralURL,
|
|
popID: popID,
|
|
apiKey: apiKey,
|
|
}
|
|
})
|
|
}
|
|
|
|
// postEvent sends an event to central asynchronously.
|
|
// Every event is sent individually - no local deduplication.
|
|
func (c *CentralClient) postEvent(ev CentralEvent) {
|
|
if c == nil {
|
|
return // Community edition - no central reporting
|
|
}
|
|
|
|
payload, _ := json.Marshal(ev)
|
|
req, err := http.NewRequest("POST", c.centralURL+"/v1/events", bytes.NewReader(payload))
|
|
if err != nil {
|
|
log.Printf("[ERROR] Failed to create event request: %v", err)
|
|
return
|
|
}
|
|
req.Header.Set("Authorization", "Bearer "+c.apiKey)
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
client := &http.Client{Timeout: 10 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
log.Printf("[ERROR] Failed to post event to central: %v", err)
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusCreated {
|
|
log.Printf("[ERROR] Central rejected event: %d", resp.StatusCode)
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Event Logging - The main interface for handlers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// EventContext holds all the searchable fields for an event.
|
|
type EventContext struct {
|
|
Code string // ERR-50001, WRN-10001, etc.
|
|
Category string // auth, database, network, security
|
|
Resource string // Which resource: uk1, db-primary, vault-file-123
|
|
Operation string // What was being attempted: "GetEntry", "agent unlock"
|
|
Actor string // web, agent:abc123, extension
|
|
ErrorDetail string // The actual error: "connection refused", "disk full"
|
|
IPAddr string // Client IP
|
|
Severity Severity
|
|
}
|
|
|
|
// LogEvent is the main entry point for all events (errors and warnings).
|
|
// This replaces direct AuditLog calls for error cases.
|
|
func LogEvent(db *DB, ctx context.Context, ec EventContext) {
|
|
def, ok := EventRegistry[ec.Code]
|
|
if !ok {
|
|
log.Printf("[ERROR] Unknown event code: %s", ec.Code)
|
|
def = EventRegistry["ERR-90001"]
|
|
ec.Code = "ERR-90001"
|
|
ec.Severity = SeverityCritical
|
|
}
|
|
|
|
// Log to local audit (searchable fields)
|
|
// Title contains structured data: resource, error, operation
|
|
// Searchable: grep "resource=uk1" audit.log
|
|
auditTitle := fmt.Sprintf("op=%s | resource=%s | error=%s | %s",
|
|
ec.Operation, ec.Resource, ec.ErrorDetail, def.Description)
|
|
AuditLog(db, &AuditEvent{
|
|
Action: ec.Code,
|
|
Actor: ec.Actor,
|
|
Title: auditTitle,
|
|
IPAddr: ec.IPAddr,
|
|
})
|
|
|
|
// Log to operator logs (human readable)
|
|
log.Printf("[%s] %s | resource=%s | op=%s | actor=%s | error=%s | severity=%s",
|
|
ec.Code,
|
|
def.Description,
|
|
ec.Resource,
|
|
ec.Operation,
|
|
ec.Actor,
|
|
ec.ErrorDetail,
|
|
def.Severity.String(),
|
|
)
|
|
|
|
// Post to central if ticket-creating event
|
|
// "Let it rain" - every event is sent individually, no deduplication
|
|
if def.CreateTicket && globalCentralClient != nil {
|
|
go globalCentralClient.postEvent(CentralEvent{
|
|
EventID: generateEventID(),
|
|
Code: ec.Code,
|
|
Category: def.Category,
|
|
Severity: def.Severity.String(),
|
|
Resource: ec.Resource,
|
|
POP: globalCentralClient.popID,
|
|
Operation: ec.Operation,
|
|
ErrorDetail: ec.ErrorDetail,
|
|
Actor: ec.Actor,
|
|
Timestamp: time.Now().Unix(),
|
|
UserMessage: def.Message,
|
|
})
|
|
}
|
|
}
|
|
|
|
// generateEventID creates a simple UUID-like identifier.
|
|
func generateEventID() string {
|
|
return fmt.Sprintf("EVT-%d-%d", time.Now().Unix(), time.Now().UnixNano()%1000)
|
|
}
|
|
|
|
// LogWarning for non-critical anomalies.
|
|
func LogWarning(db *DB, ctx context.Context, code, resource, operation, detail, actor, ip string) {
|
|
LogEvent(db, ctx, EventContext{
|
|
Code: code,
|
|
Category: "performance", // default, override via registry
|
|
Resource: resource,
|
|
Operation: operation,
|
|
Actor: actor,
|
|
ErrorDetail: detail,
|
|
IPAddr: ip,
|
|
Severity: SeverityWarning,
|
|
})
|
|
}
|
|
|
|
// LogError for user-impacting issues.
|
|
func LogError(db *DB, ctx context.Context, code, resource, operation, detail, actor, ip string) {
|
|
LogEvent(db, ctx, EventContext{
|
|
Code: code,
|
|
Category: "system", // default, override via registry
|
|
Resource: resource,
|
|
Operation: operation,
|
|
Actor: actor,
|
|
ErrorDetail: detail,
|
|
IPAddr: ip,
|
|
Severity: SeverityError,
|
|
})
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// HTTP Response Helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// ErrorResponse sends a user-facing error with code.
|
|
// Use this for API responses to clients.
|
|
func ErrorResponse(w http.ResponseWriter, status int, code string) {
|
|
def, ok := EventRegistry[code]
|
|
if !ok {
|
|
def = EventDef{
|
|
Code: code,
|
|
Message: "An error occurred. Please try again.",
|
|
}
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(status)
|
|
|
|
response := map[string]string{
|
|
"error": code,
|
|
"message": def.Message,
|
|
}
|
|
|
|
// For critical/system errors, add incident reference if available
|
|
if def.Severity >= SeverityError && def.CreateTicket {
|
|
response["reference"] = code // User can reference this when contacting support
|
|
response["status"] = "Our team has been alerted"
|
|
}
|
|
|
|
json.NewEncoder(w).Encode(response)
|
|
}
|
|
|
|
// HandleError is the full-flow helper for handlers.
|
|
// Logs event, posts to central if needed, and returns user response.
|
|
func HandleError(
|
|
w http.ResponseWriter,
|
|
r *http.Request,
|
|
db *DB,
|
|
code string,
|
|
resource string,
|
|
operation string,
|
|
internalErr error,
|
|
httpStatus int,
|
|
) {
|
|
ctx := r.Context()
|
|
|
|
// Build actor from context (set by middleware)
|
|
actor := "unknown"
|
|
if a, ok := ctx.Value("actor").(string); ok {
|
|
actor = a
|
|
}
|
|
|
|
// Log the full event
|
|
LogEvent(db, ctx, EventContext{
|
|
Code: code,
|
|
Resource: resource,
|
|
Operation: operation,
|
|
Actor: actor,
|
|
ErrorDetail: internalErr.Error(),
|
|
IPAddr: r.RemoteAddr,
|
|
})
|
|
|
|
// Return user-facing response
|
|
ErrorResponse(w, httpStatus, code)
|
|
}
|
|
|
|
// LookupEvent returns the event definition for documentation.
|
|
func LookupEvent(code string) (EventDef, bool) {
|
|
def, ok := EventRegistry[code]
|
|
return def, ok
|
|
}
|
|
|
|
// ListEventsByCategory returns all events in a category.
|
|
func ListEventsByCategory(category string) []EventDef {
|
|
var results []EventDef
|
|
for _, def := range EventRegistry {
|
|
if def.Category == category {
|
|
results = append(results, def)
|
|
}
|
|
}
|
|
return results
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Status Page Integration (for central/clavitor.ai)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// StatusPageEntry is returned by central's /status endpoint.
|
|
type StatusPageEntry struct {
|
|
Component string `json:"component"` // uk1, db-primary, etc.
|
|
Status string `json:"status"` // operational, degraded, down
|
|
IncidentID string `json:"incident_id,omitempty"`
|
|
UpdatedAt int64 `json:"updated_at"`
|
|
UserMessage string `json:"user_message,omitempty"`
|
|
}
|
|
|
|
// IsResourceAffected queries central for recent events on a resource.
|
|
// This queries central rather than local cache (no local deduplication).
|
|
func IsResourceAffected(resource string) bool {
|
|
// TODO: Query central /v1/events?resource=uk1&since=5m
|
|
// For now, returns false - central is source of truth
|
|
return false
|
|
}
|
|
|
|
// GetStatusForResource returns current status by querying central.
|
|
func GetStatusForResource(resource string) (StatusPageEntry, bool) {
|
|
// TODO: Query central /v1/status?resource=uk1
|
|
// For now, assumes operational - central drives status page
|
|
return StatusPageEntry{
|
|
Component: resource,
|
|
Status: "operational",
|
|
UpdatedAt: time.Now().Unix(),
|
|
}, false
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Central Query Helpers (for clavitor.ai implementation)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// ActiveEventSummary is what central's dashboard shows.
|
|
// SQL equivalent:
|
|
// SELECT code, resource, pop, COUNT(*) as count, MAX(timestamp) as last_seen
|
|
// FROM events
|
|
// WHERE status != 'resolved'
|
|
// GROUP BY code, resource, pop
|
|
// ORDER BY count DESC
|
|
//
|
|
// This gives you: "uk1 has 12 ERR-50001 in the last hour"
|
|
type ActiveEventSummary struct {
|
|
Code string `json:"code"` // ERR-50001
|
|
Resource string `json:"resource"` // uk1
|
|
POP string `json:"pop"` // zrh
|
|
Count int `json:"count"` // How many events
|
|
FirstSeen int64 `json:"first_seen"` // First event timestamp
|
|
LastSeen int64 `json:"last_seen"` // Most recent event timestamp
|
|
Status string `json:"status"` // investigating, identified, monitoring, resolved
|
|
}
|
|
|
|
// CentralQuery represents the query parameters for the central endpoint.
|
|
// The central API should support:
|
|
// GET /v1/events?status=active&group_by=code,resource,pop
|
|
// GET /v1/events?code=ERR-50001&resource=uk1&since=1h
|
|
// POST /v1/events/bulk-resolve { "code": "ERR-50001", "resource": "uk1" }
|
|
type CentralQuery struct {
|
|
Status string `json:"status,omitempty"` // active, resolved, all
|
|
Code string `json:"code,omitempty"` // ERR-50001
|
|
Resource string `json:"resource,omitempty"` // uk1
|
|
POP string `json:"pop,omitempty"` // zrh
|
|
Since string `json:"since,omitempty"` // 1h, 24h, 7d
|
|
GroupBy []string `json:"group_by,omitempty"` // code, resource, pop
|
|
Severity string `json:"severity,omitempty"` // error, critical
|
|
CreateTicket bool `json:"create_ticket,omitempty"` // true = ticket-creating only
|
|
}
|
|
|
|
// BulkResolveRequest marks events as resolved in bulk.
|
|
// Use case: uk1 fixed, resolve all ERR-50001 for uk1 at once.
|
|
type BulkResolveRequest struct {
|
|
Code string `json:"code"` // Required
|
|
Resource string `json:"resource,omitempty"` // Optional: resolve for specific resource
|
|
POP string `json:"pop,omitempty"` // Optional: resolve for specific POP
|
|
Since int64 `json:"since,omitempty"` // Optional: resolve events after this time
|
|
Message string `json:"message"` // Resolution message: "Disk space freed, service restored"
|
|
ResolvedBy string `json:"resolved_by"` // Who fixed it: "ops-johan"
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Environment-based initialization helper
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// InitErrorsFromEnv sets up the central client from environment variables.
|
|
// Call this from main():
|
|
// lib.InitErrorsFromEnv()
|
|
func InitErrorsFromEnv() {
|
|
centralURL := os.Getenv("CLAVITOR_CENTRAL_URL")
|
|
popID := os.Getenv("CLAVITOR_POP_ID")
|
|
apiKey := os.Getenv("CLAVITOR_API_KEY")
|
|
|
|
if centralURL != "" && popID != "" && apiKey != "" {
|
|
InitCentralClient(centralURL, popID, apiKey)
|
|
log.Printf("[INIT] Central event reporting enabled: POP=%s -> %s", popID, centralURL)
|
|
} else {
|
|
log.Printf("[INIT] Central event reporting disabled (community edition or missing config)")
|
|
}
|
|
}
|