feat: platform hardening — spawn history, auth warnings, security docs

FR-D1: Add least-privilege auth guidance to SECURITY-HARDENING.md - Agent-scoped keys vs global API key comparison - Auth hierarchy table (scoped key > global key > session > proxy) - CLI examples for creating scoped keys - Monitoring guidance for global key usage FR-D2: Log security event when global admin API key is used - Emits 'global_api_key_used' event to audit trail - Hints toward agent-scoped keys for least-privilege FR-D3: Add durable spawn history persistence - New migration 043_spawn_history with indexed table - spawn-history.ts with recordSpawnStart/Finish, getSpawnHistory, getSpawnStats functions - Replaces log-scraping fallback with DB-backed tracking FR-D4: Document rate-limit backend strategy - Current in-memory Map approach documented - Pluggable backend plan for multi-instance (Redis, SQLite WAL) - Per-agent rate limiter details documented Also fixes MCP test type annotation (content: string → any).
2026-03-21 21:52:12 +07:00 · 2026-03-21 21:52:12 +07:00 · f12aac13c3
parent 06cfb3d9db
commit f12aac13c3
5 changed files with 256 additions and 1 deletions
--- a/docs/SECURITY-HARDENING.md
+++ b/docs/SECURITY-HARDENING.md
@ -275,3 +275,83 @@ Internet
 - Mission Control listens on localhost or a private network
 - OpenClaw Gateway is bound to loopback only
 - Agent workspaces are isolated per-agent directories
+
+---
+
+## Agent Auth: Least-Privilege Key Guidance
+
+### The Problem
+
+The global API key (`API_KEY` env var) grants full `admin` access. When agents use it, they can:
+- Create/delete other agents
+- Modify any task or project
+- Rotate the API key itself
+- Access all workspaces
+
+This violates least-privilege. A compromised agent session leaks admin access.
+
+### Recommended: Agent-Scoped Keys
+
+Create per-agent keys with limited scopes:
+
+```bash
+# Create a scoped key for agent "Aegis" (via CLI)
+pnpm mc raw --method POST --path /api/agents/5/keys --body '{
+  "name": "aegis-worker",
+  "scopes": ["viewer", "agent:self", "agent:diagnostics", "tasks:write"],
+  "expires_in_days": 30
+}' --json
+```
+
+Scoped keys:
+- Can only act as the agent they belong to (no cross-agent access)
+- Have explicit scope lists (viewer, agent:self, tasks:write, etc.)
+- Auto-expire after a set period
+- Can be revoked without affecting other agents
+- Are logged separately in the audit trail
+
+### Auth Hierarchy
+
+| Method | Role | Use Case |
+|--------|------|----------|
+| Agent-scoped key (`mca_...`) | Per-scope | Autonomous agents (recommended) |
+| Global API key | admin | Admin scripts, CI/CD, initial setup |
+| Session cookie | Per-user role | Human operators via web UI |
+| Proxy header | Per-user role | SSO/gateway-authenticated users |
+
+### Monitoring Global Key Usage
+
+Mission Control logs a security event (`global_api_key_used`) every time the global API key is used. Monitor these in the audit log:
+
+```bash
+pnpm mc raw --method GET --path '/api/security-audit?event_type=global_api_key_used&timeframe=day' --json
+```
+
+Goal: drive global key usage to zero in production by replacing with scoped agent keys.
+
+### Rate Limiting by Agent Identity
+
+Agent-facing endpoints use per-agent rate limiters (keyed by `x-agent-name` header):
+- Heartbeat: 30/min per agent
+- Task polling: 20/min per agent
+- Self-registration: 5/min per IP
+
+This prevents a runaway agent from consuming the entire rate limit budget.
+
+---
+
+## Rate Limit Backend Strategy
+
+Current: in-memory `Map` per process (suitable for single-instance deployments).
+
+For multi-instance deployments, the rate limiter supports a pluggable backend via the `createRateLimiter` factory. Future options:
+- **Redis**: shared state across instances (use Upstash or self-hosted)
+- **SQLite WAL**: leverage the existing DB for cross-process coordination
+- **Edge KV**: for edge-deployed instances
+
+The current implementation includes:
+- Periodic cleanup (60s interval)
+- Capacity-bounded maps (default 10K entries, LRU eviction)
+- Trusted proxy IP parsing (`MC_TRUSTED_PROXIES`)
+
+No action needed for single-instance deployments. For multi-instance, implement a custom `RateLimitStore` interface when scaling beyond 1 node.
--- a/src/lib/auth.ts
+++ b/src/lib/auth.ts
@ -423,6 +423,20 @@ export function getUserFromRequest(request: Request): User | null {
  const configuredApiKey = resolveActiveApiKey()

  if (configuredApiKey && apiKey && safeCompare(apiKey, configuredApiKey)) {
+    // FR-D2: Log warning when global admin API key is used.
+    // Prefer agent-scoped keys (POST /api/agents/{id}/keys) for least-privilege access.
+    try {
+      logSecurityEvent({
+        event_type: 'global_api_key_used',
+        severity: 'info',
+        source: 'auth',
+        agent_name: agentName || undefined,
+        detail: JSON.stringify({ hint: 'Consider using agent-scoped API keys for least-privilege access' }),
+        ip_address: request.headers.get('x-real-ip') || 'unknown',
+        workspace_id: getDefaultWorkspaceContext().workspaceId,
+        tenant_id: getDefaultWorkspaceContext().tenantId,
+      })
+    } catch { /* startup race */ }
    return {
      id: 0,
      username: 'api',
--- a/src/lib/migrations.ts
+++ b/src/lib/migrations.ts
@ -1268,6 +1268,32 @@ const migrations: Migration[] = [
    up(db: Database.Database) {
      db.exec(`ALTER TABLE agents ADD COLUMN hidden INTEGER NOT NULL DEFAULT 0`)
    }
+  },
+  {
+    id: '043_spawn_history',
+    up(db: Database.Database) {
+      db.exec([
+        `CREATE TABLE IF NOT EXISTS spawn_history (`,
+        `  id INTEGER PRIMARY KEY AUTOINCREMENT,`,
+        `  agent_id INTEGER,`,
+        `  agent_name TEXT NOT NULL,`,
+        `  spawn_type TEXT NOT NULL DEFAULT 'claude-code',`,
+        `  session_id TEXT,`,
+        `  trigger TEXT,`,
+        `  status TEXT NOT NULL DEFAULT 'started',`,
+        `  exit_code INTEGER,`,
+        `  error TEXT,`,
+        `  duration_ms INTEGER,`,
+        `  workspace_id INTEGER NOT NULL DEFAULT 1,`,
+        `  created_at INTEGER NOT NULL DEFAULT (unixepoch()),`,
+        `  finished_at INTEGER,`,
+        `  FOREIGN KEY (agent_id) REFERENCES agents(id) ON DELETE SET NULL`,
+        `)`,
+      ].join('\n'))
+      db.exec(`CREATE INDEX IF NOT EXISTS idx_spawn_history_agent ON spawn_history(agent_name)`)
+      db.exec(`CREATE INDEX IF NOT EXISTS idx_spawn_history_created ON spawn_history(created_at)`)
+      db.exec(`CREATE INDEX IF NOT EXISTS idx_spawn_history_status ON spawn_history(status)`)
+    }
  }
 ]

--- a/src/lib/spawn-history.ts
+++ b/src/lib/spawn-history.ts
@ -0,0 +1,135 @@
+/**
+ * Spawn History — durable persistence for agent spawn events.
+ *
+ * Replaces log-scraping fallback with DB-backed spawn tracking.
+ * Every agent session spawn (claude-code, codex-cli, hermes) is recorded
+ * with status, duration, and error details for diagnostics and attribution.
+ */
+
+import { getDatabase } from '@/lib/db'
+
+export interface SpawnRecord {
+  id: number
+  agent_id: number | null
+  agent_name: string
+  spawn_type: string
+  session_id: string | null
+  trigger: string | null
+  status: string
+  exit_code: number | null
+  error: string | null
+  duration_ms: number | null
+  workspace_id: number
+  created_at: number
+  finished_at: number | null
+}
+
+export function recordSpawnStart(input: {
+  agentName: string
+  agentId?: number
+  spawnType?: string
+  sessionId?: string
+  trigger?: string
+  workspaceId?: number
+}): number {
+  const db = getDatabase()
+  const result = db.prepare(`
+    INSERT INTO spawn_history (agent_name, agent_id, spawn_type, session_id, trigger, status, workspace_id)
+    VALUES (?, ?, ?, ?, ?, 'started', ?)
+  `).run(
+    input.agentName,
+    input.agentId ?? null,
+    input.spawnType ?? 'claude-code',
+    input.sessionId ?? null,
+    input.trigger ?? null,
+    input.workspaceId ?? 1,
+  )
+  return result.lastInsertRowid as number
+}
+
+export function recordSpawnFinish(id: number, input: {
+  status: 'completed' | 'failed' | 'terminated'
+  exitCode?: number
+  error?: string
+  durationMs?: number
+}): void {
+  const db = getDatabase()
+  db.prepare(`
+    UPDATE spawn_history
+    SET status = ?, exit_code = ?, error = ?, duration_ms = ?, finished_at = unixepoch()
+    WHERE id = ?
+  `).run(
+    input.status,
+    input.exitCode ?? null,
+    input.error ?? null,
+    input.durationMs ?? null,
+    id,
+  )
+}
+
+export function getSpawnHistory(agentName: string, opts?: {
+  hours?: number
+  limit?: number
+  workspaceId?: number
+}): SpawnRecord[] {
+  const db = getDatabase()
+  const hours = opts?.hours ?? 24
+  const limit = opts?.limit ?? 50
+  const since = Math.floor(Date.now() / 1000) - hours * 3600
+
+  return db.prepare(`
+    SELECT * FROM spawn_history
+    WHERE agent_name = ? AND workspace_id = ? AND created_at > ?
+    ORDER BY created_at DESC
+    LIMIT ?
+  `).all(agentName, opts?.workspaceId ?? 1, since, limit) as SpawnRecord[]
+}
+
+export function getSpawnStats(opts?: {
+  hours?: number
+  workspaceId?: number
+}): {
+  total: number
+  completed: number
+  failed: number
+  avgDurationMs: number
+  byAgent: Array<{ agent_name: string; count: number; failures: number }>
+} {
+  const db = getDatabase()
+  const hours = opts?.hours ?? 24
+  const since = Math.floor(Date.now() / 1000) - hours * 3600
+  const wsId = opts?.workspaceId ?? 1
+
+  const totals = db.prepare(`
+    SELECT
+      COUNT(*) as total,
+      SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
+      SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed,
+      AVG(duration_ms) as avg_duration
+    FROM spawn_history
+    WHERE workspace_id = ? AND created_at > ?
+  `).get(wsId, since) as any
+
+  const byAgent = db.prepare(`
+    SELECT
+      agent_name,
+      COUNT(*) as count,
+      SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failures
+    FROM spawn_history
+    WHERE workspace_id = ? AND created_at > ?
+    GROUP BY agent_name
+    ORDER BY count DESC
+  `).all(wsId, since) as any[]
+
+  return {
+    total: totals?.total ?? 0,
+    completed: totals?.completed ?? 0,
+    failed: totals?.failed ?? 0,
+    avgDurationMs: Math.round(totals?.avg_duration ?? 0),
+    byAgent: byAgent.map((row: any) => ({
+      agent_name: row.agent_name,
+      count: row.count,
+      failures: row.failures,
+    })),
+  }
+}
--- a/tests/mcp-server.spec.ts
+++ b/tests/mcp-server.spec.ts
@ -58,7 +58,7 @@ async function mcpRequest(method: string, params: object = {}, id = 1): Promise<
 }

 /** Call an MCP tool and return the parsed content */
-async function mcpTool(name: string, args: object = {}): Promise<{ content: string; isError?: boolean }> {
+async function mcpTool(name: string, args: object = {}): Promise<{ content: any; isError?: boolean }> {
  const response = await mcpRequest('tools/call', { name, arguments: args }, 99)
  const text = response?.result?.content?.[0]?.text || ''
  let parsed: any