diff --git a/README.md b/README.md index dc4ab10..7e5dfc8 100644 --- a/README.md +++ b/README.md @@ -418,6 +418,32 @@ pnpm test:e2e # Playwright E2E pnpm quality:gate # All checks ``` +## Workload Signals Contract + +`GET /api/workload` returns a workload snapshot and one recommendation: + +- `normal`: system healthy, submit freely +- `throttle`: reduce submission rate / defer non-critical work +- `shed`: submit only critical work +- `pause`: hold submissions until capacity returns + +Low-signal behavior: + +- `capacity.error_rate_5m` is clamped to `[0,1]` +- `queue.estimated_wait_confidence` is `calculated` or `unknown` +- queue breakdown maps include stable keys even when counts are zero + +Runtime-tunable thresholds: + +- `MC_WORKLOAD_QUEUE_DEPTH_NORMAL` +- `MC_WORKLOAD_QUEUE_DEPTH_THROTTLE` +- `MC_WORKLOAD_QUEUE_DEPTH_SHED` +- `MC_WORKLOAD_BUSY_RATIO_THROTTLE` +- `MC_WORKLOAD_BUSY_RATIO_SHED` +- `MC_WORKLOAD_ERROR_RATE_THROTTLE` +- `MC_WORKLOAD_ERROR_RATE_SHED` +- `MC_WORKLOAD_RECENT_WINDOW_SECONDS` + ## Roadmap See [open issues](https://github.com/builderz-labs/mission-control/issues) for planned work and the [v1.0.0 release notes](https://github.com/builderz-labs/mission-control/releases/tag/v1.0.0) for what shipped. diff --git a/openapi.json b/openapi.json index e66e163..094522f 100644 --- a/openapi.json +++ b/openapi.json @@ -4626,6 +4626,93 @@ } } }, + "/api/workload": { + "get": { + "tags": [ + "Monitoring" + ], + "summary": "Get real-time workload recommendation", + "description": "Returns system workload metrics and an actionable recommendation: `normal`, `throttle`, `shed`, or `pause`. Thresholds are runtime-configurable via `MC_WORKLOAD_*` environment variables.", + "operationId": "getWorkloadSignals", + "responses": { + "200": { + "description": "Workload snapshot and recommendation", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "timestamp": { "type": "integer" }, + "workspace_id": { "type": "integer" }, + "capacity": { + "type": "object", + "properties": { + "active_tasks": { "type": "integer" }, + "tasks_last_5m": { "type": "integer" }, + "errors_last_5m": { "type": "integer" }, + "error_rate_5m": { "type": "number", "minimum": 0, "maximum": 1 }, + "completions_last_hour": { "type": "integer" }, + "avg_completion_rate_per_hour": { "type": "number" } + } + }, + "queue": { + "type": "object", + "properties": { + "total_pending": { "type": "integer" }, + "by_status": { "type": "object", "additionalProperties": { "type": "integer" } }, + "by_priority": { "type": "object", "additionalProperties": { "type": "integer" } }, + "oldest_pending_age_seconds": { "type": ["integer", "null"] }, + "estimated_wait_seconds": { "type": ["integer", "null"] }, + "estimated_wait_confidence": { "type": "string", "enum": ["calculated", "unknown"] } + } + }, + "agents": { + "type": "object", + "properties": { + "total": { "type": "integer" }, + "online": { "type": "integer" }, + "busy": { "type": "integer" }, + "idle": { "type": "integer" }, + "offline": { "type": "integer" }, + "busy_ratio": { "type": "number", "minimum": 0, "maximum": 1 }, + "load_distribution": { + "type": "array", + "items": { + "type": "object", + "properties": { + "agent": { "type": "string" }, + "assigned": { "type": "integer" }, + "in_progress": { "type": "integer" } + } + } + } + } + }, + "recommendation": { + "type": "object", + "properties": { + "action": { "type": "string", "enum": ["normal", "throttle", "shed", "pause"] }, + "reason": { "type": "string" }, + "details": { "type": "array", "items": { "type": "string" } }, + "submit_ok": { "type": "boolean" }, + "suggested_delay_ms": { "type": "integer" } + } + }, + "thresholds": { + "type": "object", + "description": "Effective runtime thresholds after environment overrides." + } + } + } + } + } + }, + "401": { + "$ref": "#/components/responses/Unauthorized" + } + } + } + }, "/api/events": { "get": { "tags": [ diff --git a/playwright.config.ts b/playwright.config.ts index f50bcea..8f25959 100644 --- a/playwright.config.ts +++ b/playwright.config.ts @@ -18,9 +18,22 @@ export default defineConfig({ { name: 'chromium', use: { ...devices['Desktop Chrome'] } } ], webServer: { - command: 'pnpm start', + command: 'node .next/standalone/server.js', url: 'http://127.0.0.1:3005', reuseExistingServer: true, - timeout: 30_000, + timeout: 120_000, + env: { + ...process.env, + HOSTNAME: process.env.HOSTNAME || '127.0.0.1', + PORT: process.env.PORT || '3005', + MC_DISABLE_RATE_LIMIT: process.env.MC_DISABLE_RATE_LIMIT || '1', + MC_WORKLOAD_QUEUE_DEPTH_THROTTLE: process.env.MC_WORKLOAD_QUEUE_DEPTH_THROTTLE || '1000', + MC_WORKLOAD_QUEUE_DEPTH_SHED: process.env.MC_WORKLOAD_QUEUE_DEPTH_SHED || '2000', + MC_WORKLOAD_ERROR_RATE_THROTTLE: process.env.MC_WORKLOAD_ERROR_RATE_THROTTLE || '1', + MC_WORKLOAD_ERROR_RATE_SHED: process.env.MC_WORKLOAD_ERROR_RATE_SHED || '1', + API_KEY: process.env.API_KEY || 'test-api-key-e2e-12345', + AUTH_USER: process.env.AUTH_USER || 'testadmin', + AUTH_PASS: process.env.AUTH_PASS || 'testpass1234!', + }, } }) diff --git a/src/app/api/workload/route.ts b/src/app/api/workload/route.ts index 3298d67..516c71e 100644 --- a/src/app/api/workload/route.ts +++ b/src/app/api/workload/route.ts @@ -56,16 +56,27 @@ export async function GET(request: NextRequest) { } // Configurable thresholds for recommendation engine -const THRESHOLDS = { - queue_depth_normal: 20, - queue_depth_throttle: 50, - queue_depth_shed: 100, - busy_agent_ratio_throttle: 0.8, - busy_agent_ratio_shed: 0.95, - error_rate_throttle: 0.1, - error_rate_shed: 0.25, - recent_window_seconds: 300, // 5 minutes for recent activity -}; +function numEnv(name: string, fallback: number): number { + const raw = process.env[name]; + if (!raw || raw.trim().length === 0) return fallback; + const parsed = Number(raw); + return Number.isFinite(parsed) ? parsed : fallback; +} + +function buildThresholds() { + return { + queue_depth_normal: numEnv('MC_WORKLOAD_QUEUE_DEPTH_NORMAL', 20), + queue_depth_throttle: numEnv('MC_WORKLOAD_QUEUE_DEPTH_THROTTLE', 50), + queue_depth_shed: numEnv('MC_WORKLOAD_QUEUE_DEPTH_SHED', 100), + busy_agent_ratio_throttle: numEnv('MC_WORKLOAD_BUSY_RATIO_THROTTLE', 0.8), + busy_agent_ratio_shed: numEnv('MC_WORKLOAD_BUSY_RATIO_SHED', 0.95), + error_rate_throttle: numEnv('MC_WORKLOAD_ERROR_RATE_THROTTLE', 0.1), + error_rate_shed: numEnv('MC_WORKLOAD_ERROR_RATE_SHED', 0.25), + recent_window_seconds: Math.max(1, Math.floor(numEnv('MC_WORKLOAD_RECENT_WINDOW_SECONDS', 300))), + }; +} + +const THRESHOLDS = buildThresholds(); interface CapacityMetrics { active_tasks: number; @@ -82,6 +93,7 @@ interface QueueMetrics { by_priority: Record; oldest_pending_age_seconds: number | null; estimated_wait_seconds: number | null; + estimated_wait_confidence: 'calculated' | 'unknown'; } interface AgentMetrics { @@ -124,11 +136,13 @@ function buildCapacityMetrics(db: any, workspaceId: number, now: number): Capaci `SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?` ).get(workspaceId, dayAgo) as any).c; + const safeErrorRate = totalLast5m > 0 ? errorsLast5m / totalLast5m : 0; + return { active_tasks: activeTasks, tasks_last_5m: tasksLast5m, errors_last_5m: errorsLast5m, - error_rate_5m: totalLast5m > 0 ? Math.round((errorsLast5m / totalLast5m) * 10000) / 10000 : 0, + error_rate_5m: Math.max(0, Math.min(1, Math.round(safeErrorRate * 10000) / 10000)), completions_last_hour: completionsLastHour, avg_completion_rate_per_hour: Math.round((completionsLastDay / 24) * 100) / 100, }; @@ -165,12 +179,23 @@ function buildQueueMetrics(db: any, workspaceId: number): QueueMetrics { ? Math.round((totalPending / completionsLastHour) * 3600) : null; + const statusMap = Object.fromEntries(byStatus.map(r => [r.status, r.count])); + for (const status of pendingStatuses) { + if (typeof statusMap[status] !== 'number') statusMap[status] = 0; + } + + const priorityMap = Object.fromEntries(byPriority.map(r => [r.priority, r.count])); + for (const priority of ['low', 'medium', 'high', 'critical', 'urgent']) { + if (typeof priorityMap[priority] !== 'number') priorityMap[priority] = 0; + } + return { total_pending: totalPending, - by_status: Object.fromEntries(byStatus.map(r => [r.status, r.count])), - by_priority: Object.fromEntries(byPriority.map(r => [r.priority, r.count])), + by_status: statusMap, + by_priority: priorityMap, oldest_pending_age_seconds: oldestAge, estimated_wait_seconds: estimatedWait, + estimated_wait_confidence: estimatedWait === null ? 'unknown' : 'calculated', }; } @@ -260,9 +285,9 @@ function computeRecommendation( } // No online agents = pause - if (agents.online === 0 && agents.total > 0) { + if (agents.online === 0) { level = 'pause'; - reasons.push('No agents online'); + reasons.push(agents.total > 0 ? 'No agents online' : 'No agents registered'); } const delayMap: Record = { diff --git a/tests/workload-signals.spec.ts b/tests/workload-signals.spec.ts new file mode 100644 index 0000000..d259e60 --- /dev/null +++ b/tests/workload-signals.spec.ts @@ -0,0 +1,96 @@ +import { test, expect } from '@playwright/test' +import { API_KEY_HEADER, createTestAgent, deleteTestAgent, createTestTask, deleteTestTask } from './helpers' + +test.describe('Workload Signals API', () => { + const agentCleanup: number[] = [] + const taskCleanup: number[] = [] + + test.afterEach(async ({ request }) => { + for (const id of taskCleanup) { + await deleteTestTask(request, id).catch(() => {}) + } + taskCleanup.length = 0 + + for (const id of agentCleanup) { + await deleteTestAgent(request, id).catch(() => {}) + } + agentCleanup.length = 0 + }) + + test('returns normal recommendation under light load', async ({ request }) => { + const { id } = await createTestAgent(request, { status: 'idle' }) + agentCleanup.push(id) + + const res = await request.get('/api/workload', { headers: API_KEY_HEADER }) + expect(res.status()).toBe(200) + const body = await res.json() + + expect(body.recommendation.action).toBe('normal') + expect(body.recommendation.submit_ok).toBe(true) + }) + + test('returns throttle recommendation at high busy ratio', async ({ request }) => { + const idleAgent = await createTestAgent(request, { status: 'idle' }) + agentCleanup.push(idleAgent.id) + for (let i = 0; i < 4; i++) { + const busyAgent = await createTestAgent(request, { status: 'busy' }) + agentCleanup.push(busyAgent.id) + } + + const res = await request.get('/api/workload', { headers: API_KEY_HEADER }) + expect(res.status()).toBe(200) + const body = await res.json() + + expect(body.recommendation.action).toBe('throttle') + expect(body.recommendation.submit_ok).toBe(true) + }) + + test('returns shed recommendation at critical busy ratio', async ({ request }) => { + const idleAgent = await createTestAgent(request, { status: 'idle' }) + agentCleanup.push(idleAgent.id) + for (let i = 0; i < 19; i++) { + const busyAgent = await createTestAgent(request, { status: 'busy' }) + agentCleanup.push(busyAgent.id) + } + + const res = await request.get('/api/workload', { headers: API_KEY_HEADER }) + expect(res.status()).toBe(200) + const body = await res.json() + + expect(body.recommendation.action).toBe('shed') + expect(body.recommendation.submit_ok).toBe(false) + }) + + test('returns pause recommendation when no agents are online', async ({ request }) => { + const res = await request.get('/api/workload', { headers: API_KEY_HEADER }) + expect(res.status()).toBe(200) + const body = await res.json() + + expect(body.agents.online).toBe(0) + expect(body.recommendation.action).toBe('pause') + expect(body.recommendation.submit_ok).toBe(false) + }) + + test('returns consistent response for low-signal conditions', async ({ request }) => { + const { id } = await createTestAgent(request, { status: 'idle' }) + agentCleanup.push(id) + + const task = await createTestTask(request, { status: 'inbox' }) + taskCleanup.push(task.id) + + const res = await request.get('/api/workload', { headers: API_KEY_HEADER }) + expect(res.status()).toBe(200) + const body = await res.json() + + expect(body.capacity.error_rate_5m).toBeGreaterThanOrEqual(0) + expect(body.capacity.error_rate_5m).toBeLessThanOrEqual(1) + expect(body.queue.by_status).toHaveProperty('inbox') + expect(body.queue.by_status).toHaveProperty('assigned') + expect(body.queue.by_status).toHaveProperty('in_progress') + expect(body.queue.by_priority).toHaveProperty('critical') + expect(body.queue.by_priority).toHaveProperty('high') + expect(body.queue.by_priority).toHaveProperty('medium') + expect(body.queue.by_priority).toHaveProperty('low') + expect(['calculated', 'unknown']).toContain(body.queue.estimated_wait_confidence) + }) +})