fix(workload): harden signal recommendations and add route e2e coverage
This commit is contained in:
parent
b38ad43272
commit
4296943e05
26
README.md
26
README.md
|
|
@ -418,6 +418,32 @@ pnpm test:e2e # Playwright E2E
|
||||||
pnpm quality:gate # All checks
|
pnpm quality:gate # All checks
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Workload Signals Contract
|
||||||
|
|
||||||
|
`GET /api/workload` returns a workload snapshot and one recommendation:
|
||||||
|
|
||||||
|
- `normal`: system healthy, submit freely
|
||||||
|
- `throttle`: reduce submission rate / defer non-critical work
|
||||||
|
- `shed`: submit only critical work
|
||||||
|
- `pause`: hold submissions until capacity returns
|
||||||
|
|
||||||
|
Low-signal behavior:
|
||||||
|
|
||||||
|
- `capacity.error_rate_5m` is clamped to `[0,1]`
|
||||||
|
- `queue.estimated_wait_confidence` is `calculated` or `unknown`
|
||||||
|
- queue breakdown maps include stable keys even when counts are zero
|
||||||
|
|
||||||
|
Runtime-tunable thresholds:
|
||||||
|
|
||||||
|
- `MC_WORKLOAD_QUEUE_DEPTH_NORMAL`
|
||||||
|
- `MC_WORKLOAD_QUEUE_DEPTH_THROTTLE`
|
||||||
|
- `MC_WORKLOAD_QUEUE_DEPTH_SHED`
|
||||||
|
- `MC_WORKLOAD_BUSY_RATIO_THROTTLE`
|
||||||
|
- `MC_WORKLOAD_BUSY_RATIO_SHED`
|
||||||
|
- `MC_WORKLOAD_ERROR_RATE_THROTTLE`
|
||||||
|
- `MC_WORKLOAD_ERROR_RATE_SHED`
|
||||||
|
- `MC_WORKLOAD_RECENT_WINDOW_SECONDS`
|
||||||
|
|
||||||
## Roadmap
|
## Roadmap
|
||||||
|
|
||||||
See [open issues](https://github.com/builderz-labs/mission-control/issues) for planned work and the [v1.0.0 release notes](https://github.com/builderz-labs/mission-control/releases/tag/v1.0.0) for what shipped.
|
See [open issues](https://github.com/builderz-labs/mission-control/issues) for planned work and the [v1.0.0 release notes](https://github.com/builderz-labs/mission-control/releases/tag/v1.0.0) for what shipped.
|
||||||
|
|
|
||||||
87
openapi.json
87
openapi.json
|
|
@ -4626,6 +4626,93 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/api/workload": {
|
||||||
|
"get": {
|
||||||
|
"tags": [
|
||||||
|
"Monitoring"
|
||||||
|
],
|
||||||
|
"summary": "Get real-time workload recommendation",
|
||||||
|
"description": "Returns system workload metrics and an actionable recommendation: `normal`, `throttle`, `shed`, or `pause`. Thresholds are runtime-configurable via `MC_WORKLOAD_*` environment variables.",
|
||||||
|
"operationId": "getWorkloadSignals",
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "Workload snapshot and recommendation",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"timestamp": { "type": "integer" },
|
||||||
|
"workspace_id": { "type": "integer" },
|
||||||
|
"capacity": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"active_tasks": { "type": "integer" },
|
||||||
|
"tasks_last_5m": { "type": "integer" },
|
||||||
|
"errors_last_5m": { "type": "integer" },
|
||||||
|
"error_rate_5m": { "type": "number", "minimum": 0, "maximum": 1 },
|
||||||
|
"completions_last_hour": { "type": "integer" },
|
||||||
|
"avg_completion_rate_per_hour": { "type": "number" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"queue": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"total_pending": { "type": "integer" },
|
||||||
|
"by_status": { "type": "object", "additionalProperties": { "type": "integer" } },
|
||||||
|
"by_priority": { "type": "object", "additionalProperties": { "type": "integer" } },
|
||||||
|
"oldest_pending_age_seconds": { "type": ["integer", "null"] },
|
||||||
|
"estimated_wait_seconds": { "type": ["integer", "null"] },
|
||||||
|
"estimated_wait_confidence": { "type": "string", "enum": ["calculated", "unknown"] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"agents": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"total": { "type": "integer" },
|
||||||
|
"online": { "type": "integer" },
|
||||||
|
"busy": { "type": "integer" },
|
||||||
|
"idle": { "type": "integer" },
|
||||||
|
"offline": { "type": "integer" },
|
||||||
|
"busy_ratio": { "type": "number", "minimum": 0, "maximum": 1 },
|
||||||
|
"load_distribution": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"agent": { "type": "string" },
|
||||||
|
"assigned": { "type": "integer" },
|
||||||
|
"in_progress": { "type": "integer" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"recommendation": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"action": { "type": "string", "enum": ["normal", "throttle", "shed", "pause"] },
|
||||||
|
"reason": { "type": "string" },
|
||||||
|
"details": { "type": "array", "items": { "type": "string" } },
|
||||||
|
"submit_ok": { "type": "boolean" },
|
||||||
|
"suggested_delay_ms": { "type": "integer" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"thresholds": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Effective runtime thresholds after environment overrides."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"401": {
|
||||||
|
"$ref": "#/components/responses/Unauthorized"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"/api/events": {
|
"/api/events": {
|
||||||
"get": {
|
"get": {
|
||||||
"tags": [
|
"tags": [
|
||||||
|
|
|
||||||
|
|
@ -18,9 +18,22 @@ export default defineConfig({
|
||||||
{ name: 'chromium', use: { ...devices['Desktop Chrome'] } }
|
{ name: 'chromium', use: { ...devices['Desktop Chrome'] } }
|
||||||
],
|
],
|
||||||
webServer: {
|
webServer: {
|
||||||
command: 'pnpm start',
|
command: 'node .next/standalone/server.js',
|
||||||
url: 'http://127.0.0.1:3005',
|
url: 'http://127.0.0.1:3005',
|
||||||
reuseExistingServer: true,
|
reuseExistingServer: true,
|
||||||
timeout: 30_000,
|
timeout: 120_000,
|
||||||
|
env: {
|
||||||
|
...process.env,
|
||||||
|
HOSTNAME: process.env.HOSTNAME || '127.0.0.1',
|
||||||
|
PORT: process.env.PORT || '3005',
|
||||||
|
MC_DISABLE_RATE_LIMIT: process.env.MC_DISABLE_RATE_LIMIT || '1',
|
||||||
|
MC_WORKLOAD_QUEUE_DEPTH_THROTTLE: process.env.MC_WORKLOAD_QUEUE_DEPTH_THROTTLE || '1000',
|
||||||
|
MC_WORKLOAD_QUEUE_DEPTH_SHED: process.env.MC_WORKLOAD_QUEUE_DEPTH_SHED || '2000',
|
||||||
|
MC_WORKLOAD_ERROR_RATE_THROTTLE: process.env.MC_WORKLOAD_ERROR_RATE_THROTTLE || '1',
|
||||||
|
MC_WORKLOAD_ERROR_RATE_SHED: process.env.MC_WORKLOAD_ERROR_RATE_SHED || '1',
|
||||||
|
API_KEY: process.env.API_KEY || 'test-api-key-e2e-12345',
|
||||||
|
AUTH_USER: process.env.AUTH_USER || 'testadmin',
|
||||||
|
AUTH_PASS: process.env.AUTH_PASS || 'testpass1234!',
|
||||||
|
},
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
|
||||||
|
|
@ -56,16 +56,27 @@ export async function GET(request: NextRequest) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Configurable thresholds for recommendation engine
|
// Configurable thresholds for recommendation engine
|
||||||
const THRESHOLDS = {
|
function numEnv(name: string, fallback: number): number {
|
||||||
queue_depth_normal: 20,
|
const raw = process.env[name];
|
||||||
queue_depth_throttle: 50,
|
if (!raw || raw.trim().length === 0) return fallback;
|
||||||
queue_depth_shed: 100,
|
const parsed = Number(raw);
|
||||||
busy_agent_ratio_throttle: 0.8,
|
return Number.isFinite(parsed) ? parsed : fallback;
|
||||||
busy_agent_ratio_shed: 0.95,
|
}
|
||||||
error_rate_throttle: 0.1,
|
|
||||||
error_rate_shed: 0.25,
|
function buildThresholds() {
|
||||||
recent_window_seconds: 300, // 5 minutes for recent activity
|
return {
|
||||||
|
queue_depth_normal: numEnv('MC_WORKLOAD_QUEUE_DEPTH_NORMAL', 20),
|
||||||
|
queue_depth_throttle: numEnv('MC_WORKLOAD_QUEUE_DEPTH_THROTTLE', 50),
|
||||||
|
queue_depth_shed: numEnv('MC_WORKLOAD_QUEUE_DEPTH_SHED', 100),
|
||||||
|
busy_agent_ratio_throttle: numEnv('MC_WORKLOAD_BUSY_RATIO_THROTTLE', 0.8),
|
||||||
|
busy_agent_ratio_shed: numEnv('MC_WORKLOAD_BUSY_RATIO_SHED', 0.95),
|
||||||
|
error_rate_throttle: numEnv('MC_WORKLOAD_ERROR_RATE_THROTTLE', 0.1),
|
||||||
|
error_rate_shed: numEnv('MC_WORKLOAD_ERROR_RATE_SHED', 0.25),
|
||||||
|
recent_window_seconds: Math.max(1, Math.floor(numEnv('MC_WORKLOAD_RECENT_WINDOW_SECONDS', 300))),
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const THRESHOLDS = buildThresholds();
|
||||||
|
|
||||||
interface CapacityMetrics {
|
interface CapacityMetrics {
|
||||||
active_tasks: number;
|
active_tasks: number;
|
||||||
|
|
@ -82,6 +93,7 @@ interface QueueMetrics {
|
||||||
by_priority: Record<string, number>;
|
by_priority: Record<string, number>;
|
||||||
oldest_pending_age_seconds: number | null;
|
oldest_pending_age_seconds: number | null;
|
||||||
estimated_wait_seconds: number | null;
|
estimated_wait_seconds: number | null;
|
||||||
|
estimated_wait_confidence: 'calculated' | 'unknown';
|
||||||
}
|
}
|
||||||
|
|
||||||
interface AgentMetrics {
|
interface AgentMetrics {
|
||||||
|
|
@ -124,11 +136,13 @@ function buildCapacityMetrics(db: any, workspaceId: number, now: number): Capaci
|
||||||
`SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
|
`SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
|
||||||
).get(workspaceId, dayAgo) as any).c;
|
).get(workspaceId, dayAgo) as any).c;
|
||||||
|
|
||||||
|
const safeErrorRate = totalLast5m > 0 ? errorsLast5m / totalLast5m : 0;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
active_tasks: activeTasks,
|
active_tasks: activeTasks,
|
||||||
tasks_last_5m: tasksLast5m,
|
tasks_last_5m: tasksLast5m,
|
||||||
errors_last_5m: errorsLast5m,
|
errors_last_5m: errorsLast5m,
|
||||||
error_rate_5m: totalLast5m > 0 ? Math.round((errorsLast5m / totalLast5m) * 10000) / 10000 : 0,
|
error_rate_5m: Math.max(0, Math.min(1, Math.round(safeErrorRate * 10000) / 10000)),
|
||||||
completions_last_hour: completionsLastHour,
|
completions_last_hour: completionsLastHour,
|
||||||
avg_completion_rate_per_hour: Math.round((completionsLastDay / 24) * 100) / 100,
|
avg_completion_rate_per_hour: Math.round((completionsLastDay / 24) * 100) / 100,
|
||||||
};
|
};
|
||||||
|
|
@ -165,12 +179,23 @@ function buildQueueMetrics(db: any, workspaceId: number): QueueMetrics {
|
||||||
? Math.round((totalPending / completionsLastHour) * 3600)
|
? Math.round((totalPending / completionsLastHour) * 3600)
|
||||||
: null;
|
: null;
|
||||||
|
|
||||||
|
const statusMap = Object.fromEntries(byStatus.map(r => [r.status, r.count]));
|
||||||
|
for (const status of pendingStatuses) {
|
||||||
|
if (typeof statusMap[status] !== 'number') statusMap[status] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const priorityMap = Object.fromEntries(byPriority.map(r => [r.priority, r.count]));
|
||||||
|
for (const priority of ['low', 'medium', 'high', 'critical', 'urgent']) {
|
||||||
|
if (typeof priorityMap[priority] !== 'number') priorityMap[priority] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
total_pending: totalPending,
|
total_pending: totalPending,
|
||||||
by_status: Object.fromEntries(byStatus.map(r => [r.status, r.count])),
|
by_status: statusMap,
|
||||||
by_priority: Object.fromEntries(byPriority.map(r => [r.priority, r.count])),
|
by_priority: priorityMap,
|
||||||
oldest_pending_age_seconds: oldestAge,
|
oldest_pending_age_seconds: oldestAge,
|
||||||
estimated_wait_seconds: estimatedWait,
|
estimated_wait_seconds: estimatedWait,
|
||||||
|
estimated_wait_confidence: estimatedWait === null ? 'unknown' : 'calculated',
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -260,9 +285,9 @@ function computeRecommendation(
|
||||||
}
|
}
|
||||||
|
|
||||||
// No online agents = pause
|
// No online agents = pause
|
||||||
if (agents.online === 0 && agents.total > 0) {
|
if (agents.online === 0) {
|
||||||
level = 'pause';
|
level = 'pause';
|
||||||
reasons.push('No agents online');
|
reasons.push(agents.total > 0 ? 'No agents online' : 'No agents registered');
|
||||||
}
|
}
|
||||||
|
|
||||||
const delayMap: Record<RecommendationLevel, number> = {
|
const delayMap: Record<RecommendationLevel, number> = {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,96 @@
|
||||||
|
import { test, expect } from '@playwright/test'
|
||||||
|
import { API_KEY_HEADER, createTestAgent, deleteTestAgent, createTestTask, deleteTestTask } from './helpers'
|
||||||
|
|
||||||
|
test.describe('Workload Signals API', () => {
|
||||||
|
const agentCleanup: number[] = []
|
||||||
|
const taskCleanup: number[] = []
|
||||||
|
|
||||||
|
test.afterEach(async ({ request }) => {
|
||||||
|
for (const id of taskCleanup) {
|
||||||
|
await deleteTestTask(request, id).catch(() => {})
|
||||||
|
}
|
||||||
|
taskCleanup.length = 0
|
||||||
|
|
||||||
|
for (const id of agentCleanup) {
|
||||||
|
await deleteTestAgent(request, id).catch(() => {})
|
||||||
|
}
|
||||||
|
agentCleanup.length = 0
|
||||||
|
})
|
||||||
|
|
||||||
|
test('returns normal recommendation under light load', async ({ request }) => {
|
||||||
|
const { id } = await createTestAgent(request, { status: 'idle' })
|
||||||
|
agentCleanup.push(id)
|
||||||
|
|
||||||
|
const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
|
||||||
|
expect(res.status()).toBe(200)
|
||||||
|
const body = await res.json()
|
||||||
|
|
||||||
|
expect(body.recommendation.action).toBe('normal')
|
||||||
|
expect(body.recommendation.submit_ok).toBe(true)
|
||||||
|
})
|
||||||
|
|
||||||
|
test('returns throttle recommendation at high busy ratio', async ({ request }) => {
|
||||||
|
const idleAgent = await createTestAgent(request, { status: 'idle' })
|
||||||
|
agentCleanup.push(idleAgent.id)
|
||||||
|
for (let i = 0; i < 4; i++) {
|
||||||
|
const busyAgent = await createTestAgent(request, { status: 'busy' })
|
||||||
|
agentCleanup.push(busyAgent.id)
|
||||||
|
}
|
||||||
|
|
||||||
|
const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
|
||||||
|
expect(res.status()).toBe(200)
|
||||||
|
const body = await res.json()
|
||||||
|
|
||||||
|
expect(body.recommendation.action).toBe('throttle')
|
||||||
|
expect(body.recommendation.submit_ok).toBe(true)
|
||||||
|
})
|
||||||
|
|
||||||
|
test('returns shed recommendation at critical busy ratio', async ({ request }) => {
|
||||||
|
const idleAgent = await createTestAgent(request, { status: 'idle' })
|
||||||
|
agentCleanup.push(idleAgent.id)
|
||||||
|
for (let i = 0; i < 19; i++) {
|
||||||
|
const busyAgent = await createTestAgent(request, { status: 'busy' })
|
||||||
|
agentCleanup.push(busyAgent.id)
|
||||||
|
}
|
||||||
|
|
||||||
|
const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
|
||||||
|
expect(res.status()).toBe(200)
|
||||||
|
const body = await res.json()
|
||||||
|
|
||||||
|
expect(body.recommendation.action).toBe('shed')
|
||||||
|
expect(body.recommendation.submit_ok).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
test('returns pause recommendation when no agents are online', async ({ request }) => {
|
||||||
|
const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
|
||||||
|
expect(res.status()).toBe(200)
|
||||||
|
const body = await res.json()
|
||||||
|
|
||||||
|
expect(body.agents.online).toBe(0)
|
||||||
|
expect(body.recommendation.action).toBe('pause')
|
||||||
|
expect(body.recommendation.submit_ok).toBe(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
test('returns consistent response for low-signal conditions', async ({ request }) => {
|
||||||
|
const { id } = await createTestAgent(request, { status: 'idle' })
|
||||||
|
agentCleanup.push(id)
|
||||||
|
|
||||||
|
const task = await createTestTask(request, { status: 'inbox' })
|
||||||
|
taskCleanup.push(task.id)
|
||||||
|
|
||||||
|
const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
|
||||||
|
expect(res.status()).toBe(200)
|
||||||
|
const body = await res.json()
|
||||||
|
|
||||||
|
expect(body.capacity.error_rate_5m).toBeGreaterThanOrEqual(0)
|
||||||
|
expect(body.capacity.error_rate_5m).toBeLessThanOrEqual(1)
|
||||||
|
expect(body.queue.by_status).toHaveProperty('inbox')
|
||||||
|
expect(body.queue.by_status).toHaveProperty('assigned')
|
||||||
|
expect(body.queue.by_status).toHaveProperty('in_progress')
|
||||||
|
expect(body.queue.by_priority).toHaveProperty('critical')
|
||||||
|
expect(body.queue.by_priority).toHaveProperty('high')
|
||||||
|
expect(body.queue.by_priority).toHaveProperty('medium')
|
||||||
|
expect(body.queue.by_priority).toHaveProperty('low')
|
||||||
|
expect(['calculated', 'unknown']).toContain(body.queue.estimated_wait_confidence)
|
||||||
|
})
|
||||||
|
})
|
||||||
Loading…
Reference in New Issue