feat(#160): add Real-Time Workload Signals API endpoint
New GET /api/workload endpoint providing system-wide capacity metrics and throttle recommendations for agent load awareness. Response sections: - capacity: active tasks, recent task/error rates, completion throughput - queue: pending task depth by status/priority, estimated wait time - agents: online/busy/idle counts, busy ratio, per-agent load distribution - recommendation: actionable signal (normal|throttle|shed|pause) with reasons, submit_ok flag, and suggested delay - thresholds: current threshold configuration for transparency Recommendation engine factors: - Queue depth (20/50/100 thresholds) - Agent saturation ratio (80%/95%) - Error rate in last 5 minutes (10%/25%) - Agent availability (pause if none online) Fixes #160
This commit is contained in:
parent
4cb86bc80b
commit
b38ad43272
|
|
@ -0,0 +1,294 @@
|
||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
import { getDatabase } from '@/lib/db';
|
||||||
|
import { requireRole } from '@/lib/auth';
|
||||||
|
import { logger } from '@/lib/logger';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/workload - Real-Time Workload Signals
|
||||||
|
*
|
||||||
|
* Provides system-wide capacity metrics and throttle recommendations
|
||||||
|
* so agents can make informed decisions about work submission.
|
||||||
|
*
|
||||||
|
* Response:
|
||||||
|
* capacity - Current system capacity metrics
|
||||||
|
* queue - Task queue depth and breakdown
|
||||||
|
* agents - Agent availability and load distribution
|
||||||
|
* recommendation - Actionable signal: normal | throttle | shed | pause
|
||||||
|
* thresholds - Current threshold configuration
|
||||||
|
*
|
||||||
|
* Agents should call this before submitting new work to avoid
|
||||||
|
* cascading failures and SLO breaches.
|
||||||
|
*/
|
||||||
|
export async function GET(request: NextRequest) {
|
||||||
|
const auth = requireRole(request, 'viewer');
|
||||||
|
if ('error' in auth) return NextResponse.json({ error: auth.error }, { status: auth.status });
|
||||||
|
|
||||||
|
try {
|
||||||
|
const db = getDatabase();
|
||||||
|
const workspaceId = auth.user.workspace_id ?? 1;
|
||||||
|
const now = Math.floor(Date.now() / 1000);
|
||||||
|
|
||||||
|
// --- Capacity metrics ---
|
||||||
|
const capacity = buildCapacityMetrics(db, workspaceId, now);
|
||||||
|
|
||||||
|
// --- Queue depth ---
|
||||||
|
const queue = buildQueueMetrics(db, workspaceId);
|
||||||
|
|
||||||
|
// --- Agent availability ---
|
||||||
|
const agents = buildAgentMetrics(db, workspaceId, now);
|
||||||
|
|
||||||
|
// --- Recommendation ---
|
||||||
|
const recommendation = computeRecommendation(capacity, queue, agents);
|
||||||
|
|
||||||
|
return NextResponse.json({
|
||||||
|
timestamp: now,
|
||||||
|
workspace_id: workspaceId,
|
||||||
|
capacity,
|
||||||
|
queue,
|
||||||
|
agents,
|
||||||
|
recommendation,
|
||||||
|
thresholds: THRESHOLDS,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ err: error }, 'GET /api/workload error');
|
||||||
|
return NextResponse.json({ error: 'Failed to fetch workload signals' }, { status: 500 });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Configurable thresholds for recommendation engine
|
||||||
|
const THRESHOLDS = {
|
||||||
|
queue_depth_normal: 20,
|
||||||
|
queue_depth_throttle: 50,
|
||||||
|
queue_depth_shed: 100,
|
||||||
|
busy_agent_ratio_throttle: 0.8,
|
||||||
|
busy_agent_ratio_shed: 0.95,
|
||||||
|
error_rate_throttle: 0.1,
|
||||||
|
error_rate_shed: 0.25,
|
||||||
|
recent_window_seconds: 300, // 5 minutes for recent activity
|
||||||
|
};
|
||||||
|
|
||||||
|
interface CapacityMetrics {
|
||||||
|
active_tasks: number;
|
||||||
|
tasks_last_5m: number;
|
||||||
|
errors_last_5m: number;
|
||||||
|
error_rate_5m: number;
|
||||||
|
completions_last_hour: number;
|
||||||
|
avg_completion_rate_per_hour: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface QueueMetrics {
|
||||||
|
total_pending: number;
|
||||||
|
by_status: Record<string, number>;
|
||||||
|
by_priority: Record<string, number>;
|
||||||
|
oldest_pending_age_seconds: number | null;
|
||||||
|
estimated_wait_seconds: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface AgentMetrics {
|
||||||
|
total: number;
|
||||||
|
online: number;
|
||||||
|
busy: number;
|
||||||
|
idle: number;
|
||||||
|
offline: number;
|
||||||
|
busy_ratio: number;
|
||||||
|
load_distribution: Array<{ agent: string; assigned: number; in_progress: number }>;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildCapacityMetrics(db: any, workspaceId: number, now: number): CapacityMetrics {
|
||||||
|
const recentWindow = now - THRESHOLDS.recent_window_seconds;
|
||||||
|
const hourAgo = now - 3600;
|
||||||
|
|
||||||
|
const activeTasks = (db.prepare(
|
||||||
|
`SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status IN ('assigned', 'in_progress', 'review', 'quality_review')`
|
||||||
|
).get(workspaceId) as any).c;
|
||||||
|
|
||||||
|
const tasksLast5m = (db.prepare(
|
||||||
|
`SELECT COUNT(*) as c FROM activities WHERE workspace_id = ? AND created_at >= ? AND type IN ('task_created', 'task_assigned')`
|
||||||
|
).get(workspaceId, recentWindow) as any).c;
|
||||||
|
|
||||||
|
const errorsLast5m = (db.prepare(
|
||||||
|
`SELECT COUNT(*) as c FROM activities WHERE workspace_id = ? AND created_at >= ? AND (type LIKE '%error%' OR type LIKE '%fail%')`
|
||||||
|
).get(workspaceId, recentWindow) as any).c;
|
||||||
|
|
||||||
|
const totalLast5m = (db.prepare(
|
||||||
|
`SELECT COUNT(*) as c FROM activities WHERE workspace_id = ? AND created_at >= ?`
|
||||||
|
).get(workspaceId, recentWindow) as any).c;
|
||||||
|
|
||||||
|
const completionsLastHour = (db.prepare(
|
||||||
|
`SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
|
||||||
|
).get(workspaceId, hourAgo) as any).c;
|
||||||
|
|
||||||
|
// Average completion rate over last 24h
|
||||||
|
const dayAgo = now - 86400;
|
||||||
|
const completionsLastDay = (db.prepare(
|
||||||
|
`SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
|
||||||
|
).get(workspaceId, dayAgo) as any).c;
|
||||||
|
|
||||||
|
return {
|
||||||
|
active_tasks: activeTasks,
|
||||||
|
tasks_last_5m: tasksLast5m,
|
||||||
|
errors_last_5m: errorsLast5m,
|
||||||
|
error_rate_5m: totalLast5m > 0 ? Math.round((errorsLast5m / totalLast5m) * 10000) / 10000 : 0,
|
||||||
|
completions_last_hour: completionsLastHour,
|
||||||
|
avg_completion_rate_per_hour: Math.round((completionsLastDay / 24) * 100) / 100,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildQueueMetrics(db: any, workspaceId: number): QueueMetrics {
|
||||||
|
const now = Math.floor(Date.now() / 1000);
|
||||||
|
|
||||||
|
const pendingStatuses = ['inbox', 'assigned', 'in_progress', 'review', 'quality_review'];
|
||||||
|
|
||||||
|
const byStatus = db.prepare(
|
||||||
|
`SELECT status, COUNT(*) as count FROM tasks WHERE workspace_id = ? AND status IN (${pendingStatuses.map(() => '?').join(',')}) GROUP BY status`
|
||||||
|
).all(workspaceId, ...pendingStatuses) as Array<{ status: string; count: number }>;
|
||||||
|
|
||||||
|
const byPriority = db.prepare(
|
||||||
|
`SELECT priority, COUNT(*) as count FROM tasks WHERE workspace_id = ? AND status IN (${pendingStatuses.map(() => '?').join(',')}) GROUP BY priority`
|
||||||
|
).all(workspaceId, ...pendingStatuses) as Array<{ priority: string; count: number }>;
|
||||||
|
|
||||||
|
const totalPending = byStatus.reduce((sum, r) => sum + r.count, 0);
|
||||||
|
|
||||||
|
const oldest = db.prepare(
|
||||||
|
`SELECT MIN(created_at) as oldest FROM tasks WHERE workspace_id = ? AND status IN ('inbox', 'assigned')`
|
||||||
|
).get(workspaceId) as any;
|
||||||
|
|
||||||
|
const oldestAge = oldest?.oldest ? now - oldest.oldest : null;
|
||||||
|
|
||||||
|
// Estimate wait: pending tasks / completion rate per hour * 3600
|
||||||
|
const hourAgo = now - 3600;
|
||||||
|
const completionsLastHour = (db.prepare(
|
||||||
|
`SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
|
||||||
|
).get(workspaceId, hourAgo) as any).c;
|
||||||
|
|
||||||
|
const estimatedWait = completionsLastHour > 0
|
||||||
|
? Math.round((totalPending / completionsLastHour) * 3600)
|
||||||
|
: null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
total_pending: totalPending,
|
||||||
|
by_status: Object.fromEntries(byStatus.map(r => [r.status, r.count])),
|
||||||
|
by_priority: Object.fromEntries(byPriority.map(r => [r.priority, r.count])),
|
||||||
|
oldest_pending_age_seconds: oldestAge,
|
||||||
|
estimated_wait_seconds: estimatedWait,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildAgentMetrics(db: any, workspaceId: number, now: number): AgentMetrics {
|
||||||
|
const agentStatuses = db.prepare(
|
||||||
|
`SELECT status, COUNT(*) as count FROM agents WHERE workspace_id = ? GROUP BY status`
|
||||||
|
).all(workspaceId) as Array<{ status: string; count: number }>;
|
||||||
|
|
||||||
|
const statusMap: Record<string, number> = {};
|
||||||
|
let total = 0;
|
||||||
|
for (const row of agentStatuses) {
|
||||||
|
statusMap[row.status] = row.count;
|
||||||
|
total += row.count;
|
||||||
|
}
|
||||||
|
|
||||||
|
const online = (statusMap['idle'] || 0) + (statusMap['busy'] || 0);
|
||||||
|
const busy = statusMap['busy'] || 0;
|
||||||
|
const idle = statusMap['idle'] || 0;
|
||||||
|
const offline = statusMap['offline'] || 0;
|
||||||
|
|
||||||
|
// Load distribution per agent
|
||||||
|
const loadDist = db.prepare(`
|
||||||
|
SELECT a.name as agent,
|
||||||
|
SUM(CASE WHEN t.status = 'assigned' THEN 1 ELSE 0 END) as assigned,
|
||||||
|
SUM(CASE WHEN t.status = 'in_progress' THEN 1 ELSE 0 END) as in_progress
|
||||||
|
FROM agents a
|
||||||
|
LEFT JOIN tasks t ON t.assigned_to = a.name AND t.workspace_id = a.workspace_id AND t.status IN ('assigned', 'in_progress')
|
||||||
|
WHERE a.workspace_id = ? AND a.status != 'offline'
|
||||||
|
GROUP BY a.name
|
||||||
|
ORDER BY (assigned + in_progress) DESC
|
||||||
|
`).all(workspaceId) as Array<{ agent: string; assigned: number; in_progress: number }>;
|
||||||
|
|
||||||
|
return {
|
||||||
|
total,
|
||||||
|
online,
|
||||||
|
busy,
|
||||||
|
idle,
|
||||||
|
offline,
|
||||||
|
busy_ratio: online > 0 ? Math.round((busy / online) * 100) / 100 : 0,
|
||||||
|
load_distribution: loadDist,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
type RecommendationLevel = 'normal' | 'throttle' | 'shed' | 'pause';
|
||||||
|
|
||||||
|
interface Recommendation {
|
||||||
|
action: RecommendationLevel;
|
||||||
|
reason: string;
|
||||||
|
details: string[];
|
||||||
|
submit_ok: boolean;
|
||||||
|
suggested_delay_ms: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
function computeRecommendation(
|
||||||
|
capacity: CapacityMetrics,
|
||||||
|
queue: QueueMetrics,
|
||||||
|
agents: AgentMetrics
|
||||||
|
): Recommendation {
|
||||||
|
const reasons: string[] = [];
|
||||||
|
let level: RecommendationLevel = 'normal';
|
||||||
|
|
||||||
|
// Check error rate
|
||||||
|
if (capacity.error_rate_5m >= THRESHOLDS.error_rate_shed) {
|
||||||
|
level = escalate(level, 'shed');
|
||||||
|
reasons.push(`High error rate: ${(capacity.error_rate_5m * 100).toFixed(1)}%`);
|
||||||
|
} else if (capacity.error_rate_5m >= THRESHOLDS.error_rate_throttle) {
|
||||||
|
level = escalate(level, 'throttle');
|
||||||
|
reasons.push(`Elevated error rate: ${(capacity.error_rate_5m * 100).toFixed(1)}%`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check queue depth
|
||||||
|
if (queue.total_pending >= THRESHOLDS.queue_depth_shed) {
|
||||||
|
level = escalate(level, 'shed');
|
||||||
|
reasons.push(`Queue depth critical: ${queue.total_pending} pending tasks`);
|
||||||
|
} else if (queue.total_pending >= THRESHOLDS.queue_depth_throttle) {
|
||||||
|
level = escalate(level, 'throttle');
|
||||||
|
reasons.push(`Queue depth high: ${queue.total_pending} pending tasks`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check agent saturation
|
||||||
|
if (agents.busy_ratio >= THRESHOLDS.busy_agent_ratio_shed) {
|
||||||
|
level = escalate(level, 'shed');
|
||||||
|
reasons.push(`Agent saturation critical: ${(agents.busy_ratio * 100).toFixed(0)}% busy`);
|
||||||
|
} else if (agents.busy_ratio >= THRESHOLDS.busy_agent_ratio_throttle) {
|
||||||
|
level = escalate(level, 'throttle');
|
||||||
|
reasons.push(`Agent saturation high: ${(agents.busy_ratio * 100).toFixed(0)}% busy`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// No online agents = pause
|
||||||
|
if (agents.online === 0 && agents.total > 0) {
|
||||||
|
level = 'pause';
|
||||||
|
reasons.push('No agents online');
|
||||||
|
}
|
||||||
|
|
||||||
|
const delayMap: Record<RecommendationLevel, number> = {
|
||||||
|
normal: 0,
|
||||||
|
throttle: 2000,
|
||||||
|
shed: 10000,
|
||||||
|
pause: 30000,
|
||||||
|
};
|
||||||
|
|
||||||
|
const actionDescriptions: Record<RecommendationLevel, string> = {
|
||||||
|
normal: 'System healthy — submit work freely',
|
||||||
|
throttle: 'System under load — reduce submission rate and defer non-critical work',
|
||||||
|
shed: 'System overloaded — submit only critical/high-priority work, defer everything else',
|
||||||
|
pause: 'System unavailable — hold all submissions until capacity returns',
|
||||||
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
action: level,
|
||||||
|
reason: actionDescriptions[level],
|
||||||
|
details: reasons.length > 0 ? reasons : ['All metrics within normal bounds'],
|
||||||
|
submit_ok: level === 'normal' || level === 'throttle',
|
||||||
|
suggested_delay_ms: delayMap[level],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function escalate(current: RecommendationLevel, proposed: RecommendationLevel): RecommendationLevel {
|
||||||
|
const order: RecommendationLevel[] = ['normal', 'throttle', 'shed', 'pause'];
|
||||||
|
return order.indexOf(proposed) > order.indexOf(current) ? proposed : current;
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue