feat(#160): add Real-Time Workload Signals API endpoint

New GET /api/workload endpoint providing system-wide capacity metrics and throttle recommendations for agent load awareness. Response sections: - capacity: active tasks, recent task/error rates, completion throughput - queue: pending task depth by status/priority, estimated wait time - agents: online/busy/idle counts, busy ratio, per-agent load distribution - recommendation: actionable signal (normal|throttle|shed|pause) with reasons, submit_ok flag, and suggested delay - thresholds: current threshold configuration for transparency Recommendation engine factors: - Queue depth (20/50/100 thresholds) - Agent saturation ratio (80%/95%) - Error rate in last 5 minutes (10%/25%) - Agent availability (pause if none online) Fixes #160
2026-03-04 23:12:45 +04:00 · 2026-03-04 23:12:45 +04:00 · b38ad43272
parent 4cb86bc80b
commit b38ad43272
1 changed files with 294 additions and 0 deletions
--- a/src/app/api/workload/route.ts
+++ b/src/app/api/workload/route.ts
@ -0,0 +1,294 @@
+import { NextRequest, NextResponse } from 'next/server';
+import { getDatabase } from '@/lib/db';
+import { requireRole } from '@/lib/auth';
+import { logger } from '@/lib/logger';
+
+/**
+ * GET /api/workload - Real-Time Workload Signals
+ *
+ * Provides system-wide capacity metrics and throttle recommendations
+ * so agents can make informed decisions about work submission.
+ *
+ * Response:
+ *   capacity    - Current system capacity metrics
+ *   queue       - Task queue depth and breakdown
+ *   agents      - Agent availability and load distribution
+ *   recommendation - Actionable signal: normal | throttle | shed | pause
+ *   thresholds  - Current threshold configuration
+ *
+ * Agents should call this before submitting new work to avoid
+ * cascading failures and SLO breaches.
+ */
+export async function GET(request: NextRequest) {
+  const auth = requireRole(request, 'viewer');
+  if ('error' in auth) return NextResponse.json({ error: auth.error }, { status: auth.status });
+
+  try {
+    const db = getDatabase();
+    const workspaceId = auth.user.workspace_id ?? 1;
+    const now = Math.floor(Date.now() / 1000);
+
+    // --- Capacity metrics ---
+    const capacity = buildCapacityMetrics(db, workspaceId, now);
+
+    // --- Queue depth ---
+    const queue = buildQueueMetrics(db, workspaceId);
+
+    // --- Agent availability ---
+    const agents = buildAgentMetrics(db, workspaceId, now);
+
+    // --- Recommendation ---
+    const recommendation = computeRecommendation(capacity, queue, agents);
+
+    return NextResponse.json({
+      timestamp: now,
+      workspace_id: workspaceId,
+      capacity,
+      queue,
+      agents,
+      recommendation,
+      thresholds: THRESHOLDS,
+    });
+  } catch (error) {
+    logger.error({ err: error }, 'GET /api/workload error');
+    return NextResponse.json({ error: 'Failed to fetch workload signals' }, { status: 500 });
+  }
+}
+
+// Configurable thresholds for recommendation engine
+const THRESHOLDS = {
+  queue_depth_normal: 20,
+  queue_depth_throttle: 50,
+  queue_depth_shed: 100,
+  busy_agent_ratio_throttle: 0.8,
+  busy_agent_ratio_shed: 0.95,
+  error_rate_throttle: 0.1,
+  error_rate_shed: 0.25,
+  recent_window_seconds: 300, // 5 minutes for recent activity
+};
+
+interface CapacityMetrics {
+  active_tasks: number;
+  tasks_last_5m: number;
+  errors_last_5m: number;
+  error_rate_5m: number;
+  completions_last_hour: number;
+  avg_completion_rate_per_hour: number;
+}
+
+interface QueueMetrics {
+  total_pending: number;
+  by_status: Record<string, number>;
+  by_priority: Record<string, number>;
+  oldest_pending_age_seconds: number | null;
+  estimated_wait_seconds: number | null;
+}
+
+interface AgentMetrics {
+  total: number;
+  online: number;
+  busy: number;
+  idle: number;
+  offline: number;
+  busy_ratio: number;
+  load_distribution: Array<{ agent: string; assigned: number; in_progress: number }>;
+}
+
+function buildCapacityMetrics(db: any, workspaceId: number, now: number): CapacityMetrics {
+  const recentWindow = now - THRESHOLDS.recent_window_seconds;
+  const hourAgo = now - 3600;
+
+  const activeTasks = (db.prepare(
+    `SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status IN ('assigned', 'in_progress', 'review', 'quality_review')`
+  ).get(workspaceId) as any).c;
+
+  const tasksLast5m = (db.prepare(
+    `SELECT COUNT(*) as c FROM activities WHERE workspace_id = ? AND created_at >= ? AND type IN ('task_created', 'task_assigned')`
+  ).get(workspaceId, recentWindow) as any).c;
+
+  const errorsLast5m = (db.prepare(
+    `SELECT COUNT(*) as c FROM activities WHERE workspace_id = ? AND created_at >= ? AND (type LIKE '%error%' OR type LIKE '%fail%')`
+  ).get(workspaceId, recentWindow) as any).c;
+
+  const totalLast5m = (db.prepare(
+    `SELECT COUNT(*) as c FROM activities WHERE workspace_id = ? AND created_at >= ?`
+  ).get(workspaceId, recentWindow) as any).c;
+
+  const completionsLastHour = (db.prepare(
+    `SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
+  ).get(workspaceId, hourAgo) as any).c;
+
+  // Average completion rate over last 24h
+  const dayAgo = now - 86400;
+  const completionsLastDay = (db.prepare(
+    `SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
+  ).get(workspaceId, dayAgo) as any).c;
+
+  return {
+    active_tasks: activeTasks,
+    tasks_last_5m: tasksLast5m,
+    errors_last_5m: errorsLast5m,
+    error_rate_5m: totalLast5m > 0 ? Math.round((errorsLast5m / totalLast5m) * 10000) / 10000 : 0,
+    completions_last_hour: completionsLastHour,
+    avg_completion_rate_per_hour: Math.round((completionsLastDay / 24) * 100) / 100,
+  };
+}
+
+function buildQueueMetrics(db: any, workspaceId: number): QueueMetrics {
+  const now = Math.floor(Date.now() / 1000);
+
+  const pendingStatuses = ['inbox', 'assigned', 'in_progress', 'review', 'quality_review'];
+
+  const byStatus = db.prepare(
+    `SELECT status, COUNT(*) as count FROM tasks WHERE workspace_id = ? AND status IN (${pendingStatuses.map(() => '?').join(',')}) GROUP BY status`
+  ).all(workspaceId, ...pendingStatuses) as Array<{ status: string; count: number }>;
+
+  const byPriority = db.prepare(
+    `SELECT priority, COUNT(*) as count FROM tasks WHERE workspace_id = ? AND status IN (${pendingStatuses.map(() => '?').join(',')}) GROUP BY priority`
+  ).all(workspaceId, ...pendingStatuses) as Array<{ priority: string; count: number }>;
+
+  const totalPending = byStatus.reduce((sum, r) => sum + r.count, 0);
+
+  const oldest = db.prepare(
+    `SELECT MIN(created_at) as oldest FROM tasks WHERE workspace_id = ? AND status IN ('inbox', 'assigned')`
+  ).get(workspaceId) as any;
+
+  const oldestAge = oldest?.oldest ? now - oldest.oldest : null;
+
+  // Estimate wait: pending tasks / completion rate per hour * 3600
+  const hourAgo = now - 3600;
+  const completionsLastHour = (db.prepare(
+    `SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
+  ).get(workspaceId, hourAgo) as any).c;
+
+  const estimatedWait = completionsLastHour > 0
+    ? Math.round((totalPending / completionsLastHour) * 3600)
+    : null;
+
+  return {
+    total_pending: totalPending,
+    by_status: Object.fromEntries(byStatus.map(r => [r.status, r.count])),
+    by_priority: Object.fromEntries(byPriority.map(r => [r.priority, r.count])),
+    oldest_pending_age_seconds: oldestAge,
+    estimated_wait_seconds: estimatedWait,
+  };
+}
+
+function buildAgentMetrics(db: any, workspaceId: number, now: number): AgentMetrics {
+  const agentStatuses = db.prepare(
+    `SELECT status, COUNT(*) as count FROM agents WHERE workspace_id = ? GROUP BY status`
+  ).all(workspaceId) as Array<{ status: string; count: number }>;
+
+  const statusMap: Record<string, number> = {};
+  let total = 0;
+  for (const row of agentStatuses) {
+    statusMap[row.status] = row.count;
+    total += row.count;
+  }
+
+  const online = (statusMap['idle'] || 0) + (statusMap['busy'] || 0);
+  const busy = statusMap['busy'] || 0;
+  const idle = statusMap['idle'] || 0;
+  const offline = statusMap['offline'] || 0;
+
+  // Load distribution per agent
+  const loadDist = db.prepare(`
+    SELECT a.name as agent,
+      SUM(CASE WHEN t.status = 'assigned' THEN 1 ELSE 0 END) as assigned,
+      SUM(CASE WHEN t.status = 'in_progress' THEN 1 ELSE 0 END) as in_progress
+    FROM agents a
+    LEFT JOIN tasks t ON t.assigned_to = a.name AND t.workspace_id = a.workspace_id AND t.status IN ('assigned', 'in_progress')
+    WHERE a.workspace_id = ? AND a.status != 'offline'
+    GROUP BY a.name
+    ORDER BY (assigned + in_progress) DESC
+  `).all(workspaceId) as Array<{ agent: string; assigned: number; in_progress: number }>;
+
+  return {
+    total,
+    online,
+    busy,
+    idle,
+    offline,
+    busy_ratio: online > 0 ? Math.round((busy / online) * 100) / 100 : 0,
+    load_distribution: loadDist,
+  };
+}
+
+type RecommendationLevel = 'normal' | 'throttle' | 'shed' | 'pause';
+
+interface Recommendation {
+  action: RecommendationLevel;
+  reason: string;
+  details: string[];
+  submit_ok: boolean;
+  suggested_delay_ms: number;
+}
+
+function computeRecommendation(
+  capacity: CapacityMetrics,
+  queue: QueueMetrics,
+  agents: AgentMetrics
+): Recommendation {
+  const reasons: string[] = [];
+  let level: RecommendationLevel = 'normal';
+
+  // Check error rate
+  if (capacity.error_rate_5m >= THRESHOLDS.error_rate_shed) {
+    level = escalate(level, 'shed');
+    reasons.push(`High error rate: ${(capacity.error_rate_5m * 100).toFixed(1)}%`);
+  } else if (capacity.error_rate_5m >= THRESHOLDS.error_rate_throttle) {
+    level = escalate(level, 'throttle');
+    reasons.push(`Elevated error rate: ${(capacity.error_rate_5m * 100).toFixed(1)}%`);
+  }
+
+  // Check queue depth
+  if (queue.total_pending >= THRESHOLDS.queue_depth_shed) {
+    level = escalate(level, 'shed');
+    reasons.push(`Queue depth critical: ${queue.total_pending} pending tasks`);
+  } else if (queue.total_pending >= THRESHOLDS.queue_depth_throttle) {
+    level = escalate(level, 'throttle');
+    reasons.push(`Queue depth high: ${queue.total_pending} pending tasks`);
+  }
+
+  // Check agent saturation
+  if (agents.busy_ratio >= THRESHOLDS.busy_agent_ratio_shed) {
+    level = escalate(level, 'shed');
+    reasons.push(`Agent saturation critical: ${(agents.busy_ratio * 100).toFixed(0)}% busy`);
+  } else if (agents.busy_ratio >= THRESHOLDS.busy_agent_ratio_throttle) {
+    level = escalate(level, 'throttle');
+    reasons.push(`Agent saturation high: ${(agents.busy_ratio * 100).toFixed(0)}% busy`);
+  }
+
+  // No online agents = pause
+  if (agents.online === 0 && agents.total > 0) {
+    level = 'pause';
+    reasons.push('No agents online');
+  }
+
+  const delayMap: Record<RecommendationLevel, number> = {
+    normal: 0,
+    throttle: 2000,
+    shed: 10000,
+    pause: 30000,
+  };
+
+  const actionDescriptions: Record<RecommendationLevel, string> = {
+    normal: 'System healthy — submit work freely',
+    throttle: 'System under load — reduce submission rate and defer non-critical work',
+    shed: 'System overloaded — submit only critical/high-priority work, defer everything else',
+    pause: 'System unavailable — hold all submissions until capacity returns',
+  };
+
+  return {
+    action: level,
+    reason: actionDescriptions[level],
+    details: reasons.length > 0 ? reasons : ['All metrics within normal bounds'],
+    submit_ok: level === 'normal' || level === 'throttle',
+    suggested_delay_ms: delayMap[level],
+  };
+}
+
+function escalate(current: RecommendationLevel, proposed: RecommendationLevel): RecommendationLevel {
+  const order: RecommendationLevel[] = ['normal', 'throttle', 'shed', 'pause'];
+  return order.indexOf(proposed) > order.indexOf(current) ? proposed : current;
+}