fix: task routing stuck issues + k8s agent visibility
- Add stale task watchdog (requeueStaleTasks) to scheduler — detects in_progress tasks with offline agents and requeues or fails them - Fix Aegis rejection loop: rejected tasks now requeue to 'assigned' instead of staying in 'in_progress', with max 3 retries before failing - Track dispatch_attempts on tasks (migration 044) to prevent infinite retry loops — tasks fail after 5 dispatch attempts - Include error_message and reason in SSE event broadcasts so UI can show why a task reverted - Atomic task queue claim: replace SELECT-then-UPDATE race with single UPDATE...RETURNING statement - Gateway agent auto-registration: POST/PUT /api/gateways accepts optional 'agents' array to upsert agents (k8s sidecar support) - Document k8s sidecar deployment in docs/deployment.md Fixes: tasks stuck in assigned, Aegis rejection loops, agents invisible in k8s sidecar deployments
This commit is contained in:
parent
dd7d663a36
commit
2d171ad464
|
|
@ -115,6 +115,82 @@ See `.env.example` for the full list. Key variables:
|
||||||
| `OPENCLAW_HOME` | No | - | Path to OpenClaw installation |
|
| `OPENCLAW_HOME` | No | - | Path to OpenClaw installation |
|
||||||
| `MC_ALLOWED_HOSTS` | No | `localhost,127.0.0.1` | Allowed hosts in production |
|
| `MC_ALLOWED_HOSTS` | No | `localhost,127.0.0.1` | Allowed hosts in production |
|
||||||
|
|
||||||
|
## Kubernetes Sidecar Deployment
|
||||||
|
|
||||||
|
When running Mission Control alongside a gateway as containers in the same pod (sidecar pattern), agents are not discovered via the filesystem. Instead, use the gateway's agent registration API.
|
||||||
|
|
||||||
|
### Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌──────────────── Pod ────────────────┐
|
||||||
|
│ ┌─────────┐ ┌───────────────┐ │
|
||||||
|
│ │ MC │◄───►│ Gateway │ │
|
||||||
|
│ │ :3000 │ │ :18789 │ │
|
||||||
|
│ └─────────┘ └───────────────┘ │
|
||||||
|
│ ▲ ▲ │
|
||||||
|
│ │ localhost │ │
|
||||||
|
│ └──────────────────┘ │
|
||||||
|
└─────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### Required Configuration
|
||||||
|
|
||||||
|
**Environment variables** for the MC container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
AUTH_USER=admin
|
||||||
|
AUTH_PASS=<secure-password>
|
||||||
|
API_KEY=<your-api-key>
|
||||||
|
OPENCLAW_GATEWAY_HOST=127.0.0.1
|
||||||
|
NEXT_PUBLIC_GATEWAY_PORT=18789
|
||||||
|
```
|
||||||
|
|
||||||
|
### Agent Registration
|
||||||
|
|
||||||
|
The gateway must register its agents with MC on startup. Include the `agents` array in the gateway registration request:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:3000/api/gateways \
|
||||||
|
-H "Authorization: Bearer <API_KEY>" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"name": "sidecar-gateway",
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"port": 18789,
|
||||||
|
"is_primary": true,
|
||||||
|
"agents": [
|
||||||
|
{ "name": "developer-1", "role": "developer" },
|
||||||
|
{ "name": "researcher-1", "role": "researcher" }
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
To update the agent list on reconnect, use `PUT /api/gateways` with the same `agents` field.
|
||||||
|
|
||||||
|
Alternatively, each agent can register itself via the direct connection endpoint:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:3000/api/connect \
|
||||||
|
-H "Authorization: Bearer <API_KEY>" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"tool_name": "openclaw-gateway",
|
||||||
|
"agent_name": "developer-1",
|
||||||
|
"agent_role": "developer"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Health Checks
|
||||||
|
|
||||||
|
Agents must send heartbeats to stay visible:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://localhost:3000/api/agents/<agent-id>/heartbeat \
|
||||||
|
-H "Authorization: Bearer <API_KEY>"
|
||||||
|
```
|
||||||
|
|
||||||
|
Without heartbeats, agents will be marked offline after 10 minutes (configurable via `general.agent_timeout_minutes` setting).
|
||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
|
|
||||||
### "Module not found: better-sqlite3"
|
### "Module not found: better-sqlite3"
|
||||||
|
|
|
||||||
|
|
@ -80,7 +80,7 @@ export async function POST(request: NextRequest) {
|
||||||
ensureTable(db)
|
ensureTable(db)
|
||||||
const body = await request.json()
|
const body = await request.json()
|
||||||
|
|
||||||
const { name, host, port, token, is_primary } = body
|
const { name, host, port, token, is_primary, agents } = body
|
||||||
|
|
||||||
if (!name || !host || !port) {
|
if (!name || !host || !port) {
|
||||||
return NextResponse.json({ error: 'name, host, and port are required' }, { status: 400 })
|
return NextResponse.json({ error: 'name, host, and port are required' }, { status: 400 })
|
||||||
|
|
@ -96,14 +96,37 @@ export async function POST(request: NextRequest) {
|
||||||
INSERT INTO gateways (name, host, port, token, is_primary) VALUES (?, ?, ?, ?, ?)
|
INSERT INTO gateways (name, host, port, token, is_primary) VALUES (?, ?, ?, ?, ?)
|
||||||
`).run(name, host, port, token || '', is_primary ? 1 : 0)
|
`).run(name, host, port, token || '', is_primary ? 1 : 0)
|
||||||
|
|
||||||
|
// Auto-register agents reported by the gateway (k8s sidecar support)
|
||||||
|
let agentsRegistered = 0
|
||||||
|
if (Array.isArray(agents) && agents.length > 0) {
|
||||||
|
const workspaceId = auth.user?.workspace_id ?? 1
|
||||||
|
const now = Math.floor(Date.now() / 1000)
|
||||||
|
const upsertAgent = db.prepare(`
|
||||||
|
INSERT INTO agents (name, role, status, last_seen, source, workspace_id, updated_at)
|
||||||
|
VALUES (?, ?, 'idle', ?, 'gateway', ?, ?)
|
||||||
|
ON CONFLICT(name) DO UPDATE SET
|
||||||
|
status = 'idle',
|
||||||
|
last_seen = excluded.last_seen,
|
||||||
|
source = 'gateway',
|
||||||
|
updated_at = excluded.updated_at
|
||||||
|
`)
|
||||||
|
for (const agent of agents.slice(0, 50)) {
|
||||||
|
if (typeof agent?.name !== 'string' || !agent.name.trim()) continue
|
||||||
|
const agentName = agent.name.trim().substring(0, 100)
|
||||||
|
const agentRole = typeof agent?.role === 'string' ? agent.role.trim().substring(0, 100) : 'agent'
|
||||||
|
upsertAgent.run(agentName, agentRole, now, workspaceId, now)
|
||||||
|
agentsRegistered++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
db.prepare('INSERT INTO audit_log (action, actor, detail) VALUES (?, ?, ?)').run(
|
db.prepare('INSERT INTO audit_log (action, actor, detail) VALUES (?, ?, ?)').run(
|
||||||
'gateway_added', auth.user?.username || 'system', `Added gateway: ${name} (${host}:${port})`
|
'gateway_added', auth.user?.username || 'system', `Added gateway: ${name} (${host}:${port})${agentsRegistered ? `, registered ${agentsRegistered} agent(s)` : ''}`
|
||||||
)
|
)
|
||||||
} catch { /* audit might not exist */ }
|
} catch { /* audit might not exist */ }
|
||||||
|
|
||||||
const gw = db.prepare('SELECT * FROM gateways WHERE id = ?').get(result.lastInsertRowid) as GatewayEntry
|
const gw = db.prepare('SELECT * FROM gateways WHERE id = ?').get(result.lastInsertRowid) as GatewayEntry
|
||||||
return NextResponse.json({ gateway: redactToken(gw) }, { status: 201 })
|
return NextResponse.json({ gateway: redactToken(gw), agents_registered: agentsRegistered }, { status: 201 })
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
if (err.message?.includes('UNIQUE')) {
|
if (err.message?.includes('UNIQUE')) {
|
||||||
return NextResponse.json({ error: 'A gateway with that name already exists' }, { status: 409 })
|
return NextResponse.json({ error: 'A gateway with that name already exists' }, { status: 409 })
|
||||||
|
|
@ -145,15 +168,39 @@ export async function PUT(request: NextRequest) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sets.length === 0) return NextResponse.json({ error: 'No valid fields to update' }, { status: 400 })
|
if (sets.length === 0 && !Array.isArray(updates.agents)) return NextResponse.json({ error: 'No valid fields to update' }, { status: 400 })
|
||||||
|
|
||||||
sets.push('updated_at = (unixepoch())')
|
if (sets.length > 0) {
|
||||||
values.push(id)
|
sets.push('updated_at = (unixepoch())')
|
||||||
|
values.push(id)
|
||||||
|
db.prepare(`UPDATE gateways SET ${sets.join(', ')} WHERE id = ?`).run(...values)
|
||||||
|
}
|
||||||
|
|
||||||
db.prepare(`UPDATE gateways SET ${sets.join(', ')} WHERE id = ?`).run(...values)
|
// Auto-register agents reported by the gateway (k8s sidecar support)
|
||||||
|
let agentsRegistered = 0
|
||||||
|
if (Array.isArray(updates.agents) && updates.agents.length > 0) {
|
||||||
|
const workspaceId = auth.user?.workspace_id ?? 1
|
||||||
|
const now = Math.floor(Date.now() / 1000)
|
||||||
|
const upsertAgent = db.prepare(`
|
||||||
|
INSERT INTO agents (name, role, status, last_seen, source, workspace_id, updated_at)
|
||||||
|
VALUES (?, ?, 'idle', ?, 'gateway', ?, ?)
|
||||||
|
ON CONFLICT(name, workspace_id) DO UPDATE SET
|
||||||
|
status = 'idle',
|
||||||
|
last_seen = excluded.last_seen,
|
||||||
|
source = 'gateway',
|
||||||
|
updated_at = excluded.updated_at
|
||||||
|
`)
|
||||||
|
for (const agent of updates.agents.slice(0, 50)) {
|
||||||
|
if (typeof agent?.name !== 'string' || !agent.name.trim()) continue
|
||||||
|
const agentName = agent.name.trim().substring(0, 100)
|
||||||
|
const agentRole = typeof agent?.role === 'string' ? agent.role.trim().substring(0, 100) : 'agent'
|
||||||
|
upsertAgent.run(agentName, agentRole, now, workspaceId, now)
|
||||||
|
agentsRegistered++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const updated = db.prepare('SELECT * FROM gateways WHERE id = ?').get(id) as GatewayEntry
|
const updated = db.prepare('SELECT * FROM gateways WHERE id = ?').get(id) as GatewayEntry
|
||||||
return NextResponse.json({ gateway: redactToken(updated) })
|
return NextResponse.json({ gateway: redactToken(updated), agents_registered: agentsRegistered })
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -105,37 +105,28 @@ export async function GET(request: NextRequest) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// Best-effort atomic pickup loop for race safety.
|
// Atomic claim: single UPDATE with subquery to eliminate SELECT-UPDATE race condition.
|
||||||
for (let attempt = 0; attempt < 5; attempt += 1) {
|
const claimed = db.prepare(`
|
||||||
const candidate = db.prepare(`
|
UPDATE tasks
|
||||||
SELECT *
|
SET status = 'in_progress', assigned_to = ?, updated_at = ?
|
||||||
FROM tasks
|
WHERE id = (
|
||||||
|
SELECT id FROM tasks
|
||||||
WHERE workspace_id = ?
|
WHERE workspace_id = ?
|
||||||
AND status IN ('assigned', 'inbox')
|
AND status IN ('assigned', 'inbox')
|
||||||
AND (assigned_to IS NULL OR assigned_to = ?)
|
AND (assigned_to IS NULL OR assigned_to = ?)
|
||||||
ORDER BY ${priorityRankSql()} ASC, due_date ASC NULLS LAST, created_at ASC
|
ORDER BY ${priorityRankSql()} ASC, due_date ASC NULLS LAST, created_at ASC
|
||||||
LIMIT 1
|
LIMIT 1
|
||||||
`).get(workspaceId, agent) as any | undefined
|
)
|
||||||
|
RETURNING *
|
||||||
|
`).get(agent, now, workspaceId, agent) as any | undefined
|
||||||
|
|
||||||
if (!candidate) break
|
if (claimed) {
|
||||||
|
return NextResponse.json({
|
||||||
const claimed = db.prepare(`
|
task: mapTaskRow(claimed),
|
||||||
UPDATE tasks
|
reason: 'assigned' as QueueReason,
|
||||||
SET status = 'in_progress', assigned_to = ?, updated_at = ?
|
agent,
|
||||||
WHERE id = ? AND workspace_id = ?
|
timestamp: now,
|
||||||
AND status IN ('assigned', 'inbox')
|
})
|
||||||
AND (assigned_to IS NULL OR assigned_to = ?)
|
|
||||||
`).run(agent, now, candidate.id, workspaceId, agent)
|
|
||||||
|
|
||||||
if (claimed.changes > 0) {
|
|
||||||
const task = db.prepare('SELECT * FROM tasks WHERE id = ? AND workspace_id = ?').get(candidate.id, workspaceId) as any
|
|
||||||
return NextResponse.json({
|
|
||||||
task: mapTaskRow(task),
|
|
||||||
reason: 'assigned' as QueueReason,
|
|
||||||
agent,
|
|
||||||
timestamp: now,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return NextResponse.json({
|
return NextResponse.json({
|
||||||
|
|
|
||||||
|
|
@ -1294,6 +1294,16 @@ const migrations: Migration[] = [
|
||||||
db.exec(`CREATE INDEX IF NOT EXISTS idx_spawn_history_created ON spawn_history(created_at)`)
|
db.exec(`CREATE INDEX IF NOT EXISTS idx_spawn_history_created ON spawn_history(created_at)`)
|
||||||
db.exec(`CREATE INDEX IF NOT EXISTS idx_spawn_history_status ON spawn_history(status)`)
|
db.exec(`CREATE INDEX IF NOT EXISTS idx_spawn_history_status ON spawn_history(status)`)
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: '044_task_dispatch_attempts',
|
||||||
|
up(db: Database.Database) {
|
||||||
|
const cols = db.prepare(`PRAGMA table_info(tasks)`).all() as Array<{ name: string }>
|
||||||
|
if (!cols.some(c => c.name === 'dispatch_attempts')) {
|
||||||
|
db.exec(`ALTER TABLE tasks ADD COLUMN dispatch_attempts INTEGER NOT NULL DEFAULT 0`)
|
||||||
|
}
|
||||||
|
db.exec(`CREATE INDEX IF NOT EXISTS idx_tasks_stale_inprogress ON tasks(status, updated_at) WHERE status = 'in_progress'`)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ import { pruneGatewaySessionsOlderThan, getAgentLiveStatuses } from './sessions'
|
||||||
import { eventBus } from './event-bus'
|
import { eventBus } from './event-bus'
|
||||||
import { syncSkillsFromDisk } from './skill-sync'
|
import { syncSkillsFromDisk } from './skill-sync'
|
||||||
import { syncLocalAgents } from './local-agent-sync'
|
import { syncLocalAgents } from './local-agent-sync'
|
||||||
import { dispatchAssignedTasks, runAegisReviews } from './task-dispatch'
|
import { dispatchAssignedTasks, runAegisReviews, requeueStaleTasks } from './task-dispatch'
|
||||||
import { spawnRecurringTasks } from './recurring-tasks'
|
import { spawnRecurringTasks } from './recurring-tasks'
|
||||||
|
|
||||||
const BACKUP_DIR = join(dirname(config.dbPath), 'backups')
|
const BACKUP_DIR = join(dirname(config.dbPath), 'backups')
|
||||||
|
|
@ -389,6 +389,15 @@ export function initScheduler() {
|
||||||
running: false,
|
running: false,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
tasks.set('stale_task_requeue', {
|
||||||
|
name: 'Stale Task Requeue',
|
||||||
|
intervalMs: TICK_MS, // Every 60s — check for stale in_progress tasks
|
||||||
|
lastRun: null,
|
||||||
|
nextRun: now + 25_000, // First check 25s after startup
|
||||||
|
enabled: true,
|
||||||
|
running: false,
|
||||||
|
})
|
||||||
|
|
||||||
// Start the tick loop
|
// Start the tick loop
|
||||||
tickInterval = setInterval(tick, TICK_MS)
|
tickInterval = setInterval(tick, TICK_MS)
|
||||||
logger.info('Scheduler initialized - backup at ~3AM, cleanup at ~4AM, heartbeat every 5m, webhook/claude/skill/local-agent/gateway-agent sync every 60s')
|
logger.info('Scheduler initialized - backup at ~3AM, cleanup at ~4AM, heartbeat every 5m, webhook/claude/skill/local-agent/gateway-agent sync every 60s')
|
||||||
|
|
@ -423,8 +432,9 @@ async function tick() {
|
||||||
: id === 'task_dispatch' ? 'general.task_dispatch'
|
: id === 'task_dispatch' ? 'general.task_dispatch'
|
||||||
: id === 'aegis_review' ? 'general.aegis_review'
|
: id === 'aegis_review' ? 'general.aegis_review'
|
||||||
: id === 'recurring_task_spawn' ? 'general.recurring_task_spawn'
|
: id === 'recurring_task_spawn' ? 'general.recurring_task_spawn'
|
||||||
|
: id === 'stale_task_requeue' ? 'general.stale_task_requeue'
|
||||||
: 'general.agent_heartbeat'
|
: 'general.agent_heartbeat'
|
||||||
const defaultEnabled = id === 'agent_heartbeat' || id === 'webhook_retry' || id === 'claude_session_scan' || id === 'skill_sync' || id === 'local_agent_sync' || id === 'gateway_agent_sync' || id === 'task_dispatch' || id === 'aegis_review' || id === 'recurring_task_spawn'
|
const defaultEnabled = id === 'agent_heartbeat' || id === 'webhook_retry' || id === 'claude_session_scan' || id === 'skill_sync' || id === 'local_agent_sync' || id === 'gateway_agent_sync' || id === 'task_dispatch' || id === 'aegis_review' || id === 'recurring_task_spawn' || id === 'stale_task_requeue'
|
||||||
if (!isSettingEnabled(settingKey, defaultEnabled)) continue
|
if (!isSettingEnabled(settingKey, defaultEnabled)) continue
|
||||||
|
|
||||||
task.running = true
|
task.running = true
|
||||||
|
|
@ -442,6 +452,7 @@ async function tick() {
|
||||||
: id === 'task_dispatch' ? await dispatchAssignedTasks()
|
: id === 'task_dispatch' ? await dispatchAssignedTasks()
|
||||||
: id === 'aegis_review' ? await runAegisReviews()
|
: id === 'aegis_review' ? await runAegisReviews()
|
||||||
: id === 'recurring_task_spawn' ? await spawnRecurringTasks()
|
: id === 'recurring_task_spawn' ? await spawnRecurringTasks()
|
||||||
|
: id === 'stale_task_requeue' ? await requeueStaleTasks()
|
||||||
: await runCleanup()
|
: await runCleanup()
|
||||||
task.lastResult = { ...result, timestamp: now }
|
task.lastResult = { ...result, timestamp: now }
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
|
|
@ -477,8 +488,9 @@ export function getSchedulerStatus() {
|
||||||
: id === 'task_dispatch' ? 'general.task_dispatch'
|
: id === 'task_dispatch' ? 'general.task_dispatch'
|
||||||
: id === 'aegis_review' ? 'general.aegis_review'
|
: id === 'aegis_review' ? 'general.aegis_review'
|
||||||
: id === 'recurring_task_spawn' ? 'general.recurring_task_spawn'
|
: id === 'recurring_task_spawn' ? 'general.recurring_task_spawn'
|
||||||
|
: id === 'stale_task_requeue' ? 'general.stale_task_requeue'
|
||||||
: 'general.agent_heartbeat'
|
: 'general.agent_heartbeat'
|
||||||
const defaultEnabled = id === 'agent_heartbeat' || id === 'webhook_retry' || id === 'claude_session_scan' || id === 'skill_sync' || id === 'local_agent_sync' || id === 'gateway_agent_sync' || id === 'task_dispatch' || id === 'aegis_review' || id === 'recurring_task_spawn'
|
const defaultEnabled = id === 'agent_heartbeat' || id === 'webhook_retry' || id === 'claude_session_scan' || id === 'skill_sync' || id === 'local_agent_sync' || id === 'gateway_agent_sync' || id === 'task_dispatch' || id === 'aegis_review' || id === 'recurring_task_spawn' || id === 'stale_task_requeue'
|
||||||
result.push({
|
result.push({
|
||||||
id,
|
id,
|
||||||
name: task.name,
|
name: task.name,
|
||||||
|
|
@ -506,6 +518,7 @@ export async function triggerTask(taskId: string): Promise<{ ok: boolean; messag
|
||||||
if (taskId === 'task_dispatch') return dispatchAssignedTasks()
|
if (taskId === 'task_dispatch') return dispatchAssignedTasks()
|
||||||
if (taskId === 'aegis_review') return runAegisReviews()
|
if (taskId === 'aegis_review') return runAegisReviews()
|
||||||
if (taskId === 'recurring_task_spawn') return spawnRecurringTasks()
|
if (taskId === 'recurring_task_spawn') return spawnRecurringTasks()
|
||||||
|
if (taskId === 'stale_task_requeue') return requeueStaleTasks()
|
||||||
return { ok: false, message: `Unknown task: ${taskId}` }
|
return { ok: false, message: `Unknown task: ${taskId}` }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -306,21 +306,43 @@ export async function runAegisReviews(): Promise<{ ok: boolean; message: string
|
||||||
previous_status: 'quality_review',
|
previous_status: 'quality_review',
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
// Rejected: push back to in_progress with feedback
|
// Rejected: check dispatch_attempts to decide next status
|
||||||
db.prepare('UPDATE tasks SET status = ?, error_message = ?, updated_at = ? WHERE id = ?')
|
const now = Math.floor(Date.now() / 1000)
|
||||||
.run('in_progress', `Aegis rejected: ${verdict.notes}`, Math.floor(Date.now() / 1000), task.id)
|
const currentAttempts = (db.prepare('SELECT dispatch_attempts FROM tasks WHERE id = ?').get(task.id) as { dispatch_attempts: number } | undefined)?.dispatch_attempts ?? 0
|
||||||
|
const newAttempts = currentAttempts + 1
|
||||||
|
const maxAegisRetries = 3
|
||||||
|
|
||||||
eventBus.broadcast('task.status_changed', {
|
if (newAttempts >= maxAegisRetries) {
|
||||||
id: task.id,
|
// Too many rejections — move to failed
|
||||||
status: 'in_progress',
|
db.prepare('UPDATE tasks SET status = ?, error_message = ?, dispatch_attempts = ?, updated_at = ? WHERE id = ?')
|
||||||
previous_status: 'quality_review',
|
.run('failed', `Aegis rejected ${newAttempts} times. Last: ${verdict.notes}`, newAttempts, now, task.id)
|
||||||
})
|
|
||||||
|
eventBus.broadcast('task.status_changed', {
|
||||||
|
id: task.id,
|
||||||
|
status: 'failed',
|
||||||
|
previous_status: 'quality_review',
|
||||||
|
error_message: `Aegis rejected ${newAttempts} times`,
|
||||||
|
reason: 'max_aegis_retries_exceeded',
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
// Requeue to assigned for re-dispatch with feedback
|
||||||
|
db.prepare('UPDATE tasks SET status = ?, error_message = ?, dispatch_attempts = ?, updated_at = ? WHERE id = ?')
|
||||||
|
.run('assigned', `Aegis rejected: ${verdict.notes}`, newAttempts, now, task.id)
|
||||||
|
|
||||||
|
eventBus.broadcast('task.status_changed', {
|
||||||
|
id: task.id,
|
||||||
|
status: 'assigned',
|
||||||
|
previous_status: 'quality_review',
|
||||||
|
error_message: `Aegis rejected: ${verdict.notes}`,
|
||||||
|
reason: 'aegis_rejection',
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
// Add rejection as a comment so the agent sees it on next dispatch
|
// Add rejection as a comment so the agent sees it on next dispatch
|
||||||
db.prepare(`
|
db.prepare(`
|
||||||
INSERT INTO comments (task_id, author, content, created_at, workspace_id)
|
INSERT INTO comments (task_id, author, content, created_at, workspace_id)
|
||||||
VALUES (?, 'aegis', ?, ?, ?)
|
VALUES (?, 'aegis', ?, ?, ?)
|
||||||
`).run(task.id, `Quality Review Rejected:\n${verdict.notes}`, Math.floor(Date.now() / 1000), task.workspace_id)
|
`).run(task.id, `Quality Review Rejected (attempt ${newAttempts}/${maxAegisRetries}):\n${verdict.notes}`, now, task.workspace_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
db_helpers.logActivity(
|
db_helpers.logActivity(
|
||||||
|
|
@ -363,6 +385,86 @@ export async function runAegisReviews(): Promise<{ ok: boolean; message: string
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Requeue stale tasks stuck in 'in_progress' whose assigned agent is offline.
|
||||||
|
* Prevents tasks from being permanently stuck when agents crash or disconnect.
|
||||||
|
*/
|
||||||
|
export async function requeueStaleTasks(): Promise<{ ok: boolean; message: string }> {
|
||||||
|
const db = getDatabase()
|
||||||
|
const now = Math.floor(Date.now() / 1000)
|
||||||
|
const staleThreshold = now - 10 * 60 // 10 minutes
|
||||||
|
const maxDispatchRetries = 5
|
||||||
|
|
||||||
|
const staleTasks = db.prepare(`
|
||||||
|
SELECT t.id, t.title, t.assigned_to, t.dispatch_attempts, t.workspace_id,
|
||||||
|
a.status as agent_status, a.last_seen as agent_last_seen
|
||||||
|
FROM tasks t
|
||||||
|
LEFT JOIN agents a ON a.name = t.assigned_to AND a.workspace_id = t.workspace_id
|
||||||
|
WHERE t.status = 'in_progress'
|
||||||
|
AND t.updated_at < ?
|
||||||
|
`).all(staleThreshold) as Array<{
|
||||||
|
id: number; title: string; assigned_to: string | null; dispatch_attempts: number
|
||||||
|
workspace_id: number; agent_status: string | null; agent_last_seen: number | null
|
||||||
|
}>
|
||||||
|
|
||||||
|
if (staleTasks.length === 0) {
|
||||||
|
return { ok: true, message: 'No stale tasks found' }
|
||||||
|
}
|
||||||
|
|
||||||
|
let requeued = 0
|
||||||
|
let failed = 0
|
||||||
|
|
||||||
|
for (const task of staleTasks) {
|
||||||
|
// Only requeue if the agent is offline or unknown
|
||||||
|
const agentOffline = !task.agent_status || task.agent_status === 'offline'
|
||||||
|
if (!agentOffline) continue
|
||||||
|
|
||||||
|
const newAttempts = (task.dispatch_attempts ?? 0) + 1
|
||||||
|
|
||||||
|
if (newAttempts >= maxDispatchRetries) {
|
||||||
|
db.prepare('UPDATE tasks SET status = ?, error_message = ?, dispatch_attempts = ?, updated_at = ? WHERE id = ?')
|
||||||
|
.run('failed', `Task stuck in_progress ${newAttempts} times — agent "${task.assigned_to}" offline. Moved to failed.`, newAttempts, now, task.id)
|
||||||
|
|
||||||
|
eventBus.broadcast('task.status_changed', {
|
||||||
|
id: task.id,
|
||||||
|
status: 'failed',
|
||||||
|
previous_status: 'in_progress',
|
||||||
|
error_message: `Stale task — agent offline after ${newAttempts} attempts`,
|
||||||
|
reason: 'stale_task_max_retries',
|
||||||
|
})
|
||||||
|
|
||||||
|
failed++
|
||||||
|
} else {
|
||||||
|
db.prepare('UPDATE tasks SET status = ?, error_message = ?, dispatch_attempts = ?, updated_at = ? WHERE id = ?')
|
||||||
|
.run('assigned', `Requeued: agent "${task.assigned_to}" went offline while task was in_progress`, newAttempts, now, task.id)
|
||||||
|
|
||||||
|
// Add a comment explaining the requeue
|
||||||
|
db.prepare(`
|
||||||
|
INSERT INTO comments (task_id, author, content, created_at, workspace_id)
|
||||||
|
VALUES (?, 'scheduler', ?, ?, ?)
|
||||||
|
`).run(task.id, `Task requeued (attempt ${newAttempts}/${maxDispatchRetries}): agent "${task.assigned_to}" went offline while task was in_progress.`, now, task.workspace_id)
|
||||||
|
|
||||||
|
eventBus.broadcast('task.status_changed', {
|
||||||
|
id: task.id,
|
||||||
|
status: 'assigned',
|
||||||
|
previous_status: 'in_progress',
|
||||||
|
error_message: `Agent "${task.assigned_to}" went offline`,
|
||||||
|
reason: 'stale_task_requeue',
|
||||||
|
})
|
||||||
|
|
||||||
|
requeued++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const total = requeued + failed
|
||||||
|
return {
|
||||||
|
ok: true,
|
||||||
|
message: total === 0
|
||||||
|
? `Found ${staleTasks.length} stale task(s) but agents still online`
|
||||||
|
: `Requeued ${requeued}, failed ${failed} of ${staleTasks.length} stale task(s)`,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export async function dispatchAssignedTasks(): Promise<{ ok: boolean; message: string }> {
|
export async function dispatchAssignedTasks(): Promise<{ ok: boolean; message: string }> {
|
||||||
const db = getDatabase()
|
const db = getDatabase()
|
||||||
|
|
||||||
|
|
@ -559,15 +661,36 @@ export async function dispatchAssignedTasks(): Promise<{ ok: boolean; message: s
|
||||||
const errorMsg = err.message || 'Unknown error'
|
const errorMsg = err.message || 'Unknown error'
|
||||||
logger.error({ taskId: task.id, agent: task.agent_name, err }, 'Task dispatch failed')
|
logger.error({ taskId: task.id, agent: task.agent_name, err }, 'Task dispatch failed')
|
||||||
|
|
||||||
// Revert to assigned so it can be retried on the next tick
|
// Increment dispatch_attempts and decide next status
|
||||||
db.prepare('UPDATE tasks SET status = ?, error_message = ?, updated_at = ? WHERE id = ?')
|
const currentAttempts = (db.prepare('SELECT dispatch_attempts FROM tasks WHERE id = ?').get(task.id) as { dispatch_attempts: number } | undefined)?.dispatch_attempts ?? 0
|
||||||
.run('assigned', errorMsg.substring(0, 5000), Math.floor(Date.now() / 1000), task.id)
|
const newAttempts = currentAttempts + 1
|
||||||
|
const maxDispatchRetries = 5
|
||||||
|
|
||||||
eventBus.broadcast('task.status_changed', {
|
if (newAttempts >= maxDispatchRetries) {
|
||||||
id: task.id,
|
// Too many failures — move to failed
|
||||||
status: 'assigned',
|
db.prepare('UPDATE tasks SET status = ?, error_message = ?, dispatch_attempts = ?, updated_at = ? WHERE id = ?')
|
||||||
previous_status: 'in_progress',
|
.run('failed', `Dispatch failed ${newAttempts} times. Last: ${errorMsg.substring(0, 5000)}`, newAttempts, Math.floor(Date.now() / 1000), task.id)
|
||||||
})
|
|
||||||
|
eventBus.broadcast('task.status_changed', {
|
||||||
|
id: task.id,
|
||||||
|
status: 'failed',
|
||||||
|
previous_status: 'in_progress',
|
||||||
|
error_message: `Dispatch failed ${newAttempts} times`,
|
||||||
|
reason: 'max_dispatch_retries_exceeded',
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
// Revert to assigned so it can be retried on the next tick
|
||||||
|
db.prepare('UPDATE tasks SET status = ?, error_message = ?, dispatch_attempts = ?, updated_at = ? WHERE id = ?')
|
||||||
|
.run('assigned', errorMsg.substring(0, 5000), newAttempts, Math.floor(Date.now() / 1000), task.id)
|
||||||
|
|
||||||
|
eventBus.broadcast('task.status_changed', {
|
||||||
|
id: task.id,
|
||||||
|
status: 'assigned',
|
||||||
|
previous_status: 'in_progress',
|
||||||
|
error_message: errorMsg.substring(0, 500),
|
||||||
|
reason: 'dispatch_failed',
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
db_helpers.logActivity(
|
db_helpers.logActivity(
|
||||||
'task_dispatch_failed',
|
'task_dispatch_failed',
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue