fix: resolve WebSocket disconnect bugs and add SSE reconnect backoff (#97)
- Fix stale closure: onclose now calls connectRef.current instead of capturing connect by value, so reconnect always uses the latest version - Fix disconnect-reconnect race: manualDisconnectRef prevents onclose from scheduling a new reconnect after explicit disconnect() - Fix double-connect guard: check both OPEN and CONNECTING states - Add SSE exponential backoff with 20-attempt cap (was flat 3s infinite) - Add SSE error logging (was silently swallowed) - Update README: fix stale counts (28 panels, 66 routes, 21 migrations, 148 E2E tests), add missing features (SOUL system, Ed25519, agent messaging, update checker), document NEXT_PUBLIC_GATEWAY_TOKEN
This commit is contained in:
parent
e6bae7ad88
commit
f23c78f43a
44
README.md
44
README.md
|
|
@ -24,7 +24,7 @@ Manage agent fleets, track tasks, monitor costs, and orchestrate workflows — a
|
|||
|
||||
Running AI agents at scale means juggling sessions, tasks, costs, and reliability across multiple models and channels. Mission Control gives you:
|
||||
|
||||
- **26 panels** — Tasks, agents, logs, tokens, memory, cron, alerts, webhooks, pipelines, and more
|
||||
- **28 panels** — Tasks, agents, logs, tokens, memory, cron, alerts, webhooks, pipelines, and more
|
||||
- **Real-time everything** — WebSocket + SSE push updates, smart polling that pauses when you're away
|
||||
- **Zero external dependencies** — SQLite database, single `pnpm start` to run, no Redis/Postgres/Docker required
|
||||
- **Role-based access** — Viewer, operator, and admin roles with session + API key auth
|
||||
|
|
@ -61,6 +61,10 @@ Initial login is seeded from `AUTH_USER` / `AUTH_PASS` on first run.
|
|||
- Local Claude Code session tracking (auto-discovers from `~/.claude/projects/`)
|
||||
- Quality review gates for task sign-off
|
||||
- Pipeline orchestration with workflow templates
|
||||
- Ed25519 device identity for secure gateway handshake
|
||||
- Agent SOUL system with workspace file sync and templates
|
||||
- Agent inter-agent messaging and comms
|
||||
- Update available banner with GitHub release check
|
||||
|
||||
### Known Limitations
|
||||
|
||||
|
|
@ -99,9 +103,18 @@ Automatically discovers and tracks local Claude Code sessions by scanning `~/.cl
|
|||
### GitHub Issues Sync
|
||||
Inbound sync from GitHub repositories with label and assignee mapping. Synced issues appear on the task board alongside agent-created tasks.
|
||||
|
||||
### Agent SOUL System
|
||||
Define agent personality, capabilities, and behavioral guidelines via SOUL markdown files. Edit in the UI or directly in workspace `soul.md` files — changes sync bidirectionally between disk and database.
|
||||
|
||||
### Agent Messaging
|
||||
Inter-agent communication via the comms API. Agents can send messages to each other, enabling coordinated multi-agent workflows.
|
||||
|
||||
### Integrations
|
||||
Outbound webhooks with delivery history, configurable alert rules with cooldowns, and multi-gateway connection management. Optional 1Password CLI integration for secret management.
|
||||
|
||||
### Update Checker
|
||||
Automatic GitHub release check notifies you when a new version is available, displayed as a banner in the dashboard.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
|
|
@ -111,20 +124,22 @@ mission-control/
|
|||
│ ├── app/
|
||||
│ │ ├── page.tsx # SPA shell — routes all panels
|
||||
│ │ ├── login/page.tsx # Login page
|
||||
│ │ └── api/ # 64 REST API routes
|
||||
│ │ └── api/ # 66 REST API routes
|
||||
│ ├── components/
|
||||
│ │ ├── layout/ # NavRail, HeaderBar, LiveFeed
|
||||
│ │ ├── dashboard/ # Overview dashboard
|
||||
│ │ ├── panels/ # 26 feature panels
|
||||
│ │ ├── panels/ # 28 feature panels
|
||||
│ │ └── chat/ # Agent chat UI
|
||||
│ ├── lib/
|
||||
│ │ ├── auth.ts # Session + API key auth, RBAC
|
||||
│ │ ├── db.ts # SQLite (better-sqlite3, WAL mode)
|
||||
│ │ ├── claude-sessions.ts # Local Claude Code session scanner
|
||||
│ │ ├── migrations.ts # 20 schema migrations
|
||||
│ │ ├── migrations.ts # 21 schema migrations
|
||||
│ │ ├── scheduler.ts # Background task scheduler
|
||||
│ │ ├── webhooks.ts # Outbound webhook delivery
|
||||
│ │ └── websocket.ts # Gateway WebSocket client
|
||||
│ │ ├── websocket.ts # Gateway WebSocket client
|
||||
│ │ ├── device-identity.ts # Ed25519 device identity for gateway auth
|
||||
│ │ └── agent-sync.ts # OpenClaw config → MC database sync
|
||||
│ └── store/index.ts # Zustand state management
|
||||
└── .data/ # Runtime data (SQLite DB, token logs)
|
||||
```
|
||||
|
|
@ -141,7 +156,8 @@ mission-control/
|
|||
| Charts | Recharts 3 |
|
||||
| Real-time | WebSocket + Server-Sent Events |
|
||||
| Auth | scrypt hashing, session tokens, RBAC |
|
||||
| Testing | Vitest + Playwright (165 E2E tests) |
|
||||
| Validation | Zod 4 |
|
||||
| Testing | Vitest + Playwright (148 E2E tests) |
|
||||
|
||||
## Authentication
|
||||
|
||||
|
|
@ -184,6 +200,11 @@ All endpoints require authentication unless noted. Full reference below.
|
|||
|--------|------|------|-------------|
|
||||
| `GET` | `/api/agents` | viewer | List agents with task stats |
|
||||
| `POST` | `/api/agents` | operator | Register/update agent |
|
||||
| `GET` | `/api/agents/[id]` | viewer | Agent details |
|
||||
| `POST` | `/api/agents/sync` | operator | Sync agents from openclaw.json |
|
||||
| `GET/PUT` | `/api/agents/[id]/soul` | operator | Agent SOUL content (reads from workspace, writes to both) |
|
||||
| `GET/POST` | `/api/agents/comms` | operator | Agent inter-agent communication |
|
||||
| `POST` | `/api/agents/message` | operator | Send message to agent |
|
||||
| `GET` | `/api/tasks` | viewer | List tasks (filter: `?status=`, `?assigned_to=`, `?priority=`) |
|
||||
| `POST` | `/api/tasks` | operator | Create task |
|
||||
| `GET` | `/api/tasks/[id]` | viewer | Task details |
|
||||
|
|
@ -207,6 +228,7 @@ All endpoints require authentication unless noted. Full reference below.
|
|||
| `GET` | `/api/tokens` | viewer | Token usage and cost data |
|
||||
| `GET` | `/api/standup` | viewer | Standup report history |
|
||||
| `POST` | `/api/standup` | operator | Generate standup |
|
||||
| `GET` | `/api/releases/check` | viewer | Check for new GitHub releases |
|
||||
|
||||
</details>
|
||||
|
||||
|
|
@ -232,6 +254,8 @@ All endpoints require authentication unless noted. Full reference below.
|
|||
| `GET` | `/api/memory` | viewer | Memory file browser/search |
|
||||
| `GET` | `/api/search` | viewer | Global search |
|
||||
| `GET` | `/api/export` | admin | CSV export |
|
||||
| `POST` | `/api/backup` | admin | Database backup |
|
||||
| `POST` | `/api/cleanup` | admin | Stale data cleanup |
|
||||
|
||||
</details>
|
||||
|
||||
|
|
@ -319,6 +343,8 @@ See [`.env.example`](.env.example) for the complete list. Key variables:
|
|||
| `OPENCLAW_HOME` | Yes* | Path to `.openclaw` directory |
|
||||
| `OPENCLAW_GATEWAY_HOST` | No | Gateway host (default: `127.0.0.1`) |
|
||||
| `OPENCLAW_GATEWAY_PORT` | No | Gateway WebSocket port (default: `18789`) |
|
||||
| `OPENCLAW_GATEWAY_TOKEN` | No | Server-side gateway auth token |
|
||||
| `NEXT_PUBLIC_GATEWAY_TOKEN` | No | Browser-side gateway auth token (must use `NEXT_PUBLIC_` prefix) |
|
||||
| `OPENCLAW_MEMORY_DIR` | No | Memory browser root (see note below) |
|
||||
| `MC_CLAUDE_HOME` | No | Path to `~/.claude` directory (default: `~/.claude`) |
|
||||
| `MC_TRUSTED_PROXIES` | No | Comma-separated trusted proxy IPs for XFF parsing |
|
||||
|
|
@ -386,10 +412,16 @@ See [open issues](https://github.com/builderz-labs/mission-control/issues) for p
|
|||
- [x] Webhook signature verification (HMAC-SHA256 with constant-time comparison)
|
||||
- [x] Local Claude Code session tracking — auto-discover sessions from `~/.claude/projects/`
|
||||
- [x] Rate limiter IP extraction hardening with trusted proxy support
|
||||
- [x] Ed25519 device identity for WebSocket challenge-response handshake ([#85](https://github.com/builderz-labs/mission-control/pull/85))
|
||||
- [x] Agent SOUL workspace sync — bidirectional sync between `soul.md` files and database ([#95](https://github.com/builderz-labs/mission-control/pull/95))
|
||||
- [x] Update available banner with GitHub release check ([#94](https://github.com/builderz-labs/mission-control/pull/94))
|
||||
- [x] Side panel navigation synced with URL routes ([#87](https://github.com/builderz-labs/mission-control/pull/87))
|
||||
- [x] Task board SSE wiring, priority enum, and auto-advance ([#89](https://github.com/builderz-labs/mission-control/pull/89))
|
||||
|
||||
**Up next:**
|
||||
|
||||
- [ ] Agent-agnostic gateway support — connect any orchestration framework (OpenClaw, ZeroClaw, OpenFang, NeoBot, IronClaw, etc.), not just OpenClaw
|
||||
- [ ] Workspace isolation for multi-team usage ([#75](https://github.com/builderz-labs/mission-control/issues/75))
|
||||
- [ ] Native macOS app (Electron or Tauri)
|
||||
- [ ] First-class per-agent cost breakdowns — dedicated panel with per-agent token usage and spend (currently derivable from per-session data)
|
||||
- [ ] OAuth approval UI improvements
|
||||
|
|
|
|||
|
|
@ -16,9 +16,14 @@ interface ServerEvent {
|
|||
* SSE provides instant updates for all local-DB data (tasks, agents,
|
||||
* chat, activities, notifications), making REST polling a fallback.
|
||||
*/
|
||||
const SSE_MAX_RECONNECT_ATTEMPTS = 20
|
||||
const SSE_BASE_DELAY_MS = 1000
|
||||
const SSE_MAX_DELAY_MS = 30000
|
||||
|
||||
export function useServerEvents() {
|
||||
const eventSourceRef = useRef<EventSource | null>(null)
|
||||
const reconnectTimeoutRef = useRef<NodeJS.Timeout | undefined>(undefined)
|
||||
const sseReconnectAttemptsRef = useRef<number>(0)
|
||||
|
||||
const {
|
||||
setConnection,
|
||||
|
|
@ -46,6 +51,7 @@ export function useServerEvents() {
|
|||
|
||||
es.onopen = () => {
|
||||
if (!mounted) return
|
||||
sseReconnectAttemptsRef.current = 0
|
||||
setConnection({ sseConnected: true })
|
||||
}
|
||||
|
||||
|
|
@ -65,11 +71,21 @@ export function useServerEvents() {
|
|||
es.close()
|
||||
eventSourceRef.current = null
|
||||
|
||||
// Reconnect after 3s (EventSource auto-reconnects, but we handle
|
||||
// it explicitly to control the sseConnected state)
|
||||
const attempts = sseReconnectAttemptsRef.current
|
||||
if (attempts >= SSE_MAX_RECONNECT_ATTEMPTS) {
|
||||
console.error(`SSE: max reconnect attempts (${SSE_MAX_RECONNECT_ATTEMPTS}) reached`)
|
||||
return
|
||||
}
|
||||
|
||||
// Exponential backoff with jitter
|
||||
const base = Math.min(Math.pow(2, attempts) * SSE_BASE_DELAY_MS, SSE_MAX_DELAY_MS)
|
||||
const delay = Math.round(base + Math.random() * base * 0.5)
|
||||
sseReconnectAttemptsRef.current = attempts + 1
|
||||
|
||||
console.warn(`SSE: reconnecting in ${delay}ms (attempt ${attempts + 1}/${SSE_MAX_RECONNECT_ATTEMPTS})`)
|
||||
reconnectTimeoutRef.current = setTimeout(() => {
|
||||
if (mounted) connect()
|
||||
}, 3000)
|
||||
}, delay)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -47,6 +47,8 @@ export function useWebSocket() {
|
|||
const requestIdRef = useRef<number>(0)
|
||||
const handshakeCompleteRef = useRef<boolean>(false)
|
||||
const reconnectAttemptsRef = useRef<number>(0)
|
||||
const manualDisconnectRef = useRef<boolean>(false)
|
||||
const connectRef = useRef<(url: string, token?: string) => void>(() => {})
|
||||
|
||||
// Heartbeat tracking
|
||||
const pingCounterRef = useRef<number>(0)
|
||||
|
|
@ -400,8 +402,9 @@ export function useWebSocket() {
|
|||
}, [sendConnectHandshake, setConnection, setSessions, addLog, startHeartbeat, handlePong, addChatMessage, addNotification, updateAgent])
|
||||
|
||||
const connect = useCallback((url: string, token?: string) => {
|
||||
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
||||
return // Already connected
|
||||
const state = wsRef.current?.readyState
|
||||
if (state === WebSocket.OPEN || state === WebSocket.CONNECTING) {
|
||||
return // Already connected or connecting
|
||||
}
|
||||
|
||||
// Extract token from URL if present
|
||||
|
|
@ -414,6 +417,7 @@ export function useWebSocket() {
|
|||
|
||||
reconnectUrl.current = url
|
||||
handshakeCompleteRef.current = false
|
||||
manualDisconnectRef.current = false
|
||||
|
||||
try {
|
||||
const ws = new WebSocket(url.split('?')[0]) // Connect without query params
|
||||
|
|
@ -452,7 +456,10 @@ export function useWebSocket() {
|
|||
handshakeCompleteRef.current = false
|
||||
stopHeartbeat()
|
||||
|
||||
// Auto-reconnect logic with exponential backoff (uses ref to avoid stale closure)
|
||||
// Skip auto-reconnect if this was a manual disconnect
|
||||
if (manualDisconnectRef.current) return
|
||||
|
||||
// Auto-reconnect with exponential backoff (uses connectRef to avoid stale closure)
|
||||
const attempts = reconnectAttemptsRef.current
|
||||
if (attempts < maxReconnectAttempts) {
|
||||
const base = Math.min(Math.pow(2, attempts) * 1000, 30000)
|
||||
|
|
@ -462,7 +469,7 @@ export function useWebSocket() {
|
|||
reconnectAttemptsRef.current = attempts + 1
|
||||
setConnection({ reconnectAttempts: attempts + 1 })
|
||||
reconnectTimeoutRef.current = setTimeout(() => {
|
||||
connect(url, authTokenRef.current)
|
||||
connectRef.current(reconnectUrl.current, authTokenRef.current)
|
||||
}, timeout)
|
||||
} else {
|
||||
console.error('Max reconnection attempts reached.')
|
||||
|
|
@ -493,9 +500,19 @@ export function useWebSocket() {
|
|||
}
|
||||
}, [setConnection, handleGatewayFrame, addLog, stopHeartbeat])
|
||||
|
||||
// Keep ref in sync so onclose always calls the latest version of connect
|
||||
useEffect(() => {
|
||||
connectRef.current = connect
|
||||
}, [connect])
|
||||
|
||||
const disconnect = useCallback(() => {
|
||||
// Signal manual disconnect before closing so onclose skips auto-reconnect
|
||||
manualDisconnectRef.current = true
|
||||
reconnectAttemptsRef.current = 0
|
||||
|
||||
if (reconnectTimeoutRef.current) {
|
||||
clearTimeout(reconnectTimeoutRef.current)
|
||||
reconnectTimeoutRef.current = undefined
|
||||
}
|
||||
|
||||
stopHeartbeat()
|
||||
|
|
|
|||
Loading…
Reference in New Issue