From 5ff84d2c71aa534381b8dacace82874260c7f171 Mon Sep 17 00:00:00 2001 From: Alessandro Pogliaghi Date: Tue, 2 Jun 2026 11:13:21 +0100 Subject: [PATCH 1/2] chore(cloud-agent): track stream disconnects and report agent-server crashes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make two previously-invisible cloud-run failures observable, without changing any UX. - Desktop client: emit a "Cloud stream disconnected" PostHog event whenever a cloud-run watcher gives up (failWatcher). Carries the error title and the three reconnect-budget counts, so an idle Envoy cut can be told apart from a real outage, and the event can be joined to run outcomes to see whether the underlying run survived. - Agent server: install uncaughtException / unhandledRejection handlers that mark the run failed (reportFatalError) before exiting. A hard crash was silent — the run stayed non-terminal and the desktop showed only a generic disconnect until the multi-hour inactivity timeout. --- .../src/main/services/cloud-task/service.ts | 17 +++++++++ apps/code/src/shared/types/analytics.ts | 18 +++++++++ packages/agent/src/server/agent-server.ts | 38 +++++++++++++++++++ packages/agent/src/server/bin.ts | 13 +++++++ 4 files changed, 86 insertions(+) diff --git a/apps/code/src/main/services/cloud-task/service.ts b/apps/code/src/main/services/cloud-task/service.ts index 59716b068c..db99fc923f 100644 --- a/apps/code/src/main/services/cloud-task/service.ts +++ b/apps/code/src/main/services/cloud-task/service.ts @@ -1,10 +1,12 @@ import type { CloudTaskPermissionRequestUpdate } from "@shared/types"; +import { ANALYTICS_EVENTS } from "@shared/types/analytics"; import type { StoredLogEntry } from "@shared/types/session-events"; import { inject, injectable, preDestroy } from "inversify"; import { MAIN_TOKENS } from "../../di/tokens"; import { logger } from "../../utils/logger"; import { TypedEventEmitter } from "../../utils/typed-event-emitter"; import type { AuthService } from "../auth/service"; +import { trackAppEvent } from "../posthog-analytics"; import { CloudTaskEvent, type CloudTaskEvents, @@ -979,6 +981,21 @@ export class CloudTaskService extends TypedEventEmitter { const watcher = this.watchers.get(key); if (!watcher) return; + // Track every terminal give-up so cloud-run stream failures are visible in + // PostHog. error_title distinguishes the cause; the budget counts separate an + // idle Envoy cut from a genuine outage. Best-effort — never block teardown. + trackAppEvent(ANALYTICS_EVENTS.CLOUD_STREAM_DISCONNECTED, { + task_id: watcher.taskId, + run_id: watcher.runId, + team_id: watcher.teamId, + error_title: error.title, + retryable: error.retryable, + reconnect_attempts: watcher.reconnectAttempts, + stream_error_attempts: watcher.streamErrorAttempts, + cumulative_reconnect_attempts: watcher.cumulativeReconnectAttempts, + was_bootstrapping: watcher.isBootstrapping, + }); + watcher.failed = true; watcher.isBootstrapping = false; watcher.pendingLogEntries = []; diff --git a/apps/code/src/shared/types/analytics.ts b/apps/code/src/shared/types/analytics.ts index 23053a9b8b..a5e701366b 100644 --- a/apps/code/src/shared/types/analytics.ts +++ b/apps/code/src/shared/types/analytics.ts @@ -229,6 +229,22 @@ export interface AgentSessionErrorProperties { error_type: string; } +export interface CloudStreamDisconnectedProperties { + task_id: string; + run_id: string; + team_id: number; + // The error surfaced to the user (e.g. "Cloud stream disconnected", + // "Cloud run unreachable"), so give-up causes can be told apart. + error_title: string; + retryable: boolean; + // Which reconnect budget was exhausted, to separate idle Envoy cuts from + // genuine outages: transport reconnects, backend error frames, cumulative. + reconnect_attempts: number; + stream_error_attempts: number; + cumulative_reconnect_attempts: number; + was_bootstrapping: boolean; +} + // Permission events export interface PermissionRespondedProperties { task_id: string; @@ -744,6 +760,7 @@ export const ANALYTICS_EVENTS = { // Error events TASK_CREATION_FAILED: "Task creation failed", AGENT_SESSION_ERROR: "Agent session error", + CLOUD_STREAM_DISCONNECTED: "Cloud stream disconnected", // Inbox events INBOX_INTEREST_REGISTERED: "Inbox interest registered", @@ -864,6 +881,7 @@ export type EventPropertyMap = { // Error events [ANALYTICS_EVENTS.TASK_CREATION_FAILED]: TaskCreationFailedProperties; [ANALYTICS_EVENTS.AGENT_SESSION_ERROR]: AgentSessionErrorProperties; + [ANALYTICS_EVENTS.CLOUD_STREAM_DISCONNECTED]: CloudStreamDisconnectedProperties; // Inbox events [ANALYTICS_EVENTS.INBOX_INTEREST_REGISTERED]: never; diff --git a/packages/agent/src/server/agent-server.ts b/packages/agent/src/server/agent-server.ts index 8c0f4aa72b..3c3bde54eb 100644 --- a/packages/agent/src/server/agent-server.ts +++ b/packages/agent/src/server/agent-server.ts @@ -587,6 +587,44 @@ export class AgentServer { this.logger.debug("Agent server stopped"); } + /** + * Mark the run failed after an unrecoverable crash (uncaught exception / + * unhandled rejection). Without this a hard death is silent: the run row + * stays non-terminal, the desktop client just sees the stream stop and shows + * a generic "Cloud stream disconnected", and the workflow only gives up after + * the multi-hour inactivity timeout. Best-effort and self-contained so it can + * run from a process-level handler with no session context. + */ + async reportFatalError(error: unknown): Promise { + const errorMessage = error instanceof Error ? error.message : String(error); + this.logger.error("Fatal agent-server error; marking run failed", error); + + try { + await this.posthogAPI.updateTaskRun( + this.config.taskId, + this.config.runId, + { + status: "failed", + error_message: `Agent server crashed: ${errorMessage}`, + }, + ); + } catch (updateError) { + this.logger.error( + "Failed to mark run failed after fatal error", + updateError, + ); + } + + try { + await this.eventStreamSender?.stop(); + } catch (stopError) { + this.logger.error( + "Failed to flush event stream after fatal error", + stopError, + ); + } + } + private authenticateRequest( getHeader: (name: string) => string | undefined, ): JwtPayload { diff --git a/packages/agent/src/server/bin.ts b/packages/agent/src/server/bin.ts index 36bfe7a0e9..8751246b51 100644 --- a/packages/agent/src/server/bin.ts +++ b/packages/agent/src/server/bin.ts @@ -187,6 +187,19 @@ program process.exit(0); }); + // A hard crash would otherwise leave the run non-terminal and the user staring + // at a generic "Cloud stream disconnected". Mark the run failed before exiting + // so the desktop surfaces a real error instead of a silent stall. + const handleFatalError = async (error: unknown) => { + try { + await server.reportFatalError(error); + } finally { + process.exit(1); + } + }; + process.on("uncaughtException", handleFatalError); + process.on("unhandledRejection", handleFatalError); + await server.start(); }); From b98cbbca2467b364528927b34a94f3c12317f1b7 Mon Sep 17 00:00:00 2001 From: Alessandro Pogliaghi Date: Tue, 2 Jun 2026 12:35:46 +0100 Subject: [PATCH 2/2] chore(cloud-agent): bound fatal-error reporting with a deadline reportFatalError makes two unbounded network calls; if the API is slow or unreachable at crash time, process.exit in the handler's finally would never run and the container would block pod shutdown. Race it against a 5s deadline so the process always exits promptly after a fatal error. --- packages/agent/src/server/bin.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/packages/agent/src/server/bin.ts b/packages/agent/src/server/bin.ts index 8751246b51..c70368ae9a 100644 --- a/packages/agent/src/server/bin.ts +++ b/packages/agent/src/server/bin.ts @@ -189,10 +189,18 @@ program // A hard crash would otherwise leave the run non-terminal and the user staring // at a generic "Cloud stream disconnected". Mark the run failed before exiting - // so the desktop surfaces a real error instead of a silent stall. + // so the desktop surfaces a real error instead of a silent stall. The deadline + // guarantees we exit even if reportFatalError's network calls hang at crash time + // (e.g. API unreachable during a restart), so we never block pod shutdown. + const FATAL_ERROR_REPORT_DEADLINE_MS = 5_000; const handleFatalError = async (error: unknown) => { try { - await server.reportFatalError(error); + await Promise.race([ + server.reportFatalError(error), + new Promise((resolve) => + setTimeout(resolve, FATAL_ERROR_REPORT_DEADLINE_MS).unref(), + ), + ]); } finally { process.exit(1); }