diff --git a/apps/code/src/main/services/cloud-task/service.ts b/apps/code/src/main/services/cloud-task/service.ts index 59716b068c..db99fc923f 100644 --- a/apps/code/src/main/services/cloud-task/service.ts +++ b/apps/code/src/main/services/cloud-task/service.ts @@ -1,10 +1,12 @@ import type { CloudTaskPermissionRequestUpdate } from "@shared/types"; +import { ANALYTICS_EVENTS } from "@shared/types/analytics"; import type { StoredLogEntry } from "@shared/types/session-events"; import { inject, injectable, preDestroy } from "inversify"; import { MAIN_TOKENS } from "../../di/tokens"; import { logger } from "../../utils/logger"; import { TypedEventEmitter } from "../../utils/typed-event-emitter"; import type { AuthService } from "../auth/service"; +import { trackAppEvent } from "../posthog-analytics"; import { CloudTaskEvent, type CloudTaskEvents, @@ -979,6 +981,21 @@ export class CloudTaskService extends TypedEventEmitter { const watcher = this.watchers.get(key); if (!watcher) return; + // Track every terminal give-up so cloud-run stream failures are visible in + // PostHog. error_title distinguishes the cause; the budget counts separate an + // idle Envoy cut from a genuine outage. Best-effort — never block teardown. + trackAppEvent(ANALYTICS_EVENTS.CLOUD_STREAM_DISCONNECTED, { + task_id: watcher.taskId, + run_id: watcher.runId, + team_id: watcher.teamId, + error_title: error.title, + retryable: error.retryable, + reconnect_attempts: watcher.reconnectAttempts, + stream_error_attempts: watcher.streamErrorAttempts, + cumulative_reconnect_attempts: watcher.cumulativeReconnectAttempts, + was_bootstrapping: watcher.isBootstrapping, + }); + watcher.failed = true; watcher.isBootstrapping = false; watcher.pendingLogEntries = []; diff --git a/apps/code/src/shared/types/analytics.ts b/apps/code/src/shared/types/analytics.ts index 23053a9b8b..a5e701366b 100644 --- a/apps/code/src/shared/types/analytics.ts +++ b/apps/code/src/shared/types/analytics.ts @@ -229,6 +229,22 @@ export interface AgentSessionErrorProperties { error_type: string; } +export interface CloudStreamDisconnectedProperties { + task_id: string; + run_id: string; + team_id: number; + // The error surfaced to the user (e.g. "Cloud stream disconnected", + // "Cloud run unreachable"), so give-up causes can be told apart. + error_title: string; + retryable: boolean; + // Which reconnect budget was exhausted, to separate idle Envoy cuts from + // genuine outages: transport reconnects, backend error frames, cumulative. + reconnect_attempts: number; + stream_error_attempts: number; + cumulative_reconnect_attempts: number; + was_bootstrapping: boolean; +} + // Permission events export interface PermissionRespondedProperties { task_id: string; @@ -744,6 +760,7 @@ export const ANALYTICS_EVENTS = { // Error events TASK_CREATION_FAILED: "Task creation failed", AGENT_SESSION_ERROR: "Agent session error", + CLOUD_STREAM_DISCONNECTED: "Cloud stream disconnected", // Inbox events INBOX_INTEREST_REGISTERED: "Inbox interest registered", @@ -864,6 +881,7 @@ export type EventPropertyMap = { // Error events [ANALYTICS_EVENTS.TASK_CREATION_FAILED]: TaskCreationFailedProperties; [ANALYTICS_EVENTS.AGENT_SESSION_ERROR]: AgentSessionErrorProperties; + [ANALYTICS_EVENTS.CLOUD_STREAM_DISCONNECTED]: CloudStreamDisconnectedProperties; // Inbox events [ANALYTICS_EVENTS.INBOX_INTEREST_REGISTERED]: never; diff --git a/packages/agent/src/server/agent-server.ts b/packages/agent/src/server/agent-server.ts index 8c0f4aa72b..3c3bde54eb 100644 --- a/packages/agent/src/server/agent-server.ts +++ b/packages/agent/src/server/agent-server.ts @@ -587,6 +587,44 @@ export class AgentServer { this.logger.debug("Agent server stopped"); } + /** + * Mark the run failed after an unrecoverable crash (uncaught exception / + * unhandled rejection). Without this a hard death is silent: the run row + * stays non-terminal, the desktop client just sees the stream stop and shows + * a generic "Cloud stream disconnected", and the workflow only gives up after + * the multi-hour inactivity timeout. Best-effort and self-contained so it can + * run from a process-level handler with no session context. + */ + async reportFatalError(error: unknown): Promise { + const errorMessage = error instanceof Error ? error.message : String(error); + this.logger.error("Fatal agent-server error; marking run failed", error); + + try { + await this.posthogAPI.updateTaskRun( + this.config.taskId, + this.config.runId, + { + status: "failed", + error_message: `Agent server crashed: ${errorMessage}`, + }, + ); + } catch (updateError) { + this.logger.error( + "Failed to mark run failed after fatal error", + updateError, + ); + } + + try { + await this.eventStreamSender?.stop(); + } catch (stopError) { + this.logger.error( + "Failed to flush event stream after fatal error", + stopError, + ); + } + } + private authenticateRequest( getHeader: (name: string) => string | undefined, ): JwtPayload { diff --git a/packages/agent/src/server/bin.ts b/packages/agent/src/server/bin.ts index 36bfe7a0e9..c70368ae9a 100644 --- a/packages/agent/src/server/bin.ts +++ b/packages/agent/src/server/bin.ts @@ -187,6 +187,27 @@ program process.exit(0); }); + // A hard crash would otherwise leave the run non-terminal and the user staring + // at a generic "Cloud stream disconnected". Mark the run failed before exiting + // so the desktop surfaces a real error instead of a silent stall. The deadline + // guarantees we exit even if reportFatalError's network calls hang at crash time + // (e.g. API unreachable during a restart), so we never block pod shutdown. + const FATAL_ERROR_REPORT_DEADLINE_MS = 5_000; + const handleFatalError = async (error: unknown) => { + try { + await Promise.race([ + server.reportFatalError(error), + new Promise((resolve) => + setTimeout(resolve, FATAL_ERROR_REPORT_DEADLINE_MS).unref(), + ), + ]); + } finally { + process.exit(1); + } + }; + process.on("uncaughtException", handleFatalError); + process.on("unhandledRejection", handleFatalError); + await server.start(); });