Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions apps/code/src/main/services/cloud-task/service.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import type { CloudTaskPermissionRequestUpdate } from "@shared/types";
import { ANALYTICS_EVENTS } from "@shared/types/analytics";
import type { StoredLogEntry } from "@shared/types/session-events";
import { inject, injectable, preDestroy } from "inversify";
import { MAIN_TOKENS } from "../../di/tokens";
import { logger } from "../../utils/logger";
import { TypedEventEmitter } from "../../utils/typed-event-emitter";
import type { AuthService } from "../auth/service";
import { trackAppEvent } from "../posthog-analytics";
import {
CloudTaskEvent,
type CloudTaskEvents,
Expand Down Expand Up @@ -979,6 +981,21 @@ export class CloudTaskService extends TypedEventEmitter<CloudTaskEvents> {
const watcher = this.watchers.get(key);
if (!watcher) return;

// Track every terminal give-up so cloud-run stream failures are visible in
// PostHog. error_title distinguishes the cause; the budget counts separate an
// idle Envoy cut from a genuine outage. Best-effort — never block teardown.
trackAppEvent(ANALYTICS_EVENTS.CLOUD_STREAM_DISCONNECTED, {
task_id: watcher.taskId,
run_id: watcher.runId,
team_id: watcher.teamId,
error_title: error.title,
retryable: error.retryable,
reconnect_attempts: watcher.reconnectAttempts,
stream_error_attempts: watcher.streamErrorAttempts,
cumulative_reconnect_attempts: watcher.cumulativeReconnectAttempts,
was_bootstrapping: watcher.isBootstrapping,
});

watcher.failed = true;
watcher.isBootstrapping = false;
watcher.pendingLogEntries = [];
Expand Down
18 changes: 18 additions & 0 deletions apps/code/src/shared/types/analytics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,22 @@ export interface AgentSessionErrorProperties {
error_type: string;
}

export interface CloudStreamDisconnectedProperties {
task_id: string;
run_id: string;
team_id: number;
// The error surfaced to the user (e.g. "Cloud stream disconnected",
// "Cloud run unreachable"), so give-up causes can be told apart.
error_title: string;
retryable: boolean;
// Which reconnect budget was exhausted, to separate idle Envoy cuts from
// genuine outages: transport reconnects, backend error frames, cumulative.
reconnect_attempts: number;
stream_error_attempts: number;
cumulative_reconnect_attempts: number;
was_bootstrapping: boolean;
}

// Permission events
export interface PermissionRespondedProperties {
task_id: string;
Expand Down Expand Up @@ -744,6 +760,7 @@ export const ANALYTICS_EVENTS = {
// Error events
TASK_CREATION_FAILED: "Task creation failed",
AGENT_SESSION_ERROR: "Agent session error",
CLOUD_STREAM_DISCONNECTED: "Cloud stream disconnected",

// Inbox events
INBOX_INTEREST_REGISTERED: "Inbox interest registered",
Expand Down Expand Up @@ -864,6 +881,7 @@ export type EventPropertyMap = {
// Error events
[ANALYTICS_EVENTS.TASK_CREATION_FAILED]: TaskCreationFailedProperties;
[ANALYTICS_EVENTS.AGENT_SESSION_ERROR]: AgentSessionErrorProperties;
[ANALYTICS_EVENTS.CLOUD_STREAM_DISCONNECTED]: CloudStreamDisconnectedProperties;

// Inbox events
[ANALYTICS_EVENTS.INBOX_INTEREST_REGISTERED]: never;
Expand Down
38 changes: 38 additions & 0 deletions packages/agent/src/server/agent-server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,44 @@ export class AgentServer {
this.logger.debug("Agent server stopped");
}

/**
* Mark the run failed after an unrecoverable crash (uncaught exception /
* unhandled rejection). Without this a hard death is silent: the run row
* stays non-terminal, the desktop client just sees the stream stop and shows
* a generic "Cloud stream disconnected", and the workflow only gives up after
* the multi-hour inactivity timeout. Best-effort and self-contained so it can
* run from a process-level handler with no session context.
*/
async reportFatalError(error: unknown): Promise<void> {
const errorMessage = error instanceof Error ? error.message : String(error);
this.logger.error("Fatal agent-server error; marking run failed", error);

try {
await this.posthogAPI.updateTaskRun(
this.config.taskId,
this.config.runId,
{
status: "failed",
error_message: `Agent server crashed: ${errorMessage}`,
},
);
} catch (updateError) {
this.logger.error(
"Failed to mark run failed after fatal error",
updateError,
);
}

try {
await this.eventStreamSender?.stop();
} catch (stopError) {
this.logger.error(
"Failed to flush event stream after fatal error",
stopError,
);
}
}

private authenticateRequest(
getHeader: (name: string) => string | undefined,
): JwtPayload {
Expand Down
21 changes: 21 additions & 0 deletions packages/agent/src/server/bin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,27 @@ program
process.exit(0);
});

// A hard crash would otherwise leave the run non-terminal and the user staring
// at a generic "Cloud stream disconnected". Mark the run failed before exiting
// so the desktop surfaces a real error instead of a silent stall. The deadline
// guarantees we exit even if reportFatalError's network calls hang at crash time
// (e.g. API unreachable during a restart), so we never block pod shutdown.
const FATAL_ERROR_REPORT_DEADLINE_MS = 5_000;
const handleFatalError = async (error: unknown) => {
try {
await Promise.race([
server.reportFatalError(error),
new Promise((resolve) =>
setTimeout(resolve, FATAL_ERROR_REPORT_DEADLINE_MS).unref(),
),
]);
} finally {
process.exit(1);
}
};
process.on("uncaughtException", handleFatalError);
process.on("unhandledRejection", handleFatalError);
Comment thread
tatoalo marked this conversation as resolved.

await server.start();
});

Expand Down
Loading