diff --git a/apps/web/src/app/(app)/claw/components/ClawOnboardingFlow.state.test.ts b/apps/web/src/app/(app)/claw/components/ClawOnboardingFlow.state.test.ts index 17682113fd..c0b520297d 100644 --- a/apps/web/src/app/(app)/claw/components/ClawOnboardingFlow.state.test.ts +++ b/apps/web/src/app/(app)/claw/components/ClawOnboardingFlow.state.test.ts @@ -197,6 +197,52 @@ describe('ClawOnboardingFlow state machine', () => { } ); + test('renders an error when the setup request failed', () => { + expect( + getClawOnboardingFlowState( + createInput({ + createSetupStarted: true, + setupFailed: true, + onboardingStep: 'provisioning', + hasBotIdentity: true, + selectedPreset: 'always-ask', + status: undefined, + }) + ).renderStep + ).toBe('error'); + expect( + getClawOnboardingFlowState( + createInput({ + mode: 'post-provisioning', + setupFailed: true, + status: createStatus(null), + }) + ).renderStep + ).toBe('error'); + expect( + getClawOnboardingFlowState( + createInput({ + mode: 'post-provisioning', + setupFailed: true, + status: createStatus('starting'), + }) + ).renderStep + ).toBe('error'); + }); + + test('does not let an old setup failure override a running instance', () => { + const state = getClawOnboardingFlowState( + createInput({ + mode: 'post-provisioning', + setupFailed: true, + status: createStatus('running'), + }) + ); + + expect(state.renderStep).toBe('complete'); + expect(state.postProvisioningReady).toBe(true); + }); + test.each(CLAW_ONBOARDING_ERROR_STATUSES)( 'renders an error when machine status is %s', status => { diff --git a/apps/web/src/app/(app)/claw/components/ClawOnboardingFlow.state.ts b/apps/web/src/app/(app)/claw/components/ClawOnboardingFlow.state.ts index c1485fcea6..ab1abf52c0 100644 --- a/apps/web/src/app/(app)/claw/components/ClawOnboardingFlow.state.ts +++ b/apps/web/src/app/(app)/claw/components/ClawOnboardingFlow.state.ts @@ -62,6 +62,7 @@ export type ClawOnboardingFlowStateInput = { status: KiloClawDashboardStatus | undefined; mode: ClawOnboardingMode; createSetupStarted: boolean; + setupFailed?: boolean; onboardingStep: OnboardingStep; selectedPreset: ExecPreset | null; hasBotIdentity: boolean; @@ -103,6 +104,7 @@ export function getClawOnboardingFlowState({ status, mode, createSetupStarted, + setupFailed = false, onboardingStep, selectedPreset, hasBotIdentity, @@ -122,6 +124,7 @@ export function getClawOnboardingFlowState({ const renderStepDecision = getRenderStepDecision({ mode, createSetupStarted, + setupFailed, instanceStatus, postProvisioningReady, onboardingStep, @@ -145,6 +148,7 @@ export function getClawOnboardingFlowState({ status, mode, createSetupStarted, + setupFailed, onboardingStep, selectedPreset, hasBotIdentity, @@ -167,7 +171,12 @@ export function getClawOnboardingFlowState({ type RenderStepInput = Pick< ClawOnboardingFlowStateInput, - 'mode' | 'createSetupStarted' | 'onboardingStep' | 'selectedPreset' | 'hasBotIdentity' + | 'mode' + | 'createSetupStarted' + | 'setupFailed' + | 'onboardingStep' + | 'selectedPreset' + | 'hasBotIdentity' > & { instanceStatus: PopulatedClawStatus | null; postProvisioningReady: boolean; @@ -204,6 +213,7 @@ const clawOnboardingFlowDebugSnapshots = new Map void; onCreateFlowFailed?: () => void; }) { @@ -71,6 +73,7 @@ export function ClawOnboardingFlow({ status={status} mode={mode} createFlowStarted={createFlowStarted} + setupFailed={setupFailed} onCreateFlowStarted={onCreateFlowStarted} onCreateFlowFailed={onCreateFlowFailed} /> @@ -82,12 +85,14 @@ function ClawOnboardingFlowInner({ status, mode, createFlowStarted, + setupFailed, onCreateFlowStarted, onCreateFlowFailed, }: { status: KiloClawDashboardStatus | undefined; mode: ClawOnboardingMode; createFlowStarted: boolean; + setupFailed: boolean; onCreateFlowStarted?: () => void; onCreateFlowFailed?: () => void; }) { @@ -113,6 +118,7 @@ function ClawOnboardingFlowInner({ status, mode, createSetupStarted, + setupFailed, onboardingStep, selectedPreset, hasBotIdentity: botIdentity !== null, @@ -394,8 +400,8 @@ export function ClawSetupErrorStep({ basePath }: { basePath: string }) {

Something went wrong

- Your KiloClaw instance stopped during setup. Please reach out to support for help - getting it back online. + Your KiloClaw instance stopped or failed during setup. Please reach out to support for + help getting it back online.

diff --git a/apps/web/src/app/(app)/claw/components/CreateInstanceCard.tsx b/apps/web/src/app/(app)/claw/components/CreateInstanceCard.tsx index c5eb7f123b..7e93fe0ac6 100644 --- a/apps/web/src/app/(app)/claw/components/CreateInstanceCard.tsx +++ b/apps/web/src/app/(app)/claw/components/CreateInstanceCard.tsx @@ -148,6 +148,10 @@ export function CreateInstanceCard({ }, { onError: err => { + posthog?.capture('claw_setup_provision_failed', { + selected_model: selectedModel, + reason: 'provision_request_failed', + }); onProvisionFailed?.(); toast.error(`Failed to create: ${err.message}`); }, diff --git a/apps/web/src/app/(app)/claw/new/ClawNewClient.tsx b/apps/web/src/app/(app)/claw/new/ClawNewClient.tsx index 882309fad7..e73fd7f16e 100644 --- a/apps/web/src/app/(app)/claw/new/ClawNewClient.tsx +++ b/apps/web/src/app/(app)/claw/new/ClawNewClient.tsx @@ -32,10 +32,12 @@ function ClawNewLoader({ createFlowStartedAt, billingUpdatedAt, onCreateFlowStarted, + setupFailed, onCreateFlowFailed, }: { mode: ClawOnboardingMode; createFlowStartedAt: number | null; + setupFailed: boolean; billingUpdatedAt: number; onCreateFlowStarted: () => void; onCreateFlowFailed: () => void; @@ -53,14 +55,20 @@ function ClawNewLoader({ status={status} mode={mode} createFlowStarted={createFlowStartedAt !== null} + setupFailed={setupFailed} onCreateFlowStarted={onCreateFlowStarted} onCreateFlowFailed={onCreateFlowFailed} /> ); } - const statusQueryForBoundary = - statusQuery.error || statusQuery.dataUpdatedAt >= billingUpdatedAt + const statusQueryForBoundary = setupFailed + ? { + data: statusQuery.data, + isLoading: false, + error: null, + } + : statusQuery.error || statusQuery.dataUpdatedAt >= billingUpdatedAt ? statusQuery : { data: undefined, @@ -73,6 +81,7 @@ function ClawNewLoader({ statusQuery={statusQueryForBoundary} mode={mode} createFlowStarted={createFlowStartedAt !== null} + setupFailed={setupFailed} onCreateFlowStarted={onCreateFlowStarted} onCreateFlowFailed={onCreateFlowFailed} /> @@ -95,8 +104,15 @@ function ClawNewLiveClient() { const trpc = useTRPC(); const billingQuery = useQuery(trpc.kiloclaw.getBillingStatus.queryOptions()); const [createFlowStartedAt, setCreateFlowStartedAt] = useState(null); - const onCreateFlowStarted = useCallback(() => setCreateFlowStartedAt(Date.now()), []); - const onCreateFlowFailed = useCallback(() => setCreateFlowStartedAt(null), []); + const [setupFailed, setSetupFailed] = useState(false); + const onCreateFlowStarted = useCallback(() => { + setSetupFailed(false); + setCreateFlowStartedAt(Date.now()); + }, []); + const onCreateFlowFailed = useCallback(() => { + setSetupFailed(true); + setCreateFlowStartedAt(null); + }, []); if (billingQuery.isLoading) { return ; @@ -115,7 +131,7 @@ function ClawNewLiveClient() { ); } - if (createFlowStartedAt === null && billingQuery.isFetching) { + if (!setupFailed && createFlowStartedAt === null && billingQuery.isFetching) { return ; } @@ -144,6 +160,7 @@ function ClawNewLiveClient() { (null); + const [setupFailed, setSetupFailed] = useState(false); const [hasSettledStatus, setHasSettledStatus] = useState(false); - const onCreateFlowStarted = useCallback(() => setCreateFlowStartedAt(Date.now()), []); - const onCreateFlowFailed = useCallback(() => setCreateFlowStartedAt(null), []); + const onCreateFlowStarted = useCallback(() => { + setSetupFailed(false); + setCreateFlowStartedAt(Date.now()); + }, []); + const onCreateFlowFailed = useCallback(() => { + setSetupFailed(true); + setCreateFlowStartedAt(null); + }, []); useEffect(() => { if (!statusQuery.isFetching && (statusQuery.data !== undefined || statusQuery.error)) { @@ -54,19 +61,21 @@ function OrgClawNewLiveClient({ organizationId }: { organizationId: string }) { mode="create-first" organizationId={organizationId} createFlowStarted + setupFailed={setupFailed} onCreateFlowStarted={onCreateFlowStarted} onCreateFlowFailed={onCreateFlowFailed} /> ); } - if (statusQuery.error) { + if (!setupFailed && statusQuery.error) { return ( @@ -75,13 +84,14 @@ function OrgClawNewLiveClient({ organizationId }: { organizationId: string }) { const isFetchingEmptyStatus = statusQuery.isFetching && statusQuery.data?.status === null; - if (statusQuery.isLoading || !hasSettledStatus || isFetchingEmptyStatus) { + if (!setupFailed && (statusQuery.isLoading || !hasSettledStatus || isFetchingEmptyStatus)) { return ( @@ -94,10 +104,15 @@ function OrgClawNewLiveClient({ organizationId }: { organizationId: string }) { return ( diff --git a/services/kiloclaw/src/routes/platform-sanitize-error.test.ts b/services/kiloclaw/src/routes/platform-sanitize-error.test.ts index 0744f0df0a..f86696f003 100644 --- a/services/kiloclaw/src/routes/platform-sanitize-error.test.ts +++ b/services/kiloclaw/src/routes/platform-sanitize-error.test.ts @@ -17,7 +17,7 @@ afterEach(() => { }); /** Minimal env whose DO stub rejects with the given error (simulates RPC boundary). */ -function envWithDOError(error: Error) { +function envWithDOError(error: Error, writeDataPoint = vi.fn()) { return { KILOCLAW_INSTANCE: { idFromName: (id: string) => id, @@ -29,7 +29,7 @@ function envWithDOError(error: Error) { } ), }, - KILOCLAW_AE: { writeDataPoint: vi.fn() }, + KILOCLAW_AE: { writeDataPoint }, KV_CLAW_CACHE: { get: vi.fn().mockResolvedValue(null), put: vi.fn().mockResolvedValue(undefined), @@ -146,6 +146,40 @@ describe('sanitizeError: Instance-not-* status correction', () => { const body = await jsonBody(resp); expect(body.error).toBe('status failed'); }); + + it('logs the full provision error object while returning a sanitized response', async () => { + const err = new Error('Fly API allocateIP failed (500): upstream'); + const writeDataPoint = vi.fn(); + const env = envWithDOError(err, writeDataPoint); + const consoleSpy = vi.spyOn(console, 'error').mockImplementation(() => {}); + + const resp = await platform.request( + '/provision', + { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ userId: 'user-1' }), + }, + env + ); + + expect(resp.status).toBe(500); + const body = await jsonBody(resp); + expect(body.error).toBe('provision failed'); + expect(consoleSpy).toHaveBeenCalledWith('[platform] provision failed:', err); + const provisioningFailureCall = writeDataPoint.mock.calls.find(call => + JSON.stringify(call[0]).includes('instance.provisioning_failed') + ); + expect(provisioningFailureCall).toBeDefined(); + expect(provisioningFailureCall?.[0]).toMatchObject({ + indexes: ['instance.provisioning_failed'], + }); + const serializedDataPoint = JSON.stringify(provisioningFailureCall?.[0]); + expect(serializedDataPoint).toContain('fly_api_allocateIP_500'); + expect(serializedDataPoint).toContain('provision failed'); + expect(serializedDataPoint).not.toContain(''); + expect(serializedDataPoint).not.toContain('upstream'); + }); }); describe('kilo-cli-run/start: conflict response handling', () => { diff --git a/services/kiloclaw/src/routes/platform.ts b/services/kiloclaw/src/routes/platform.ts index c829df314e..d856098809 100644 --- a/services/kiloclaw/src/routes/platform.ts +++ b/services/kiloclaw/src/routes/platform.ts @@ -726,7 +726,7 @@ function sanitizeError(err: unknown, operation: string): { message: string; stat const normalized = raw.replace(/^(?:[A-Za-z]+Error:\s*)+/, ''); // Log the full error for Sentry/debugging — this never reaches the caller - console.error(`[platform] ${operation} failed:`, raw); + console.error(`[platform] ${operation} failed:`, err); // Allow known-safe messages through if (SAFE_ERROR_PREFIXES.some(prefix => normalized.startsWith(prefix))) { @@ -736,6 +736,15 @@ function sanitizeError(err: unknown, operation: string): { message: string; stat return { message: `${operation} failed`, status }; } +function classifyProvisionFailure(err: unknown, status: number): string { + const raw = err instanceof Error ? err.message : String(err); + const flyApiMatch = raw.match(/Fly API ([A-Za-z0-9_-]+) failed \((\d{3})\)/); + const flyOperation = flyApiMatch?.[1]; + const flyStatus = flyApiMatch?.[2]; + if (flyOperation && flyStatus) return `fly_api_${flyOperation}_${flyStatus}`; + return `provision_${status}`; +} + /** * DO lifecycle methods throw `Object.assign(new Error('Instance not provisioned'), { status: 404 })` * but `.status` is lost crossing the DO RPC boundary, so `statusCodeFromError` @@ -856,6 +865,7 @@ platform.post('/provision', async c => { const shouldBootstrapSubscription = !instanceId || bootstrapSubscription === true; const provisionDoKey = await resolveInstanceDoKey(c.env, userId, provisionedInstanceId); const provisionRoute = '/api/platform/provision'; + const provisionStartedAt = performance.now(); let selectedProvider = provider; if (!selectedProvider && shouldInsertInstanceRecord) { @@ -903,6 +913,18 @@ platform.post('/provision', async c => { return c.json({ error: 'User already has an active instance' }, 409); } const { message, status } = sanitizeError(err, 'provision'); + writeEvent(c.env, { + event: 'instance.provisioning_failed', + delivery: 'http', + route: provisionRoute, + userId, + instanceId: provisionedInstanceId, + orgId: orgId ?? undefined, + error: message, + label: classifyProvisionFailure(err, status), + durationMs: performance.now() - provisionStartedAt, + value: status, + }); return jsonError(message, status); }