Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion .github/workflows/push-dev-kiloclaw.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ on:

permissions:
contents: read
packages: write

jobs:
push-dev:
Expand All @@ -28,6 +29,13 @@ jobs:
username: x
password: ${{ secrets.FLY_DEV_API_TOKEN }}

- name: Login to GHCR
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Generate image tag and cache bust value
id: tag
run: |
Expand Down Expand Up @@ -72,6 +80,8 @@ jobs:
push: true
tags: |
registry.fly.io/kiloclaw-registry-dev:${{ steps.tag.outputs.tag }}
ghcr.io/kilo-org/kiloclaw:${{ steps.tag.outputs.tag }}
ghcr.io/kilo-org/kiloclaw:sha-${{ github.sha }}
build-args: |
CONTROLLER_COMMIT=${{ github.sha }}
CONTROLLER_CACHE_BUST=${{ steps.tag.outputs.cache_bust }}
Expand Down Expand Up @@ -115,7 +125,16 @@ jobs:
echo "FLY_IMAGE_CONTENT_HASH=${CONTENT}"
echo '```'
echo ""
echo "**Image pushed to:** \`registry.fly.io/kiloclaw-registry-dev:${TAG}\`"
echo "**Images pushed to:**"
echo "- \`registry.fly.io/kiloclaw-registry-dev:${TAG}\`"
echo "- \`ghcr.io/kilo-org/kiloclaw:${TAG}\`"
echo "- \`ghcr.io/kilo-org/kiloclaw:sha-${{ github.sha }}\`"
echo ""
echo "For Northflank, use:"
echo ""
echo '```'
echo "NF_IMAGE_PATH_TEMPLATE=ghcr.io/kilo-org/kiloclaw:{tag}"
echo '```'
echo ""
} >> "$GITHUB_STEP_SUMMARY"

Expand All @@ -127,4 +146,7 @@ jobs:
fi
echo "OPENCLAW_VERSION=${OPENCLAW}"
echo "FLY_IMAGE_CONTENT_HASH=${CONTENT}"
echo "NF_IMAGE_PATH_TEMPLATE=ghcr.io/kilo-org/kiloclaw:{tag}"
echo "GHCR_IMAGE=ghcr.io/kilo-org/kiloclaw:${TAG}"
echo "GHCR_SHA_IMAGE=ghcr.io/kilo-org/kiloclaw:sha-${{ github.sha }}"
echo "=========================="
1,278 changes: 710 additions & 568 deletions pnpm-lock.yaml

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions services/kiloclaw/.dev.vars.example
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,22 @@ DOCKER_LOCAL_API_BASE=http://127.0.0.1:23750
DOCKER_LOCAL_IMAGE=kiloclaw:local
DOCKER_LOCAL_PORT_RANGE=45000-45999

# Northflank provider (disabled for rollout until Worker runtime support is smoke-tested).
# NF_API_TOKEN is always a secret; set production values with wrangler secret put.
NF_API_TOKEN=
NF_API_BASE=https://api.northflank.com/v1
NF_TEAM_ID=kilo-dev
NF_REGION=us-central
NF_DEPLOYMENT_PLAN=nf-compute-200
NF_STORAGE_CLASS_NAME=nf-multi-rw
NF_STORAGE_ACCESS_MODE=ReadWriteMany
NF_VOLUME_SIZE_MB=10240
NF_EPHEMERAL_STORAGE_MB=2048
NF_EDGE_HEADER_NAME=x-kiloclaw-northflank-edge-dev
NF_EDGE_HEADER_VALUE=dev-local-northflank-edge-secret
NF_IMAGE_PATH_TEMPLATE=ghcr.io/kilo-org/kiloclaw:{tag}
NF_IMAGE_CREDENTIALS_ID=

# Auto-populated by dev-start from `fly auth whoami`.
# Tags Fly machines with developer identity for cleanup.
# @exec fly auth whoami
Expand Down
1 change: 1 addition & 0 deletions services/kiloclaw/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"@kilocode/encryption": "workspace:*",
"@kilocode/kiloclaw-secret-catalog": "workspace:*",
"@kilocode/worker-utils": "workspace:*",
"@northflank/js-client": "^0.9.3",
"drizzle-orm": "catalog:",
"hono": "catalog:",
"jose": "^6.2.1",
Expand Down
4 changes: 4 additions & 0 deletions services/kiloclaw/src/db/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ export async function getInstanceBySandboxId(db: WorkerDb, sandboxId: string) {
sandbox_id: kiloclaw_instances.sandbox_id,
user_id: kiloclaw_instances.user_id,
organization_id: kiloclaw_instances.organization_id,
provider: kiloclaw_instances.provider,
})
.from(kiloclaw_instances)
.where(
Expand All @@ -109,6 +110,7 @@ export async function getInstanceBySandboxId(db: WorkerDb, sandboxId: string) {
sandboxId: row.sandbox_id,
userId: row.user_id,
orgId: row.organization_id,
provider: row.provider,
};
}

Expand Down Expand Up @@ -136,6 +138,7 @@ export async function getInstanceByIdIncludingDestroyed(
user_id: kiloclaw_instances.user_id,
organization_id: kiloclaw_instances.organization_id,
inbound_email_enabled: kiloclaw_instances.inbound_email_enabled,
provider: kiloclaw_instances.provider,
})
.from(kiloclaw_instances)
.where(where)
Expand All @@ -149,6 +152,7 @@ export async function getInstanceByIdIncludingDestroyed(
userId: row.user_id,
orgId: row.organization_id,
inboundEmailEnabled: row.inbound_email_enabled,
provider: row.provider,
};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ export function resolveRuntimeImageRef(state: InstanceMutableState, env: KiloCla
if (state.provider === 'docker-local') {
return env.DOCKER_LOCAL_IMAGE ?? 'kiloclaw:local';
}
if (state.provider === 'northflank') {
const template = env.NF_IMAGE_PATH_TEMPLATE;
if (!template) {
throw new Error('NF_IMAGE_PATH_TEMPLATE is not configured');
}
return template.replaceAll('{tag}', resolveImageTag(state, env));
}
return resolveImageRef(state, env);
}

Expand Down
155 changes: 117 additions & 38 deletions services/kiloclaw/src/durable-objects/kiloclaw-instance/postgres.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import type { KiloClawEnv } from '../../types';
import type { EncryptedEnvelope } from '../../schemas/instance-config';
import type {
EncryptedEnvelope,
FlyProviderState,
NorthflankProviderState,
} from '../../schemas/instance-config';
import {
getWorkerDb,
getActivePersonalInstance,
getInstanceById,
getInstanceBySandboxId,
markInstanceDestroyed,
} from '../../db';
Expand All @@ -12,13 +17,76 @@ import { getAppKey, getFlyConfig } from './types';
import { applyProviderState, storageUpdate } from './state';
import { attemptMetadataRecovery } from './reconcile';
import { doError, doWarn, toLoggable, createReconcileContext } from './log';
import { isInstanceKeyedSandboxId } from '@kilocode/worker-utils/instance-id';
import {
isInstanceKeyedSandboxId,
instanceIdFromSandboxId,
} from '@kilocode/worker-utils/instance-id';
import { getNorthflankConfig } from '../../northflank/config';
import {
findProjectByName,
findProjectSecretByName,
findServiceByName,
findVolumeByName,
} from '../../northflank/client';
import { northflankResourceNames } from '../../providers/northflank/names';

type RestoreOpts = {
/** If the DO has a stored sandboxId, use it for precise lookup. */
sandboxId?: string | null;
};

type RestoredInstance = NonNullable<Awaited<ReturnType<typeof getInstanceBySandboxId>>>;

function firstNorthflankIngressHost(service: {
ports?: Array<{ dns?: string | null }>;
}): string | null {
return service.ports?.find(port => port.dns)?.dns ?? null;
}

async function getRestoreInstance(
db: ReturnType<typeof getWorkerDb>,
userId: string,
opts?: RestoreOpts
): Promise<RestoredInstance | null> {
if (opts?.sandboxId && isInstanceKeyedSandboxId(opts.sandboxId)) {
const byId = await getInstanceById(db, instanceIdFromSandboxId(opts.sandboxId));
if (byId) return byId;
}
if (opts?.sandboxId) {
return await getInstanceBySandboxId(db, opts.sandboxId);
}
const personal = await getActivePersonalInstance(db, userId);
return personal ? await getInstanceBySandboxId(db, personal.sandboxId) : null;
}

async function recoverNorthflankProviderState(
env: KiloClawEnv,
sandboxId: string
): Promise<NorthflankProviderState> {
const config = getNorthflankConfig(env);
const names = await northflankResourceNames(sandboxId);
const project = await findProjectByName(config, names.projectName);
const volume = project ? await findVolumeByName(config, project.id, names.volumeName) : null;
const service = project ? await findServiceByName(config, project.id, names.serviceName) : null;
const secret = project
? await findProjectSecretByName(config, project.id, names.secretName)
: null;

return {
provider: 'northflank',
projectId: project?.id ?? null,
projectName: project?.name ?? names.projectName,
serviceId: service?.id ?? null,
serviceName: service?.name ?? names.serviceName,
volumeId: volume?.id ?? null,
volumeName: volume?.name ?? names.volumeName,
secretId: secret?.id ?? null,
secretName: secret?.name ?? names.secretName,
ingressHost: service ? firstNorthflankIngressHost(service) : null,
region: config.region,
};
}

export async function fallbackAppNameForRestore(
userId: string,
sandboxId: string,
Expand All @@ -30,6 +98,25 @@ export async function fallbackAppNameForRestore(
: appNameFromUserId(appKey, prefix);
}

async function recoverFlyProviderState(
env: KiloClawEnv,
userId: string,
sandboxId: string
): Promise<FlyProviderState> {
const appKey = getAppKey({ userId, sandboxId });
const appStub = env.KILOCLAW_APP.get(env.KILOCLAW_APP.idFromName(appKey));
const prefix = env.WORKER_ENV === 'development' ? 'dev' : undefined;
const fallbackAppName = await fallbackAppNameForRestore(userId, sandboxId, prefix);
const recoveredAppName = (await appStub.getAppName()) ?? fallbackAppName;
return {
provider: 'fly',
appName: recoveredAppName,
machineId: null,
volumeId: null,
region: null,
};
}

/**
* Restore DO state from Postgres backup if SQLite was wiped.
*
Expand All @@ -53,44 +140,35 @@ export async function restoreFromPostgres(
try {
const db = getWorkerDb(connectionString);

// Prefer sandboxId lookup (multi-instance safe) over userId lookup (ambiguous).
const instance = opts?.sandboxId
? await getInstanceBySandboxId(db, opts.sandboxId)
: await getActivePersonalInstance(db, userId);
const instance = await getRestoreInstance(db, userId, opts);

if (!instance) {
doWarn(state, 'No active instance found in Postgres', { userId });
return;
}

console.log('[DO] Restoring state from Postgres backup for', userId);
const restoredUserId = instance.userId ?? userId;
console.log('[DO] Restoring state from Postgres backup for', restoredUserId);

const envVars: Record<string, string> | null = null;
const encryptedSecrets: Record<string, EncryptedEnvelope> | null = null;
const channels = null;

// Recover flyAppName from the App DO or derive deterministically.
// Instance-keyed DOs (ki_ sandboxId) have per-instance apps (inst-{hash}),
// legacy DOs have per-user apps (acct-{hash}).
const appKey = getAppKey({ userId, sandboxId: instance.sandboxId });
const appStub = env.KILOCLAW_APP.get(env.KILOCLAW_APP.idFromName(appKey));
const prefix = env.WORKER_ENV === 'development' ? 'dev' : undefined;
const fallbackAppName = await fallbackAppNameForRestore(userId, instance.sandboxId, prefix);
const recoveredAppName = (await appStub.getAppName()) ?? fallbackAppName;
const providerState = {
provider: 'fly',
appName: recoveredAppName,
machineId: null,
volumeId: null,
region: null,
} as const;
// docker-local is development-only and should not be restored from Postgres.
// Treat any non-Northflank persisted provider as Fly for legacy safety.
const provider = instance.provider === 'northflank' ? 'northflank' : 'fly';
const providerState =
provider === 'northflank'
? await recoverNorthflankProviderState(env, instance.sandboxId)
: await recoverFlyProviderState(env, restoredUserId, instance.sandboxId);
const recoveredAppName = providerState.provider === 'fly' ? providerState.appName : null;

await ctx.storage.put(
storageUpdate({
userId,
userId: restoredUserId,
sandboxId: instance.sandboxId,
orgId: instance.orgId ?? null,
provider: 'fly',
provider,
providerState,
status: 'provisioned',
envVars,
Expand All @@ -116,7 +194,7 @@ export async function restoreFromPostgres(
})
);

state.userId = userId;
state.userId = restoredUserId;
state.sandboxId = instance.sandboxId;
state.orgId = instance.orgId ?? null;
applyProviderState(state, providerState);
Expand All @@ -143,19 +221,20 @@ export async function restoreFromPostgres(

console.log('[DO] Restored from Postgres: sandboxId =', instance.sandboxId);

// Attempt to recover machine/volume IDs via Fly metadata.
try {
const flyConfig = getFlyConfig(env, state);
await attemptMetadataRecovery(
flyConfig,
ctx,
state,
createReconcileContext(state, env, 'postgres_restore')
);
} catch (err) {
doWarn(state, 'Metadata recovery after Postgres restore failed', {
error: toLoggable(err),
});
if (provider === 'fly') {
try {
const flyConfig = getFlyConfig(env, state);
await attemptMetadataRecovery(
flyConfig,
ctx,
state,
createReconcileContext(state, env, 'postgres_restore')
);
} catch (err) {
doWarn(state, 'Metadata recovery after Postgres restore failed', {
error: toLoggable(err),
});
}
}
} catch (err) {
doError(state, 'Postgres restore failed', { error: toLoggable(err) });
Expand Down
Loading
Loading