feat: self-recreate handoff for kua-deploy + startup completeSelfRecreate()

When kua-deploy is recreating itself (target appName == kua-deploy on same host), the OLD process is about to be killed by the docker daemon mid-flight. Without a handoff, progress would be stuck at deploy:running forever and release-app would poll until timeout.

Self-recreate path: (1) pre-mark progress phase=self_recreate_pending with the freshly-built image SHA + deployStartTs + stateless services list; (2) fire-and-forget recreateService (do not await its return — the OLD process is dying anyway); (3) sleep 90s as a ceiling — if were still alive, recreate failed and we throw.

On startup, completeSelfRecreate() reads progress-kua-deploy.json; if phase is self_recreate_pending, queries its own container via docker inspect, compares running image SHA to the pre-recreate expected SHA, checks StartedAt > recreate_started_at + state=running, then writes phase=succeeded (or failed) plus a verify struct on the deploy step. Idempotent — no-op if no marker is found.
This commit is contained in:
kua-deploy-split 2026-05-21 18:31:45 -04:00
parent 9169c84381
commit 06852c227c
1 changed files with 124 additions and 0 deletions

124
server.js
View File

@ -9,6 +9,10 @@ import { promisify } from 'util';
const exec = promisify(execCb);
const execFile = promisify(execFileCb);
// The app name this kua-deploy reports itself as in deploy-registry.json.
// Used to detect "I am being deployed" cases so we can do the self-recreate handoff.
const SELF_APP_NAME = 'kua-deploy';
// Input validation
const SAFE_MESSAGE_RE = /^[a-zA-Z0-9 _.,!?:;@#/()[\]{}<>='"+*&^%$~`|-]{1,500}$/;
function validateMessage(msg) {
@ -696,6 +700,49 @@ ${detail}`);
// apps (not just kua-deploy) and replaces the old runOnServer + kua-vault-run
// shell prefix approach.
const composeProject = path.basename(deployDir);
// SELF-RECREATE HANDOFF — when the target IS kua-deploy on the same host,
// this process is about to be killed. We pre-mark progress with a
// self-recreate-pending sentinel so the NEW kua-deploy can pick up the
// verification on startup. See completeSelfRecreate() near init.
const selfRecreate = appName === SELF_APP_NAME && isLocal(server) && stateless.includes(SELF_APP_NAME);
if (selfRecreate) {
// Capture the freshly-built image SHA for post-restart verification.
const builtSha = (await run(`docker compose -p ${composeProject} -f ${deployDir}/docker-compose.yml images --quiet ${SELF_APP_NAME} 2>/dev/null | head -1`)).stdout.trim() || null;
steps[steps.length - 1] = {
step: 'deploy',
status: 'running',
self_recreate: true,
note: 'self-recreate handoff — NEW kua-deploy will verify on startup',
};
await markProgressPhase(appName, 'self_recreate_pending', {
action,
triggered_by: opts.triggered_by || 'api',
steps,
commit: deployCommit,
self_recreate_expected_image: builtSha,
self_recreate_started_at: deployStartTs.toISOString(),
self_recreate_stateless: stateless,
});
// Fire-and-forget recreate. The OLD process is about to die; spawn close
// handler may resolve with ok=false because of the kill, which is expected.
// We don't throw on its failure — the docker daemon owns the lifecycle now.
recreateService({
project: composeProject,
deployDir,
services: stateless,
force: true,
vault: prod.vault || null,
server,
}).catch(() => { /* swallowing — we're dying anyway */ });
// Block here so the process keeps the lock until the daemon kills us.
// 90s ceiling so the lock doesn't leak if the recreate truly fails.
await new Promise(r => setTimeout(r, 90000));
// If we're still alive at this point, the recreate didn't take. Bail.
steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: 'self-recreate timed out — container was not replaced' };
throw new Error('self-recreate did not replace container within 90s');
}
const upRes = await recreateService({
project: composeProject,
deployDir,
@ -1365,6 +1412,78 @@ process.on('unhandledRejection', (reason, promise) => {
fastify.log.error({ reason, promise }, 'Unhandled promise rejection — investigate immediately');
});
// completeSelfRecreate — on startup, if the previous kua-deploy left progress
// in "phase: self_recreate_pending" state, this NEW instance verifies the
// recreate landed (image SHA matches what was built, container is fresh) and
// flips progress to succeeded or failed. Without this, release-app would poll
// /progress forever after a self-deploy.
async function completeSelfRecreate() {
const progressFile = path.join(DATA_DIR, `progress-${SELF_APP_NAME}.json`);
let progress;
try {
progress = JSON.parse(await fs.readFile(progressFile, 'utf-8'));
} catch { return; /* no progress file yet */ }
if (progress.phase !== 'self_recreate_pending') return;
const expectedSha = progress.self_recreate_expected_image || null;
const recreateStartedAt = progress.self_recreate_started_at
? new Date(progress.self_recreate_started_at)
: null;
// Check our own container via docker
let runningSha = null, startedAtStr = null, state = null;
try {
const insp = await run(`docker inspect ${SELF_APP_NAME} --format '{{.Image}}|{{.State.StartedAt}}|{{.State.Status}}'`);
if (insp.ok) {
const parts = insp.stdout.split('|');
runningSha = parts[0] || null;
startedAtStr = parts[1] || null;
state = parts[2] || null;
}
} catch { /* docker unreachable — leave progress in pending; next startup retries */ }
const imageMatches = expectedSha && runningSha && expectedSha.endsWith(runningSha.replace(/^sha256:/, ''));
const freshlyStarted = startedAtStr && recreateStartedAt && new Date(startedAtStr) >= recreateStartedAt;
const ok = !!(imageMatches && freshlyStarted && state === 'running');
const verify = {
expected_image_sha: expectedSha,
running_image_sha: runningSha,
started_at: startedAtStr,
recreate_started_at: progress.self_recreate_started_at,
state,
image_matches: !!imageMatches,
freshly_started: !!freshlyStarted,
completed_at: new Date().toISOString(),
};
// Mark the deploy step done, append the verify, flip phase + status
const steps = Array.isArray(progress.steps) ? progress.steps : [];
const deployIdx = steps.findIndex(s => s.step === 'deploy');
if (deployIdx >= 0) {
steps[deployIdx] = ok
? { ...steps[deployIdx], status: 'done', verify, note: 'self-recreate completed; verified by NEW kua-deploy on startup' }
: { ...steps[deployIdx], status: 'failed', error: 'self-recreate verify failed', verify };
}
const updated = {
...progress,
phase: ok ? 'succeeded' : 'failed',
status: 'done',
current_step: ok ? 'done' : 'deploy',
steps,
updated_at: Math.floor(Date.now() / 1000),
self_recreate_completed: true,
};
// Clear the marker fields
delete updated.self_recreate_expected_image;
delete updated.self_recreate_started_at;
delete updated.self_recreate_stateless;
await fs.writeFile(progressFile, JSON.stringify(updated, null, 2));
fastify.log.info({ ok, verify }, 'self-recreate completed and verified');
}
const start = async () => {
try {
// Fail fast if webhook secret is missing in production
@ -1374,6 +1493,11 @@ const start = async () => {
await loadRegistry();
await loadHistory();
await fs.mkdir(DATA_DIR, { recursive: true });
// Self-recreate recovery — handles handoff from previous instance that was
// killed mid-recreate during a kua-deploy self-deploy. Idempotent.
try { await completeSelfRecreate(); } catch (e) {
fastify.log.error({ err: e }, 'completeSelfRecreate failed (non-fatal)');
}
await fastify.listen({ port: 3200, host: '0.0.0.0' });
} catch (err) {
fastify.log.error(err);