feat: self-recreate handoff for kua-deploy + startup completeSelfRecreate()
When kua-deploy is recreating itself (target appName == kua-deploy on same host), the OLD process is about to be killed by the docker daemon mid-flight. Without a handoff, progress would be stuck at deploy:running forever and release-app would poll until timeout. Self-recreate path: (1) pre-mark progress phase=self_recreate_pending with the freshly-built image SHA + deployStartTs + stateless services list; (2) fire-and-forget recreateService (do not await its return — the OLD process is dying anyway); (3) sleep 90s as a ceiling — if were still alive, recreate failed and we throw. On startup, completeSelfRecreate() reads progress-kua-deploy.json; if phase is self_recreate_pending, queries its own container via docker inspect, compares running image SHA to the pre-recreate expected SHA, checks StartedAt > recreate_started_at + state=running, then writes phase=succeeded (or failed) plus a verify struct on the deploy step. Idempotent — no-op if no marker is found.
This commit is contained in:
parent
9169c84381
commit
06852c227c
124
server.js
124
server.js
|
|
@ -9,6 +9,10 @@ import { promisify } from 'util';
|
|||
const exec = promisify(execCb);
|
||||
const execFile = promisify(execFileCb);
|
||||
|
||||
// The app name this kua-deploy reports itself as in deploy-registry.json.
|
||||
// Used to detect "I am being deployed" cases so we can do the self-recreate handoff.
|
||||
const SELF_APP_NAME = 'kua-deploy';
|
||||
|
||||
// Input validation
|
||||
const SAFE_MESSAGE_RE = /^[a-zA-Z0-9 _.,!?:;@#/()[\]{}<>='"+*&^%$~`|-]{1,500}$/;
|
||||
function validateMessage(msg) {
|
||||
|
|
@ -696,6 +700,49 @@ ${detail}`);
|
|||
// apps (not just kua-deploy) and replaces the old runOnServer + kua-vault-run
|
||||
// shell prefix approach.
|
||||
const composeProject = path.basename(deployDir);
|
||||
|
||||
// SELF-RECREATE HANDOFF — when the target IS kua-deploy on the same host,
|
||||
// this process is about to be killed. We pre-mark progress with a
|
||||
// self-recreate-pending sentinel so the NEW kua-deploy can pick up the
|
||||
// verification on startup. See completeSelfRecreate() near init.
|
||||
const selfRecreate = appName === SELF_APP_NAME && isLocal(server) && stateless.includes(SELF_APP_NAME);
|
||||
if (selfRecreate) {
|
||||
// Capture the freshly-built image SHA for post-restart verification.
|
||||
const builtSha = (await run(`docker compose -p ${composeProject} -f ${deployDir}/docker-compose.yml images --quiet ${SELF_APP_NAME} 2>/dev/null | head -1`)).stdout.trim() || null;
|
||||
steps[steps.length - 1] = {
|
||||
step: 'deploy',
|
||||
status: 'running',
|
||||
self_recreate: true,
|
||||
note: 'self-recreate handoff — NEW kua-deploy will verify on startup',
|
||||
};
|
||||
await markProgressPhase(appName, 'self_recreate_pending', {
|
||||
action,
|
||||
triggered_by: opts.triggered_by || 'api',
|
||||
steps,
|
||||
commit: deployCommit,
|
||||
self_recreate_expected_image: builtSha,
|
||||
self_recreate_started_at: deployStartTs.toISOString(),
|
||||
self_recreate_stateless: stateless,
|
||||
});
|
||||
// Fire-and-forget recreate. The OLD process is about to die; spawn close
|
||||
// handler may resolve with ok=false because of the kill, which is expected.
|
||||
// We don't throw on its failure — the docker daemon owns the lifecycle now.
|
||||
recreateService({
|
||||
project: composeProject,
|
||||
deployDir,
|
||||
services: stateless,
|
||||
force: true,
|
||||
vault: prod.vault || null,
|
||||
server,
|
||||
}).catch(() => { /* swallowing — we're dying anyway */ });
|
||||
// Block here so the process keeps the lock until the daemon kills us.
|
||||
// 90s ceiling so the lock doesn't leak if the recreate truly fails.
|
||||
await new Promise(r => setTimeout(r, 90000));
|
||||
// If we're still alive at this point, the recreate didn't take. Bail.
|
||||
steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: 'self-recreate timed out — container was not replaced' };
|
||||
throw new Error('self-recreate did not replace container within 90s');
|
||||
}
|
||||
|
||||
const upRes = await recreateService({
|
||||
project: composeProject,
|
||||
deployDir,
|
||||
|
|
@ -1365,6 +1412,78 @@ process.on('unhandledRejection', (reason, promise) => {
|
|||
fastify.log.error({ reason, promise }, 'Unhandled promise rejection — investigate immediately');
|
||||
});
|
||||
|
||||
// completeSelfRecreate — on startup, if the previous kua-deploy left progress
|
||||
// in "phase: self_recreate_pending" state, this NEW instance verifies the
|
||||
// recreate landed (image SHA matches what was built, container is fresh) and
|
||||
// flips progress to succeeded or failed. Without this, release-app would poll
|
||||
// /progress forever after a self-deploy.
|
||||
async function completeSelfRecreate() {
|
||||
const progressFile = path.join(DATA_DIR, `progress-${SELF_APP_NAME}.json`);
|
||||
let progress;
|
||||
try {
|
||||
progress = JSON.parse(await fs.readFile(progressFile, 'utf-8'));
|
||||
} catch { return; /* no progress file yet */ }
|
||||
|
||||
if (progress.phase !== 'self_recreate_pending') return;
|
||||
|
||||
const expectedSha = progress.self_recreate_expected_image || null;
|
||||
const recreateStartedAt = progress.self_recreate_started_at
|
||||
? new Date(progress.self_recreate_started_at)
|
||||
: null;
|
||||
|
||||
// Check our own container via docker
|
||||
let runningSha = null, startedAtStr = null, state = null;
|
||||
try {
|
||||
const insp = await run(`docker inspect ${SELF_APP_NAME} --format '{{.Image}}|{{.State.StartedAt}}|{{.State.Status}}'`);
|
||||
if (insp.ok) {
|
||||
const parts = insp.stdout.split('|');
|
||||
runningSha = parts[0] || null;
|
||||
startedAtStr = parts[1] || null;
|
||||
state = parts[2] || null;
|
||||
}
|
||||
} catch { /* docker unreachable — leave progress in pending; next startup retries */ }
|
||||
|
||||
const imageMatches = expectedSha && runningSha && expectedSha.endsWith(runningSha.replace(/^sha256:/, ''));
|
||||
const freshlyStarted = startedAtStr && recreateStartedAt && new Date(startedAtStr) >= recreateStartedAt;
|
||||
const ok = !!(imageMatches && freshlyStarted && state === 'running');
|
||||
|
||||
const verify = {
|
||||
expected_image_sha: expectedSha,
|
||||
running_image_sha: runningSha,
|
||||
started_at: startedAtStr,
|
||||
recreate_started_at: progress.self_recreate_started_at,
|
||||
state,
|
||||
image_matches: !!imageMatches,
|
||||
freshly_started: !!freshlyStarted,
|
||||
completed_at: new Date().toISOString(),
|
||||
};
|
||||
|
||||
// Mark the deploy step done, append the verify, flip phase + status
|
||||
const steps = Array.isArray(progress.steps) ? progress.steps : [];
|
||||
const deployIdx = steps.findIndex(s => s.step === 'deploy');
|
||||
if (deployIdx >= 0) {
|
||||
steps[deployIdx] = ok
|
||||
? { ...steps[deployIdx], status: 'done', verify, note: 'self-recreate completed; verified by NEW kua-deploy on startup' }
|
||||
: { ...steps[deployIdx], status: 'failed', error: 'self-recreate verify failed', verify };
|
||||
}
|
||||
const updated = {
|
||||
...progress,
|
||||
phase: ok ? 'succeeded' : 'failed',
|
||||
status: 'done',
|
||||
current_step: ok ? 'done' : 'deploy',
|
||||
steps,
|
||||
updated_at: Math.floor(Date.now() / 1000),
|
||||
self_recreate_completed: true,
|
||||
};
|
||||
// Clear the marker fields
|
||||
delete updated.self_recreate_expected_image;
|
||||
delete updated.self_recreate_started_at;
|
||||
delete updated.self_recreate_stateless;
|
||||
|
||||
await fs.writeFile(progressFile, JSON.stringify(updated, null, 2));
|
||||
fastify.log.info({ ok, verify }, 'self-recreate completed and verified');
|
||||
}
|
||||
|
||||
const start = async () => {
|
||||
try {
|
||||
// Fail fast if webhook secret is missing in production
|
||||
|
|
@ -1374,6 +1493,11 @@ const start = async () => {
|
|||
await loadRegistry();
|
||||
await loadHistory();
|
||||
await fs.mkdir(DATA_DIR, { recursive: true });
|
||||
// Self-recreate recovery — handles handoff from previous instance that was
|
||||
// killed mid-recreate during a kua-deploy self-deploy. Idempotent.
|
||||
try { await completeSelfRecreate(); } catch (e) {
|
||||
fastify.log.error({ err: e }, 'completeSelfRecreate failed (non-fatal)');
|
||||
}
|
||||
await fastify.listen({ port: 3200, host: '0.0.0.0' });
|
||||
} catch (err) {
|
||||
fastify.log.error(err);
|
||||
|
|
|
|||
Loading…
Reference in New Issue