feat(engine): ensure-checkout before deploy/rollback + rollback to_ref

- ensureCheckout(server, deployDir, repoUrl): clone-if-missing, inside the
  per-app acquireLock, called before deploy() git_pull and before rollback()
  cd. No-op when .git present (asserts origin==repo_url if set); requires
  registry repo_url when absent; refuses to clobber a non-empty non-repo dir.
- rollback(appName, opts): opts.to_ref (validated /^[A-Za-z0-9._/-]+$/,
  rejected before any mutation) checks out that ref; default = previous
  successful tag from deployHistory. fetch now --prune --tags.
- route POST /api/v1/apps/:app/rollback reads body.to_ref.
This commit is contained in:
Kavi Bruno 2026-05-26 03:51:33 -04:00
parent 58733939e2
commit e33b1e96cb
1 changed files with 121 additions and 16 deletions

133
server.js
View File

@ -397,6 +397,49 @@ async function runOnServer(server, cmd, opts = {}) {
}
}
// ensureCheckout — guarantee deployDir is a usable git checkout before deploy/rollback.
// The engine historically assumed the repo already existed (`cd ${deployDir} && git fetch`,
// see deploy()/rollback()); a registered app whose deploy_dir was never cloned failed at the
// very first `cd`. This makes a first-time API deploy self-heal by cloning from the
// registry-declared `repo_url`. It is a NO-OP for existing checkouts, so the conforming apps
// (which carry no `repo_url`) keep working untouched. Runs inside the caller's per-app lock.
// Clone source is NOT derived from the app name — origins are heterogeneous (Forgejo :2222,
// scp-style, and at least one GitHub repo whose name differs from the app) — so it MUST come
// from the registry. The caller still performs its own branch/tag checkout afterwards.
async function ensureCheckout(server, deployDir, repoUrl) {
const probe = await runOnServer(server, `test -e ${deployDir}/.git && echo REPO || echo MISSING`);
if (probe.stdout.trim() === 'REPO') {
// Already a checkout — leave branch/tag selection to the caller. Optionally assert origin.
if (repoUrl) {
const originRes = await runOnServer(server, `git -C ${deployDir} config --get remote.origin.url || true`);
const actual = originRes.stdout.trim();
if (actual && actual !== repoUrl) {
throw new Error(`ensure-checkout: ${deployDir} origin (${actual}) != registry repo_url (${repoUrl}) — refusing to deploy a mismatched checkout`);
}
}
return { cloned: false };
}
if (!repoUrl) {
throw new Error(`ensure-checkout: ${deployDir} is not a git checkout and no "repo_url" is set in the registry — cannot clone. Add repo_url to the app's registry entry (or create the checkout manually).`);
}
// Refuse to clobber a non-empty, non-repo directory.
const dirState = await runOnServer(server, `if [ -e ${deployDir} ] && [ -n "$(ls -A ${deployDir} 2>/dev/null)" ]; then echo NONEMPTY; else echo OK; fi`);
if (dirState.stdout.trim() === 'NONEMPTY') {
throw new Error(`ensure-checkout: ${deployDir} exists, is not a git repo, and is non-empty — refusing to clobber. Inspect/remove it manually.`);
}
const cloneRes = await runOnServer(server, `git clone ${repoUrl} ${deployDir}`, { timeout: 180000 });
if (!cloneRes.ok) {
throw new Error(`ensure-checkout: git clone ${repoUrl} -> ${deployDir} failed: ${cloneRes.stderr}`);
}
// Verify the clone landed and origin matches what we asked for.
const verifyRes = await runOnServer(server, `git -C ${deployDir} config --get remote.origin.url || true`);
const landed = verifyRes.stdout.trim();
if (landed !== repoUrl) {
throw new Error(`ensure-checkout: cloned ${deployDir} but origin is ${landed || '(none)'} (expected ${repoUrl})`);
}
return { cloned: true };
}
// --- kua-db integration ---
async function kuaDbSafeCheck(app) {
try {
@ -573,6 +616,16 @@ async function deploy(appName, opts = {}) {
// Step 2: Git pull on production server
steps.push({ step: 'git_pull', status: 'running' });
await markProgressPhase(appName, 'git_pull', { action, triggered_by: opts.triggered_by || 'api', steps });
// ensure-checkout (TUBE step 1): self-heal a missing deploy_dir by cloning from the
// registry repo_url, so a first-time API deploy doesn't die at the `cd` below. No-op
// for existing checkouts. Inside the per-app lock acquired above.
try {
const ec = await ensureCheckout(server, deployDir, app.repo_url);
if (ec.cloned) steps[steps.length - 1].cloned = true;
} catch (err) {
steps[steps.length - 1] = { step: 'git_pull', status: 'failed', error: err.message };
throw err;
}
const fetchCmd = `cd ${deployDir} && git fetch --prune ${remote}`;
const fetchRes = await runOnServer(server, fetchCmd, { timeout: 60000 });
if (!fetchRes.ok) {
@ -636,8 +689,8 @@ ${detail}`);
if (verifyMode === 'off') return { ok: true, results: [], skipped: true };
const results = [];
for (const svc of services) {
const exp = await runOnServer(server, `cd ${deployDir} && docker compose images --quiet ${svc} 2>/dev/null | head -1`);
const cid = await runOnServer(server, `cd ${deployDir} && docker compose ps --quiet ${svc} 2>/dev/null | head -1`);
const exp = await runOnServer(server, `cd ${deployDir} && ${kvPrefix} docker compose images --quiet ${svc} 2>/dev/null | head -1`);
const cid = await runOnServer(server, `cd ${deployDir} && ${kvPrefix} docker compose ps --quiet ${svc} 2>/dev/null | head -1`);
const expectedSha = (exp.stdout || '').trim();
const containerId = (cid.stdout || '').trim();
if (!containerId) {
@ -647,7 +700,8 @@ ${detail}`);
const insp = await runOnServer(server, `docker inspect --format '{{.Image}}|{{.State.StartedAt}}' ${containerId}`);
const [actualSha, startedAtStr] = (insp.stdout || '').trim().split('|');
const startedAt = new Date(startedAtStr || 0);
const imageMatch = !!expectedSha && actualSha === expectedSha;
const stripSha = (s) => (s || '').replace(/^sha256:/, '');
const imageMatch = !!expectedSha && stripSha(actualSha) === stripSha(expectedSha);
const freshlyStarted = !isNaN(startedAt) && startedAt >= deployStartTs;
results.push({
service: svc, ok: imageMatch && freshlyStarted,
@ -673,6 +727,15 @@ ${detail}`);
const svcRes = await runOnServer(server, `cd ${deployDir} && docker compose config --services`);
const allServices = svcRes.stdout.split('\n').filter(Boolean);
// Fail-loud (TUBE step 1): if compose resolved NO services, the recreate+verify
// block below is skipped entirely and the deploy would silently report `done`
// having recreated nothing (a false-success path). Refuse it.
if (allServices.length === 0) {
steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: 'docker compose config returned no services — nothing to recreate' };
await markProgressPhase(appName, 'deploy', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
throw new Error('deploy: docker compose config returned no services — refusing to report success without recreating anything');
}
let stateful = prod.stateful_services || [];
if (stateful.length === 0) {
// Auto-detect stateful services from image names so db/redis are never force-recreated
@ -910,7 +973,7 @@ ${detail}`);
// ROLLBACK ENGINE
// =============================================================================
async function rollback(appName) {
async function rollback(appName, opts = {}) {
const app = getApp(appName);
if (!app) throw new Error(`Unknown app: ${appName}`);
@ -919,13 +982,25 @@ async function rollback(appName) {
const deployDir = prod.deploy_dir;
const remote = app.git_remote || 'origin';
// Find the previous successful deploy
// Determine the rollback target. An explicit to_ref (tag/commit/branch, from MCP or admin)
// overrides the default "previous successful tagged deploy" behavior — this preserves the
// MCP deploy.rollback contract (arbitrary to_ref) now that it routes through this endpoint
// instead of the legacy systemd path. Validate to_ref to keep it out of shell-injection
// range (it interpolates into `git checkout` below).
const history = deployHistory[appName] || [];
const current = history[0];
let tag;
if (opts.to_ref) {
if (!/^[A-Za-z0-9._/-]+$/.test(opts.to_ref)) {
throw new Error(`Invalid to_ref ${JSON.stringify(opts.to_ref)} — must match [A-Za-z0-9._/-]+`);
}
tag = opts.to_ref;
} else {
const previous = history.find((d, i) => i > 0 && d.result === 'success' && d.tag && d.tag !== 'untagged');
if (!previous) {
return { app: appName, result: 'no_rollback_target', message: 'No previous successful deploy with a tag found' };
return { app: appName, result: 'no_rollback_target', message: 'No previous successful deploy with a tag found (pass to_ref to roll back to a specific tag/commit)' };
}
tag = previous.tag;
}
if (!acquireLock(appName)) {
@ -933,7 +1008,6 @@ async function rollback(appName) {
}
try {
const tag = previous.tag;
await writeProgress(appName, {
action: 'rollback',
triggered_by: 'api',
@ -944,8 +1018,12 @@ async function rollback(appName) {
rolled_back_from: current?.tag || current?.commit || 'unknown',
});
// Checkout the previous tag on production
const checkoutRes = await runOnServer(server, `cd ${deployDir} && git fetch --prune ${remote} && git checkout ${tag}`, { timeout: 60000 });
// ensure-checkout (TUBE step 1): rollback shares deploy()'s `cd ${deployDir}` assumption,
// so the deploy_dir must exist here too. No-op for existing checkouts.
await ensureCheckout(server, deployDir, app.repo_url);
// Checkout the rollback target on production (--tags so an explicit to_ref tag resolves).
const checkoutRes = await runOnServer(server, `cd ${deployDir} && git fetch --prune --tags ${remote} && git checkout ${tag}`, { timeout: 60000 });
if (!checkoutRes.ok) throw new Error(`Checkout ${tag} failed: ${checkoutRes.stderr}`);
// Rebuild + recreate via transient-container pattern (consistent with deploy()).
@ -1162,6 +1240,29 @@ fastify.post('/webhook/forgejo', async (request, reply) => {
// --- Apps ---
// Registry dump — used by kua-mcp-core to discover all apps at startup
// without relying on a filesystem path that may not resolve inside its container.
fastify.get('/api/v1/apps/registry', async () => {
return registry;
});
// Reload the deploy registry from disk WITHOUT restarting the engine (TUBE step 3 —
// kills the cache-restart wart: kua-deploy caches the registry at startup, so a newly
// registered app otherwise needs `docker restart kua-deploy`). loadRegistry() JSON-parses
// the file; on parse failure it throws BEFORE reassigning `registry`, so the in-memory
// last-good registry is preserved. Auth is handled by the global onRequest hook.
fastify.post('/api/v1/registry/reload', async (request, reply) => {
const before = Object.keys(registry.apps || {}).length;
try {
await loadRegistry();
} catch (err) {
return reply.code(500).send({ ok: false, error: `registry reload failed (kept last-good ${before} apps): ${err.message}`, apps: before });
}
const after = Object.keys(registry.apps || {}).length;
fastify.log.info({ before, after }, 'registry reloaded via /api/v1/registry/reload');
return { ok: true, apps: after, before };
});
// List all apps
fastify.get('/api/v1/apps', async () => {
const results = [];
@ -1257,9 +1358,11 @@ fastify.post('/api/v1/apps/:app/rebuild', async (request, reply) => {
return { triggered: true, app };
});
// Rollback
// Rollback. Optional body { to_ref } rolls back to a specific tag/commit/branch;
// omitted = previous successful tagged deploy.
fastify.post('/api/v1/apps/:app/rollback', async (request) => {
return await rollback(request.params.app);
const to_ref = request.body && typeof request.body.to_ref === 'string' ? request.body.to_ref : undefined;
return await rollback(request.params.app, { to_ref });
});
// --- Deploy Progress ---
@ -1502,9 +1605,11 @@ async function completeSelfRecreate() {
const start = async () => {
try {
// Fail fast if webhook secret is missing in production
// WEBHOOK_SECRET is optional — the Forgejo webhook path is now retired in
// favour of the admin API (/api/v1/apps/:app/deploy). The handler remains
// but returns 503 when the secret is absent, which is safe.
if (!DEV_MODE && !WEBHOOK_SECRET) {
throw new Error('KUA_DEPLOY_WEBHOOK_SECRET must be set in production — refusing to start');
fastify.log.warn('KUA_DEPLOY_WEBHOOK_SECRET not set — /webhook/forgejo will return 503. Set the secret to re-enable Forgejo push triggers.');
}
await loadRegistry();
await loadHistory();