diff --git a/server.js b/server.js index 436bafa..74851d3 100644 --- a/server.js +++ b/server.js @@ -397,6 +397,49 @@ async function runOnServer(server, cmd, opts = {}) { } } +// ensureCheckout — guarantee deployDir is a usable git checkout before deploy/rollback. +// The engine historically assumed the repo already existed (`cd ${deployDir} && git fetch`, +// see deploy()/rollback()); a registered app whose deploy_dir was never cloned failed at the +// very first `cd`. This makes a first-time API deploy self-heal by cloning from the +// registry-declared `repo_url`. It is a NO-OP for existing checkouts, so the conforming apps +// (which carry no `repo_url`) keep working untouched. Runs inside the caller's per-app lock. +// Clone source is NOT derived from the app name — origins are heterogeneous (Forgejo :2222, +// scp-style, and at least one GitHub repo whose name differs from the app) — so it MUST come +// from the registry. The caller still performs its own branch/tag checkout afterwards. +async function ensureCheckout(server, deployDir, repoUrl) { + const probe = await runOnServer(server, `test -e ${deployDir}/.git && echo REPO || echo MISSING`); + if (probe.stdout.trim() === 'REPO') { + // Already a checkout — leave branch/tag selection to the caller. Optionally assert origin. + if (repoUrl) { + const originRes = await runOnServer(server, `git -C ${deployDir} config --get remote.origin.url || true`); + const actual = originRes.stdout.trim(); + if (actual && actual !== repoUrl) { + throw new Error(`ensure-checkout: ${deployDir} origin (${actual}) != registry repo_url (${repoUrl}) — refusing to deploy a mismatched checkout`); + } + } + return { cloned: false }; + } + if (!repoUrl) { + throw new Error(`ensure-checkout: ${deployDir} is not a git checkout and no "repo_url" is set in the registry — cannot clone. Add repo_url to the app's registry entry (or create the checkout manually).`); + } + // Refuse to clobber a non-empty, non-repo directory. + const dirState = await runOnServer(server, `if [ -e ${deployDir} ] && [ -n "$(ls -A ${deployDir} 2>/dev/null)" ]; then echo NONEMPTY; else echo OK; fi`); + if (dirState.stdout.trim() === 'NONEMPTY') { + throw new Error(`ensure-checkout: ${deployDir} exists, is not a git repo, and is non-empty — refusing to clobber. Inspect/remove it manually.`); + } + const cloneRes = await runOnServer(server, `git clone ${repoUrl} ${deployDir}`, { timeout: 180000 }); + if (!cloneRes.ok) { + throw new Error(`ensure-checkout: git clone ${repoUrl} -> ${deployDir} failed: ${cloneRes.stderr}`); + } + // Verify the clone landed and origin matches what we asked for. + const verifyRes = await runOnServer(server, `git -C ${deployDir} config --get remote.origin.url || true`); + const landed = verifyRes.stdout.trim(); + if (landed !== repoUrl) { + throw new Error(`ensure-checkout: cloned ${deployDir} but origin is ${landed || '(none)'} (expected ${repoUrl})`); + } + return { cloned: true }; +} + // --- kua-db integration --- async function kuaDbSafeCheck(app) { try { @@ -573,6 +616,16 @@ async function deploy(appName, opts = {}) { // Step 2: Git pull on production server steps.push({ step: 'git_pull', status: 'running' }); await markProgressPhase(appName, 'git_pull', { action, triggered_by: opts.triggered_by || 'api', steps }); + // ensure-checkout (TUBE step 1): self-heal a missing deploy_dir by cloning from the + // registry repo_url, so a first-time API deploy doesn't die at the `cd` below. No-op + // for existing checkouts. Inside the per-app lock acquired above. + try { + const ec = await ensureCheckout(server, deployDir, app.repo_url); + if (ec.cloned) steps[steps.length - 1].cloned = true; + } catch (err) { + steps[steps.length - 1] = { step: 'git_pull', status: 'failed', error: err.message }; + throw err; + } const fetchCmd = `cd ${deployDir} && git fetch --prune ${remote}`; const fetchRes = await runOnServer(server, fetchCmd, { timeout: 60000 }); if (!fetchRes.ok) { @@ -636,8 +689,8 @@ ${detail}`); if (verifyMode === 'off') return { ok: true, results: [], skipped: true }; const results = []; for (const svc of services) { - const exp = await runOnServer(server, `cd ${deployDir} && docker compose images --quiet ${svc} 2>/dev/null | head -1`); - const cid = await runOnServer(server, `cd ${deployDir} && docker compose ps --quiet ${svc} 2>/dev/null | head -1`); + const exp = await runOnServer(server, `cd ${deployDir} && ${kvPrefix} docker compose images --quiet ${svc} 2>/dev/null | head -1`); + const cid = await runOnServer(server, `cd ${deployDir} && ${kvPrefix} docker compose ps --quiet ${svc} 2>/dev/null | head -1`); const expectedSha = (exp.stdout || '').trim(); const containerId = (cid.stdout || '').trim(); if (!containerId) { @@ -647,7 +700,8 @@ ${detail}`); const insp = await runOnServer(server, `docker inspect --format '{{.Image}}|{{.State.StartedAt}}' ${containerId}`); const [actualSha, startedAtStr] = (insp.stdout || '').trim().split('|'); const startedAt = new Date(startedAtStr || 0); - const imageMatch = !!expectedSha && actualSha === expectedSha; + const stripSha = (s) => (s || '').replace(/^sha256:/, ''); + const imageMatch = !!expectedSha && stripSha(actualSha) === stripSha(expectedSha); const freshlyStarted = !isNaN(startedAt) && startedAt >= deployStartTs; results.push({ service: svc, ok: imageMatch && freshlyStarted, @@ -673,6 +727,15 @@ ${detail}`); const svcRes = await runOnServer(server, `cd ${deployDir} && docker compose config --services`); const allServices = svcRes.stdout.split('\n').filter(Boolean); + // Fail-loud (TUBE step 1): if compose resolved NO services, the recreate+verify + // block below is skipped entirely and the deploy would silently report `done` + // having recreated nothing (a false-success path). Refuse it. + if (allServices.length === 0) { + steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: 'docker compose config returned no services — nothing to recreate' }; + await markProgressPhase(appName, 'deploy', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); + throw new Error('deploy: docker compose config returned no services — refusing to report success without recreating anything'); + } + let stateful = prod.stateful_services || []; if (stateful.length === 0) { // Auto-detect stateful services from image names so db/redis are never force-recreated @@ -910,7 +973,7 @@ ${detail}`); // ROLLBACK ENGINE // ============================================================================= -async function rollback(appName) { +async function rollback(appName, opts = {}) { const app = getApp(appName); if (!app) throw new Error(`Unknown app: ${appName}`); @@ -919,13 +982,25 @@ async function rollback(appName) { const deployDir = prod.deploy_dir; const remote = app.git_remote || 'origin'; - // Find the previous successful deploy + // Determine the rollback target. An explicit to_ref (tag/commit/branch, from MCP or admin) + // overrides the default "previous successful tagged deploy" behavior — this preserves the + // MCP deploy.rollback contract (arbitrary to_ref) now that it routes through this endpoint + // instead of the legacy systemd path. Validate to_ref to keep it out of shell-injection + // range (it interpolates into `git checkout` below). const history = deployHistory[appName] || []; const current = history[0]; - const previous = history.find((d, i) => i > 0 && d.result === 'success' && d.tag && d.tag !== 'untagged'); - - if (!previous) { - return { app: appName, result: 'no_rollback_target', message: 'No previous successful deploy with a tag found' }; + let tag; + if (opts.to_ref) { + if (!/^[A-Za-z0-9._/-]+$/.test(opts.to_ref)) { + throw new Error(`Invalid to_ref ${JSON.stringify(opts.to_ref)} — must match [A-Za-z0-9._/-]+`); + } + tag = opts.to_ref; + } else { + const previous = history.find((d, i) => i > 0 && d.result === 'success' && d.tag && d.tag !== 'untagged'); + if (!previous) { + return { app: appName, result: 'no_rollback_target', message: 'No previous successful deploy with a tag found (pass to_ref to roll back to a specific tag/commit)' }; + } + tag = previous.tag; } if (!acquireLock(appName)) { @@ -933,7 +1008,6 @@ async function rollback(appName) { } try { - const tag = previous.tag; await writeProgress(appName, { action: 'rollback', triggered_by: 'api', @@ -944,8 +1018,12 @@ async function rollback(appName) { rolled_back_from: current?.tag || current?.commit || 'unknown', }); - // Checkout the previous tag on production - const checkoutRes = await runOnServer(server, `cd ${deployDir} && git fetch --prune ${remote} && git checkout ${tag}`, { timeout: 60000 }); + // ensure-checkout (TUBE step 1): rollback shares deploy()'s `cd ${deployDir}` assumption, + // so the deploy_dir must exist here too. No-op for existing checkouts. + await ensureCheckout(server, deployDir, app.repo_url); + + // Checkout the rollback target on production (--tags so an explicit to_ref tag resolves). + const checkoutRes = await runOnServer(server, `cd ${deployDir} && git fetch --prune --tags ${remote} && git checkout ${tag}`, { timeout: 60000 }); if (!checkoutRes.ok) throw new Error(`Checkout ${tag} failed: ${checkoutRes.stderr}`); // Rebuild + recreate via transient-container pattern (consistent with deploy()). @@ -1162,6 +1240,29 @@ fastify.post('/webhook/forgejo', async (request, reply) => { // --- Apps --- +// Registry dump — used by kua-mcp-core to discover all apps at startup +// without relying on a filesystem path that may not resolve inside its container. +fastify.get('/api/v1/apps/registry', async () => { + return registry; +}); + +// Reload the deploy registry from disk WITHOUT restarting the engine (TUBE step 3 — +// kills the cache-restart wart: kua-deploy caches the registry at startup, so a newly +// registered app otherwise needs `docker restart kua-deploy`). loadRegistry() JSON-parses +// the file; on parse failure it throws BEFORE reassigning `registry`, so the in-memory +// last-good registry is preserved. Auth is handled by the global onRequest hook. +fastify.post('/api/v1/registry/reload', async (request, reply) => { + const before = Object.keys(registry.apps || {}).length; + try { + await loadRegistry(); + } catch (err) { + return reply.code(500).send({ ok: false, error: `registry reload failed (kept last-good ${before} apps): ${err.message}`, apps: before }); + } + const after = Object.keys(registry.apps || {}).length; + fastify.log.info({ before, after }, 'registry reloaded via /api/v1/registry/reload'); + return { ok: true, apps: after, before }; +}); + // List all apps fastify.get('/api/v1/apps', async () => { const results = []; @@ -1257,9 +1358,11 @@ fastify.post('/api/v1/apps/:app/rebuild', async (request, reply) => { return { triggered: true, app }; }); -// Rollback +// Rollback. Optional body { to_ref } rolls back to a specific tag/commit/branch; +// omitted = previous successful tagged deploy. fastify.post('/api/v1/apps/:app/rollback', async (request) => { - return await rollback(request.params.app); + const to_ref = request.body && typeof request.body.to_ref === 'string' ? request.body.to_ref : undefined; + return await rollback(request.params.app, { to_ref }); }); // --- Deploy Progress --- @@ -1502,9 +1605,11 @@ async function completeSelfRecreate() { const start = async () => { try { - // Fail fast if webhook secret is missing in production + // WEBHOOK_SECRET is optional — the Forgejo webhook path is now retired in + // favour of the admin API (/api/v1/apps/:app/deploy). The handler remains + // but returns 503 when the secret is absent, which is safe. if (!DEV_MODE && !WEBHOOK_SECRET) { - throw new Error('KUA_DEPLOY_WEBHOOK_SECRET must be set in production — refusing to start'); + fastify.log.warn('KUA_DEPLOY_WEBHOOK_SECRET not set — /webhook/forgejo will return 503. Set the secret to re-enable Forgejo push triggers.'); } await loadRegistry(); await loadHistory();