import Fastify from 'fastify'; import fs from 'fs/promises'; import path from 'path'; import crypto from 'crypto'; import http from 'http'; import { exec as execCb, execFile as execFileCb, spawn } from 'child_process'; import { promisify } from 'util'; const exec = promisify(execCb); const execFile = promisify(execFileCb); // The app name this kua-deploy reports itself as in deploy-registry.json. // Used to detect "I am being deployed" cases so we can do the self-recreate handoff. const SELF_APP_NAME = 'kua-deploy'; // Input validation const SAFE_MESSAGE_RE = /^[a-zA-Z0-9 _.,!?:;@#/()[\]{}<>='"+*&^%$~`|-]{1,500}$/; function validateMessage(msg) { if (!msg || typeof msg !== 'string') return 'Release to production'; if (!SAFE_MESSAGE_RE.test(msg)) throw new Error('Invalid message: only printable ASCII allowed, max 500 chars'); return msg; } // --- Configuration --- const DATA_DIR = path.join(process.cwd(), 'data'); const LOG_DIR = path.join(process.cwd(), 'logs'); const AUDIT_LOG_FILE = path.join(LOG_DIR, 'audit.log'); const DEPLOY_HISTORY_FILE = path.join(DATA_DIR, 'deploys.json'); const REGISTRY_FILE = path.join(process.cwd(), 'deploy-registry.json'); const ADMIN_TOKEN = process.env.KUA_DEPLOY_ADMIN_TOKEN; const TAILSCALE_SOCKET = '/var/run/tailscale/tailscaled.sock'; const HOSTNAME = process.env.HOSTNAME || 'gal'; const KUA_DB_URL = process.env.KUA_DB_URL || 'http://localhost:3100'; const KUA_DB_TOKEN = process.env.KUA_DB_ADMIN_TOKEN || ADMIN_TOKEN; const WEBHOOK_SECRET = process.env.KUA_DEPLOY_WEBHOOK_SECRET || ''; const DEV_MODE = process.env.NODE_ENV !== 'production'; // Per-app webhook rate limiter — max 5 triggers per 60s to prevent spam const webhookRateLimiter = new Map(); // app -> [timestamps] const WEBHOOK_RATE_LIMIT = 5; const WEBHOOK_RATE_WINDOW_MS = 60_000; function checkWebhookRateLimit(app) { const now = Date.now(); const hits = (webhookRateLimiter.get(app) || []).filter(t => now - t < WEBHOOK_RATE_WINDOW_MS); if (hits.length >= WEBHOOK_RATE_LIMIT) return false; hits.push(now); webhookRateLimiter.set(app, hits); return true; } const ALLOWED_NODES = new Set((process.env.KUA_ALLOWED_NODES || 'gal,bruno,genesis').split(',').map(s => s.trim())); function isAuthorizedNode(tsIdentity) { if (tsIdentity.tags?.includes('tag:admin')) return true; if (ALLOWED_NODES.has(tsIdentity.hostname)) return true; return false; } const fastify = Fastify({ logger: true, // Preserve raw body for webhook HMAC verification addContentTypeParser: undefined, }); // Override default JSON parser to capture rawBody for webhook HMAC verification. // We store it on both req.rawBody (legacy) and req.raw.rawBody so both access paths work. fastify.addContentTypeParser('application/json', { parseAs: 'buffer' }, (req, body, done) => { const raw = body.toString('utf-8'); req.rawBody = raw; req.raw.rawBody = raw; try { done(null, JSON.parse(body)); } catch (err) { done(err); } }); // --- Deploy locks (prevent concurrent deploys per app) --- const deployLocks = new Map(); const LOCK_TTL_MS = 20 * 60 * 1000; // 20 minutes — auto-expire stale locks from crashed deploys function acquireLock(app) { const lock = deployLocks.get(app); if (lock) { if (Date.now() - lock.acquiredAt < LOCK_TTL_MS) return false; fastify.log.warn(`Clearing expired deploy lock for ${app} (${Math.round((Date.now() - lock.acquiredAt) / 1000)}s old)`); } deployLocks.set(app, { acquiredAt: Date.now(), deployId: crypto.randomUUID() }); return true; } function releaseLock(app) { deployLocks.delete(app); } // Returns the active deploy ID for the app, or null if no lock held. function getDeployId(app) { return deployLocks.get(app)?.deployId ?? null; } // --- Load Registry --- let registry = { apps: {} }; async function loadRegistry() { const data = await fs.readFile(REGISTRY_FILE, 'utf-8'); registry = JSON.parse(data); fastify.log.info(`Registry loaded: ${Object.keys(registry.apps).length} apps`); } function getApp(name) { return registry.apps[name] || null; } function getAllApps() { return Object.keys(registry.apps); } // --- Deploy History --- let deployHistory = {}; async function loadHistory() { try { const data = await fs.readFile(DEPLOY_HISTORY_FILE, 'utf-8'); deployHistory = JSON.parse(data); } catch { deployHistory = {}; } } async function saveHistory() { await fs.mkdir(DATA_DIR, { recursive: true }); await fs.writeFile(DEPLOY_HISTORY_FILE, JSON.stringify(deployHistory, null, 2), 'utf-8'); } function recordDeploy(app, entry) { if (!deployHistory[app]) deployHistory[app] = []; deployHistory[app].unshift({ ...entry, timestamp: new Date().toISOString(), id: crypto.randomUUID().slice(0, 8), }); // Keep last 50 deploys per app if (deployHistory[app].length > 50) deployHistory[app] = deployHistory[app].slice(0, 50); saveHistory().catch(() => {}); } function getLastSuccessfulDeploy(app) { return (deployHistory[app] || []).find(d => d.result === 'success'); } function progressFilePath(app) { return path.join(DATA_DIR, `progress-${app}.json`); } async function readProgress(app) { try { const raw = await fs.readFile(progressFilePath(app), 'utf-8'); return JSON.parse(raw); } catch { return null; } } async function writeProgress(app, patch) { const now = Math.floor(Date.now() / 1000); const current = (await readProgress(app)) || {}; const next = { app, ...current, ...patch, updated_at: now, }; if (!next.started_at) next.started_at = now; await fs.mkdir(DATA_DIR, { recursive: true }); await fs.writeFile(progressFilePath(app), JSON.stringify(next, null, 2), 'utf-8'); return next; } async function markProgressPhase(app, phase, patch = {}) { return writeProgress(app, { phase, current_step: phase, status: patch.status || 'running', ...patch, }); } // --- Tailscale Whois --- async function tailscaleWhois(remoteAddr) { return new Promise((resolve) => { const timeout = setTimeout(() => resolve(null), 2000); const req = http.request({ socketPath: TAILSCALE_SOCKET, path: `/localapi/v0/whois?addr=${encodeURIComponent(remoteAddr)}`, method: 'GET', }, (res) => { let data = ''; res.on('data', (chunk) => { data += chunk; }); res.on('end', () => { clearTimeout(timeout); try { const parsed = JSON.parse(data); const node = parsed.Node; const user = parsed.UserProfile; if (!node) return resolve(null); resolve({ stableId: node.StableID || '', hostname: node.ComputedName || node.Hostinfo?.Hostname || '', tags: node.Tags || [], user: user?.LoginName || '', }); } catch { resolve(null); } }); }); req.on('error', () => { clearTimeout(timeout); resolve(null); }); req.end(); }); } // --- Auth Hook --- fastify.addHook('onRequest', async (request, reply) => { if (request.url === '/health') return; // Webhook endpoint uses its own auth (HMAC signature verification inside the handler) if (request.url === '/webhook/forgejo') return; const isLocalhost = ['127.0.0.1', '::1', '::ffff:127.0.0.1'].includes(request.ip) || request.ip.startsWith('172.'); if (isLocalhost) { request.identity = { stableId: 'local', hostname: HOSTNAME, tags: ['tag:admin'], user: 'local' }; return; } const remoteAddr = request.ip + ':' + (request.socket.remotePort || 0); const tsIdentity = await tailscaleWhois(remoteAddr); if (tsIdentity) { if (!isAuthorizedNode(tsIdentity)) return reply.code(403).send({ error: `Node '${tsIdentity.hostname}' not authorized` }); request.identity = tsIdentity; return; } const authHeader = request.headers.authorization; const providedToken = authHeader?.split('Bearer ')[1]; if (providedToken && ADMIN_TOKEN) { const bufP = Buffer.from(providedToken); const bufA = Buffer.from(ADMIN_TOKEN); if (bufP.length === bufA.length && crypto.timingSafeEqual(bufP, bufA)) { request.identity = { stableId: 'admin-token', hostname: 'admin', tags: ['tag:admin'], user: 'admin' }; return; } } return reply.code(401).send({ error: 'Unauthorized' }); }); // --- Audit --- async function audit(entry) { try { await fs.mkdir(LOG_DIR, { recursive: true }); await fs.appendFile(AUDIT_LOG_FILE, JSON.stringify({ ...entry, timestamp: new Date().toISOString() }) + '\n', 'utf-8'); } catch (err) { fastify.log.error(`Audit write failed: ${err.message}`); } } // --- Shell helpers --- function isLocal(server) { const host = server.includes('@') ? server.split('@')[1] : server; return host === HOSTNAME; } function tailscaleIpForServer(server) { const host = server.includes('@') ? server.split('@')[1] : server; const ips = { bruno: '100.74.17.6', gal: '100.122.129.114', }; return ips[host] || ''; } function composeEnvPrefix(server) { const tailscaleIp = tailscaleIpForServer(server); return tailscaleIp ? `TAILSCALE_IP=${tailscaleIp} ` : ''; } // recreateService — spawn a one-shot transient docker:cli container that runs // `docker compose up -d` against the host docker socket. The transient container // is OUTSIDE the lifecycle of the service it recreates, so even when the target // is kua-deploy itself, the recreate completes (the transient survives kua-deploy // being stopped/started; the docker daemon does the actual work). // // Bind paths MUST be identical between host and transient container // (e.g. -v /root/apps/X:/root/apps/X) so compose's path resolution matches host // reality. Secrets are pre-fetched via kua-vault into a private --env-file rather // than being passed on the docker run command line. // // Returns the same { ok, stdout, stderr, error? } shape as run()/runOnServer() // so call sites can swap with minimal change. async function recreateService({ project, // compose project name (basename(deployDir) typically) deployDir, // absolute path to the deploy dir on the docker host services, // array of service names to recreate force = true, // pass --force-recreate vault = null, // { project, env } — if set, fetch secrets via kua-vault export server = 'bruno', // for TAILSCALE_IP env var composeFile = 'docker-compose.yml', timeout = 300000, } = {}) { if (!Array.isArray(services) || services.length === 0) { return { ok: true, stdout: '', stderr: '', error: null, skipped: true }; } // Stage env file on the kua-deploy data volume so the docker CLI (running // inside kua-deploy) can read it. The transient container picks up vars via // --env-file processed at submit time — no host-side mount needed. const tmpName = `.env-recreate-${crypto.randomBytes(8).toString('hex')}`; const envFilePath = `/app/data/${tmpName}`; let envFileWritten = false; try { if (vault && vault.project) { const envEnv = vault.env || 'prod'; // kua-vault export emits KEY=VALUE lines — directly compatible with --env-file const exportRes = await run(`kua-vault export --project ${vault.project} --env ${envEnv}`, { timeout: 30000 }); if (!exportRes.ok) { return { ok: false, stdout: '', stderr: `kua-vault export ${vault.project}/${envEnv} failed: ${exportRes.stderr?.slice(-300) || exportRes.error}`, error: 'vault export failed' }; } // Strip any non KEY=VALUE lines (e.g. status banners) and validate const envLines = exportRes.stdout .split('\n') .filter(l => /^[A-Z_][A-Z0-9_]*=/.test(l)); if (envLines.length === 0) { return { ok: false, stdout: '', stderr: `kua-vault export returned no KEY=VALUE lines for ${vault.project}/${envEnv}`, error: 'empty vault export' }; } await fs.writeFile(envFilePath, envLines.join('\n') + '\n', { mode: 0o600 }); envFileWritten = true; } // Build docker run args const runArgs = [ 'run', '--rm', '-v', '/var/run/docker.sock:/var/run/docker.sock', '-v', `${deployDir}:${deployDir}`, '-w', deployDir, ]; const tailscaleIp = tailscaleIpForServer(server); if (tailscaleIp) runArgs.push('-e', `TAILSCALE_IP=${tailscaleIp}`); if (envFileWritten) runArgs.push('--env-file', envFilePath); runArgs.push('docker:cli'); // Compose command (transient container will run it) runArgs.push('docker', 'compose', '-p', project, '-f', composeFile, 'up', '-d', '--no-deps', '--remove-orphans'); if (force) runArgs.push('--force-recreate'); runArgs.push(...services); return await new Promise((resolve) => { const child = spawn('docker', runArgs); let stdout = ''; let stderr = ''; const tHandle = setTimeout(() => { try { child.kill('SIGKILL'); } catch (_) { /* ignore */ } }, timeout); child.stdout.on('data', d => { stdout += d.toString(); }); child.stderr.on('data', d => { stderr += d.toString(); }); child.on('close', (code) => { clearTimeout(tHandle); if (code === 0) resolve({ ok: true, stdout: stdout.trim(), stderr: stderr.trim() }); else resolve({ ok: false, stdout: stdout.trim(), stderr: stderr.trim(), error: `docker run exit ${code}` }); }); child.on('error', (err) => { clearTimeout(tHandle); resolve({ ok: false, stdout: '', stderr: String(err?.message || err), error: 'spawn failed' }); }); }); } finally { if (envFileWritten) { try { await fs.unlink(envFilePath); } catch (_) { /* ignore */ } } } } async function run(cmd, opts = {}) { const timeout = opts.timeout || 30000; try { const { stdout, stderr } = await exec(cmd, { timeout }); return { ok: true, stdout: stdout.trim(), stderr: stderr.trim() }; } catch (err) { return { ok: false, stdout: err.stdout?.trim() || '', stderr: err.stderr?.trim() || '', error: err.message }; } } // Allow only simple host strings: optional user@, then hostname/IP with dots/hyphens only. const SAFE_HOST_RE = /^([a-zA-Z0-9._-]+@)?[a-zA-Z0-9][a-zA-Z0-9._-]*$/; async function runOnServer(server, cmd, opts = {}) { if (!SAFE_HOST_RE.test(server)) throw new Error(`Unsafe server name rejected: ${JSON.stringify(server)}`); if (isLocal(server)) return run(cmd, opts); // Use execFile to avoid shell interpretation of server/cmd const timeout = opts.timeout || 30000; try { const { stdout, stderr } = await execFile( 'ssh', ['-o', 'StrictHostKeyChecking=no', server, cmd], { timeout }, ); return { ok: true, stdout: stdout.trim(), stderr: stderr.trim() }; } catch (err) { return { ok: false, stdout: err.stdout?.trim() || '', stderr: err.stderr?.trim() || '', error: err.message }; } } // --- kua-db integration --- async function kuaDbSafeCheck(app) { try { const res = await fetch(`${KUA_DB_URL}/api/v1/migrations/${encodeURIComponent(app)}/safe-to-deploy?env=production`, { headers: KUA_DB_TOKEN ? { Authorization: `Bearer ${KUA_DB_TOKEN}` } : {}, signal: AbortSignal.timeout(60000), }); if (!res.ok) return { safe: false, reason: `kua-db returned ${res.status} — blocking deploy (use --force to skip)` }; return await res.json(); } catch { return { safe: false, reason: 'kua-db unreachable — blocking deploy (use --force to skip)' }; } } async function kuaDbMigrate(app) { try { const res = await fetch(`${KUA_DB_URL}/api/v1/migrations/${app}/apply`, { method: 'POST', headers: { 'Content-Type': 'application/json', ...(KUA_DB_TOKEN ? { Authorization: `Bearer ${KUA_DB_TOKEN}` } : {}) }, body: JSON.stringify({ env: 'production' }), signal: AbortSignal.timeout(120000), }); return await res.json(); } catch (err) { return { result: 'error', error: err.message }; } } // ============================================================================= // RELEASE ENGINE // ============================================================================= async function release(appName, message = 'Release to production', opts = {}) { const app = getApp(appName); if (!app) throw new Error(`Unknown app: ${appName}`); message = validateMessage(message); const repoDir = app.repo_dir; const remote = app.git_remote; const sourceBranch = opts.source_branch ?? app.source_branch; const deployBranch = opts.target_branch ?? app.deploy_branch; const steps = []; // Check clean worktree const dirty = await run(`git -C ${repoDir} status --porcelain --ignore-submodules=dirty`); if (dirty.stdout) { throw new Error(`${repoDir} has uncommitted changes — commit or stash first`); } // Fetch steps.push({ step: 'fetch', status: 'running' }); const fetchResult = await run(`git -C ${repoDir} fetch ${remote}`, { timeout: 30000 }); if (!fetchResult.ok) throw new Error(`git fetch failed: ${fetchResult.stderr}`); steps[steps.length - 1] = { step: 'fetch', status: 'done' }; // Push source branch steps.push({ step: 'push_source', status: 'running' }); await run(`git -C ${repoDir} checkout ${sourceBranch}`); const pushResult = await run(`git -C ${repoDir} push ${remote} ${sourceBranch}`, { timeout: 60000 }); if (!pushResult.ok) throw new Error(`push ${sourceBranch} failed: ${pushResult.stderr}`); steps[steps.length - 1] = { step: 'push_source', status: 'done' }; // Get source commit const headResult = await run(`git -C ${repoDir} rev-parse --short HEAD`); const sourceCommit = headResult.stdout; // Merge to deploy branch (use execFile to avoid shell injection via message) steps.push({ step: 'merge', status: 'running' }); const branchExists = await run(`git -C ${repoDir} ls-remote --exit-code --heads ${remote} ${deployBranch}`); if (branchExists.ok) { await run(`git -C ${repoDir} checkout ${deployBranch}`); try { await execFile('git', ['-C', repoDir, 'merge', `${remote}/${sourceBranch}`, '-m', `[RELEASE] ${message}`], { timeout: 30000 }); } catch (err) { throw new Error(`merge failed: ${err.stderr || err.message}`); } } else { await run(`git -C ${repoDir} checkout -B ${deployBranch} ${remote}/${sourceBranch}`); } steps[steps.length - 1] = { step: 'merge', status: 'done' }; // Tag (use execFile to avoid shell injection via message) const tag = `prod-${new Date().toISOString().replace(/[-:T]/g, '').slice(0, 14)}`; try { await execFile('git', ['-C', repoDir, 'tag', '-a', tag, '-m', `[RELEASE] ${message}`], { timeout: 10000 }); } catch (err) { throw new Error(`tag failed: ${err.stderr || err.message}`); } // Push deploy branch + tags steps.push({ step: 'push_deploy', status: 'running' }); const pushDeploy = await run(`git -C ${repoDir} push ${remote} ${deployBranch} --tags`, { timeout: 60000 }); if (!pushDeploy.ok) throw new Error(`push ${deployBranch} failed: ${pushDeploy.stderr}`); steps[steps.length - 1] = { step: 'push_deploy', status: 'done' }; // Return to source branch await run(`git -C ${repoDir} checkout ${sourceBranch}`); await audit({ action: 'release', app: appName, tag, commit: sourceCommit, message }); return { app: appName, result: 'released', tag, commit: sourceCommit, message, deploy_mode: app.deploy_mode, steps, }; } // ============================================================================= // DEPLOY ENGINE // ============================================================================= async function deploy(appName, opts = {}) { const app = getApp(appName); if (!app) throw new Error(`Unknown app: ${appName}`); const prod = app.production; if (!prod) throw new Error(`${appName} has no production config`); if (!acquireLock(appName)) { return { app: appName, result: 'locked', message: 'Deploy already in progress' }; } const steps = []; let finalResult = 'success'; const action = opts.action || 'deploy'; try { const server = prod.server; const deployDir = prod.deploy_dir; const remote = app.git_remote || 'origin'; const deployBranch = app.deploy_branch; await writeProgress(appName, { action, triggered_by: opts.triggered_by || 'api', status: 'running', phase: 'started', current_step: 'started', server, steps, }); // Step 1: kua-db safe-to-deploy check (if app has migrations) if (prod.has_migrations && !opts.force) { steps.push({ step: 'db_safety', status: 'running' }); await markProgressPhase(appName, 'db_safety', { action, triggered_by: opts.triggered_by || 'api', steps }); const safety = await kuaDbSafeCheck(appName); if (!safety.safe) { steps[steps.length - 1] = { step: 'db_safety', status: 'blocked', reasons: safety.reasons || [safety.reason] }; finalResult = 'blocked'; await writeProgress(appName, { action, triggered_by: opts.triggered_by || 'api', status: 'blocked', phase: 'db_safety_blocked', current_step: 'db_safety', result: 'blocked', reason: 'db_safety', steps, finished_at: Math.floor(Date.now() / 1000), }); recordDeploy(appName, { result: 'blocked', reason: 'db_safety', steps, action, triggered_by: opts.triggered_by || 'api' }); return { app: appName, result: 'blocked', steps }; } steps[steps.length - 1] = { step: 'db_safety', status: 'passed', note: safety.reason || 'safe' }; await markProgressPhase(appName, 'db_safety_passed', { action, triggered_by: opts.triggered_by || 'api', steps }); } // Step 2: Git pull on production server steps.push({ step: 'git_pull', status: 'running' }); await markProgressPhase(appName, 'git_pull', { action, triggered_by: opts.triggered_by || 'api', steps }); const fetchCmd = `cd ${deployDir} && git fetch --prune ${remote}`; const fetchRes = await runOnServer(server, fetchCmd, { timeout: 60000 }); if (!fetchRes.ok) { steps[steps.length - 1] = { step: 'git_pull', status: 'failed', error: fetchRes.stderr }; throw new Error(`git fetch failed on ${server}: ${fetchRes.stderr}`); } const checkoutCmd = `cd ${deployDir} && git checkout -B ${deployBranch} ${remote}/${deployBranch}`; const checkoutRes = await runOnServer(server, checkoutCmd, { timeout: 30000 }); if (!checkoutRes.ok) { steps[steps.length - 1] = { step: 'git_pull', status: 'failed', error: checkoutRes.stderr }; throw new Error(`git checkout failed on ${server}: ${checkoutRes.stderr}`); } // Get current commit const commitRes = await runOnServer(server, `cd ${deployDir} && git rev-parse --short HEAD`); const deployCommit = commitRes.stdout; steps[steps.length - 1] = { step: 'git_pull', status: 'done', commit: deployCommit }; await markProgressPhase(appName, 'git_pull_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); // Step 3: Pre-build dirty check — untracked/uncommitted files silently pollute COPY . . const dirtyRes = await runOnServer(server, `cd ${deployDir} && git status --porcelain`); if (!dirtyRes.ok || dirtyRes.stdout.trim()) { const detail = dirtyRes.stdout.trim() || dirtyRes.stderr.trim() || 'git status failed'; throw new Error(`Working tree is dirty — clean up before deploying: ${detail}`); } // Step 3: Docker build steps.push({ step: 'build', status: 'running' }); await markProgressPhase(appName, 'build', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); const kvPrefix = prod.vault ? `kua-vault run --project ${prod.vault.project} --env ${prod.vault.env} --` : ''; const envPrefix = composeEnvPrefix(server); const buildCmd = `cd ${deployDir} && ${envPrefix}${kvPrefix} docker compose build`; const buildRes = await runOnServer(server, buildCmd, { timeout: 600000 }); if (!buildRes.ok) { steps[steps.length - 1] = { step: 'build', status: 'failed', error: buildRes.stderr?.slice(-500) }; throw new Error('docker compose build failed'); } steps[steps.length - 1] = { step: 'build', status: 'done' }; await markProgressPhase(appName, 'build_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); // ------------------------------------------------------------------ // Post-deploy verification (added 2026-05-20 after the false-success // bug that left muralla on a stale image for ~22h: kua-deploy marked // step:deploy=done while the container kept running the prior image). // For each STATELESS service (the ones we just told compose to // --force-recreate), assert that: // (i) the container's `.Image` SHA equals what `docker compose // images --quiet ` reports as the expected image, AND // (ii) the container's `.State.StartedAt` is newer than the // timestamp captured *before* the `up -d` call. // Either failing means the recreate did not actually take. Stateful // services are intentionally NOT image-checked (they shouldn't have // been recreated); we only assert they are running. // Mode is env-configurable: KUA_DEPLOY_VERIFY=error|warn|off. const verifyMode = (process.env.KUA_DEPLOY_VERIFY || 'error').toLowerCase(); async function verifyStatelessRecreated(server, deployDir, services, deployStartTs) { if (verifyMode === 'off') return { ok: true, results: [], skipped: true }; const results = []; for (const svc of services) { const exp = await runOnServer(server, `cd ${deployDir} && docker compose images --quiet ${svc} 2>/dev/null | head -1`); const cid = await runOnServer(server, `cd ${deployDir} && docker compose ps --quiet ${svc} 2>/dev/null | head -1`); const expectedSha = (exp.stdout || '').trim(); const containerId = (cid.stdout || '').trim(); if (!containerId) { results.push({ service: svc, ok: false, reason: 'no running container after up' }); continue; } const insp = await runOnServer(server, `docker inspect --format '{{.Image}}|{{.State.StartedAt}}' ${containerId}`); const [actualSha, startedAtStr] = (insp.stdout || '').trim().split('|'); const startedAt = new Date(startedAtStr || 0); const imageMatch = !!expectedSha && actualSha === expectedSha; const freshlyStarted = !isNaN(startedAt) && startedAt >= deployStartTs; results.push({ service: svc, ok: imageMatch && freshlyStarted, expected_image_sha: expectedSha, running_image_sha: actualSha, started_at: startedAtStr, deploy_started_at: deployStartTs.toISOString(), image_match: imageMatch, freshly_started: freshlyStarted, reason: imageMatch && freshlyStarted ? null : !imageMatch ? `image SHA mismatch (expected ${expectedSha}, running ${actualSha})` : `container not freshly started (StartedAt ${startedAtStr} < deploy start)`, }); } const ok = results.every(r => r.ok); return { ok, results, mode: verifyMode }; } // ------------------------------------------------------------------ // Step 4: Docker up (split stateful/stateless if needed) steps.push({ step: 'deploy', status: 'running' }); await markProgressPhase(appName, 'deploy', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); // Get all services first — needed for both split and auto-detect paths const svcRes = await runOnServer(server, `cd ${deployDir} && docker compose config --services`); const allServices = svcRes.stdout.split('\n').filter(Boolean); let stateful = prod.stateful_services || []; if (stateful.length === 0) { // Auto-detect stateful services from image names so db/redis are never force-recreated const composeContent = await runOnServer(server, `cat ${deployDir}/docker-compose.yml 2>/dev/null || cat ${deployDir}/docker-compose.yaml 2>/dev/null || echo ""`); const statefulImagePat = /postgres|mysql|mariadb|mongo|redis|rabbitmq|cassandra|elasticsearch|opensearch/i; let currentSvc = null; for (const line of composeContent.stdout.split('\n')) { const svcMatch = line.match(/^ ([a-zA-Z0-9_-]+)\s*:/); if (svcMatch) currentSvc = svcMatch[1]; if (currentSvc && /^\s+image\s*:/.test(line) && statefulImagePat.test(line)) { stateful.push(currentSvc); } } if (stateful.length > 0) { fastify.log.warn({ app: appName, inferred_stateful: stateful }, 'Auto-detected stateful services — add to stateful_services in deploy-registry.json to silence this warning'); } } { const stateless = allServices.filter(s => !stateful.includes(s)); const deployStartTs = new Date(); if (stateless.length > 0) { // Use transient-container recreate so kua-deploy can self-update without // killing the compose-up process mid-flight. Same pattern works for all // apps (not just kua-deploy) and replaces the old runOnServer + kua-vault-run // shell prefix approach. const composeProject = path.basename(deployDir); // SELF-RECREATE HANDOFF — when the target IS kua-deploy on the same host, // this process is about to be killed. We pre-mark progress with a // self-recreate-pending sentinel so the NEW kua-deploy can pick up the // verification on startup. See completeSelfRecreate() near init. const selfRecreate = appName === SELF_APP_NAME && isLocal(server) && stateless.includes(SELF_APP_NAME); if (selfRecreate) { // Capture the freshly-built image SHA for post-restart verification. // `docker compose images` returns the image used by the EXISTING container // (still the OLD one before recreate). For the just-built image, query the // image tag that compose builds into: ${project}-${service}:latest. const builtImageTag = `${composeProject}-${SELF_APP_NAME}:latest`; const builtSha = (await run(`docker images ${builtImageTag} --quiet --no-trunc | head -1`)).stdout.trim() || null; steps[steps.length - 1] = { step: 'deploy', status: 'running', self_recreate: true, note: 'self-recreate handoff — NEW kua-deploy will verify on startup', }; await markProgressPhase(appName, 'self_recreate_pending', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit, self_recreate_expected_image: builtSha, self_recreate_started_at: deployStartTs.toISOString(), self_recreate_stateless: stateless, }); // Fire-and-forget recreate. The OLD process is about to die; spawn close // handler may resolve with ok=false because of the kill, which is expected. // We don't throw on its failure — the docker daemon owns the lifecycle now. recreateService({ project: composeProject, deployDir, services: stateless, force: true, vault: prod.vault || null, server, }).catch(() => { /* swallowing — we're dying anyway */ }); // Block here so the process keeps the lock until the daemon kills us. // 90s ceiling so the lock doesn't leak if the recreate truly fails. await new Promise(r => setTimeout(r, 90000)); // If we're still alive at this point, the recreate didn't take. Bail. steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: 'self-recreate timed out — container was not replaced' }; throw new Error('self-recreate did not replace container within 90s'); } const upRes = await recreateService({ project: composeProject, deployDir, services: stateless, force: true, vault: prod.vault || null, server, }); if (!upRes.ok) { steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: upRes.stderr?.slice(-500) || upRes.error }; throw new Error('recreateService failed for stateless services'); } // POST-DEPLOY VERIFY — catches false-success (see helper comment above). const verify = await verifyStatelessRecreated(server, deployDir, stateless, deployStartTs); if (!verify.ok) { const bad = verify.results.filter(r => !r.ok).map(r => `${r.service}: ${r.reason}`).join('; '); if (verifyMode === 'error') { steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: `verify: ${bad}`, verify }; await markProgressPhase(appName, 'deploy', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); throw new Error(`post-deploy verify failed: ${bad}`); } else { fastify.log.warn({ app: appName, verify }, 'post-deploy verify failed (warn mode — not blocking)'); steps[steps.length - 1] = { step: 'deploy', status: 'done', verify_warn: bad }; } } } if (stateful.length > 0) { // Stateful services: start if not running but don't force-recreate // (db/redis must keep their volume + connection state). const composeProject = path.basename(deployDir); const upRes = await recreateService({ project: composeProject, deployDir, services: stateful, force: false, vault: prod.vault || null, server, }); if (!upRes.ok) { steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: upRes.stderr?.slice(-500) || upRes.error }; throw new Error('recreateService failed for stateful services'); } } } steps[steps.length - 1] = { step: 'deploy', status: 'done' }; await markProgressPhase(appName, 'deploy_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); // Step 5: Run migrations via kua-db (if applicable) if (prod.has_migrations) { steps.push({ step: 'migrate', status: 'running' }); await markProgressPhase(appName, 'migrate', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); const migrateResult = await kuaDbMigrate(appName); const migrateOk = ['success', 'no_pending_migrations', 'already_applied'].includes(migrateResult.result); if (!migrateOk) { steps[steps.length - 1] = { step: 'migrate', status: 'failed', error: migrateResult.error || migrateResult.steps?.find(s => s.step === 'migrate')?.error || migrateResult.result }; finalResult = 'partial'; await markProgressPhase(appName, 'migrate_failed', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit, result: finalResult }); } else { steps[steps.length - 1] = { step: 'migrate', status: 'done', result: migrateResult.result }; await markProgressPhase(appName, 'migrate_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); } } // Step 6: Health check steps.push({ step: 'health', status: 'running' }); await markProgressPhase(appName, 'health', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); if (prod.health_url) { let healthy = false; for (let i = 0; i < 20; i++) { try { const res = await fetch(prod.health_url, { signal: AbortSignal.timeout(5000) }); if (res.ok) { healthy = true; break; } } catch { /* retry */ } await new Promise(r => setTimeout(r, 3000)); } if (!healthy) { steps[steps.length - 1] = { step: 'health', status: 'failed', url: prod.health_url }; finalResult = 'unhealthy'; await markProgressPhase(appName, 'health_failed', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit, result: finalResult }); } else { steps[steps.length - 1] = { step: 'health', status: 'done', url: prod.health_url }; await markProgressPhase(appName, 'health_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); } } else { // No health URL — check containers const psRes = await runOnServer(server, `cd ${deployDir} && docker compose ps --format json`); steps[steps.length - 1] = { step: 'health', status: 'done', note: 'no health URL configured' }; await markProgressPhase(appName, 'health_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); } // Step 7: Post-deploy hooks if (prod.post_deploy) { steps.push({ step: 'post_deploy', status: 'running' }); await markProgressPhase(appName, 'post_deploy', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); await runOnServer(server, prod.post_deploy, { timeout: 30000 }); steps[steps.length - 1] = { step: 'post_deploy', status: 'done' }; await markProgressPhase(appName, 'post_deploy_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit }); } // Get tag const tagRes = await runOnServer(server, `cd ${deployDir} && git describe --tags --abbrev=0 2>/dev/null || echo "untagged"`); const currentTag = tagRes.stdout; const entry = { result: finalResult, commit: deployCommit, tag: currentTag, server, steps, action, triggered_by: opts.triggered_by || 'api', }; await writeProgress(appName, { action, triggered_by: opts.triggered_by || 'api', status: finalResult === 'success' ? 'done' : 'failed', phase: finalResult === 'success' ? 'succeeded' : 'completed_with_issues', current_step: 'done', result: finalResult, commit: deployCommit, tag: currentTag, server, steps, finished_at: Math.floor(Date.now() / 1000), }); recordDeploy(appName, entry); await audit({ action, app: appName, ...entry }); return { app: appName, ...entry }; } catch (err) { const entry = { result: 'failed', error: err.message, steps, action, triggered_by: opts.triggered_by || 'api', }; await writeProgress(appName, { action, triggered_by: opts.triggered_by || 'api', status: 'failed', phase: 'failed', current_step: steps[steps.length - 1]?.step || 'unknown', result: 'failed', error: err.message, steps, finished_at: Math.floor(Date.now() / 1000), }); recordDeploy(appName, entry); await audit({ action: `${action}_failed`, app: appName, error: err.message }); return { app: appName, ...entry }; } finally { releaseLock(appName); } } // ============================================================================= // ROLLBACK ENGINE // ============================================================================= async function rollback(appName) { const app = getApp(appName); if (!app) throw new Error(`Unknown app: ${appName}`); const prod = app.production; const server = prod.server; const deployDir = prod.deploy_dir; const remote = app.git_remote || 'origin'; // Find the previous successful deploy const history = deployHistory[appName] || []; const current = history[0]; const previous = history.find((d, i) => i > 0 && d.result === 'success' && d.tag && d.tag !== 'untagged'); if (!previous) { return { app: appName, result: 'no_rollback_target', message: 'No previous successful deploy with a tag found' }; } if (!acquireLock(appName)) { return { app: appName, result: 'locked', message: 'Deploy already in progress' }; } try { const tag = previous.tag; await writeProgress(appName, { action: 'rollback', triggered_by: 'api', status: 'running', phase: 'rollback_started', current_step: 'rollback', rolled_back_to: tag, rolled_back_from: current?.tag || current?.commit || 'unknown', }); // Checkout the previous tag on production const checkoutRes = await runOnServer(server, `cd ${deployDir} && git fetch --prune ${remote} && git checkout ${tag}`, { timeout: 60000 }); if (!checkoutRes.ok) throw new Error(`Checkout ${tag} failed: ${checkoutRes.stderr}`); // Rebuild + recreate via transient-container pattern (consistent with deploy()). // Build runs via runOnServer (local exec when server=bruno); the recreate uses // the transient docker:cli so kua-deploy can roll back itself reliably. const kvPrefix = prod.vault ? `kua-vault run --project ${prod.vault.project} --env ${prod.vault.env} --` : ''; const buildRes = await runOnServer(server, `cd ${deployDir} && ${composeEnvPrefix(server)}${kvPrefix} docker compose build`, { timeout: 600000 }); if (!buildRes.ok) throw new Error(`rollback build failed: ${buildRes.stderr?.slice(-500)}`); // Recreate all services for the rollback target. const svcList = (await runOnServer(server, `cd ${deployDir} && docker compose config --services`)).stdout.split('\n').filter(Boolean); const recreateRes = await recreateService({ project: path.basename(deployDir), deployDir, services: svcList, force: true, vault: prod.vault || null, server, }); if (!recreateRes.ok) throw new Error(`rollback recreate failed: ${recreateRes.stderr?.slice(-500) || recreateRes.error}`); // Health check let healthy = true; if (prod.health_url) { healthy = false; for (let i = 0; i < 20; i++) { try { const res = await fetch(prod.health_url, { signal: AbortSignal.timeout(5000) }); if (res.ok) { healthy = true; break; } } catch { /* retry */ } await new Promise(r => setTimeout(r, 3000)); } } const entry = { result: healthy ? 'success' : 'unhealthy', action: 'rollback', rolled_back_to: tag, rolled_back_from: current?.tag || current?.commit || 'unknown', server, triggered_by: 'api', }; recordDeploy(appName, entry); await audit({ action: 'rollback', app: appName, ...entry }); await writeProgress(appName, { action: 'rollback', triggered_by: 'api', status: healthy ? 'done' : 'failed', phase: healthy ? 'rollback_succeeded' : 'rollback_unhealthy', current_step: 'done', result: healthy ? 'success' : 'unhealthy', rolled_back_to: tag, rolled_back_from: current?.tag || current?.commit || 'unknown', server, finished_at: Math.floor(Date.now() / 1000), }); return { app: appName, ...entry }; } finally { releaseLock(appName); } } // ============================================================================= // APP STATUS // ============================================================================= async function appStatus(appName) { const app = getApp(appName); if (!app) throw new Error(`Unknown app: ${appName}`); const prod = app.production; const server = prod.server; const deployDir = prod.deploy_dir; const status = { app: appName, deploy_mode: app.deploy_mode, server, locked: !!deployLocks.get(appName), }; // Current commit + tag on production try { const commitRes = await runOnServer(server, `cd ${deployDir} && git rev-parse --short HEAD`); status.current_commit = commitRes.stdout; const tagRes = await runOnServer(server, `cd ${deployDir} && git describe --tags --abbrev=0 2>/dev/null || echo "untagged"`); status.current_tag = tagRes.stdout; const branchRes = await runOnServer(server, `cd ${deployDir} && git rev-parse --abbrev-ref HEAD`); status.current_branch = branchRes.stdout; } catch { status.current_commit = 'unreachable'; } // Latest commit on source branch (dev) try { const devCommitRes = await run(`git -C ${app.repo_dir} rev-parse --short HEAD`); status.dev_commit = devCommitRes.stdout; status.dev_ahead = status.current_commit !== status.dev_commit; } catch { status.dev_commit = 'unknown'; } // Last deploy from history const lastDeploy = (deployHistory[appName] || [])[0]; if (lastDeploy) { status.last_deploy = { result: lastDeploy.result, timestamp: lastDeploy.timestamp, commit: lastDeploy.commit, tag: lastDeploy.tag, triggered_by: lastDeploy.triggered_by, }; } const progress = await readProgress(appName); if (progress) { status.progress = progress; } // Health if (prod.health_url) { try { const res = await fetch(prod.health_url, { signal: AbortSignal.timeout(5000) }); status.healthy = res.ok; status.health_status = res.status; } catch { status.healthy = false; status.health_status = 'unreachable'; } } return status; } // ============================================================================= // ROUTES // ============================================================================= // Health fastify.get('/health', async () => { return { status: 'ok', version: '1.0.0', apps: Object.keys(registry.apps).length }; }); // --- Webhook --- // Forgejo webhook receiver (replaces forgejo-webhook.py) fastify.post('/webhook/forgejo', async (request, reply) => { // Webhook secret is mandatory in production if (!WEBHOOK_SECRET && !DEV_MODE) { return reply.code(503).send({ error: 'Webhook not configured: KUA_DEPLOY_WEBHOOK_SECRET is not set' }); } if (WEBHOOK_SECRET) { const sig = request.headers['x-forgejo-signature'] || request.headers['x-gitea-signature'] || ''; if (!sig) return reply.code(401).send({ error: 'Missing webhook signature' }); // rawBody must have been captured by the content-type parser; reject if it wasn't const rawBody = request.rawBody; if (!rawBody) return reply.code(400).send({ error: 'Could not read raw request body for HMAC verification' }); const expected = crypto.createHmac('sha256', WEBHOOK_SECRET).update(rawBody).digest('hex'); const bufSig = Buffer.from(sig); const bufExp = Buffer.from(expected); if (bufSig.length !== bufExp.length || !crypto.timingSafeEqual(bufSig, bufExp)) { return reply.code(401).send({ error: 'Invalid webhook signature' }); } } const data = request.body || {}; const ref = data.ref || ''; const repoInfo = data.repository || {}; const repoName = repoInfo.name || ''; if (ref !== 'refs/heads/production') { return { ignored: true, reason: `not production branch (got ${ref})` }; } const app = getApp(repoName); if (!app) { fastify.log.warn(`Webhook for unknown repo: ${repoName}`); return { ignored: true, reason: `unknown repo: ${repoName}` }; } if (!checkWebhookRateLimit(repoName)) { fastify.log.warn(`Webhook rate limit exceeded for ${repoName}`); return reply.code(429).send({ error: `Rate limit exceeded for ${repoName} — max ${WEBHOOK_RATE_LIMIT} triggers per minute` }); } fastify.log.info(`Webhook: deploying ${repoName} (push to production)`); await writeProgress(repoName, { action: 'deploy', triggered_by: 'webhook', status: 'running', phase: 'webhook_received', current_step: 'webhook_received', ref, repo: repoName, }); // Deploy async — use an IIFE to guarantee the .catch() is always attached void (async () => { try { const result = await deploy(repoName, { triggered_by: 'webhook' }); fastify.log.info(`Deploy ${repoName}: ${result.result}`); } catch (err) { fastify.log.error(`Deploy ${repoName} failed: ${err.message}`); } })(); return { triggered: true, app: repoName }; }); // --- Apps --- // List all apps fastify.get('/api/v1/apps', async () => { const results = []; for (const name of getAllApps()) { try { results.push(await appStatus(name)); } catch (err) { results.push({ app: name, error: err.message }); } } return { apps: results }; }); // Single app status fastify.get('/api/v1/apps/:app', async (request) => { return await appStatus(request.params.app); }); // Deploy history fastify.get('/api/v1/apps/:app/deploys', async (request) => { const app = request.params.app; const limit = parseInt(request.query.limit) || 20; const history = (deployHistory[app] || []).slice(0, limit); return { app, deploys: history, total: (deployHistory[app] || []).length }; }); // --- Actions --- // Release (merge main→production, tag, push — triggers webhook deploy) fastify.post('/api/v1/apps/:app/release', async (request) => { const { message, source_branch, target_branch } = request.body || {}; return await release(request.params.app, message || 'Release to production', { source_branch, target_branch }); }); // Direct deploy (skip release, just pull + build + deploy on production) fastify.post('/api/v1/apps/:app/deploy', async (request, reply) => { const { app } = request.params; const { force } = request.body || {}; if (!getApp(app)) return reply.code(404).send({ ok: false, error: `Unknown app: ${app}` }); await writeProgress(app, { action: 'deploy', triggered_by: 'api', status: 'running', phase: 'api_received', current_step: 'api_received', }); // Fire-and-forget — mirrors /webhook/forgejo. A blocking response held the // HTTP connection for the full ~3-min deploy; the kua-mcp-core ssh+curl // pipe tore down on idle (http_code 000 -> "via ssh failed (0): {}") even // though the deploy succeeded server-side. Caller polls /progress. void (async () => { try { const result = await deploy(app, { force: force === true, triggered_by: 'api' }); fastify.log.info(`Deploy ${app}: ${result.result}`); } catch (err) { fastify.log.error(`Deploy ${app} failed: ${err.message}`); } })(); return { triggered: true, app }; }); // Force rebuild + recreate using the authoritative deploy path fastify.post('/api/v1/apps/:app/rebuild', async (request, reply) => { const { app } = request.params; const { force } = request.body || {}; if (!getApp(app)) return reply.code(404).send({ ok: false, error: `Unknown app: ${app}` }); await writeProgress(app, { action: 'rebuild', triggered_by: 'api_rebuild', status: 'running', phase: 'api_received', current_step: 'api_received', }); // Fire-and-forget — same rationale as /deploy above. void (async () => { try { const result = await deploy(app, { force: force !== false, action: 'rebuild', triggered_by: 'api_rebuild', }); fastify.log.info(`Rebuild ${app}: ${result.result}`); } catch (err) { fastify.log.error(`Rebuild ${app} failed: ${err.message}`); } })(); return { triggered: true, app }; }); // Rollback fastify.post('/api/v1/apps/:app/rollback', async (request) => { return await rollback(request.params.app); }); // --- Deploy Progress --- fastify.get('/api/v1/apps/:app/progress', async (request) => { const { app } = request.params; const data = await readProgress(app); if (!data) { return { ok: false, app, status: 'idle', message: 'No active or recent deployment' }; } const now = Math.floor(Date.now() / 1000); const age = now - (data.updated_at || 0); if (age > 300 && data.status !== 'done' && data.status !== 'failed' && data.status !== 'blocked') { return { ok: false, app, stale: true, age_s: age, ...data }; } return { ok: true, ...data }; }); // External progress reporter — used by forgejo-webhook.py to push phase updates fastify.patch('/api/v1/apps/:app/progress', async (request, reply) => { const { app } = request.params; if (!getApp(app)) return reply.code(404).send({ ok: false, error: `Unknown app: ${app}` }); const patch = request.body || {}; const updated = await writeProgress(app, patch); return { ok: true, ...updated }; }); // Force-clear a stale running progress state (admin only — already requires auth via hook) // GET /api/v1/apps/:app/runtime-status — per-service running-vs-expected // image SHA and freshness. Read-only. Powers release-app post-verify and // the deploy.status MCP tool's stale-detection field. Added 2026-05-20 // alongside the post-deploy verify; together they make "kua-deploy says // done" actually mean "container is running the just-built image." fastify.get('/api/v1/apps/:app/runtime-status', async (request, reply) => { const auth = request.headers.authorization || ''; if (!auth.startsWith('Bearer ') || auth.slice(7) !== ADMIN_TOKEN) { return reply.code(401).send({ error: 'unauthorized' }); } const appName = request.params.app; const cfg = getApp(appName); if (!cfg) return reply.code(404).send({ error: 'app not in registry' }); const prod = cfg.production || {}; const server = prod.server || cfg.deploy_server || 'bruno'; const deployDir = prod.deploy_dir || cfg.repo_dir; if (!deployDir) return reply.code(400).send({ error: 'no deploy_dir for app' }); try { const svcRes = await runOnServer(server, `cd ${deployDir} && docker compose config --services`); const services = (svcRes.stdout || '').split('\n').filter(Boolean); const out = []; let anyStale = false; for (const svc of services) { const exp = await runOnServer(server, `cd ${deployDir} && docker compose images --quiet ${svc} 2>/dev/null | head -1`); const cid = await runOnServer(server, `cd ${deployDir} && docker compose ps --quiet ${svc} 2>/dev/null | head -1`); const expectedSha = (exp.stdout || '').trim(); const containerId = (cid.stdout || '').trim(); let running_image_sha = null, started_at = null, state = null, health = null; if (containerId) { const insp = await runOnServer(server, `docker inspect --format '{{.Image}}|{{.State.StartedAt}}|{{.State.Status}}|{{if .State.Health}}{{.State.Health.Status}}{{else}}n/a{{end}}' ${containerId}`); const parts = (insp.stdout || '').trim().split('|'); running_image_sha = parts[0] || null; started_at = parts[1] || null; state = parts[2] || null; health = parts[3] || null; } const stale = !!expectedSha && !!running_image_sha && expectedSha !== running_image_sha; if (stale) anyStale = true; out.push({ service: svc, container_id: containerId || null, expected_image_sha: expectedSha || null, running_image_sha, started_at, state, health, stale }); } return { ok: true, app: appName, server, services: out, any_stale: anyStale, checked_at: new Date().toISOString() }; } catch (e) { return reply.code(500).send({ ok: false, error: String(e?.message || e) }); } }); fastify.post('/api/v1/apps/:app/progress/reset', async (request, reply) => { const { app } = request.params; if (!getApp(app)) return reply.code(404).send({ ok: false, error: `Unknown app: ${app}` }); const progressFile = progressFilePath(app); try { await fs.rm(progressFile, { force: true }); } catch { /* already gone */ } releaseLock(app); // also clear any in-memory lock return { ok: true, app, message: 'Progress state and lock cleared' }; }); // Force-release the in-memory deploy lock (admin only — allows recovery without container restart) fastify.post('/api/v1/apps/:app/unlock', async (request, reply) => { const { app } = request.params; if (!getApp(app)) return reply.code(404).send({ ok: false, error: `Unknown app: ${app}` }); const hadLock = deployLocks.has(app); releaseLock(app); return { ok: true, app, had_lock: hadLock, message: hadLock ? 'Lock released' : 'No lock was held' }; }); // --- Alerts --- fastify.get('/api/v1/alerts', async () => { const alerts = []; for (const name of getAllApps()) { const app = getApp(name); const prod = app.production; // Check health if (prod.health_url) { try { const res = await fetch(prod.health_url, { signal: AbortSignal.timeout(5000) }); if (!res.ok) { alerts.push({ app: name, severity: 'critical', type: 'unhealthy', status: res.status, url: prod.health_url }); } } catch { alerts.push({ app: name, severity: 'critical', type: 'unreachable', url: prod.health_url }); } } // Check if dev is ahead try { const devCommit = (await run(`git -C ${app.repo_dir} rev-parse --short HEAD`)).stdout; const prodCommit = (await runOnServer(prod.server, `cd ${prod.deploy_dir} && git rev-parse --short HEAD`)).stdout; if (devCommit && prodCommit && devCommit !== prodCommit) { alerts.push({ app: name, severity: 'info', type: 'dev_ahead', dev: devCommit, prod: prodCommit }); } } catch { /* skip if unreachable */ } // Check last deploy const lastDeploy = (deployHistory[name] || [])[0]; if (lastDeploy?.result === 'failed') { alerts.push({ app: name, severity: 'warning', type: 'last_deploy_failed', timestamp: lastDeploy.timestamp }); } } return { alerts, count: alerts.length }; }); // --- Audit --- fastify.get('/api/v1/audit', async (request) => { const limit = parseInt(request.query.limit) || 50; try { const data = await fs.readFile(AUDIT_LOG_FILE, 'utf-8'); const lines = data.trim().split('\n').reverse(); const logs = lines.slice(0, limit).map(l => { try { return JSON.parse(l); } catch { return { error: 'unparseable', raw: l }; } }); return { logs, total: lines.length }; } catch { return { logs: [], total: 0 }; } }); // ============================================================================= // START // ============================================================================= process.on('unhandledRejection', (reason, promise) => { fastify.log.error({ reason, promise }, 'Unhandled promise rejection — investigate immediately'); }); // completeSelfRecreate — on startup, if the previous kua-deploy left progress // in "phase: self_recreate_pending" state, this NEW instance verifies the // recreate landed (image SHA matches what was built, container is fresh) and // flips progress to succeeded or failed. Without this, release-app would poll // /progress forever after a self-deploy. async function completeSelfRecreate() { const progressFile = path.join(DATA_DIR, `progress-${SELF_APP_NAME}.json`); let progress; try { progress = JSON.parse(await fs.readFile(progressFile, 'utf-8')); } catch { return; /* no progress file yet */ } if (progress.phase !== 'self_recreate_pending') return; const expectedSha = progress.self_recreate_expected_image || null; const recreateStartedAt = progress.self_recreate_started_at ? new Date(progress.self_recreate_started_at) : null; // Check our own container via docker let runningSha = null, startedAtStr = null, state = null; try { const insp = await run(`docker inspect ${SELF_APP_NAME} --format '{{.Image}}|{{.State.StartedAt}}|{{.State.Status}}'`); if (insp.ok) { const parts = insp.stdout.split('|'); runningSha = parts[0] || null; startedAtStr = parts[1] || null; state = parts[2] || null; } } catch { /* docker unreachable — leave progress in pending; next startup retries */ } // Normalize: strip sha256: prefix from both sides for tolerant comparison. const normSha = s => (s || '').replace(/^sha256:/, '').trim(); const imageMatches = expectedSha && runningSha && normSha(expectedSha) === normSha(runningSha); const freshlyStarted = startedAtStr && recreateStartedAt && new Date(startedAtStr) >= recreateStartedAt; const ok = !!(imageMatches && freshlyStarted && state === 'running'); const verify = { expected_image_sha: expectedSha, running_image_sha: runningSha, started_at: startedAtStr, recreate_started_at: progress.self_recreate_started_at, state, image_matches: !!imageMatches, freshly_started: !!freshlyStarted, completed_at: new Date().toISOString(), }; // Mark the deploy step done, append the verify, flip phase + status const steps = Array.isArray(progress.steps) ? progress.steps : []; const deployIdx = steps.findIndex(s => s.step === 'deploy'); if (deployIdx >= 0) { steps[deployIdx] = ok ? { ...steps[deployIdx], status: 'done', verify, note: 'self-recreate completed; verified by NEW kua-deploy on startup' } : { ...steps[deployIdx], status: 'failed', error: 'self-recreate verify failed', verify }; } const updated = { ...progress, phase: ok ? 'succeeded' : 'failed', status: 'done', current_step: ok ? 'done' : 'deploy', steps, updated_at: Math.floor(Date.now() / 1000), self_recreate_completed: true, }; // Clear the marker fields delete updated.self_recreate_expected_image; delete updated.self_recreate_started_at; delete updated.self_recreate_stateless; await fs.writeFile(progressFile, JSON.stringify(updated, null, 2)); fastify.log.info({ ok, verify }, 'self-recreate completed and verified'); } const start = async () => { try { // Fail fast if webhook secret is missing in production if (!DEV_MODE && !WEBHOOK_SECRET) { throw new Error('KUA_DEPLOY_WEBHOOK_SECRET must be set in production — refusing to start'); } await loadRegistry(); await loadHistory(); await fs.mkdir(DATA_DIR, { recursive: true }); // Self-recreate recovery — handles handoff from previous instance that was // killed mid-recreate during a kua-deploy self-deploy. Idempotent. try { await completeSelfRecreate(); } catch (e) { fastify.log.error({ err: e }, 'completeSelfRecreate failed (non-fatal)'); } await fastify.listen({ port: 3200, host: '0.0.0.0' }); } catch (err) { fastify.log.error(err); process.exit(1); } }; start();