kua-deploy/server.js

1534 lines
62 KiB
JavaScript

import Fastify from 'fastify';
import fs from 'fs/promises';
import path from 'path';
import crypto from 'crypto';
import http from 'http';
import { exec as execCb, execFile as execFileCb, spawn } from 'child_process';
import { promisify } from 'util';
const exec = promisify(execCb);
const execFile = promisify(execFileCb);
// The app name this kua-deploy reports itself as in deploy-registry.json.
// Used to detect "I am being deployed" cases so we can do the self-recreate handoff.
const SELF_APP_NAME = 'kua-deploy';
// Input validation
const SAFE_MESSAGE_RE = /^[a-zA-Z0-9 _.,!?:;@#/()[\]{}<>='"+*&^%$~`|-]{1,500}$/;
function validateMessage(msg) {
if (!msg || typeof msg !== 'string') return 'Release to production';
if (!SAFE_MESSAGE_RE.test(msg)) throw new Error('Invalid message: only printable ASCII allowed, max 500 chars');
return msg;
}
// --- Configuration ---
const DATA_DIR = path.join(process.cwd(), 'data');
const LOG_DIR = path.join(process.cwd(), 'logs');
const AUDIT_LOG_FILE = path.join(LOG_DIR, 'audit.log');
const DEPLOY_HISTORY_FILE = path.join(DATA_DIR, 'deploys.json');
const REGISTRY_FILE = path.join(process.cwd(), 'deploy-registry.json');
const ADMIN_TOKEN = process.env.KUA_DEPLOY_ADMIN_TOKEN;
const TAILSCALE_SOCKET = '/var/run/tailscale/tailscaled.sock';
const HOSTNAME = process.env.HOSTNAME || 'gal';
const KUA_DB_URL = process.env.KUA_DB_URL || 'http://localhost:3100';
const KUA_DB_TOKEN = process.env.KUA_DB_ADMIN_TOKEN || ADMIN_TOKEN;
const WEBHOOK_SECRET = process.env.KUA_DEPLOY_WEBHOOK_SECRET || '';
const DEV_MODE = process.env.NODE_ENV !== 'production';
// Per-app webhook rate limiter — max 5 triggers per 60s to prevent spam
const webhookRateLimiter = new Map(); // app -> [timestamps]
const WEBHOOK_RATE_LIMIT = 5;
const WEBHOOK_RATE_WINDOW_MS = 60_000;
function checkWebhookRateLimit(app) {
const now = Date.now();
const hits = (webhookRateLimiter.get(app) || []).filter(t => now - t < WEBHOOK_RATE_WINDOW_MS);
if (hits.length >= WEBHOOK_RATE_LIMIT) return false;
hits.push(now);
webhookRateLimiter.set(app, hits);
return true;
}
const ALLOWED_NODES = new Set((process.env.KUA_ALLOWED_NODES || 'gal,bruno,genesis').split(',').map(s => s.trim()));
function isAuthorizedNode(tsIdentity) {
if (tsIdentity.tags?.includes('tag:admin')) return true;
if (ALLOWED_NODES.has(tsIdentity.hostname)) return true;
return false;
}
const fastify = Fastify({
logger: true,
// Preserve raw body for webhook HMAC verification
addContentTypeParser: undefined,
});
// Override default JSON parser to capture rawBody for webhook HMAC verification.
// We store it on both req.rawBody (legacy) and req.raw.rawBody so both access paths work.
fastify.addContentTypeParser('application/json', { parseAs: 'buffer' }, (req, body, done) => {
const raw = body.toString('utf-8');
req.rawBody = raw;
req.raw.rawBody = raw;
try { done(null, JSON.parse(body)); } catch (err) { done(err); }
});
// --- Deploy locks (prevent concurrent deploys per app) ---
const deployLocks = new Map();
const LOCK_TTL_MS = 20 * 60 * 1000; // 20 minutes — auto-expire stale locks from crashed deploys
function acquireLock(app) {
const lock = deployLocks.get(app);
if (lock) {
if (Date.now() - lock.acquiredAt < LOCK_TTL_MS) return false;
fastify.log.warn(`Clearing expired deploy lock for ${app} (${Math.round((Date.now() - lock.acquiredAt) / 1000)}s old)`);
}
deployLocks.set(app, { acquiredAt: Date.now(), deployId: crypto.randomUUID() });
return true;
}
function releaseLock(app) {
deployLocks.delete(app);
}
// Returns the active deploy ID for the app, or null if no lock held.
function getDeployId(app) {
return deployLocks.get(app)?.deployId ?? null;
}
// --- Load Registry ---
let registry = { apps: {} };
async function loadRegistry() {
const data = await fs.readFile(REGISTRY_FILE, 'utf-8');
registry = JSON.parse(data);
fastify.log.info(`Registry loaded: ${Object.keys(registry.apps).length} apps`);
}
function getApp(name) {
return registry.apps[name] || null;
}
function getAllApps() {
return Object.keys(registry.apps);
}
// --- Deploy History ---
let deployHistory = {};
async function loadHistory() {
try {
const data = await fs.readFile(DEPLOY_HISTORY_FILE, 'utf-8');
deployHistory = JSON.parse(data);
} catch {
deployHistory = {};
}
}
async function saveHistory() {
await fs.mkdir(DATA_DIR, { recursive: true });
await fs.writeFile(DEPLOY_HISTORY_FILE, JSON.stringify(deployHistory, null, 2), 'utf-8');
}
function recordDeploy(app, entry) {
if (!deployHistory[app]) deployHistory[app] = [];
deployHistory[app].unshift({
...entry,
timestamp: new Date().toISOString(),
id: crypto.randomUUID().slice(0, 8),
});
// Keep last 50 deploys per app
if (deployHistory[app].length > 50) deployHistory[app] = deployHistory[app].slice(0, 50);
saveHistory().catch(() => {});
}
function getLastSuccessfulDeploy(app) {
return (deployHistory[app] || []).find(d => d.result === 'success');
}
function progressFilePath(app) {
return path.join(DATA_DIR, `progress-${app}.json`);
}
async function readProgress(app) {
try {
const raw = await fs.readFile(progressFilePath(app), 'utf-8');
return JSON.parse(raw);
} catch {
return null;
}
}
async function writeProgress(app, patch) {
const now = Math.floor(Date.now() / 1000);
const current = (await readProgress(app)) || {};
const next = {
app,
...current,
...patch,
updated_at: now,
};
if (!next.started_at) next.started_at = now;
await fs.mkdir(DATA_DIR, { recursive: true });
await fs.writeFile(progressFilePath(app), JSON.stringify(next, null, 2), 'utf-8');
return next;
}
async function markProgressPhase(app, phase, patch = {}) {
return writeProgress(app, {
phase,
current_step: phase,
status: patch.status || 'running',
...patch,
});
}
// --- Tailscale Whois ---
async function tailscaleWhois(remoteAddr) {
return new Promise((resolve) => {
const timeout = setTimeout(() => resolve(null), 2000);
const req = http.request({
socketPath: TAILSCALE_SOCKET,
path: `/localapi/v0/whois?addr=${encodeURIComponent(remoteAddr)}`,
method: 'GET',
}, (res) => {
let data = '';
res.on('data', (chunk) => { data += chunk; });
res.on('end', () => {
clearTimeout(timeout);
try {
const parsed = JSON.parse(data);
const node = parsed.Node;
const user = parsed.UserProfile;
if (!node) return resolve(null);
resolve({
stableId: node.StableID || '',
hostname: node.ComputedName || node.Hostinfo?.Hostname || '',
tags: node.Tags || [],
user: user?.LoginName || '',
});
} catch { resolve(null); }
});
});
req.on('error', () => { clearTimeout(timeout); resolve(null); });
req.end();
});
}
// --- Auth Hook ---
fastify.addHook('onRequest', async (request, reply) => {
if (request.url === '/health') return;
// Webhook endpoint uses its own auth (HMAC signature verification inside the handler)
if (request.url === '/webhook/forgejo') return;
const isLocalhost = ['127.0.0.1', '::1', '::ffff:127.0.0.1'].includes(request.ip) || request.ip.startsWith('172.');
if (isLocalhost) {
request.identity = { stableId: 'local', hostname: HOSTNAME, tags: ['tag:admin'], user: 'local' };
return;
}
const remoteAddr = request.ip + ':' + (request.socket.remotePort || 0);
const tsIdentity = await tailscaleWhois(remoteAddr);
if (tsIdentity) {
if (!isAuthorizedNode(tsIdentity)) return reply.code(403).send({ error: `Node '${tsIdentity.hostname}' not authorized` });
request.identity = tsIdentity; return;
}
const authHeader = request.headers.authorization;
const providedToken = authHeader?.split('Bearer ')[1];
if (providedToken && ADMIN_TOKEN) {
const bufP = Buffer.from(providedToken);
const bufA = Buffer.from(ADMIN_TOKEN);
if (bufP.length === bufA.length && crypto.timingSafeEqual(bufP, bufA)) {
request.identity = { stableId: 'admin-token', hostname: 'admin', tags: ['tag:admin'], user: 'admin' };
return;
}
}
return reply.code(401).send({ error: 'Unauthorized' });
});
// --- Audit ---
async function audit(entry) {
try {
await fs.mkdir(LOG_DIR, { recursive: true });
await fs.appendFile(AUDIT_LOG_FILE, JSON.stringify({ ...entry, timestamp: new Date().toISOString() }) + '\n', 'utf-8');
} catch (err) {
fastify.log.error(`Audit write failed: ${err.message}`);
}
}
// --- Shell helpers ---
function isLocal(server) {
const host = server.includes('@') ? server.split('@')[1] : server;
return host === HOSTNAME;
}
function tailscaleIpForServer(server) {
const host = server.includes('@') ? server.split('@')[1] : server;
const ips = {
bruno: '100.74.17.6',
gal: '100.122.129.114',
};
return ips[host] || '';
}
function composeEnvPrefix(server) {
const tailscaleIp = tailscaleIpForServer(server);
return tailscaleIp ? `TAILSCALE_IP=${tailscaleIp} ` : '';
}
// recreateService — spawn a one-shot transient docker:cli container that runs
// `docker compose up -d` against the host docker socket. The transient container
// is OUTSIDE the lifecycle of the service it recreates, so even when the target
// is kua-deploy itself, the recreate completes (the transient survives kua-deploy
// being stopped/started; the docker daemon does the actual work).
//
// Bind paths MUST be identical between host and transient container
// (e.g. -v /root/apps/X:/root/apps/X) so compose's path resolution matches host
// reality. Secrets are pre-fetched via kua-vault into a private --env-file rather
// than being passed on the docker run command line.
//
// Returns the same { ok, stdout, stderr, error? } shape as run()/runOnServer()
// so call sites can swap with minimal change.
async function recreateService({
project, // compose project name (basename(deployDir) typically)
deployDir, // absolute path to the deploy dir on the docker host
services, // array of service names to recreate
force = true, // pass --force-recreate
vault = null, // { project, env } — if set, fetch secrets via kua-vault export
server = 'bruno', // for TAILSCALE_IP env var
composeFile = 'docker-compose.yml',
timeout = 300000,
} = {}) {
if (!Array.isArray(services) || services.length === 0) {
return { ok: true, stdout: '', stderr: '', error: null, skipped: true };
}
// Stage env file on the kua-deploy data volume so the docker CLI (running
// inside kua-deploy) can read it. The transient container picks up vars via
// --env-file processed at submit time — no host-side mount needed.
const tmpName = `.env-recreate-${crypto.randomBytes(8).toString('hex')}`;
const envFilePath = `/app/data/${tmpName}`;
let envFileWritten = false;
try {
if (vault && vault.project) {
const envEnv = vault.env || 'prod';
// kua-vault export emits KEY=VALUE lines — directly compatible with --env-file
const exportRes = await run(`kua-vault export --project ${vault.project} --env ${envEnv}`, { timeout: 30000 });
if (!exportRes.ok) {
return { ok: false, stdout: '', stderr: `kua-vault export ${vault.project}/${envEnv} failed: ${exportRes.stderr?.slice(-300) || exportRes.error}`, error: 'vault export failed' };
}
// Strip any non KEY=VALUE lines (e.g. status banners) and validate
const envLines = exportRes.stdout
.split('\n')
.filter(l => /^[A-Z_][A-Z0-9_]*=/.test(l));
if (envLines.length === 0) {
return { ok: false, stdout: '', stderr: `kua-vault export returned no KEY=VALUE lines for ${vault.project}/${envEnv}`, error: 'empty vault export' };
}
await fs.writeFile(envFilePath, envLines.join('\n') + '\n', { mode: 0o600 });
envFileWritten = true;
}
// Build docker run args
const runArgs = [
'run', '--rm',
'-v', '/var/run/docker.sock:/var/run/docker.sock',
'-v', `${deployDir}:${deployDir}`,
'-w', deployDir,
];
const tailscaleIp = tailscaleIpForServer(server);
if (tailscaleIp) runArgs.push('-e', `TAILSCALE_IP=${tailscaleIp}`);
if (envFileWritten) runArgs.push('--env-file', envFilePath);
runArgs.push('docker:cli');
// Compose command (transient container will run it)
runArgs.push('docker', 'compose', '-p', project, '-f', composeFile, 'up', '-d', '--no-deps', '--remove-orphans');
if (force) runArgs.push('--force-recreate');
runArgs.push(...services);
return await new Promise((resolve) => {
const child = spawn('docker', runArgs);
let stdout = '';
let stderr = '';
const tHandle = setTimeout(() => {
try { child.kill('SIGKILL'); } catch (_) { /* ignore */ }
}, timeout);
child.stdout.on('data', d => { stdout += d.toString(); });
child.stderr.on('data', d => { stderr += d.toString(); });
child.on('close', (code) => {
clearTimeout(tHandle);
if (code === 0) resolve({ ok: true, stdout: stdout.trim(), stderr: stderr.trim() });
else resolve({ ok: false, stdout: stdout.trim(), stderr: stderr.trim(), error: `docker run exit ${code}` });
});
child.on('error', (err) => {
clearTimeout(tHandle);
resolve({ ok: false, stdout: '', stderr: String(err?.message || err), error: 'spawn failed' });
});
});
} finally {
if (envFileWritten) {
try { await fs.unlink(envFilePath); } catch (_) { /* ignore */ }
}
}
}
async function run(cmd, opts = {}) {
const timeout = opts.timeout || 30000;
try {
const { stdout, stderr } = await exec(cmd, { timeout });
return { ok: true, stdout: stdout.trim(), stderr: stderr.trim() };
} catch (err) {
return { ok: false, stdout: err.stdout?.trim() || '', stderr: err.stderr?.trim() || '', error: err.message };
}
}
// Allow only simple host strings: optional user@, then hostname/IP with dots/hyphens only.
const SAFE_HOST_RE = /^([a-zA-Z0-9._-]+@)?[a-zA-Z0-9][a-zA-Z0-9._-]*$/;
async function runOnServer(server, cmd, opts = {}) {
if (!SAFE_HOST_RE.test(server)) throw new Error(`Unsafe server name rejected: ${JSON.stringify(server)}`);
if (isLocal(server)) return run(cmd, opts);
// Use execFile to avoid shell interpretation of server/cmd
const timeout = opts.timeout || 30000;
try {
const { stdout, stderr } = await execFile(
'ssh',
['-o', 'StrictHostKeyChecking=no', server, cmd],
{ timeout },
);
return { ok: true, stdout: stdout.trim(), stderr: stderr.trim() };
} catch (err) {
return { ok: false, stdout: err.stdout?.trim() || '', stderr: err.stderr?.trim() || '', error: err.message };
}
}
// --- kua-db integration ---
async function kuaDbSafeCheck(app) {
try {
const res = await fetch(`${KUA_DB_URL}/api/v1/migrations/${encodeURIComponent(app)}/safe-to-deploy?env=production`, {
headers: KUA_DB_TOKEN ? { Authorization: `Bearer ${KUA_DB_TOKEN}` } : {},
signal: AbortSignal.timeout(60000),
});
if (!res.ok) return { safe: false, reason: `kua-db returned ${res.status} — blocking deploy (use --force to skip)` };
return await res.json();
} catch {
return { safe: false, reason: 'kua-db unreachable — blocking deploy (use --force to skip)' };
}
}
async function kuaDbMigrate(app) {
try {
const res = await fetch(`${KUA_DB_URL}/api/v1/migrations/${app}/apply`, {
method: 'POST',
headers: { 'Content-Type': 'application/json', ...(KUA_DB_TOKEN ? { Authorization: `Bearer ${KUA_DB_TOKEN}` } : {}) },
body: JSON.stringify({ env: 'production' }),
signal: AbortSignal.timeout(120000),
});
return await res.json();
} catch (err) {
return { result: 'error', error: err.message };
}
}
// =============================================================================
// RELEASE ENGINE
// =============================================================================
async function release(appName, message = 'Release to production', opts = {}) {
const app = getApp(appName);
if (!app) throw new Error(`Unknown app: ${appName}`);
message = validateMessage(message);
const repoDir = app.repo_dir;
const remote = app.git_remote;
const sourceBranch = opts.source_branch ?? app.source_branch;
const deployBranch = opts.target_branch ?? app.deploy_branch;
const steps = [];
// Check clean worktree
const dirty = await run(`git -C ${repoDir} status --porcelain --ignore-submodules=dirty`);
if (dirty.stdout) {
throw new Error(`${repoDir} has uncommitted changes — commit or stash first`);
}
// Fetch
steps.push({ step: 'fetch', status: 'running' });
const fetchResult = await run(`git -C ${repoDir} fetch ${remote}`, { timeout: 30000 });
if (!fetchResult.ok) throw new Error(`git fetch failed: ${fetchResult.stderr}`);
steps[steps.length - 1] = { step: 'fetch', status: 'done' };
// Push source branch
steps.push({ step: 'push_source', status: 'running' });
await run(`git -C ${repoDir} checkout ${sourceBranch}`);
const pushResult = await run(`git -C ${repoDir} push ${remote} ${sourceBranch}`, { timeout: 60000 });
if (!pushResult.ok) throw new Error(`push ${sourceBranch} failed: ${pushResult.stderr}`);
steps[steps.length - 1] = { step: 'push_source', status: 'done' };
// Get source commit
const headResult = await run(`git -C ${repoDir} rev-parse --short HEAD`);
const sourceCommit = headResult.stdout;
// Merge to deploy branch (use execFile to avoid shell injection via message)
steps.push({ step: 'merge', status: 'running' });
const branchExists = await run(`git -C ${repoDir} ls-remote --exit-code --heads ${remote} ${deployBranch}`);
if (branchExists.ok) {
await run(`git -C ${repoDir} checkout ${deployBranch}`);
try {
await execFile('git', ['-C', repoDir, 'merge', `${remote}/${sourceBranch}`, '-m', `[RELEASE] ${message}`], { timeout: 30000 });
} catch (err) {
throw new Error(`merge failed: ${err.stderr || err.message}`);
}
} else {
await run(`git -C ${repoDir} checkout -B ${deployBranch} ${remote}/${sourceBranch}`);
}
steps[steps.length - 1] = { step: 'merge', status: 'done' };
// Tag (use execFile to avoid shell injection via message)
const tag = `prod-${new Date().toISOString().replace(/[-:T]/g, '').slice(0, 14)}`;
try {
await execFile('git', ['-C', repoDir, 'tag', '-a', tag, '-m', `[RELEASE] ${message}`], { timeout: 10000 });
} catch (err) {
throw new Error(`tag failed: ${err.stderr || err.message}`);
}
// Push deploy branch + tags
steps.push({ step: 'push_deploy', status: 'running' });
const pushDeploy = await run(`git -C ${repoDir} push ${remote} ${deployBranch} --tags`, { timeout: 60000 });
if (!pushDeploy.ok) throw new Error(`push ${deployBranch} failed: ${pushDeploy.stderr}`);
steps[steps.length - 1] = { step: 'push_deploy', status: 'done' };
// Return to source branch
await run(`git -C ${repoDir} checkout ${sourceBranch}`);
await audit({ action: 'release', app: appName, tag, commit: sourceCommit, message });
return {
app: appName,
result: 'released',
tag,
commit: sourceCommit,
message,
deploy_mode: app.deploy_mode,
steps,
};
}
// =============================================================================
// DEPLOY ENGINE
// =============================================================================
async function deploy(appName, opts = {}) {
const app = getApp(appName);
if (!app) throw new Error(`Unknown app: ${appName}`);
const prod = app.production;
if (!prod) throw new Error(`${appName} has no production config`);
if (!acquireLock(appName)) {
return { app: appName, result: 'locked', message: 'Deploy already in progress' };
}
const steps = [];
let finalResult = 'success';
const action = opts.action || 'deploy';
try {
const server = prod.server;
const deployDir = prod.deploy_dir;
const remote = app.git_remote || 'origin';
const deployBranch = app.deploy_branch;
await writeProgress(appName, {
action,
triggered_by: opts.triggered_by || 'api',
status: 'running',
phase: 'started',
current_step: 'started',
server,
steps,
});
// Step 1: kua-db safe-to-deploy check (if app has migrations)
if (prod.has_migrations && !opts.force) {
steps.push({ step: 'db_safety', status: 'running' });
await markProgressPhase(appName, 'db_safety', { action, triggered_by: opts.triggered_by || 'api', steps });
const safety = await kuaDbSafeCheck(appName);
if (!safety.safe) {
steps[steps.length - 1] = { step: 'db_safety', status: 'blocked', reasons: safety.reasons || [safety.reason] };
finalResult = 'blocked';
await writeProgress(appName, {
action,
triggered_by: opts.triggered_by || 'api',
status: 'blocked',
phase: 'db_safety_blocked',
current_step: 'db_safety',
result: 'blocked',
reason: 'db_safety',
steps,
finished_at: Math.floor(Date.now() / 1000),
});
recordDeploy(appName, { result: 'blocked', reason: 'db_safety', steps, action, triggered_by: opts.triggered_by || 'api' });
return { app: appName, result: 'blocked', steps };
}
steps[steps.length - 1] = { step: 'db_safety', status: 'passed', note: safety.reason || 'safe' };
await markProgressPhase(appName, 'db_safety_passed', { action, triggered_by: opts.triggered_by || 'api', steps });
}
// Step 2: Git pull on production server
steps.push({ step: 'git_pull', status: 'running' });
await markProgressPhase(appName, 'git_pull', { action, triggered_by: opts.triggered_by || 'api', steps });
const fetchCmd = `cd ${deployDir} && git fetch --prune ${remote}`;
const fetchRes = await runOnServer(server, fetchCmd, { timeout: 60000 });
if (!fetchRes.ok) {
steps[steps.length - 1] = { step: 'git_pull', status: 'failed', error: fetchRes.stderr };
throw new Error(`git fetch failed on ${server}: ${fetchRes.stderr}`);
}
const checkoutCmd = `cd ${deployDir} && git checkout -B ${deployBranch} ${remote}/${deployBranch}`;
const checkoutRes = await runOnServer(server, checkoutCmd, { timeout: 30000 });
if (!checkoutRes.ok) {
steps[steps.length - 1] = { step: 'git_pull', status: 'failed', error: checkoutRes.stderr };
throw new Error(`git checkout failed on ${server}: ${checkoutRes.stderr}`);
}
// Get current commit
const commitRes = await runOnServer(server, `cd ${deployDir} && git rev-parse --short HEAD`);
const deployCommit = commitRes.stdout;
steps[steps.length - 1] = { step: 'git_pull', status: 'done', commit: deployCommit };
await markProgressPhase(appName, 'git_pull_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
// Step 3: Pre-build dirty check — untracked/uncommitted files silently pollute COPY . .
const dirtyRes = await runOnServer(server, `cd ${deployDir} && git status --porcelain`);
if (!dirtyRes.ok || dirtyRes.stdout.trim()) {
const detail = dirtyRes.stdout.trim() || dirtyRes.stderr.trim() || 'git status failed';
throw new Error(`Working tree is dirty — clean up before deploying:
${detail}`);
}
// Step 3: Docker build
steps.push({ step: 'build', status: 'running' });
await markProgressPhase(appName, 'build', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
const kvPrefix = prod.vault
? `kua-vault run --project ${prod.vault.project} --env ${prod.vault.env} --`
: '';
const envPrefix = composeEnvPrefix(server);
const buildCmd = `cd ${deployDir} && ${envPrefix}${kvPrefix} docker compose build`;
const buildRes = await runOnServer(server, buildCmd, { timeout: 600000 });
if (!buildRes.ok) {
steps[steps.length - 1] = { step: 'build', status: 'failed', error: buildRes.stderr?.slice(-500) };
throw new Error('docker compose build failed');
}
steps[steps.length - 1] = { step: 'build', status: 'done' };
await markProgressPhase(appName, 'build_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
// ------------------------------------------------------------------
// Post-deploy verification (added 2026-05-20 after the false-success
// bug that left muralla on a stale image for ~22h: kua-deploy marked
// step:deploy=done while the container kept running the prior image).
// For each STATELESS service (the ones we just told compose to
// --force-recreate), assert that:
// (i) the container's `.Image` SHA equals what `docker compose
// images --quiet <svc>` reports as the expected image, AND
// (ii) the container's `.State.StartedAt` is newer than the
// timestamp captured *before* the `up -d` call.
// Either failing means the recreate did not actually take. Stateful
// services are intentionally NOT image-checked (they shouldn't have
// been recreated); we only assert they are running.
// Mode is env-configurable: KUA_DEPLOY_VERIFY=error|warn|off.
const verifyMode = (process.env.KUA_DEPLOY_VERIFY || 'error').toLowerCase();
async function verifyStatelessRecreated(server, deployDir, services, deployStartTs) {
if (verifyMode === 'off') return { ok: true, results: [], skipped: true };
const results = [];
for (const svc of services) {
const exp = await runOnServer(server, `cd ${deployDir} && ${kvPrefix} docker compose images --quiet ${svc} 2>/dev/null | head -1`);
const cid = await runOnServer(server, `cd ${deployDir} && ${kvPrefix} docker compose ps --quiet ${svc} 2>/dev/null | head -1`);
const expectedSha = (exp.stdout || '').trim();
const containerId = (cid.stdout || '').trim();
if (!containerId) {
results.push({ service: svc, ok: false, reason: 'no running container after up' });
continue;
}
const insp = await runOnServer(server, `docker inspect --format '{{.Image}}|{{.State.StartedAt}}' ${containerId}`);
const [actualSha, startedAtStr] = (insp.stdout || '').trim().split('|');
const startedAt = new Date(startedAtStr || 0);
const stripSha = (s) => (s || '').replace(/^sha256:/, '');
const imageMatch = !!expectedSha && stripSha(actualSha) === stripSha(expectedSha);
const freshlyStarted = !isNaN(startedAt) && startedAt >= deployStartTs;
results.push({
service: svc, ok: imageMatch && freshlyStarted,
expected_image_sha: expectedSha, running_image_sha: actualSha,
started_at: startedAtStr,
deploy_started_at: deployStartTs.toISOString(),
image_match: imageMatch, freshly_started: freshlyStarted,
reason: imageMatch && freshlyStarted ? null
: !imageMatch ? `image SHA mismatch (expected ${expectedSha}, running ${actualSha})`
: `container not freshly started (StartedAt ${startedAtStr} < deploy start)`,
});
}
const ok = results.every(r => r.ok);
return { ok, results, mode: verifyMode };
}
// ------------------------------------------------------------------
// Step 4: Docker up (split stateful/stateless if needed)
steps.push({ step: 'deploy', status: 'running' });
await markProgressPhase(appName, 'deploy', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
// Get all services first — needed for both split and auto-detect paths
const svcRes = await runOnServer(server, `cd ${deployDir} && docker compose config --services`);
const allServices = svcRes.stdout.split('\n').filter(Boolean);
let stateful = prod.stateful_services || [];
if (stateful.length === 0) {
// Auto-detect stateful services from image names so db/redis are never force-recreated
const composeContent = await runOnServer(server, `cat ${deployDir}/docker-compose.yml 2>/dev/null || cat ${deployDir}/docker-compose.yaml 2>/dev/null || echo ""`);
const statefulImagePat = /postgres|mysql|mariadb|mongo|redis|rabbitmq|cassandra|elasticsearch|opensearch/i;
let currentSvc = null;
for (const line of composeContent.stdout.split('\n')) {
const svcMatch = line.match(/^ ([a-zA-Z0-9_-]+)\s*:/);
if (svcMatch) currentSvc = svcMatch[1];
if (currentSvc && /^\s+image\s*:/.test(line) && statefulImagePat.test(line)) {
stateful.push(currentSvc);
}
}
if (stateful.length > 0) {
fastify.log.warn({ app: appName, inferred_stateful: stateful }, 'Auto-detected stateful services — add to stateful_services in deploy-registry.json to silence this warning');
}
}
{
const stateless = allServices.filter(s => !stateful.includes(s));
const deployStartTs = new Date();
if (stateless.length > 0) {
// Use transient-container recreate so kua-deploy can self-update without
// killing the compose-up process mid-flight. Same pattern works for all
// apps (not just kua-deploy) and replaces the old runOnServer + kua-vault-run
// shell prefix approach.
const composeProject = path.basename(deployDir);
// SELF-RECREATE HANDOFF — when the target IS kua-deploy on the same host,
// this process is about to be killed. We pre-mark progress with a
// self-recreate-pending sentinel so the NEW kua-deploy can pick up the
// verification on startup. See completeSelfRecreate() near init.
const selfRecreate = appName === SELF_APP_NAME && isLocal(server) && stateless.includes(SELF_APP_NAME);
if (selfRecreate) {
// Capture the freshly-built image SHA for post-restart verification.
// `docker compose images` returns the image used by the EXISTING container
// (still the OLD one before recreate). For the just-built image, query the
// image tag that compose builds into: ${project}-${service}:latest.
const builtImageTag = `${composeProject}-${SELF_APP_NAME}:latest`;
const builtSha = (await run(`docker images ${builtImageTag} --quiet --no-trunc | head -1`)).stdout.trim() || null;
steps[steps.length - 1] = {
step: 'deploy',
status: 'running',
self_recreate: true,
note: 'self-recreate handoff — NEW kua-deploy will verify on startup',
};
await markProgressPhase(appName, 'self_recreate_pending', {
action,
triggered_by: opts.triggered_by || 'api',
steps,
commit: deployCommit,
self_recreate_expected_image: builtSha,
self_recreate_started_at: deployStartTs.toISOString(),
self_recreate_stateless: stateless,
});
// Fire-and-forget recreate. The OLD process is about to die; spawn close
// handler may resolve with ok=false because of the kill, which is expected.
// We don't throw on its failure — the docker daemon owns the lifecycle now.
recreateService({
project: composeProject,
deployDir,
services: stateless,
force: true,
vault: prod.vault || null,
server,
}).catch(() => { /* swallowing — we're dying anyway */ });
// Block here so the process keeps the lock until the daemon kills us.
// 90s ceiling so the lock doesn't leak if the recreate truly fails.
await new Promise(r => setTimeout(r, 90000));
// If we're still alive at this point, the recreate didn't take. Bail.
steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: 'self-recreate timed out — container was not replaced' };
throw new Error('self-recreate did not replace container within 90s');
}
const upRes = await recreateService({
project: composeProject,
deployDir,
services: stateless,
force: true,
vault: prod.vault || null,
server,
});
if (!upRes.ok) {
steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: upRes.stderr?.slice(-500) || upRes.error };
throw new Error('recreateService failed for stateless services');
}
// POST-DEPLOY VERIFY — catches false-success (see helper comment above).
const verify = await verifyStatelessRecreated(server, deployDir, stateless, deployStartTs);
if (!verify.ok) {
const bad = verify.results.filter(r => !r.ok).map(r => `${r.service}: ${r.reason}`).join('; ');
if (verifyMode === 'error') {
steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: `verify: ${bad}`, verify };
await markProgressPhase(appName, 'deploy', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
throw new Error(`post-deploy verify failed: ${bad}`);
} else {
fastify.log.warn({ app: appName, verify }, 'post-deploy verify failed (warn mode — not blocking)');
steps[steps.length - 1] = { step: 'deploy', status: 'done', verify_warn: bad };
}
}
}
if (stateful.length > 0) {
// Stateful services: start if not running but don't force-recreate
// (db/redis must keep their volume + connection state).
const composeProject = path.basename(deployDir);
const upRes = await recreateService({
project: composeProject,
deployDir,
services: stateful,
force: false,
vault: prod.vault || null,
server,
});
if (!upRes.ok) {
steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: upRes.stderr?.slice(-500) || upRes.error };
throw new Error('recreateService failed for stateful services');
}
}
}
steps[steps.length - 1] = { step: 'deploy', status: 'done' };
await markProgressPhase(appName, 'deploy_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
// Step 5: Run migrations via kua-db (if applicable)
if (prod.has_migrations) {
steps.push({ step: 'migrate', status: 'running' });
await markProgressPhase(appName, 'migrate', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
const migrateResult = await kuaDbMigrate(appName);
const migrateOk = ['success', 'no_pending_migrations', 'already_applied'].includes(migrateResult.result);
if (!migrateOk) {
steps[steps.length - 1] = { step: 'migrate', status: 'failed', error: migrateResult.error || migrateResult.steps?.find(s => s.step === 'migrate')?.error || migrateResult.result };
finalResult = 'partial';
await markProgressPhase(appName, 'migrate_failed', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit, result: finalResult });
} else {
steps[steps.length - 1] = { step: 'migrate', status: 'done', result: migrateResult.result };
await markProgressPhase(appName, 'migrate_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
}
}
// Step 6: Health check
steps.push({ step: 'health', status: 'running' });
await markProgressPhase(appName, 'health', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
if (prod.health_url) {
let healthy = false;
for (let i = 0; i < 20; i++) {
try {
const res = await fetch(prod.health_url, { signal: AbortSignal.timeout(5000) });
if (res.ok) { healthy = true; break; }
} catch { /* retry */ }
await new Promise(r => setTimeout(r, 3000));
}
if (!healthy) {
steps[steps.length - 1] = { step: 'health', status: 'failed', url: prod.health_url };
finalResult = 'unhealthy';
await markProgressPhase(appName, 'health_failed', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit, result: finalResult });
} else {
steps[steps.length - 1] = { step: 'health', status: 'done', url: prod.health_url };
await markProgressPhase(appName, 'health_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
}
} else {
// No health URL — check containers
const psRes = await runOnServer(server, `cd ${deployDir} && docker compose ps --format json`);
steps[steps.length - 1] = { step: 'health', status: 'done', note: 'no health URL configured' };
await markProgressPhase(appName, 'health_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
}
// Step 7: Post-deploy hooks
if (prod.post_deploy) {
steps.push({ step: 'post_deploy', status: 'running' });
await markProgressPhase(appName, 'post_deploy', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
await runOnServer(server, prod.post_deploy, { timeout: 30000 });
steps[steps.length - 1] = { step: 'post_deploy', status: 'done' };
await markProgressPhase(appName, 'post_deploy_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
}
// Get tag
const tagRes = await runOnServer(server, `cd ${deployDir} && git describe --tags --abbrev=0 2>/dev/null || echo "untagged"`);
const currentTag = tagRes.stdout;
const entry = {
result: finalResult,
commit: deployCommit,
tag: currentTag,
server,
steps,
action,
triggered_by: opts.triggered_by || 'api',
};
await writeProgress(appName, {
action,
triggered_by: opts.triggered_by || 'api',
status: finalResult === 'success' ? 'done' : 'failed',
phase: finalResult === 'success' ? 'succeeded' : 'completed_with_issues',
current_step: 'done',
result: finalResult,
commit: deployCommit,
tag: currentTag,
server,
steps,
finished_at: Math.floor(Date.now() / 1000),
});
recordDeploy(appName, entry);
await audit({ action, app: appName, ...entry });
return { app: appName, ...entry };
} catch (err) {
const entry = {
result: 'failed',
error: err.message,
steps,
action,
triggered_by: opts.triggered_by || 'api',
};
await writeProgress(appName, {
action,
triggered_by: opts.triggered_by || 'api',
status: 'failed',
phase: 'failed',
current_step: steps[steps.length - 1]?.step || 'unknown',
result: 'failed',
error: err.message,
steps,
finished_at: Math.floor(Date.now() / 1000),
});
recordDeploy(appName, entry);
await audit({ action: `${action}_failed`, app: appName, error: err.message });
return { app: appName, ...entry };
} finally {
releaseLock(appName);
}
}
// =============================================================================
// ROLLBACK ENGINE
// =============================================================================
async function rollback(appName) {
const app = getApp(appName);
if (!app) throw new Error(`Unknown app: ${appName}`);
const prod = app.production;
const server = prod.server;
const deployDir = prod.deploy_dir;
const remote = app.git_remote || 'origin';
// Find the previous successful deploy
const history = deployHistory[appName] || [];
const current = history[0];
const previous = history.find((d, i) => i > 0 && d.result === 'success' && d.tag && d.tag !== 'untagged');
if (!previous) {
return { app: appName, result: 'no_rollback_target', message: 'No previous successful deploy with a tag found' };
}
if (!acquireLock(appName)) {
return { app: appName, result: 'locked', message: 'Deploy already in progress' };
}
try {
const tag = previous.tag;
await writeProgress(appName, {
action: 'rollback',
triggered_by: 'api',
status: 'running',
phase: 'rollback_started',
current_step: 'rollback',
rolled_back_to: tag,
rolled_back_from: current?.tag || current?.commit || 'unknown',
});
// Checkout the previous tag on production
const checkoutRes = await runOnServer(server, `cd ${deployDir} && git fetch --prune ${remote} && git checkout ${tag}`, { timeout: 60000 });
if (!checkoutRes.ok) throw new Error(`Checkout ${tag} failed: ${checkoutRes.stderr}`);
// Rebuild + recreate via transient-container pattern (consistent with deploy()).
// Build runs via runOnServer (local exec when server=bruno); the recreate uses
// the transient docker:cli so kua-deploy can roll back itself reliably.
const kvPrefix = prod.vault
? `kua-vault run --project ${prod.vault.project} --env ${prod.vault.env} --`
: '';
const buildRes = await runOnServer(server, `cd ${deployDir} && ${composeEnvPrefix(server)}${kvPrefix} docker compose build`, { timeout: 600000 });
if (!buildRes.ok) throw new Error(`rollback build failed: ${buildRes.stderr?.slice(-500)}`);
// Recreate all services for the rollback target.
const svcList = (await runOnServer(server, `cd ${deployDir} && docker compose config --services`)).stdout.split('\n').filter(Boolean);
const recreateRes = await recreateService({
project: path.basename(deployDir),
deployDir,
services: svcList,
force: true,
vault: prod.vault || null,
server,
});
if (!recreateRes.ok) throw new Error(`rollback recreate failed: ${recreateRes.stderr?.slice(-500) || recreateRes.error}`);
// Health check
let healthy = true;
if (prod.health_url) {
healthy = false;
for (let i = 0; i < 20; i++) {
try {
const res = await fetch(prod.health_url, { signal: AbortSignal.timeout(5000) });
if (res.ok) { healthy = true; break; }
} catch { /* retry */ }
await new Promise(r => setTimeout(r, 3000));
}
}
const entry = {
result: healthy ? 'success' : 'unhealthy',
action: 'rollback',
rolled_back_to: tag,
rolled_back_from: current?.tag || current?.commit || 'unknown',
server,
triggered_by: 'api',
};
recordDeploy(appName, entry);
await audit({ action: 'rollback', app: appName, ...entry });
await writeProgress(appName, {
action: 'rollback',
triggered_by: 'api',
status: healthy ? 'done' : 'failed',
phase: healthy ? 'rollback_succeeded' : 'rollback_unhealthy',
current_step: 'done',
result: healthy ? 'success' : 'unhealthy',
rolled_back_to: tag,
rolled_back_from: current?.tag || current?.commit || 'unknown',
server,
finished_at: Math.floor(Date.now() / 1000),
});
return { app: appName, ...entry };
} finally {
releaseLock(appName);
}
}
// =============================================================================
// APP STATUS
// =============================================================================
async function appStatus(appName) {
const app = getApp(appName);
if (!app) throw new Error(`Unknown app: ${appName}`);
const prod = app.production;
const server = prod.server;
const deployDir = prod.deploy_dir;
const status = {
app: appName,
deploy_mode: app.deploy_mode,
server,
locked: !!deployLocks.get(appName),
};
// Current commit + tag on production
try {
const commitRes = await runOnServer(server, `cd ${deployDir} && git rev-parse --short HEAD`);
status.current_commit = commitRes.stdout;
const tagRes = await runOnServer(server, `cd ${deployDir} && git describe --tags --abbrev=0 2>/dev/null || echo "untagged"`);
status.current_tag = tagRes.stdout;
const branchRes = await runOnServer(server, `cd ${deployDir} && git rev-parse --abbrev-ref HEAD`);
status.current_branch = branchRes.stdout;
} catch {
status.current_commit = 'unreachable';
}
// Latest commit on source branch (dev)
try {
const devCommitRes = await run(`git -C ${app.repo_dir} rev-parse --short HEAD`);
status.dev_commit = devCommitRes.stdout;
status.dev_ahead = status.current_commit !== status.dev_commit;
} catch {
status.dev_commit = 'unknown';
}
// Last deploy from history
const lastDeploy = (deployHistory[appName] || [])[0];
if (lastDeploy) {
status.last_deploy = {
result: lastDeploy.result,
timestamp: lastDeploy.timestamp,
commit: lastDeploy.commit,
tag: lastDeploy.tag,
triggered_by: lastDeploy.triggered_by,
};
}
const progress = await readProgress(appName);
if (progress) {
status.progress = progress;
}
// Health
if (prod.health_url) {
try {
const res = await fetch(prod.health_url, { signal: AbortSignal.timeout(5000) });
status.healthy = res.ok;
status.health_status = res.status;
} catch {
status.healthy = false;
status.health_status = 'unreachable';
}
}
return status;
}
// =============================================================================
// ROUTES
// =============================================================================
// Health
fastify.get('/health', async () => {
return { status: 'ok', version: '1.0.0', apps: Object.keys(registry.apps).length };
});
// --- Webhook ---
// Forgejo webhook receiver (replaces forgejo-webhook.py)
fastify.post('/webhook/forgejo', async (request, reply) => {
// Webhook secret is mandatory in production
if (!WEBHOOK_SECRET && !DEV_MODE) {
return reply.code(503).send({ error: 'Webhook not configured: KUA_DEPLOY_WEBHOOK_SECRET is not set' });
}
if (WEBHOOK_SECRET) {
const sig = request.headers['x-forgejo-signature'] || request.headers['x-gitea-signature'] || '';
if (!sig) return reply.code(401).send({ error: 'Missing webhook signature' });
// rawBody must have been captured by the content-type parser; reject if it wasn't
const rawBody = request.rawBody;
if (!rawBody) return reply.code(400).send({ error: 'Could not read raw request body for HMAC verification' });
const expected = crypto.createHmac('sha256', WEBHOOK_SECRET).update(rawBody).digest('hex');
const bufSig = Buffer.from(sig); const bufExp = Buffer.from(expected);
if (bufSig.length !== bufExp.length || !crypto.timingSafeEqual(bufSig, bufExp)) {
return reply.code(401).send({ error: 'Invalid webhook signature' });
}
}
const data = request.body || {};
const ref = data.ref || '';
const repoInfo = data.repository || {};
const repoName = repoInfo.name || '';
if (ref !== 'refs/heads/production') {
return { ignored: true, reason: `not production branch (got ${ref})` };
}
const app = getApp(repoName);
if (!app) {
fastify.log.warn(`Webhook for unknown repo: ${repoName}`);
return { ignored: true, reason: `unknown repo: ${repoName}` };
}
if (!checkWebhookRateLimit(repoName)) {
fastify.log.warn(`Webhook rate limit exceeded for ${repoName}`);
return reply.code(429).send({ error: `Rate limit exceeded for ${repoName} — max ${WEBHOOK_RATE_LIMIT} triggers per minute` });
}
fastify.log.info(`Webhook: deploying ${repoName} (push to production)`);
await writeProgress(repoName, {
action: 'deploy',
triggered_by: 'webhook',
status: 'running',
phase: 'webhook_received',
current_step: 'webhook_received',
ref,
repo: repoName,
});
// Deploy async — use an IIFE to guarantee the .catch() is always attached
void (async () => {
try {
const result = await deploy(repoName, { triggered_by: 'webhook' });
fastify.log.info(`Deploy ${repoName}: ${result.result}`);
} catch (err) {
fastify.log.error(`Deploy ${repoName} failed: ${err.message}`);
}
})();
return { triggered: true, app: repoName };
});
// --- Apps ---
// Registry dump — used by kua-mcp-core to discover all apps at startup
// without relying on a filesystem path that may not resolve inside its container.
fastify.get('/api/v1/apps/registry', async () => {
return registry;
});
// List all apps
fastify.get('/api/v1/apps', async () => {
const results = [];
for (const name of getAllApps()) {
try {
results.push(await appStatus(name));
} catch (err) {
results.push({ app: name, error: err.message });
}
}
return { apps: results };
});
// Single app status
fastify.get('/api/v1/apps/:app', async (request) => {
return await appStatus(request.params.app);
});
// Deploy history
fastify.get('/api/v1/apps/:app/deploys', async (request) => {
const app = request.params.app;
const limit = parseInt(request.query.limit) || 20;
const history = (deployHistory[app] || []).slice(0, limit);
return { app, deploys: history, total: (deployHistory[app] || []).length };
});
// --- Actions ---
// Release (merge main→production, tag, push — triggers webhook deploy)
fastify.post('/api/v1/apps/:app/release', async (request) => {
const { message, source_branch, target_branch } = request.body || {};
return await release(request.params.app, message || 'Release to production', { source_branch, target_branch });
});
// Direct deploy (skip release, just pull + build + deploy on production)
fastify.post('/api/v1/apps/:app/deploy', async (request, reply) => {
const { app } = request.params;
const { force } = request.body || {};
if (!getApp(app)) return reply.code(404).send({ ok: false, error: `Unknown app: ${app}` });
await writeProgress(app, {
action: 'deploy',
triggered_by: 'api',
status: 'running',
phase: 'api_received',
current_step: 'api_received',
});
// Fire-and-forget — mirrors /webhook/forgejo. A blocking response held the
// HTTP connection for the full ~3-min deploy; the kua-mcp-core ssh+curl
// pipe tore down on idle (http_code 000 -> "via ssh failed (0): {}") even
// though the deploy succeeded server-side. Caller polls /progress.
void (async () => {
try {
const result = await deploy(app, { force: force === true, triggered_by: 'api' });
fastify.log.info(`Deploy ${app}: ${result.result}`);
} catch (err) {
fastify.log.error(`Deploy ${app} failed: ${err.message}`);
}
})();
return { triggered: true, app };
});
// Force rebuild + recreate using the authoritative deploy path
fastify.post('/api/v1/apps/:app/rebuild', async (request, reply) => {
const { app } = request.params;
const { force } = request.body || {};
if (!getApp(app)) return reply.code(404).send({ ok: false, error: `Unknown app: ${app}` });
await writeProgress(app, {
action: 'rebuild',
triggered_by: 'api_rebuild',
status: 'running',
phase: 'api_received',
current_step: 'api_received',
});
// Fire-and-forget — same rationale as /deploy above.
void (async () => {
try {
const result = await deploy(app, {
force: force !== false,
action: 'rebuild',
triggered_by: 'api_rebuild',
});
fastify.log.info(`Rebuild ${app}: ${result.result}`);
} catch (err) {
fastify.log.error(`Rebuild ${app} failed: ${err.message}`);
}
})();
return { triggered: true, app };
});
// Rollback
fastify.post('/api/v1/apps/:app/rollback', async (request) => {
return await rollback(request.params.app);
});
// --- Deploy Progress ---
fastify.get('/api/v1/apps/:app/progress', async (request) => {
const { app } = request.params;
const data = await readProgress(app);
if (!data) {
return { ok: false, app, status: 'idle', message: 'No active or recent deployment' };
}
const now = Math.floor(Date.now() / 1000);
const age = now - (data.updated_at || 0);
if (age > 300 && data.status !== 'done' && data.status !== 'failed' && data.status !== 'blocked') {
return { ok: false, app, stale: true, age_s: age, ...data };
}
return { ok: true, ...data };
});
// External progress reporter — used by forgejo-webhook.py to push phase updates
fastify.patch('/api/v1/apps/:app/progress', async (request, reply) => {
const { app } = request.params;
if (!getApp(app)) return reply.code(404).send({ ok: false, error: `Unknown app: ${app}` });
const patch = request.body || {};
const updated = await writeProgress(app, patch);
return { ok: true, ...updated };
});
// Force-clear a stale running progress state (admin only — already requires auth via hook)
// GET /api/v1/apps/:app/runtime-status — per-service running-vs-expected
// image SHA and freshness. Read-only. Powers release-app post-verify and
// the deploy.status MCP tool's stale-detection field. Added 2026-05-20
// alongside the post-deploy verify; together they make "kua-deploy says
// done" actually mean "container is running the just-built image."
fastify.get('/api/v1/apps/:app/runtime-status', async (request, reply) => {
const auth = request.headers.authorization || '';
if (!auth.startsWith('Bearer ') || auth.slice(7) !== ADMIN_TOKEN) {
return reply.code(401).send({ error: 'unauthorized' });
}
const appName = request.params.app;
const cfg = getApp(appName);
if (!cfg) return reply.code(404).send({ error: 'app not in registry' });
const prod = cfg.production || {};
const server = prod.server || cfg.deploy_server || 'bruno';
const deployDir = prod.deploy_dir || cfg.repo_dir;
if (!deployDir) return reply.code(400).send({ error: 'no deploy_dir for app' });
// kua-vault wrap — compose files in this org use ${VAR} interpolations
// backed by vault-injected env (KUA_SESSIONS_ADMIN_TOKEN, AGENT_API_KEY,
// STRIPE_*, etc). Without the wrap, `docker compose config --services`
// emits empty/error output for most apps, which made /runtime-status
// return services: [] (the original symptom the coordinator session
// diagnosed). Mirrors the canonical kvPrefix pattern from deploy().
const kvPrefix = prod.vault
? `kua-vault run --project ${prod.vault.project} --env ${prod.vault.env} --`
: '';
try {
const svcRes = await runOnServer(server, `cd ${deployDir} && ${kvPrefix} docker compose config --services`);
const services = (svcRes.stdout || '').split('\n').filter(Boolean);
const out = [];
let anyStale = false;
for (const svc of services) {
const exp = await runOnServer(server, `cd ${deployDir} && ${kvPrefix} docker compose images --quiet ${svc} 2>/dev/null | head -1`);
const cid = await runOnServer(server, `cd ${deployDir} && ${kvPrefix} docker compose ps --quiet ${svc} 2>/dev/null | head -1`);
const expectedSha = (exp.stdout || '').trim();
const containerId = (cid.stdout || '').trim();
let running_image_sha = null, started_at = null, state = null, health = null;
if (containerId) {
const insp = await runOnServer(server, `docker inspect --format '{{.Image}}|{{.State.StartedAt}}|{{.State.Status}}|{{if .State.Health}}{{.State.Health.Status}}{{else}}n/a{{end}}' ${containerId}`);
const parts = (insp.stdout || '').trim().split('|');
running_image_sha = parts[0] || null;
started_at = parts[1] || null;
state = parts[2] || null;
health = parts[3] || null;
}
const stripSha = (s) => (s || '').replace(/^sha256:/, '');
const stale = !!expectedSha && !!running_image_sha && stripSha(expectedSha) !== stripSha(running_image_sha);
if (stale) anyStale = true;
out.push({ service: svc, container_id: containerId || null, expected_image_sha: expectedSha || null, running_image_sha, started_at, state, health, stale });
}
return { ok: true, app: appName, server, services: out, any_stale: anyStale, checked_at: new Date().toISOString() };
} catch (e) {
return reply.code(500).send({ ok: false, error: String(e?.message || e) });
}
});
fastify.post('/api/v1/apps/:app/progress/reset', async (request, reply) => {
const { app } = request.params;
if (!getApp(app)) return reply.code(404).send({ ok: false, error: `Unknown app: ${app}` });
const progressFile = progressFilePath(app);
try { await fs.rm(progressFile, { force: true }); } catch { /* already gone */ }
releaseLock(app); // also clear any in-memory lock
return { ok: true, app, message: 'Progress state and lock cleared' };
});
// Force-release the in-memory deploy lock (admin only — allows recovery without container restart)
fastify.post('/api/v1/apps/:app/unlock', async (request, reply) => {
const { app } = request.params;
if (!getApp(app)) return reply.code(404).send({ ok: false, error: `Unknown app: ${app}` });
const hadLock = deployLocks.has(app);
releaseLock(app);
return { ok: true, app, had_lock: hadLock, message: hadLock ? 'Lock released' : 'No lock was held' };
});
// --- Alerts ---
fastify.get('/api/v1/alerts', async () => {
const alerts = [];
for (const name of getAllApps()) {
const app = getApp(name);
const prod = app.production;
// Check health
if (prod.health_url) {
try {
const res = await fetch(prod.health_url, { signal: AbortSignal.timeout(5000) });
if (!res.ok) {
alerts.push({ app: name, severity: 'critical', type: 'unhealthy', status: res.status, url: prod.health_url });
}
} catch {
alerts.push({ app: name, severity: 'critical', type: 'unreachable', url: prod.health_url });
}
}
// Check if dev is ahead
try {
const devCommit = (await run(`git -C ${app.repo_dir} rev-parse --short HEAD`)).stdout;
const prodCommit = (await runOnServer(prod.server, `cd ${prod.deploy_dir} && git rev-parse --short HEAD`)).stdout;
if (devCommit && prodCommit && devCommit !== prodCommit) {
alerts.push({ app: name, severity: 'info', type: 'dev_ahead', dev: devCommit, prod: prodCommit });
}
} catch { /* skip if unreachable */ }
// Check last deploy
const lastDeploy = (deployHistory[name] || [])[0];
if (lastDeploy?.result === 'failed') {
alerts.push({ app: name, severity: 'warning', type: 'last_deploy_failed', timestamp: lastDeploy.timestamp });
}
}
return { alerts, count: alerts.length };
});
// --- Audit ---
fastify.get('/api/v1/audit', async (request) => {
const limit = parseInt(request.query.limit) || 50;
try {
const data = await fs.readFile(AUDIT_LOG_FILE, 'utf-8');
const lines = data.trim().split('\n').reverse();
const logs = lines.slice(0, limit).map(l => {
try { return JSON.parse(l); } catch { return { error: 'unparseable', raw: l }; }
});
return { logs, total: lines.length };
} catch {
return { logs: [], total: 0 };
}
});
// =============================================================================
// START
// =============================================================================
process.on('unhandledRejection', (reason, promise) => {
fastify.log.error({ reason, promise }, 'Unhandled promise rejection — investigate immediately');
});
// completeSelfRecreate — on startup, if the previous kua-deploy left progress
// in "phase: self_recreate_pending" state, this NEW instance verifies the
// recreate landed (image SHA matches what was built, container is fresh) and
// flips progress to succeeded or failed. Without this, release-app would poll
// /progress forever after a self-deploy.
async function completeSelfRecreate() {
const progressFile = path.join(DATA_DIR, `progress-${SELF_APP_NAME}.json`);
let progress;
try {
progress = JSON.parse(await fs.readFile(progressFile, 'utf-8'));
} catch { return; /* no progress file yet */ }
if (progress.phase !== 'self_recreate_pending') return;
const expectedSha = progress.self_recreate_expected_image || null;
const recreateStartedAt = progress.self_recreate_started_at
? new Date(progress.self_recreate_started_at)
: null;
// Check our own container via docker
let runningSha = null, startedAtStr = null, state = null;
try {
const insp = await run(`docker inspect ${SELF_APP_NAME} --format '{{.Image}}|{{.State.StartedAt}}|{{.State.Status}}'`);
if (insp.ok) {
const parts = insp.stdout.split('|');
runningSha = parts[0] || null;
startedAtStr = parts[1] || null;
state = parts[2] || null;
}
} catch { /* docker unreachable — leave progress in pending; next startup retries */ }
// Normalize: strip sha256: prefix from both sides for tolerant comparison.
const normSha = s => (s || '').replace(/^sha256:/, '').trim();
const imageMatches = expectedSha && runningSha && normSha(expectedSha) === normSha(runningSha);
const freshlyStarted = startedAtStr && recreateStartedAt && new Date(startedAtStr) >= recreateStartedAt;
const ok = !!(imageMatches && freshlyStarted && state === 'running');
const verify = {
expected_image_sha: expectedSha,
running_image_sha: runningSha,
started_at: startedAtStr,
recreate_started_at: progress.self_recreate_started_at,
state,
image_matches: !!imageMatches,
freshly_started: !!freshlyStarted,
completed_at: new Date().toISOString(),
};
// Mark the deploy step done, append the verify, flip phase + status
const steps = Array.isArray(progress.steps) ? progress.steps : [];
const deployIdx = steps.findIndex(s => s.step === 'deploy');
if (deployIdx >= 0) {
steps[deployIdx] = ok
? { ...steps[deployIdx], status: 'done', verify, note: 'self-recreate completed; verified by NEW kua-deploy on startup' }
: { ...steps[deployIdx], status: 'failed', error: 'self-recreate verify failed', verify };
}
const updated = {
...progress,
phase: ok ? 'succeeded' : 'failed',
status: 'done',
current_step: ok ? 'done' : 'deploy',
steps,
updated_at: Math.floor(Date.now() / 1000),
self_recreate_completed: true,
};
// Clear the marker fields
delete updated.self_recreate_expected_image;
delete updated.self_recreate_started_at;
delete updated.self_recreate_stateless;
await fs.writeFile(progressFile, JSON.stringify(updated, null, 2));
fastify.log.info({ ok, verify }, 'self-recreate completed and verified');
}
const start = async () => {
try {
// WEBHOOK_SECRET is optional — the Forgejo webhook path is now retired in
// favour of the admin API (/api/v1/apps/:app/deploy). The handler remains
// but returns 503 when the secret is absent, which is safe.
if (!DEV_MODE && !WEBHOOK_SECRET) {
fastify.log.warn('KUA_DEPLOY_WEBHOOK_SECRET not set — /webhook/forgejo will return 503. Set the secret to re-enable Forgejo push triggers.');
}
await loadRegistry();
await loadHistory();
await fs.mkdir(DATA_DIR, { recursive: true });
// Self-recreate recovery — handles handoff from previous instance that was
// killed mid-recreate during a kua-deploy self-deploy. Idempotent.
try { await completeSelfRecreate(); } catch (e) {
fastify.log.error({ err: e }, 'completeSelfRecreate failed (non-fatal)');
}
await fastify.listen({ port: 3200, host: '0.0.0.0' });
} catch (err) {
fastify.log.error(err);
process.exit(1);
}
};
start();