Compare commits
3 Commits
main
...
production
| Author | SHA1 | Date |
|---|---|---|
|
|
ed476804f9 | |
|
|
d75a145d90 | |
|
|
e33b1e96cb |
|
|
@ -5,8 +5,11 @@
|
|||
# - kua-services: reach kua-vault, kua-db, kua-mcp-core, etc.
|
||||
# - production_proxy: reach forgejo (git operations) + Caddy edge labels
|
||||
#
|
||||
# Registry: deploy-registry.json is bind-mounted from coder-core's checkout
|
||||
# during this transition. Future cleanup can migrate it into this repo.
|
||||
# Registry: ENGINE-OWNED at /app/data/registry.json (on the kua-deploy-data volume),
|
||||
# mutated only via the authenticated API (PUT/PATCH/DELETE /api/v1/apps/:app). The old
|
||||
# git bind-mount was removed 2026-05-26 — git is no longer in the registry path. On a
|
||||
# fresh volume the engine seeds from KUA_REGISTRY_SEED (default /app/deploy-registry.json)
|
||||
# if present; the cutover pre-seeds /app/data/registry.json from the live registry first.
|
||||
services:
|
||||
kua-deploy:
|
||||
build:
|
||||
|
|
@ -31,7 +34,6 @@ services:
|
|||
- kua-deploy-data:/app/data
|
||||
- /root/.ssh:/root/.ssh:ro
|
||||
- /root/apps:/root/apps
|
||||
- /root/apps/coder-core/services/kua-deploy/deploy-registry.json:/app/deploy-registry.json:ro
|
||||
- /usr/local/bin/kua-vault:/usr/local/bin/kua-vault:ro
|
||||
- /root/.config/kua-vault:/root/.config/kua-vault:ro
|
||||
networks:
|
||||
|
|
|
|||
464
server.js
464
server.js
|
|
@ -21,12 +21,31 @@ function validateMessage(msg) {
|
|||
return msg;
|
||||
}
|
||||
|
||||
// Validate a request-supplied git branch name before it is interpolated into a
|
||||
// shell command in release(). Registry-derived defaults are trusted and skip this.
|
||||
const SAFE_BRANCH_RE = /^[A-Za-z0-9._/-]{1,200}$/;
|
||||
function validateBranchName(name, label) {
|
||||
if (typeof name !== 'string' || !SAFE_BRANCH_RE.test(name) ||
|
||||
name.includes('..') || name.startsWith('-') || name.includes('@{')) {
|
||||
throw new Error(`Invalid ${label}: ${JSON.stringify(name)} — must match ${SAFE_BRANCH_RE}, with no '..', leading '-', or '@{'`);
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
// --- Configuration ---
|
||||
const DATA_DIR = path.join(process.cwd(), 'data');
|
||||
const LOG_DIR = path.join(process.cwd(), 'logs');
|
||||
const AUDIT_LOG_FILE = path.join(LOG_DIR, 'audit.log');
|
||||
// Audit log moved under DATA_DIR (the only mounted/persistent volume) — the old
|
||||
// /app/logs path was never mounted, so the audit trail was lost on every restart.
|
||||
const LOG_DIR = DATA_DIR;
|
||||
const AUDIT_LOG_FILE = path.join(DATA_DIR, 'audit.log');
|
||||
const DEPLOY_HISTORY_FILE = path.join(DATA_DIR, 'deploys.json');
|
||||
const REGISTRY_FILE = path.join(process.cwd(), 'deploy-registry.json');
|
||||
// The registry is ENGINE-OWNED runtime state on the persistent volume — NOT a
|
||||
// git-committed bind-mount. Mutated only via the authenticated API below. On first
|
||||
// boot it seeds from the legacy bind-mount (if still present) or a seed file, then
|
||||
// owns the file thereafter. See REGISTRY_SEED.
|
||||
const REGISTRY_FILE = path.join(DATA_DIR, 'registry.json');
|
||||
const REGISTRY_SEED = process.env.KUA_REGISTRY_SEED || path.join(process.cwd(), 'deploy-registry.json');
|
||||
const REGISTRY_EVENTS_FILE = path.join(DATA_DIR, 'registry-events.jsonl');
|
||||
const ADMIN_TOKEN = process.env.KUA_DEPLOY_ADMIN_TOKEN;
|
||||
const TAILSCALE_SOCKET = '/var/run/tailscale/tailscaled.sock';
|
||||
const HOSTNAME = process.env.HOSTNAME || 'gal';
|
||||
|
|
@ -93,14 +112,106 @@ function getDeployId(app) {
|
|||
return deployLocks.get(app)?.deployId ?? null;
|
||||
}
|
||||
|
||||
// --- Load Registry ---
|
||||
// --- Load Registry (engine-owned) ---
|
||||
let registry = { apps: {} };
|
||||
|
||||
// Seed-once: if the engine-owned file is missing, import from the legacy seed
|
||||
// (the old git bind-mount) so we don't lose the existing apps on cutover. NEVER
|
||||
// overwrites an existing engine-owned file.
|
||||
async function seedRegistryIfMissing() {
|
||||
try {
|
||||
await fs.access(REGISTRY_FILE);
|
||||
return false; // already engine-owned
|
||||
} catch { /* missing — seed below */ }
|
||||
let seed;
|
||||
try {
|
||||
seed = await fs.readFile(REGISTRY_SEED, 'utf-8');
|
||||
JSON.parse(seed); // validate it parses before adopting
|
||||
} catch (err) {
|
||||
fastify.log.warn(`No registry seed at ${REGISTRY_SEED} (${err.message}); starting with empty registry`);
|
||||
seed = JSON.stringify({ apps: {} }, null, 2) + '\n';
|
||||
}
|
||||
await fs.mkdir(DATA_DIR, { recursive: true });
|
||||
await fs.writeFile(REGISTRY_FILE, seed, 'utf-8');
|
||||
fastify.log.info(`Registry seeded from ${REGISTRY_SEED} -> ${REGISTRY_FILE}`);
|
||||
return true;
|
||||
}
|
||||
|
||||
async function loadRegistry() {
|
||||
await seedRegistryIfMissing();
|
||||
const data = await fs.readFile(REGISTRY_FILE, 'utf-8');
|
||||
registry = JSON.parse(data);
|
||||
const parsed = JSON.parse(data);
|
||||
if (!parsed || typeof parsed !== 'object' || typeof parsed.apps !== 'object') {
|
||||
throw new Error('registry.json malformed: missing "apps" object');
|
||||
}
|
||||
registry = parsed;
|
||||
fastify.log.info(`Registry loaded: ${Object.keys(registry.apps).length} apps`);
|
||||
}
|
||||
|
||||
// Atomic write of the in-memory registry to the engine-owned file (temp + rename).
|
||||
async function writeRegistry() {
|
||||
await fs.mkdir(DATA_DIR, { recursive: true });
|
||||
const tmp = `${REGISTRY_FILE}.tmp.${process.pid}.${Date.now()}`;
|
||||
await fs.writeFile(tmp, JSON.stringify(registry, null, 2) + '\n', 'utf-8');
|
||||
await fs.rename(tmp, REGISTRY_FILE);
|
||||
}
|
||||
|
||||
// --- Registry mutation lock (engine is single-process; serialize writers) ---
|
||||
let registryMutating = false;
|
||||
async function withRegistryLock(fn) {
|
||||
while (registryMutating) await new Promise(r => setTimeout(r, 25));
|
||||
registryMutating = true;
|
||||
try { return await fn(); }
|
||||
finally { registryMutating = false; }
|
||||
}
|
||||
|
||||
// Append-only hash-chained audit of registry mutations.
|
||||
let lastRegistryEventHash = null;
|
||||
async function appendRegistryEvent(ev) {
|
||||
if (lastRegistryEventHash === null) {
|
||||
// recover the last hash from the tail of the events file (best-effort)
|
||||
try {
|
||||
const txt = await fs.readFile(REGISTRY_EVENTS_FILE, 'utf-8');
|
||||
const lines = txt.trim().split('\n').filter(Boolean);
|
||||
if (lines.length) lastRegistryEventHash = JSON.parse(lines[lines.length - 1]).event_hash || '';
|
||||
else lastRegistryEventHash = '';
|
||||
} catch { lastRegistryEventHash = ''; }
|
||||
}
|
||||
const record = { ...ev, ts: new Date().toISOString(), prev_hash: lastRegistryEventHash };
|
||||
record.event_hash = crypto.createHash('sha256').update(JSON.stringify(record)).digest('hex');
|
||||
await fs.mkdir(DATA_DIR, { recursive: true });
|
||||
await fs.appendFile(REGISTRY_EVENTS_FILE, JSON.stringify(record) + '\n', 'utf-8');
|
||||
lastRegistryEventHash = record.event_hash;
|
||||
return record;
|
||||
}
|
||||
|
||||
// Validate a registry entry coming from the API (request input — privileged but typed).
|
||||
const REGISTRY_SERVERS = new Set((process.env.KUA_REGISTRY_SERVERS || 'bruno,gal,genesis,eva').split(',').map(s => s.trim()));
|
||||
function validateEntry(name, entry) {
|
||||
const errs = [];
|
||||
if (!/^[a-z0-9][a-z0-9._-]*$/.test(name)) errs.push(`app name "${name}" must be lowercase [a-z0-9._-]`);
|
||||
if (!entry || typeof entry !== 'object') { errs.push('entry must be an object'); return errs; }
|
||||
const reqTop = ['repo_dir', 'source_branch', 'deploy_branch', 'deploy_mode'];
|
||||
for (const k of reqTop) if (typeof entry[k] !== 'string' || !entry[k]) errs.push(`missing/invalid "${k}"`);
|
||||
if (entry.deploy_mode && !['direct', 'webhook', 'script'].includes(entry.deploy_mode)) errs.push(`deploy_mode "${entry.deploy_mode}" not direct|webhook|script`);
|
||||
if (entry.source_branch) try { validateBranchName(entry.source_branch, 'source_branch'); } catch (e) { errs.push(e.message); }
|
||||
if (entry.deploy_branch) try { validateBranchName(entry.deploy_branch, 'deploy_branch'); } catch (e) { errs.push(e.message); }
|
||||
if (entry.repo_url && typeof entry.repo_url !== 'string') errs.push('repo_url must be a string');
|
||||
const prod = entry.production;
|
||||
if (!prod || typeof prod !== 'object') { errs.push('missing "production" object'); return errs; }
|
||||
if (!REGISTRY_SERVERS.has(prod.server)) errs.push(`production.server "${prod.server}" not in allowed set {${[...REGISTRY_SERVERS].join(',')}}`);
|
||||
if (typeof prod.deploy_dir !== 'string' || !prod.deploy_dir.startsWith('/')) errs.push('production.deploy_dir must be an absolute path');
|
||||
if ('has_migrations' in prod && typeof prod.has_migrations !== 'boolean') errs.push('production.has_migrations must be boolean');
|
||||
if (prod.stateful_services && !Array.isArray(prod.stateful_services)) errs.push('production.stateful_services must be an array');
|
||||
return errs;
|
||||
}
|
||||
|
||||
function normalizedDiff(before, after) {
|
||||
const b = before ? JSON.stringify(before, Object.keys(before).sort()) : null;
|
||||
const a = after ? JSON.stringify(after, Object.keys(after).sort()) : null;
|
||||
return { changed: b !== a, before: before || null, after: after || null };
|
||||
}
|
||||
|
||||
function getApp(name) {
|
||||
return registry.apps[name] || null;
|
||||
}
|
||||
|
|
@ -217,7 +328,11 @@ fastify.addHook('onRequest', async (request, reply) => {
|
|||
// Webhook endpoint uses its own auth (HMAC signature verification inside the handler)
|
||||
if (request.url === '/webhook/forgejo') return;
|
||||
|
||||
const isLocalhost = ['127.0.0.1', '::1', '::ffff:127.0.0.1'].includes(request.ip) || request.ip.startsWith('172.');
|
||||
// Genuine loopback only. The Docker-bridge "172.*" shortcut was removed: the
|
||||
// service binds 0.0.0.0, so ANY container on the bridge inherited tag:admin
|
||||
// (including /unlock and /progress/reset). Bridge callers now go through the
|
||||
// normal Tailscale-Whois-or-bearer-token path like everyone else.
|
||||
const isLocalhost = ['127.0.0.1', '::1', '::ffff:127.0.0.1'].includes(request.ip);
|
||||
if (isLocalhost) {
|
||||
request.identity = { stableId: 'local', hostname: HOSTNAME, tags: ['tag:admin'], user: 'local' };
|
||||
return;
|
||||
|
|
@ -260,17 +375,53 @@ function isLocal(server) {
|
|||
return host === HOSTNAME;
|
||||
}
|
||||
|
||||
function tailscaleIpForServer(server) {
|
||||
const host = server.includes('@') ? server.split('@')[1] : server;
|
||||
const ips = {
|
||||
bruno: '100.74.17.6',
|
||||
gal: '100.122.129.114',
|
||||
};
|
||||
return ips[host] || '';
|
||||
// Resolve a server's Tailscale IPv4 at runtime via the tailscaled LocalAPI over
|
||||
// the mounted socket (the same mechanism tailscaleWhois uses). The kua-deploy
|
||||
// container has the socket but NOT the `tailscale` CLI, so we query
|
||||
// /localapi/v0/status and match the host by HostName/DNSName rather than shelling
|
||||
// out. Cached per host for the process lifetime; falls back to '' (TAILSCALE_IP
|
||||
// left unset, prior behavior for unknown hosts) if resolution fails.
|
||||
const _tailscaleIpCache = new Map();
|
||||
async function tailscaleStatusLookup(host) {
|
||||
return new Promise((resolve) => {
|
||||
const timeout = setTimeout(() => resolve(''), 2000);
|
||||
const req = http.request({
|
||||
socketPath: TAILSCALE_SOCKET,
|
||||
path: '/localapi/v0/status',
|
||||
method: 'GET',
|
||||
headers: { Host: 'local-tailscaled.sock' }, // anti-DNS-rebind guard the LocalAPI requires
|
||||
}, (res) => {
|
||||
let data = '';
|
||||
res.on('data', (chunk) => { data += chunk; });
|
||||
res.on('end', () => {
|
||||
clearTimeout(timeout);
|
||||
try {
|
||||
const status = JSON.parse(data);
|
||||
const all = [status.Self, ...Object.values(status.Peer || {})].filter(Boolean);
|
||||
const want = host.toLowerCase();
|
||||
const match = all.find(p =>
|
||||
(p.HostName || '').toLowerCase() === want ||
|
||||
(p.DNSName || '').toLowerCase().startsWith(want + '.'));
|
||||
const ip = (match?.TailscaleIPs || []).find(a => /^\d+\.\d+\.\d+\.\d+$/.test(a)) || '';
|
||||
resolve(ip);
|
||||
} catch { resolve(''); }
|
||||
});
|
||||
});
|
||||
req.on('error', () => { clearTimeout(timeout); resolve(''); });
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
function composeEnvPrefix(server) {
|
||||
const tailscaleIp = tailscaleIpForServer(server);
|
||||
async function tailscaleIpForServer(server) {
|
||||
const host = server.includes('@') ? server.split('@')[1] : server;
|
||||
if (_tailscaleIpCache.has(host)) return _tailscaleIpCache.get(host);
|
||||
const ip = await tailscaleStatusLookup(host);
|
||||
_tailscaleIpCache.set(host, ip);
|
||||
return ip;
|
||||
}
|
||||
|
||||
async function composeEnvPrefix(server) {
|
||||
const tailscaleIp = await tailscaleIpForServer(server);
|
||||
return tailscaleIp ? `TAILSCALE_IP=${tailscaleIp} ` : '';
|
||||
}
|
||||
|
||||
|
|
@ -332,7 +483,7 @@ async function recreateService({
|
|||
'-v', `${deployDir}:${deployDir}`,
|
||||
'-w', deployDir,
|
||||
];
|
||||
const tailscaleIp = tailscaleIpForServer(server);
|
||||
const tailscaleIp = await tailscaleIpForServer(server);
|
||||
if (tailscaleIp) runArgs.push('-e', `TAILSCALE_IP=${tailscaleIp}`);
|
||||
if (envFileWritten) runArgs.push('--env-file', envFilePath);
|
||||
runArgs.push('docker:cli');
|
||||
|
|
@ -397,6 +548,55 @@ async function runOnServer(server, cmd, opts = {}) {
|
|||
}
|
||||
}
|
||||
|
||||
// ensureCheckout — guarantee deployDir is a usable git checkout before deploy/rollback.
|
||||
// The engine historically assumed the repo already existed (`cd ${deployDir} && git fetch`,
|
||||
// see deploy()/rollback()); a registered app whose deploy_dir was never cloned failed at the
|
||||
// very first `cd`. This makes a first-time API deploy self-heal by cloning from the
|
||||
// registry-declared `repo_url`. It is a NO-OP for existing checkouts, so the conforming apps
|
||||
// (which carry no `repo_url`) keep working untouched. Runs inside the caller's per-app lock.
|
||||
// Clone source is NOT derived from the app name — origins are heterogeneous (Forgejo :2222,
|
||||
// scp-style, and at least one GitHub repo whose name differs from the app) — so it MUST come
|
||||
// from the registry. The caller still performs its own branch/tag checkout afterwards.
|
||||
// repoDir is the GIT ROOT (registry `repo_dir`), which is NOT always the deploy_dir:
|
||||
// sub-monorepo apps like coder-core have repo_dir=/root/apps/coder-core but
|
||||
// deploy_dir=/root/apps/coder-core/services/production (compose lives in a subdir).
|
||||
// Probing/cloning must target the git root — probing deploy_dir/.git would falsely
|
||||
// report MISSING for those apps (the bug that broke coder-core deploys 2026-05-26).
|
||||
// For the 18 normal apps repo_dir == deploy_dir, so behavior is unchanged.
|
||||
async function ensureCheckout(server, repoDir, repoUrl) {
|
||||
const probe = await runOnServer(server, `test -e ${repoDir}/.git && echo REPO || echo MISSING`);
|
||||
if (probe.stdout.trim() === 'REPO') {
|
||||
// Already a checkout — leave branch/tag selection to the caller. Optionally assert origin.
|
||||
if (repoUrl) {
|
||||
const originRes = await runOnServer(server, `git -C ${repoDir} config --get remote.origin.url || true`);
|
||||
const actual = originRes.stdout.trim();
|
||||
if (actual && actual !== repoUrl) {
|
||||
throw new Error(`ensure-checkout: ${repoDir} origin (${actual}) != registry repo_url (${repoUrl}) — refusing to deploy a mismatched checkout`);
|
||||
}
|
||||
}
|
||||
return { cloned: false };
|
||||
}
|
||||
if (!repoUrl) {
|
||||
throw new Error(`ensure-checkout: ${repoDir} is not a git checkout and no "repo_url" is set in the registry — cannot clone. Add repo_url to the app's registry entry (or create the checkout manually).`);
|
||||
}
|
||||
// Refuse to clobber a non-empty, non-repo directory.
|
||||
const dirState = await runOnServer(server, `if [ -e ${repoDir} ] && [ -n "$(ls -A ${repoDir} 2>/dev/null)" ]; then echo NONEMPTY; else echo OK; fi`);
|
||||
if (dirState.stdout.trim() === 'NONEMPTY') {
|
||||
throw new Error(`ensure-checkout: ${repoDir} exists, is not a git repo, and is non-empty — refusing to clobber. Inspect/remove it manually.`);
|
||||
}
|
||||
const cloneRes = await runOnServer(server, `git clone ${repoUrl} ${repoDir}`, { timeout: 180000 });
|
||||
if (!cloneRes.ok) {
|
||||
throw new Error(`ensure-checkout: git clone ${repoUrl} -> ${repoDir} failed: ${cloneRes.stderr}`);
|
||||
}
|
||||
// Verify the clone landed and origin matches what we asked for.
|
||||
const verifyRes = await runOnServer(server, `git -C ${repoDir} config --get remote.origin.url || true`);
|
||||
const landed = verifyRes.stdout.trim();
|
||||
if (landed !== repoUrl) {
|
||||
throw new Error(`ensure-checkout: cloned ${repoDir} but origin is ${landed || '(none)'} (expected ${repoUrl})`);
|
||||
}
|
||||
return { cloned: true };
|
||||
}
|
||||
|
||||
// --- kua-db integration ---
|
||||
async function kuaDbSafeCheck(app) {
|
||||
try {
|
||||
|
|
@ -573,6 +773,19 @@ async function deploy(appName, opts = {}) {
|
|||
// Step 2: Git pull on production server
|
||||
steps.push({ step: 'git_pull', status: 'running' });
|
||||
await markProgressPhase(appName, 'git_pull', { action, triggered_by: opts.triggered_by || 'api', steps });
|
||||
// ensure-checkout (TUBE step 1): self-heal a missing deploy_dir by cloning from the
|
||||
// registry repo_url, so a first-time API deploy doesn't die at the `cd` below. No-op
|
||||
// for existing checkouts. Inside the per-app lock acquired above.
|
||||
try {
|
||||
// Probe/clone the GIT ROOT (repo_dir), not deploy_dir — they differ for
|
||||
// sub-monorepo apps (coder-core). git fetch/checkout below run from deploy_dir
|
||||
// and git walks up to the root, so only the ensure-checkout probe needs repo_dir.
|
||||
const ec = await ensureCheckout(server, app.repo_dir || deployDir, app.repo_url);
|
||||
if (ec.cloned) steps[steps.length - 1].cloned = true;
|
||||
} catch (err) {
|
||||
steps[steps.length - 1] = { step: 'git_pull', status: 'failed', error: err.message };
|
||||
throw err;
|
||||
}
|
||||
const fetchCmd = `cd ${deployDir} && git fetch --prune ${remote}`;
|
||||
const fetchRes = await runOnServer(server, fetchCmd, { timeout: 60000 });
|
||||
if (!fetchRes.ok) {
|
||||
|
|
@ -607,7 +820,7 @@ ${detail}`);
|
|||
const kvPrefix = prod.vault
|
||||
? `kua-vault run --project ${prod.vault.project} --env ${prod.vault.env} --`
|
||||
: '';
|
||||
const envPrefix = composeEnvPrefix(server);
|
||||
const envPrefix = await composeEnvPrefix(server);
|
||||
const buildCmd = `cd ${deployDir} && ${envPrefix}${kvPrefix} docker compose build`;
|
||||
const buildRes = await runOnServer(server, buildCmd, { timeout: 600000 });
|
||||
if (!buildRes.ok) {
|
||||
|
|
@ -636,8 +849,8 @@ ${detail}`);
|
|||
if (verifyMode === 'off') return { ok: true, results: [], skipped: true };
|
||||
const results = [];
|
||||
for (const svc of services) {
|
||||
const exp = await runOnServer(server, `cd ${deployDir} && docker compose images --quiet ${svc} 2>/dev/null | head -1`);
|
||||
const cid = await runOnServer(server, `cd ${deployDir} && docker compose ps --quiet ${svc} 2>/dev/null | head -1`);
|
||||
const exp = await runOnServer(server, `cd ${deployDir} && ${kvPrefix} docker compose images --quiet ${svc} 2>/dev/null | head -1`);
|
||||
const cid = await runOnServer(server, `cd ${deployDir} && ${kvPrefix} docker compose ps --quiet ${svc} 2>/dev/null | head -1`);
|
||||
const expectedSha = (exp.stdout || '').trim();
|
||||
const containerId = (cid.stdout || '').trim();
|
||||
if (!containerId) {
|
||||
|
|
@ -647,7 +860,8 @@ ${detail}`);
|
|||
const insp = await runOnServer(server, `docker inspect --format '{{.Image}}|{{.State.StartedAt}}' ${containerId}`);
|
||||
const [actualSha, startedAtStr] = (insp.stdout || '').trim().split('|');
|
||||
const startedAt = new Date(startedAtStr || 0);
|
||||
const imageMatch = !!expectedSha && actualSha === expectedSha;
|
||||
const stripSha = (s) => (s || '').replace(/^sha256:/, '');
|
||||
const imageMatch = !!expectedSha && stripSha(actualSha) === stripSha(expectedSha);
|
||||
const freshlyStarted = !isNaN(startedAt) && startedAt >= deployStartTs;
|
||||
results.push({
|
||||
service: svc, ok: imageMatch && freshlyStarted,
|
||||
|
|
@ -673,6 +887,15 @@ ${detail}`);
|
|||
const svcRes = await runOnServer(server, `cd ${deployDir} && docker compose config --services`);
|
||||
const allServices = svcRes.stdout.split('\n').filter(Boolean);
|
||||
|
||||
// Fail-loud (TUBE step 1): if compose resolved NO services, the recreate+verify
|
||||
// block below is skipped entirely and the deploy would silently report `done`
|
||||
// having recreated nothing (a false-success path). Refuse it.
|
||||
if (allServices.length === 0) {
|
||||
steps[steps.length - 1] = { step: 'deploy', status: 'failed', error: 'docker compose config returned no services — nothing to recreate' };
|
||||
await markProgressPhase(appName, 'deploy', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
|
||||
throw new Error('deploy: docker compose config returned no services — refusing to report success without recreating anything');
|
||||
}
|
||||
|
||||
let stateful = prod.stateful_services || [];
|
||||
if (stateful.length === 0) {
|
||||
// Auto-detect stateful services from image names so db/redis are never force-recreated
|
||||
|
|
@ -831,20 +1054,37 @@ ${detail}`);
|
|||
await markProgressPhase(appName, 'health_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
|
||||
}
|
||||
} else {
|
||||
// No health URL — check containers
|
||||
// No health URL — fall back to confirming containers are listable. runOnServer
|
||||
// returns {ok:false} on failure (it does not throw); if we can't even run
|
||||
// `docker compose ps` we cannot claim health, so mark it failed (mirrors the
|
||||
// health_url failure path: finalResult='unhealthy') instead of silent 'done'.
|
||||
const psRes = await runOnServer(server, `cd ${deployDir} && docker compose ps --format json`);
|
||||
if (!psRes.ok) {
|
||||
steps[steps.length - 1] = { step: 'health', status: 'failed', error: psRes.stderr?.slice(-300) || psRes.error, note: 'no health URL; docker compose ps failed' };
|
||||
finalResult = 'unhealthy';
|
||||
await markProgressPhase(appName, 'health_failed', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit, result: finalResult });
|
||||
} else {
|
||||
steps[steps.length - 1] = { step: 'health', status: 'done', note: 'no health URL configured' };
|
||||
await markProgressPhase(appName, 'health_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
|
||||
}
|
||||
}
|
||||
|
||||
// Step 7: Post-deploy hooks
|
||||
if (prod.post_deploy) {
|
||||
steps.push({ step: 'post_deploy', status: 'running' });
|
||||
await markProgressPhase(appName, 'post_deploy', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
|
||||
await runOnServer(server, prod.post_deploy, { timeout: 30000 });
|
||||
// runOnServer returns {ok:false} on failure (it does not throw) — check it,
|
||||
// else a failing post-deploy hook was silently reported as success.
|
||||
const postRes = await runOnServer(server, prod.post_deploy, { timeout: 30000 });
|
||||
if (!postRes.ok) {
|
||||
steps[steps.length - 1] = { step: 'post_deploy', status: 'failed', error: postRes.stderr?.slice(-500) || postRes.error };
|
||||
finalResult = 'partial';
|
||||
await markProgressPhase(appName, 'post_deploy_failed', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit, result: finalResult });
|
||||
} else {
|
||||
steps[steps.length - 1] = { step: 'post_deploy', status: 'done' };
|
||||
await markProgressPhase(appName, 'post_deploy_done', { action, triggered_by: opts.triggered_by || 'api', steps, commit: deployCommit });
|
||||
}
|
||||
}
|
||||
|
||||
// Get tag
|
||||
const tagRes = await runOnServer(server, `cd ${deployDir} && git describe --tags --abbrev=0 2>/dev/null || echo "untagged"`);
|
||||
|
|
@ -910,7 +1150,7 @@ ${detail}`);
|
|||
// ROLLBACK ENGINE
|
||||
// =============================================================================
|
||||
|
||||
async function rollback(appName) {
|
||||
async function rollback(appName, opts = {}) {
|
||||
const app = getApp(appName);
|
||||
if (!app) throw new Error(`Unknown app: ${appName}`);
|
||||
|
||||
|
|
@ -919,13 +1159,25 @@ async function rollback(appName) {
|
|||
const deployDir = prod.deploy_dir;
|
||||
const remote = app.git_remote || 'origin';
|
||||
|
||||
// Find the previous successful deploy
|
||||
// Determine the rollback target. An explicit to_ref (tag/commit/branch, from MCP or admin)
|
||||
// overrides the default "previous successful tagged deploy" behavior — this preserves the
|
||||
// MCP deploy.rollback contract (arbitrary to_ref) now that it routes through this endpoint
|
||||
// instead of the legacy systemd path. Validate to_ref to keep it out of shell-injection
|
||||
// range (it interpolates into `git checkout` below).
|
||||
const history = deployHistory[appName] || [];
|
||||
const current = history[0];
|
||||
let tag;
|
||||
if (opts.to_ref) {
|
||||
if (!/^[A-Za-z0-9._/-]+$/.test(opts.to_ref)) {
|
||||
throw new Error(`Invalid to_ref ${JSON.stringify(opts.to_ref)} — must match [A-Za-z0-9._/-]+`);
|
||||
}
|
||||
tag = opts.to_ref;
|
||||
} else {
|
||||
const previous = history.find((d, i) => i > 0 && d.result === 'success' && d.tag && d.tag !== 'untagged');
|
||||
|
||||
if (!previous) {
|
||||
return { app: appName, result: 'no_rollback_target', message: 'No previous successful deploy with a tag found' };
|
||||
return { app: appName, result: 'no_rollback_target', message: 'No previous successful deploy with a tag found (pass to_ref to roll back to a specific tag/commit)' };
|
||||
}
|
||||
tag = previous.tag;
|
||||
}
|
||||
|
||||
if (!acquireLock(appName)) {
|
||||
|
|
@ -933,7 +1185,6 @@ async function rollback(appName) {
|
|||
}
|
||||
|
||||
try {
|
||||
const tag = previous.tag;
|
||||
await writeProgress(appName, {
|
||||
action: 'rollback',
|
||||
triggered_by: 'api',
|
||||
|
|
@ -944,8 +1195,12 @@ async function rollback(appName) {
|
|||
rolled_back_from: current?.tag || current?.commit || 'unknown',
|
||||
});
|
||||
|
||||
// Checkout the previous tag on production
|
||||
const checkoutRes = await runOnServer(server, `cd ${deployDir} && git fetch --prune ${remote} && git checkout ${tag}`, { timeout: 60000 });
|
||||
// ensure-checkout (TUBE step 1): probe/clone the git root (repo_dir), not deploy_dir
|
||||
// (they differ for sub-monorepo apps like coder-core). No-op for existing checkouts.
|
||||
await ensureCheckout(server, app.repo_dir || deployDir, app.repo_url);
|
||||
|
||||
// Checkout the rollback target on production (--tags so an explicit to_ref tag resolves).
|
||||
const checkoutRes = await runOnServer(server, `cd ${deployDir} && git fetch --prune --tags ${remote} && git checkout ${tag}`, { timeout: 60000 });
|
||||
if (!checkoutRes.ok) throw new Error(`Checkout ${tag} failed: ${checkoutRes.stderr}`);
|
||||
|
||||
// Rebuild + recreate via transient-container pattern (consistent with deploy()).
|
||||
|
|
@ -954,7 +1209,8 @@ async function rollback(appName) {
|
|||
const kvPrefix = prod.vault
|
||||
? `kua-vault run --project ${prod.vault.project} --env ${prod.vault.env} --`
|
||||
: '';
|
||||
const buildRes = await runOnServer(server, `cd ${deployDir} && ${composeEnvPrefix(server)}${kvPrefix} docker compose build`, { timeout: 600000 });
|
||||
const envPrefix = await composeEnvPrefix(server);
|
||||
const buildRes = await runOnServer(server, `cd ${deployDir} && ${envPrefix}${kvPrefix} docker compose build`, { timeout: 600000 });
|
||||
if (!buildRes.ok) throw new Error(`rollback build failed: ${buildRes.stderr?.slice(-500)}`);
|
||||
// Recreate all services for the rollback target.
|
||||
const svcList = (await runOnServer(server, `cd ${deployDir} && docker compose config --services`)).stdout.split('\n').filter(Boolean);
|
||||
|
|
@ -1007,6 +1263,37 @@ async function rollback(appName) {
|
|||
|
||||
return { app: appName, ...entry };
|
||||
|
||||
} catch (err) {
|
||||
// The checkout/build/recreate steps above return {ok:false}/throw on failure;
|
||||
// without this catch a failed rollback left progress stuck in 'running' and was
|
||||
// never recorded. Mirror deploy()'s catch: record the failure and return a
|
||||
// {result:'failed'} object (the rollback route's contract) rather than 500ing.
|
||||
const entry = {
|
||||
result: 'failed',
|
||||
action: 'rollback',
|
||||
error: err.message,
|
||||
rolled_back_to: tag,
|
||||
rolled_back_from: current?.tag || current?.commit || 'unknown',
|
||||
server,
|
||||
triggered_by: 'api',
|
||||
};
|
||||
await writeProgress(appName, {
|
||||
action: 'rollback',
|
||||
triggered_by: 'api',
|
||||
status: 'failed',
|
||||
phase: 'rollback_failed',
|
||||
current_step: 'rollback',
|
||||
result: 'failed',
|
||||
error: err.message,
|
||||
rolled_back_to: tag,
|
||||
rolled_back_from: current?.tag || current?.commit || 'unknown',
|
||||
server,
|
||||
finished_at: Math.floor(Date.now() / 1000),
|
||||
});
|
||||
recordDeploy(appName, entry);
|
||||
await audit({ action: 'rollback_failed', app: appName, error: err.message });
|
||||
return { app: appName, ...entry };
|
||||
|
||||
} finally {
|
||||
releaseLock(appName);
|
||||
}
|
||||
|
|
@ -1162,6 +1449,103 @@ fastify.post('/webhook/forgejo', async (request, reply) => {
|
|||
|
||||
// --- Apps ---
|
||||
|
||||
// Registry dump — used by kua-mcp-core to discover all apps at startup
|
||||
// without relying on a filesystem path that may not resolve inside its container.
|
||||
fastify.get('/api/v1/apps/registry', async () => {
|
||||
return registry;
|
||||
});
|
||||
|
||||
// Reload the deploy registry from disk WITHOUT restarting the engine (TUBE step 3 —
|
||||
// kills the cache-restart wart: kua-deploy caches the registry at startup, so a newly
|
||||
// registered app otherwise needs `docker restart kua-deploy`). loadRegistry() JSON-parses
|
||||
// the file; on parse failure it throws BEFORE reassigning `registry`, so the in-memory
|
||||
// last-good registry is preserved. Auth is handled by the global onRequest hook.
|
||||
fastify.post('/api/v1/registry/reload', async (request, reply) => {
|
||||
const before = Object.keys(registry.apps || {}).length;
|
||||
try {
|
||||
await loadRegistry();
|
||||
} catch (err) {
|
||||
return reply.code(500).send({ ok: false, error: `registry reload failed (kept last-good ${before} apps): ${err.message}`, apps: before });
|
||||
}
|
||||
const after = Object.keys(registry.apps || {}).length;
|
||||
fastify.log.info({ before, after }, 'registry reloaded via /api/v1/registry/reload');
|
||||
return { ok: true, apps: after, before };
|
||||
});
|
||||
|
||||
// Export the full registry + write a timestamped snapshot to the data volume.
|
||||
// Git can consume these as generated audit artifacts; it is NOT the source of truth.
|
||||
fastify.get('/api/v1/registry/export', async (request, reply) => {
|
||||
try {
|
||||
const ts = new Date().toISOString().replace(/[:.]/g, '').slice(0, 15);
|
||||
const snap = path.join(DATA_DIR, `registry-snapshot-${ts}.json`);
|
||||
await fs.mkdir(DATA_DIR, { recursive: true });
|
||||
await fs.writeFile(snap, JSON.stringify(registry, null, 2) + '\n', 'utf-8');
|
||||
return { ok: true, apps: Object.keys(registry.apps).length, snapshot: snap, registry };
|
||||
} catch (err) {
|
||||
return reply.code(500).send({ ok: false, error: err.message });
|
||||
}
|
||||
});
|
||||
|
||||
// --- Registry mutations (engine-owned) — authenticated via the global onRequest hook ---
|
||||
// Upsert a full app entry. ?dry_run=1 validates + returns a diff without writing.
|
||||
fastify.put('/api/v1/apps/:app', async (request, reply) => {
|
||||
const name = request.params.app;
|
||||
const entry = request.body;
|
||||
const errs = validateEntry(name, entry);
|
||||
if (name === 'registry') errs.push('"registry" is a reserved name');
|
||||
if (errs.length) return reply.code(400).send({ ok: false, errors: errs });
|
||||
const before = registry.apps[name] || null;
|
||||
const diff = normalizedDiff(before, entry);
|
||||
if (request.query.dry_run === '1' || request.query.dry_run === 'true') {
|
||||
return { ok: true, dry_run: true, op: before ? 'update' : 'create', app: name, diff };
|
||||
}
|
||||
await withRegistryLock(async () => {
|
||||
registry.apps[name] = entry;
|
||||
await writeRegistry();
|
||||
await appendRegistryEvent({ op: before ? 'update' : 'create', app: name, actor: request.identity?.user || 'unknown', before, after: entry });
|
||||
});
|
||||
return { ok: true, op: before ? 'update' : 'create', app: name, apps: Object.keys(registry.apps).length };
|
||||
});
|
||||
|
||||
// Deep-merge a partial update onto an existing entry.
|
||||
fastify.patch('/api/v1/apps/:app', async (request, reply) => {
|
||||
const name = request.params.app;
|
||||
const before = registry.apps[name];
|
||||
if (!before) return reply.code(404).send({ ok: false, error: `Unknown app: ${name}` });
|
||||
const deepMerge = (a, b) => {
|
||||
const out = Array.isArray(a) ? [...a] : { ...a };
|
||||
for (const [k, v] of Object.entries(b || {})) {
|
||||
out[k] = (v && typeof v === 'object' && !Array.isArray(v) && a && typeof a[k] === 'object' && !Array.isArray(a[k])) ? deepMerge(a[k], v) : v;
|
||||
}
|
||||
return out;
|
||||
};
|
||||
const merged = deepMerge(before, request.body || {});
|
||||
const errs = validateEntry(name, merged);
|
||||
if (errs.length) return reply.code(400).send({ ok: false, errors: errs });
|
||||
if (request.query.dry_run === '1' || request.query.dry_run === 'true') {
|
||||
return { ok: true, dry_run: true, op: 'patch', app: name, diff: normalizedDiff(before, merged) };
|
||||
}
|
||||
await withRegistryLock(async () => {
|
||||
registry.apps[name] = merged;
|
||||
await writeRegistry();
|
||||
await appendRegistryEvent({ op: 'patch', app: name, actor: request.identity?.user || 'unknown', before, after: merged });
|
||||
});
|
||||
return { ok: true, op: 'patch', app: name };
|
||||
});
|
||||
|
||||
// Remove an app from the registry.
|
||||
fastify.delete('/api/v1/apps/:app', async (request, reply) => {
|
||||
const name = request.params.app;
|
||||
const before = registry.apps[name];
|
||||
if (!before) return reply.code(404).send({ ok: false, error: `Unknown app: ${name}` });
|
||||
await withRegistryLock(async () => {
|
||||
delete registry.apps[name];
|
||||
await writeRegistry();
|
||||
await appendRegistryEvent({ op: 'delete', app: name, actor: request.identity?.user || 'unknown', before, after: null });
|
||||
});
|
||||
return { ok: true, op: 'delete', app: name, apps: Object.keys(registry.apps).length };
|
||||
});
|
||||
|
||||
// List all apps
|
||||
fastify.get('/api/v1/apps', async () => {
|
||||
const results = [];
|
||||
|
|
@ -1191,8 +1575,14 @@ fastify.get('/api/v1/apps/:app/deploys', async (request) => {
|
|||
// --- Actions ---
|
||||
|
||||
// Release (merge main→production, tag, push — triggers webhook deploy)
|
||||
fastify.post('/api/v1/apps/:app/release', async (request) => {
|
||||
fastify.post('/api/v1/apps/:app/release', async (request, reply) => {
|
||||
const { message, source_branch, target_branch } = request.body || {};
|
||||
try {
|
||||
if (source_branch !== undefined) validateBranchName(source_branch, 'source_branch');
|
||||
if (target_branch !== undefined) validateBranchName(target_branch, 'target_branch');
|
||||
} catch (err) {
|
||||
return reply.code(400).send({ ok: false, error: err.message });
|
||||
}
|
||||
return await release(request.params.app, message || 'Release to production', { source_branch, target_branch });
|
||||
});
|
||||
|
||||
|
|
@ -1257,9 +1647,11 @@ fastify.post('/api/v1/apps/:app/rebuild', async (request, reply) => {
|
|||
return { triggered: true, app };
|
||||
});
|
||||
|
||||
// Rollback
|
||||
// Rollback. Optional body { to_ref } rolls back to a specific tag/commit/branch;
|
||||
// omitted = previous successful tagged deploy.
|
||||
fastify.post('/api/v1/apps/:app/rollback', async (request) => {
|
||||
return await rollback(request.params.app);
|
||||
const to_ref = request.body && typeof request.body.to_ref === 'string' ? request.body.to_ref : undefined;
|
||||
return await rollback(request.params.app, { to_ref });
|
||||
});
|
||||
|
||||
// --- Deploy Progress ---
|
||||
|
|
@ -1502,9 +1894,11 @@ async function completeSelfRecreate() {
|
|||
|
||||
const start = async () => {
|
||||
try {
|
||||
// Fail fast if webhook secret is missing in production
|
||||
// WEBHOOK_SECRET is optional — the Forgejo webhook path is now retired in
|
||||
// favour of the admin API (/api/v1/apps/:app/deploy). The handler remains
|
||||
// but returns 503 when the secret is absent, which is safe.
|
||||
if (!DEV_MODE && !WEBHOOK_SECRET) {
|
||||
throw new Error('KUA_DEPLOY_WEBHOOK_SECRET must be set in production — refusing to start');
|
||||
fastify.log.warn('KUA_DEPLOY_WEBHOOK_SECRET not set — /webhook/forgejo will return 503. Set the secret to re-enable Forgejo push triggers.');
|
||||
}
|
||||
await loadRegistry();
|
||||
await loadHistory();
|
||||
|
|
|
|||
Loading…
Reference in New Issue