Problem: the agent polled /poll every 30s while waiting for admin approval. At 10 req/15min, the 11th poll hit 429 after ~5 min and every subsequent one also failed — recovery required an agent restart. A human-paced approval SLA is longer than 5 minutes. CCP side (agents.routes.ts): Split the one-size-fits-all agentRegistrationLimiter into two. /register stays tight (10/15min — invite-code brute force is the real attack surface). /poll gets a new agentPollLimiter at 180/15min (one poll per ~5s upper bound), scoped to registrationId+slug so blast radius is bounded. Agent side (server.ts): Replaced fixed 30s setInterval with a self-scheduling setTimeout loop that backs off exponentially on HTTP 429 (30s → 60s → 120s → 300s cap) and resets to 30s on any 2xx. Stop-flag protects against re-entry after approval. Fixes the "agent wedged at 429, restart to recover" workaround. Bunker Admin
243 lines
9.4 KiB
TypeScript
243 lines
9.4 KiB
TypeScript
import 'express-async-errors';
|
|
import express from 'express';
|
|
import https from 'https';
|
|
import http from 'http';
|
|
import fs from 'fs';
|
|
import { env } from './config/env';
|
|
import { logger } from './utils/logger';
|
|
import { mtlsAuth } from './middleware/mtls-auth';
|
|
import { errorHandler } from './middleware/error-handler';
|
|
import healthRoutes from './routes/health.routes';
|
|
import composeRoutes from './routes/compose.routes';
|
|
import filesRoutes from './routes/files.routes';
|
|
import registryRoutes from './routes/registry.routes';
|
|
import backupRoutes from './routes/backup.routes';
|
|
import upgradeRoutes from './routes/upgrade.routes';
|
|
|
|
const app = express();
|
|
|
|
// Parse JSON bodies (up to 50MB for template file uploads)
|
|
app.use(express.json({ limit: '50mb' }));
|
|
|
|
// Health endpoint is always accessible (no mTLS required)
|
|
app.use(healthRoutes);
|
|
|
|
// All other routes require mTLS authentication
|
|
function hasCerts(): boolean {
|
|
try {
|
|
fs.accessSync(env.AGENT_CERT_PATH);
|
|
fs.accessSync(env.AGENT_KEY_PATH);
|
|
fs.accessSync(env.AGENT_CA_CERT_PATH);
|
|
return true;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (hasCerts()) {
|
|
// mTLS mode — certificates are installed
|
|
const tlsOptions: https.ServerOptions = {
|
|
key: fs.readFileSync(env.AGENT_KEY_PATH),
|
|
cert: fs.readFileSync(env.AGENT_CERT_PATH),
|
|
ca: fs.readFileSync(env.AGENT_CA_CERT_PATH),
|
|
requestCert: true,
|
|
rejectUnauthorized: true,
|
|
};
|
|
|
|
app.use(mtlsAuth);
|
|
app.use(composeRoutes);
|
|
app.use(filesRoutes);
|
|
app.use(registryRoutes);
|
|
app.use(backupRoutes);
|
|
app.use(upgradeRoutes);
|
|
app.use(errorHandler);
|
|
|
|
const server = https.createServer(tlsOptions, app);
|
|
server.listen(env.AGENT_PORT, async () => {
|
|
logger.info(`CCP Agent (mTLS) listening on port ${env.AGENT_PORT}`);
|
|
|
|
// Auto-register this instance's slug if configured
|
|
if (env.INSTANCE_SLUG && env.INSTANCE_BASE_PATH) {
|
|
const { registerSlug, getSlugEntry } = await import('./services/registry.service');
|
|
try {
|
|
await getSlugEntry(env.INSTANCE_SLUG);
|
|
logger.debug(`[registry] Slug ${env.INSTANCE_SLUG} already registered`);
|
|
} catch {
|
|
// Detect compose project name: use env override, or derive from basePath directory name
|
|
// (Docker Compose default: directory name with special chars stripped)
|
|
const pathMod = await import('path');
|
|
const composeProject = env.COMPOSE_PROJECT
|
|
|| pathMod.basename(env.INSTANCE_BASE_PATH).replace(/[^a-zA-Z0-9]/g, '').toLowerCase();
|
|
await registerSlug(env.INSTANCE_SLUG, env.INSTANCE_BASE_PATH, composeProject);
|
|
}
|
|
}
|
|
});
|
|
} else {
|
|
// Pre-approval mode — start HTTP, only health + phone-home polling
|
|
logger.info('No certificates found — starting in phone-home registration mode');
|
|
|
|
app.use(errorHandler);
|
|
|
|
const server = http.createServer(app);
|
|
server.listen(env.AGENT_PORT, () => {
|
|
logger.info(`CCP Agent (registration mode) listening on port ${env.AGENT_PORT}`);
|
|
});
|
|
|
|
// Start phone-home polling if CCP_URL and CCP_INVITE_CODE are set
|
|
if (env.CCP_URL && env.CCP_INVITE_CODE) {
|
|
startPhoneHome();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Phone-home registration flow:
|
|
* 1. POST to CCP with invite code + instance metadata
|
|
* 2. Poll CCP every 30s until approved
|
|
* 3. On approval, save certs and restart with mTLS
|
|
*/
|
|
async function startPhoneHome() {
|
|
logger.info(`[phone-home] Registering with CCP at ${env.CCP_URL}...`);
|
|
|
|
// Step 1: Send registration request
|
|
try {
|
|
const response = await fetch(`${env.CCP_URL}/api/agents/register`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
inviteCode: env.CCP_INVITE_CODE,
|
|
slug: env.INSTANCE_SLUG,
|
|
name: env.INSTANCE_SLUG,
|
|
domain: env.INSTANCE_DOMAIN,
|
|
agentUrl: env.CCP_AGENT_URL,
|
|
basePath: env.INSTANCE_BASE_PATH,
|
|
}),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const err = await response.text();
|
|
logger.error(`[phone-home] Registration failed: ${response.status} ${err}`);
|
|
return;
|
|
}
|
|
|
|
const result = await response.json() as { registrationId: string };
|
|
logger.info(`[phone-home] Registration submitted (id: ${result.registrationId}). Waiting for approval...`);
|
|
|
|
// Step 2: Poll for approval. Every path inside the callback is wrapped in
|
|
// try/catch so an unexpected throw never kills the interval silently.
|
|
// On every poll we log either the status transition or a heartbeat every
|
|
// 10th attempt, so admins can see the loop is alive.
|
|
// Self-scheduling poll with exponential backoff on HTTP 429. Fixed-interval
|
|
// setInterval blew up at the 15-min mark when CCP's rate limiter rejected
|
|
// the 11th poll and subsequent polls — the only recovery was an agent
|
|
// restart. This loop doubles the delay on 429 (cap 5 min) and resets on
|
|
// any 2xx response.
|
|
let pollCount = 0;
|
|
let lastLoggedStatus: string | null = null;
|
|
const BASE_DELAY_MS = 30_000;
|
|
const MAX_DELAY_MS = 300_000;
|
|
let currentDelayMs = BASE_DELAY_MS;
|
|
let pollTimer: NodeJS.Timeout | null = null;
|
|
let stopped = false;
|
|
|
|
const scheduleNext = (delayMs: number) => {
|
|
if (stopped) return;
|
|
pollTimer = setTimeout(runPoll, delayMs);
|
|
};
|
|
|
|
const runPoll = async () => {
|
|
if (stopped) return;
|
|
pollCount += 1;
|
|
try {
|
|
const pollResp = await fetch(
|
|
`${env.CCP_URL}/api/agents/poll?registrationId=${result.registrationId}&slug=${env.INSTANCE_SLUG}`
|
|
);
|
|
|
|
if (pollResp.status === 429) {
|
|
// Rate limited — back off, don't try to parse body.
|
|
const newDelay = Math.min(currentDelayMs * 2, MAX_DELAY_MS);
|
|
logger.warn(`[phone-home] Poll #${pollCount} HTTP 429 — backing off ${currentDelayMs}ms → ${newDelay}ms`);
|
|
currentDelayMs = newDelay;
|
|
scheduleNext(currentDelayMs);
|
|
return;
|
|
}
|
|
|
|
if (!pollResp.ok) {
|
|
logger.warn(`[phone-home] Poll #${pollCount} HTTP ${pollResp.status} ${pollResp.statusText}`);
|
|
scheduleNext(currentDelayMs);
|
|
return;
|
|
}
|
|
|
|
// Success — reset backoff if previously inflated.
|
|
if (currentDelayMs !== BASE_DELAY_MS) {
|
|
logger.info(`[phone-home] Poll succeeded — resetting delay ${currentDelayMs}ms → ${BASE_DELAY_MS}ms`);
|
|
currentDelayMs = BASE_DELAY_MS;
|
|
}
|
|
|
|
const pollData = await pollResp.json() as {
|
|
status: string;
|
|
certBundle?: { caCertPem: string; agentCertPem: string; agentKeyPem: string; ccpFingerprint: string };
|
|
message?: string;
|
|
};
|
|
|
|
if (pollData.status !== lastLoggedStatus) {
|
|
logger.info(`[phone-home] Poll #${pollCount}: status=${pollData.status}${pollData.message ? ` — ${pollData.message}` : ''}`);
|
|
lastLoggedStatus = pollData.status;
|
|
} else if (pollCount % 10 === 0) {
|
|
logger.debug(`[phone-home] Poll #${pollCount}: still ${pollData.status}`);
|
|
}
|
|
|
|
if (pollData.status === 'APPROVED' && pollData.certBundle) {
|
|
stopped = true;
|
|
if (pollTimer) clearTimeout(pollTimer);
|
|
logger.info('[phone-home] Approved! Saving certificates...');
|
|
|
|
// Save certs
|
|
const fsp = await import('fs/promises');
|
|
const pathMod = await import('path');
|
|
await fsp.mkdir(pathMod.dirname(env.AGENT_CERT_PATH), { recursive: true });
|
|
await fsp.writeFile(env.AGENT_CERT_PATH, pollData.certBundle.agentCertPem);
|
|
await fsp.writeFile(env.AGENT_KEY_PATH, pollData.certBundle.agentKeyPem);
|
|
await fsp.writeFile(env.AGENT_CA_CERT_PATH, pollData.certBundle.caCertPem);
|
|
|
|
// SECURITY: Write the CCP fingerprint to a config file so the agent
|
|
// can verify the CCP's identity on subsequent connections.
|
|
if (pollData.certBundle.ccpFingerprint) {
|
|
const configPath = pathMod.join(env.AGENT_DATA_DIR, 'ccp-fingerprint');
|
|
await fsp.mkdir(env.AGENT_DATA_DIR, { recursive: true });
|
|
await fsp.writeFile(configPath, pollData.certBundle.ccpFingerprint);
|
|
logger.info(`[phone-home] CCP fingerprint saved: ${pollData.certBundle.ccpFingerprint.substring(0, 16)}...`);
|
|
}
|
|
|
|
logger.info('[phone-home] Certificates saved. Restarting with mTLS...');
|
|
process.exit(0);
|
|
} else if (pollData.status === 'APPROVED' && !pollData.certBundle) {
|
|
// Admin approved but cert bundle was consumed (e.g. by debug curl).
|
|
// Keep polling — admin can re-issue certs via the new endpoint.
|
|
scheduleNext(currentDelayMs);
|
|
} else if (pollData.status === 'REJECTED') {
|
|
stopped = true;
|
|
if (pollTimer) clearTimeout(pollTimer);
|
|
logger.error('[phone-home] Registration was rejected by CCP admin');
|
|
} else {
|
|
scheduleNext(currentDelayMs);
|
|
}
|
|
} catch (err) {
|
|
// CRITICAL: this catch MUST swallow every error — previously an unhandled
|
|
// rejection could silently kill the poll loop depending on Node config.
|
|
logger.warn(`[phone-home] Poll #${pollCount} failed: ${(err as Error).message}`);
|
|
scheduleNext(currentDelayMs);
|
|
}
|
|
};
|
|
|
|
scheduleNext(currentDelayMs);
|
|
|
|
// Defensive: if the Node process receives an unhandled rejection that
|
|
// somehow originates from the poll path, log it instead of dying quietly.
|
|
process.on('unhandledRejection', (reason) => {
|
|
logger.error(`[phone-home] Unhandled rejection in poll loop: ${reason}`);
|
|
});
|
|
} catch (err) {
|
|
logger.error(`[phone-home] Registration request failed: ${(err as Error).message}`);
|
|
}
|
|
}
|