diff --git a/changemaker-control-panel/agent/src/server.ts b/changemaker-control-panel/agent/src/server.ts index ed6d6b33..590c4ec2 100644 --- a/changemaker-control-panel/agent/src/server.ts +++ b/changemaker-control-panel/agent/src/server.ts @@ -126,28 +126,59 @@ async function startPhoneHome() { // try/catch so an unexpected throw never kills the interval silently. // On every poll we log either the status transition or a heartbeat every // 10th attempt, so admins can see the loop is alive. + // Self-scheduling poll with exponential backoff on HTTP 429. Fixed-interval + // setInterval blew up at the 15-min mark when CCP's rate limiter rejected + // the 11th poll and subsequent polls — the only recovery was an agent + // restart. This loop doubles the delay on 429 (cap 5 min) and resets on + // any 2xx response. let pollCount = 0; let lastLoggedStatus: string | null = null; - const pollInterval = setInterval(async () => { + const BASE_DELAY_MS = 30_000; + const MAX_DELAY_MS = 300_000; + let currentDelayMs = BASE_DELAY_MS; + let pollTimer: NodeJS.Timeout | null = null; + let stopped = false; + + const scheduleNext = (delayMs: number) => { + if (stopped) return; + pollTimer = setTimeout(runPoll, delayMs); + }; + + const runPoll = async () => { + if (stopped) return; pollCount += 1; try { const pollResp = await fetch( `${env.CCP_URL}/api/agents/poll?registrationId=${result.registrationId}&slug=${env.INSTANCE_SLUG}` ); + if (pollResp.status === 429) { + // Rate limited — back off, don't try to parse body. + const newDelay = Math.min(currentDelayMs * 2, MAX_DELAY_MS); + logger.warn(`[phone-home] Poll #${pollCount} HTTP 429 — backing off ${currentDelayMs}ms → ${newDelay}ms`); + currentDelayMs = newDelay; + scheduleNext(currentDelayMs); + return; + } + if (!pollResp.ok) { logger.warn(`[phone-home] Poll #${pollCount} HTTP ${pollResp.status} ${pollResp.statusText}`); + scheduleNext(currentDelayMs); return; } + // Success — reset backoff if previously inflated. + if (currentDelayMs !== BASE_DELAY_MS) { + logger.info(`[phone-home] Poll succeeded — resetting delay ${currentDelayMs}ms → ${BASE_DELAY_MS}ms`); + currentDelayMs = BASE_DELAY_MS; + } + const pollData = await pollResp.json() as { status: string; certBundle?: { caCertPem: string; agentCertPem: string; agentKeyPem: string; ccpFingerprint: string }; message?: string; }; - // Log status transitions and periodic heartbeats so the loop is never - // invisible. Previously a stuck loop left no trace in logs. if (pollData.status !== lastLoggedStatus) { logger.info(`[phone-home] Poll #${pollCount}: status=${pollData.status}${pollData.message ? ` — ${pollData.message}` : ''}`); lastLoggedStatus = pollData.status; @@ -156,7 +187,8 @@ async function startPhoneHome() { } if (pollData.status === 'APPROVED' && pollData.certBundle) { - clearInterval(pollInterval); + stopped = true; + if (pollTimer) clearTimeout(pollTimer); logger.info('[phone-home] Approved! Saving certificates...'); // Save certs @@ -177,25 +209,27 @@ async function startPhoneHome() { } logger.info('[phone-home] Certificates saved. Restarting with mTLS...'); - - // Exit so Docker restart policy brings us back with certs process.exit(0); } else if (pollData.status === 'APPROVED' && !pollData.certBundle) { // Admin approved but cert bundle was consumed (e.g. by debug curl). - // Keep polling — admin can re-issue certs via the new endpoint and we'll - // pick them up on the next poll. - // (No action needed; the status-transition log above covers visibility.) + // Keep polling — admin can re-issue certs via the new endpoint. + scheduleNext(currentDelayMs); } else if (pollData.status === 'REJECTED') { - clearInterval(pollInterval); + stopped = true; + if (pollTimer) clearTimeout(pollTimer); logger.error('[phone-home] Registration was rejected by CCP admin'); + } else { + scheduleNext(currentDelayMs); } } catch (err) { - // CRITICAL: this catch MUST swallow every error — if it rethrows the - // setInterval callback becomes an unhandled rejection and Node may kill - // the interval depending on the runtime config. We saw this in prod. + // CRITICAL: this catch MUST swallow every error — previously an unhandled + // rejection could silently kill the poll loop depending on Node config. logger.warn(`[phone-home] Poll #${pollCount} failed: ${(err as Error).message}`); + scheduleNext(currentDelayMs); } - }, 30_000); + }; + + scheduleNext(currentDelayMs); // Defensive: if the Node process receives an unhandled rejection that // somehow originates from the poll path, log it instead of dying quietly. diff --git a/changemaker-control-panel/api/src/modules/agents/agents.routes.ts b/changemaker-control-panel/api/src/modules/agents/agents.routes.ts index e33a0910..025fbe03 100644 --- a/changemaker-control-panel/api/src/modules/agents/agents.routes.ts +++ b/changemaker-control-panel/api/src/modules/agents/agents.routes.ts @@ -10,7 +10,8 @@ import { logger } from '../../utils/logger'; const router = Router(); -// SECURITY: Strict rate limiter for unauthenticated agent endpoints +// SECURITY: Strict rate limiter for unauthenticated agent /register endpoint. +// Registration is the real attack surface (invite-code guessing); keep it tight. const agentRegistrationLimiter = rateLimit({ windowMs: 15 * 60 * 1000, // 15 minutes max: 10, // 10 attempts per window per IP @@ -19,6 +20,18 @@ const agentRegistrationLimiter = rateLimit({ message: { error: 'RATE_LIMITED', message: 'Too many registration attempts, try again later' }, }); +// Looser limiter for /poll. The agent polls every 30s (60/15min) while waiting +// for admin approval; at the old 10/15min limit it hit 429 after 5 min and +// required a restart to recover. /poll is scoped to a specific registrationId +// and slug, so the blast radius is bounded even at 180/15min (one poll per 5s). +const agentPollLimiter = rateLimit({ + windowMs: 15 * 60 * 1000, + max: 180, + standardHeaders: true, + legacyHeaders: false, + message: { error: 'RATE_LIMITED', message: 'Poll rate exceeded — back off' }, +}); + // ─── Public Endpoints (used by remote agents during phone-home) ────── /** @@ -73,7 +86,7 @@ router.post('/register', agentRegistrationLimiter, async (req: Request, res: Res * Agent polls to check if registration was approved. * Returns cert bundle on approval. */ -router.get('/poll', agentRegistrationLimiter, async (req: Request, res: Response) => { +router.get('/poll', agentPollLimiter, async (req: Request, res: Response) => { const { registrationId, slug } = req.query; if (!registrationId && !slug) {