ccp: split /register and /poll rate limits; agent backoff on 429

Problem: the agent polled /poll every 30s while waiting for admin
approval. At 10 req/15min, the 11th poll hit 429 after ~5 min and
every subsequent one also failed — recovery required an agent
restart. A human-paced approval SLA is longer than 5 minutes.

CCP side (agents.routes.ts):
  Split the one-size-fits-all agentRegistrationLimiter into two.
  /register stays tight (10/15min — invite-code brute force is the
  real attack surface). /poll gets a new agentPollLimiter at 180/15min
  (one poll per ~5s upper bound), scoped to registrationId+slug so
  blast radius is bounded.

Agent side (server.ts):
  Replaced fixed 30s setInterval with a self-scheduling setTimeout
  loop that backs off exponentially on HTTP 429 (30s → 60s → 120s →
  300s cap) and resets to 30s on any 2xx. Stop-flag protects against
  re-entry after approval. Fixes the "agent wedged at 429, restart to
  recover" workaround.

Bunker Admin
This commit is contained in:
bunker-admin 2026-04-16 13:11:39 -06:00
parent 6504598752
commit d2da13929a
2 changed files with 63 additions and 16 deletions

View File

@ -126,28 +126,59 @@ async function startPhoneHome() {
// try/catch so an unexpected throw never kills the interval silently.
// On every poll we log either the status transition or a heartbeat every
// 10th attempt, so admins can see the loop is alive.
// Self-scheduling poll with exponential backoff on HTTP 429. Fixed-interval
// setInterval blew up at the 15-min mark when CCP's rate limiter rejected
// the 11th poll and subsequent polls — the only recovery was an agent
// restart. This loop doubles the delay on 429 (cap 5 min) and resets on
// any 2xx response.
let pollCount = 0;
let lastLoggedStatus: string | null = null;
const pollInterval = setInterval(async () => {
const BASE_DELAY_MS = 30_000;
const MAX_DELAY_MS = 300_000;
let currentDelayMs = BASE_DELAY_MS;
let pollTimer: NodeJS.Timeout | null = null;
let stopped = false;
const scheduleNext = (delayMs: number) => {
if (stopped) return;
pollTimer = setTimeout(runPoll, delayMs);
};
const runPoll = async () => {
if (stopped) return;
pollCount += 1;
try {
const pollResp = await fetch(
`${env.CCP_URL}/api/agents/poll?registrationId=${result.registrationId}&slug=${env.INSTANCE_SLUG}`
);
if (pollResp.status === 429) {
// Rate limited — back off, don't try to parse body.
const newDelay = Math.min(currentDelayMs * 2, MAX_DELAY_MS);
logger.warn(`[phone-home] Poll #${pollCount} HTTP 429 — backing off ${currentDelayMs}ms → ${newDelay}ms`);
currentDelayMs = newDelay;
scheduleNext(currentDelayMs);
return;
}
if (!pollResp.ok) {
logger.warn(`[phone-home] Poll #${pollCount} HTTP ${pollResp.status} ${pollResp.statusText}`);
scheduleNext(currentDelayMs);
return;
}
// Success — reset backoff if previously inflated.
if (currentDelayMs !== BASE_DELAY_MS) {
logger.info(`[phone-home] Poll succeeded — resetting delay ${currentDelayMs}ms → ${BASE_DELAY_MS}ms`);
currentDelayMs = BASE_DELAY_MS;
}
const pollData = await pollResp.json() as {
status: string;
certBundle?: { caCertPem: string; agentCertPem: string; agentKeyPem: string; ccpFingerprint: string };
message?: string;
};
// Log status transitions and periodic heartbeats so the loop is never
// invisible. Previously a stuck loop left no trace in logs.
if (pollData.status !== lastLoggedStatus) {
logger.info(`[phone-home] Poll #${pollCount}: status=${pollData.status}${pollData.message ? `${pollData.message}` : ''}`);
lastLoggedStatus = pollData.status;
@ -156,7 +187,8 @@ async function startPhoneHome() {
}
if (pollData.status === 'APPROVED' && pollData.certBundle) {
clearInterval(pollInterval);
stopped = true;
if (pollTimer) clearTimeout(pollTimer);
logger.info('[phone-home] Approved! Saving certificates...');
// Save certs
@ -177,25 +209,27 @@ async function startPhoneHome() {
}
logger.info('[phone-home] Certificates saved. Restarting with mTLS...');
// Exit so Docker restart policy brings us back with certs
process.exit(0);
} else if (pollData.status === 'APPROVED' && !pollData.certBundle) {
// Admin approved but cert bundle was consumed (e.g. by debug curl).
// Keep polling — admin can re-issue certs via the new endpoint and we'll
// pick them up on the next poll.
// (No action needed; the status-transition log above covers visibility.)
// Keep polling — admin can re-issue certs via the new endpoint.
scheduleNext(currentDelayMs);
} else if (pollData.status === 'REJECTED') {
clearInterval(pollInterval);
stopped = true;
if (pollTimer) clearTimeout(pollTimer);
logger.error('[phone-home] Registration was rejected by CCP admin');
} else {
scheduleNext(currentDelayMs);
}
} catch (err) {
// CRITICAL: this catch MUST swallow every error — if it rethrows the
// setInterval callback becomes an unhandled rejection and Node may kill
// the interval depending on the runtime config. We saw this in prod.
// CRITICAL: this catch MUST swallow every error — previously an unhandled
// rejection could silently kill the poll loop depending on Node config.
logger.warn(`[phone-home] Poll #${pollCount} failed: ${(err as Error).message}`);
scheduleNext(currentDelayMs);
}
}, 30_000);
};
scheduleNext(currentDelayMs);
// Defensive: if the Node process receives an unhandled rejection that
// somehow originates from the poll path, log it instead of dying quietly.

View File

@ -10,7 +10,8 @@ import { logger } from '../../utils/logger';
const router = Router();
// SECURITY: Strict rate limiter for unauthenticated agent endpoints
// SECURITY: Strict rate limiter for unauthenticated agent /register endpoint.
// Registration is the real attack surface (invite-code guessing); keep it tight.
const agentRegistrationLimiter = rateLimit({
windowMs: 15 * 60 * 1000, // 15 minutes
max: 10, // 10 attempts per window per IP
@ -19,6 +20,18 @@ const agentRegistrationLimiter = rateLimit({
message: { error: 'RATE_LIMITED', message: 'Too many registration attempts, try again later' },
});
// Looser limiter for /poll. The agent polls every 30s (60/15min) while waiting
// for admin approval; at the old 10/15min limit it hit 429 after 5 min and
// required a restart to recover. /poll is scoped to a specific registrationId
// and slug, so the blast radius is bounded even at 180/15min (one poll per 5s).
const agentPollLimiter = rateLimit({
windowMs: 15 * 60 * 1000,
max: 180,
standardHeaders: true,
legacyHeaders: false,
message: { error: 'RATE_LIMITED', message: 'Poll rate exceeded — back off' },
});
// ─── Public Endpoints (used by remote agents during phone-home) ──────
/**
@ -73,7 +86,7 @@ router.post('/register', agentRegistrationLimiter, async (req: Request, res: Res
* Agent polls to check if registration was approved.
* Returns cert bundle on approval.
*/
router.get('/poll', agentRegistrationLimiter, async (req: Request, res: Response) => {
router.get('/poll', agentPollLimiter, async (req: Request, res: Response) => {
const { registrationId, slug } = req.query;
if (!registrationId && !slug) {