ccp: split /register and /poll rate limits; agent backoff on 429
Problem: the agent polled /poll every 30s while waiting for admin approval. At 10 req/15min, the 11th poll hit 429 after ~5 min and every subsequent one also failed — recovery required an agent restart. A human-paced approval SLA is longer than 5 minutes. CCP side (agents.routes.ts): Split the one-size-fits-all agentRegistrationLimiter into two. /register stays tight (10/15min — invite-code brute force is the real attack surface). /poll gets a new agentPollLimiter at 180/15min (one poll per ~5s upper bound), scoped to registrationId+slug so blast radius is bounded. Agent side (server.ts): Replaced fixed 30s setInterval with a self-scheduling setTimeout loop that backs off exponentially on HTTP 429 (30s → 60s → 120s → 300s cap) and resets to 30s on any 2xx. Stop-flag protects against re-entry after approval. Fixes the "agent wedged at 429, restart to recover" workaround. Bunker Admin
This commit is contained in:
parent
6504598752
commit
d2da13929a
@ -126,28 +126,59 @@ async function startPhoneHome() {
|
||||
// try/catch so an unexpected throw never kills the interval silently.
|
||||
// On every poll we log either the status transition or a heartbeat every
|
||||
// 10th attempt, so admins can see the loop is alive.
|
||||
// Self-scheduling poll with exponential backoff on HTTP 429. Fixed-interval
|
||||
// setInterval blew up at the 15-min mark when CCP's rate limiter rejected
|
||||
// the 11th poll and subsequent polls — the only recovery was an agent
|
||||
// restart. This loop doubles the delay on 429 (cap 5 min) and resets on
|
||||
// any 2xx response.
|
||||
let pollCount = 0;
|
||||
let lastLoggedStatus: string | null = null;
|
||||
const pollInterval = setInterval(async () => {
|
||||
const BASE_DELAY_MS = 30_000;
|
||||
const MAX_DELAY_MS = 300_000;
|
||||
let currentDelayMs = BASE_DELAY_MS;
|
||||
let pollTimer: NodeJS.Timeout | null = null;
|
||||
let stopped = false;
|
||||
|
||||
const scheduleNext = (delayMs: number) => {
|
||||
if (stopped) return;
|
||||
pollTimer = setTimeout(runPoll, delayMs);
|
||||
};
|
||||
|
||||
const runPoll = async () => {
|
||||
if (stopped) return;
|
||||
pollCount += 1;
|
||||
try {
|
||||
const pollResp = await fetch(
|
||||
`${env.CCP_URL}/api/agents/poll?registrationId=${result.registrationId}&slug=${env.INSTANCE_SLUG}`
|
||||
);
|
||||
|
||||
if (pollResp.status === 429) {
|
||||
// Rate limited — back off, don't try to parse body.
|
||||
const newDelay = Math.min(currentDelayMs * 2, MAX_DELAY_MS);
|
||||
logger.warn(`[phone-home] Poll #${pollCount} HTTP 429 — backing off ${currentDelayMs}ms → ${newDelay}ms`);
|
||||
currentDelayMs = newDelay;
|
||||
scheduleNext(currentDelayMs);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!pollResp.ok) {
|
||||
logger.warn(`[phone-home] Poll #${pollCount} HTTP ${pollResp.status} ${pollResp.statusText}`);
|
||||
scheduleNext(currentDelayMs);
|
||||
return;
|
||||
}
|
||||
|
||||
// Success — reset backoff if previously inflated.
|
||||
if (currentDelayMs !== BASE_DELAY_MS) {
|
||||
logger.info(`[phone-home] Poll succeeded — resetting delay ${currentDelayMs}ms → ${BASE_DELAY_MS}ms`);
|
||||
currentDelayMs = BASE_DELAY_MS;
|
||||
}
|
||||
|
||||
const pollData = await pollResp.json() as {
|
||||
status: string;
|
||||
certBundle?: { caCertPem: string; agentCertPem: string; agentKeyPem: string; ccpFingerprint: string };
|
||||
message?: string;
|
||||
};
|
||||
|
||||
// Log status transitions and periodic heartbeats so the loop is never
|
||||
// invisible. Previously a stuck loop left no trace in logs.
|
||||
if (pollData.status !== lastLoggedStatus) {
|
||||
logger.info(`[phone-home] Poll #${pollCount}: status=${pollData.status}${pollData.message ? ` — ${pollData.message}` : ''}`);
|
||||
lastLoggedStatus = pollData.status;
|
||||
@ -156,7 +187,8 @@ async function startPhoneHome() {
|
||||
}
|
||||
|
||||
if (pollData.status === 'APPROVED' && pollData.certBundle) {
|
||||
clearInterval(pollInterval);
|
||||
stopped = true;
|
||||
if (pollTimer) clearTimeout(pollTimer);
|
||||
logger.info('[phone-home] Approved! Saving certificates...');
|
||||
|
||||
// Save certs
|
||||
@ -177,25 +209,27 @@ async function startPhoneHome() {
|
||||
}
|
||||
|
||||
logger.info('[phone-home] Certificates saved. Restarting with mTLS...');
|
||||
|
||||
// Exit so Docker restart policy brings us back with certs
|
||||
process.exit(0);
|
||||
} else if (pollData.status === 'APPROVED' && !pollData.certBundle) {
|
||||
// Admin approved but cert bundle was consumed (e.g. by debug curl).
|
||||
// Keep polling — admin can re-issue certs via the new endpoint and we'll
|
||||
// pick them up on the next poll.
|
||||
// (No action needed; the status-transition log above covers visibility.)
|
||||
// Keep polling — admin can re-issue certs via the new endpoint.
|
||||
scheduleNext(currentDelayMs);
|
||||
} else if (pollData.status === 'REJECTED') {
|
||||
clearInterval(pollInterval);
|
||||
stopped = true;
|
||||
if (pollTimer) clearTimeout(pollTimer);
|
||||
logger.error('[phone-home] Registration was rejected by CCP admin');
|
||||
} else {
|
||||
scheduleNext(currentDelayMs);
|
||||
}
|
||||
} catch (err) {
|
||||
// CRITICAL: this catch MUST swallow every error — if it rethrows the
|
||||
// setInterval callback becomes an unhandled rejection and Node may kill
|
||||
// the interval depending on the runtime config. We saw this in prod.
|
||||
// CRITICAL: this catch MUST swallow every error — previously an unhandled
|
||||
// rejection could silently kill the poll loop depending on Node config.
|
||||
logger.warn(`[phone-home] Poll #${pollCount} failed: ${(err as Error).message}`);
|
||||
scheduleNext(currentDelayMs);
|
||||
}
|
||||
}, 30_000);
|
||||
};
|
||||
|
||||
scheduleNext(currentDelayMs);
|
||||
|
||||
// Defensive: if the Node process receives an unhandled rejection that
|
||||
// somehow originates from the poll path, log it instead of dying quietly.
|
||||
|
||||
@ -10,7 +10,8 @@ import { logger } from '../../utils/logger';
|
||||
|
||||
const router = Router();
|
||||
|
||||
// SECURITY: Strict rate limiter for unauthenticated agent endpoints
|
||||
// SECURITY: Strict rate limiter for unauthenticated agent /register endpoint.
|
||||
// Registration is the real attack surface (invite-code guessing); keep it tight.
|
||||
const agentRegistrationLimiter = rateLimit({
|
||||
windowMs: 15 * 60 * 1000, // 15 minutes
|
||||
max: 10, // 10 attempts per window per IP
|
||||
@ -19,6 +20,18 @@ const agentRegistrationLimiter = rateLimit({
|
||||
message: { error: 'RATE_LIMITED', message: 'Too many registration attempts, try again later' },
|
||||
});
|
||||
|
||||
// Looser limiter for /poll. The agent polls every 30s (60/15min) while waiting
|
||||
// for admin approval; at the old 10/15min limit it hit 429 after 5 min and
|
||||
// required a restart to recover. /poll is scoped to a specific registrationId
|
||||
// and slug, so the blast radius is bounded even at 180/15min (one poll per 5s).
|
||||
const agentPollLimiter = rateLimit({
|
||||
windowMs: 15 * 60 * 1000,
|
||||
max: 180,
|
||||
standardHeaders: true,
|
||||
legacyHeaders: false,
|
||||
message: { error: 'RATE_LIMITED', message: 'Poll rate exceeded — back off' },
|
||||
});
|
||||
|
||||
// ─── Public Endpoints (used by remote agents during phone-home) ──────
|
||||
|
||||
/**
|
||||
@ -73,7 +86,7 @@ router.post('/register', agentRegistrationLimiter, async (req: Request, res: Res
|
||||
* Agent polls to check if registration was approved.
|
||||
* Returns cert bundle on approval.
|
||||
*/
|
||||
router.get('/poll', agentRegistrationLimiter, async (req: Request, res: Response) => {
|
||||
router.get('/poll', agentPollLimiter, async (req: Request, res: Response) => {
|
||||
const { registrationId, slug } = req.query;
|
||||
|
||||
if (!registrationId && !slug) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user