diff --git a/admin/src/components/volunteer/dashboard/ActionStepsList.tsx b/admin/src/components/volunteer/dashboard/ActionStepsList.tsx index 98731738..6a333ade 100644 --- a/admin/src/components/volunteer/dashboard/ActionStepsList.tsx +++ b/admin/src/components/volunteer/dashboard/ActionStepsList.tsx @@ -10,6 +10,8 @@ import { LinkOutlined, CheckSquareOutlined, CheckCircleFilled, + RightOutlined, + ThunderboltOutlined, } from '@ant-design/icons'; import { useNavigate } from 'react-router-dom'; import { api } from '@/lib/api'; @@ -66,6 +68,97 @@ function resolveStepLink(step: DashboardActionStep): { to: string; external: boo } } +function HighlightedStep({ + step, + onNavigate, + onSelfReport, + loading, +}: { + step: DashboardActionStep; + onNavigate: (step: DashboardActionStep) => void; + onSelfReport: (step: DashboardActionStep) => void; + loading: boolean; +}) { + const isSelfReport = step.kind === 'CUSTOM' || step.kind === 'VISIT_LINK'; + const canNavigate = resolveStepLink(step) !== null; + + return ( +
+
+ + + Next Up + +
+
+
+ {KIND_ICONS[step.kind]} +
+
+ + {step.label} + + {step.description && ( + + {step.description} + + )} +
+
+
+ {isSelfReport ? ( + <> + {canNavigate && ( + + )} + + + ) : ( + + )} +
+
+ ); +} + export default function ActionStepsList({ campaign, onRefresh }: ActionStepsListProps) { const navigate = useNavigate(); const { message } = App.useApp(); @@ -95,6 +188,8 @@ export default function ActionStepsList({ campaign, onRefresh }: ActionStepsList }; const sortedSteps = [...campaign.steps].sort((a, b) => a.order - b.order); + const highlightedStep = sortedSteps.find((s) => !s.completed); + const remainingSteps = sortedSteps.filter((s) => s.id !== highlightedStep?.id); return ( } > - {sortedSteps.map((step, i) => { + {highlightedStep && ( +
+ +
+ )} + + {remainingSteps.map((step, i) => { const isSelfReport = step.kind === 'CUSTOM' || step.kind === 'VISIT_LINK'; const canNavigate = resolveStepLink(step) !== null; @@ -119,8 +225,8 @@ export default function ActionStepsList({ campaign, onRefresh }: ActionStepsList display: 'flex', alignItems: 'center', justifyContent: 'space-between', - padding: '12px 20px', - borderTop: i > 0 ? '1px solid rgba(255,255,255,0.04)' : undefined, + padding: '10px 20px', + borderTop: (highlightedStep || i > 0) ? '1px solid rgba(255,255,255,0.04)' : undefined, opacity: step.completed ? 0.55 : 1, gap: 12, }} @@ -128,22 +234,22 @@ export default function ActionStepsList({ campaign, onRefresh }: ActionStepsList
{step.completed ? : KIND_ICONS[step.kind]}
- + {KIND_LABELS[step.kind]} {step.completed ? ( - Done + Done ) : isSelfReport ? ( {canNavigate && ( diff --git a/admin/src/pages/events/TicketedEventsPage.tsx b/admin/src/pages/events/TicketedEventsPage.tsx index 0fc69c15..74373630 100644 --- a/admin/src/pages/events/TicketedEventsPage.tsx +++ b/admin/src/pages/events/TicketedEventsPage.tsx @@ -7,7 +7,7 @@ import { import { PlusOutlined, SearchOutlined, EditOutlined, EyeOutlined, DeleteOutlined, CheckCircleOutlined, CloseCircleOutlined, CopyOutlined, ScanOutlined, - TagOutlined, VideoCameraOutlined, EnvironmentOutlined, + TagOutlined, VideoCameraOutlined, EnvironmentOutlined, StarOutlined, StarFilled, } from '@ant-design/icons'; import { api } from '@/lib/api'; import dayjs from 'dayjs'; @@ -45,6 +45,7 @@ interface TicketedEvent { currentAttendees: number; coverImageUrl: string | null; organizerName: string | null; + featured: boolean; ticketTiers: TicketTier[]; _count: { tickets: number; checkIns: number }; createdAt: string; @@ -198,18 +199,55 @@ export default function TicketedEventsPage() { } }; + const handleFeature = async (id: string, featured: boolean) => { + try { + if (featured) { + // Unfeature all others first (exclusive toggle) + const othersToUnfeature = events.filter((e) => e.featured && e.id !== id); + await Promise.all( + othersToUnfeature.map((e) => api.put(`/api/ticketed-events/admin/${e.id}`, { featured: false })) + ); + } + await api.put(`/api/ticketed-events/admin/${id}`, { featured }); + message.success(featured ? 'Event featured on volunteer dashboard' : 'Event unfeatured'); + fetchEvents(); + } catch { + message.error('Failed to update featured status'); + } + }; + const copyLink = (slug: string) => { navigator.clipboard.writeText(`${window.location.origin}/event/${slug}`); message.success('Link copied'); }; const columns = [ + { + title: '', + key: 'featured', + width: 36, + render: (_: unknown, record: TicketedEvent) => ( + + @@ -784,20 +879,36 @@ export default function InstanceDetailPage() { { title: 'Size', dataIndex: 'sizeBytes', - render: (b: number | null) => (b ? `${(b / 1024 / 1024).toFixed(1)} MB` : '-'), + render: (b: number | string | null) => { + if (b == null) return '-'; + const n = typeof b === 'string' ? parseInt(b, 10) : b; + return `${(n / 1024 / 1024).toFixed(1)} MB`; + }, }, { title: 'Actions', - width: 120, + width: 160, render: (_: unknown, record: Backup) => ( {record.status === 'COMPLETED' && ( - + + + } + > + s || '(root)' }, + { title: 'Name', dataIndex: 'name' }, + { title: 'Target', render: (_: unknown, r: { hasTarget: boolean; targetIp?: string; targetPort?: number }) => + r.hasTarget ? `${r.targetIp}:${r.targetPort}` : No target + }, + ]} + /> + + )} + + + + + + ) : ( + <> + + + +
+ -app.${instance.domain}, -api.${instance.domain}, etc.`} + rules={[{ required: true }, { pattern: /^[a-z0-9-]+$/, message: 'Lowercase alphanumeric + hyphens only' }]} + > + + + + + + +
+ + )} + + ); + + const localTunnelTab = ( + + {!isManaged && ( )} - {!isRegistered && tunnelConfigured && ( + {isManaged && tunnelConfigured && ( )} - {!isRegistered && !tunnelConfigured && ( + {isManaged && !tunnelConfigured && ( )} - {canConfigureTunnel && ( + {canConfigureTunnel && !isRemote && (
); + const tunnelTab = isRemote ? remoteTunnelTab : localTunnelTab; + // ─── Updates Tab ────────────────────────────────────────────── const isUpgrading = currentUpgrade?.status === 'IN_PROGRESS' || currentUpgrade?.status === 'PENDING'; @@ -1278,7 +1559,7 @@ export default function InstanceDetailPage() { )} {/* Upgrade Action */} - {!isRegistered && ( + {isManaged && ( {isUpgrading && currentUpgrade ? ( @@ -1340,7 +1621,7 @@ export default function InstanceDetailPage() { )} - {isRegistered && ( + {!isManaged && ( )} + {isRemote && ( + + )} {/* Upgrade History */} @@ -1794,6 +2083,108 @@ export default function InstanceDetailPage() { { key: 'tunnel', label: 'Tunnel', children: tunnelTab }, ]} /> + + {/* Restore confirmation modal (destructive action guard) */} + setRestoreModal(null)} + onOk={handleRestoreConfirm} + okText="Restore" + okButtonProps={{ + danger: true, + loading: restoring, + disabled: restoreModal?.typedSlug !== instance.slug, + }} + cancelButtonProps={{ disabled: restoring }} + width={560} + > + + {restoreModal && ( + + + {restoreModal.backup.id.substring(0, 8)} + + + {restoreModal.backup.sizeBytes + ? `${(Number(restoreModal.backup.sizeBytes) / 1024 / 1024).toFixed(1)} MB` + : '-'} + + + {restoreModal.backup.completedAt + ? dayjs(restoreModal.backup.completedAt).format('YYYY-MM-DD HH:mm') + : '-'} + + + )} + + Type the instance slug {instance.slug} to confirm: + + + setRestoreModal((cur) => (cur ? { ...cur, typedSlug: e.target.value } : cur)) + } + placeholder={instance.slug} + autoFocus + /> + + + {/* Active restore progress banner */} + {activeRestoreId && activeRestoreState && ( + + +
+ + {activeRestoreState.status} + + {activeRestoreState.status === 'RUNNING' && ( + + Agent is running scripts/restore.sh — this can take several minutes + + )} +
+ {activeRestoreState.errorMessage && ( + + )} + {activeRestoreState.logTail && ( +
+                {activeRestoreState.logTail}
+              
+ )} +
+
+ )} ); } diff --git a/changemaker-control-panel/admin/src/pages/InviteCodesPage.tsx b/changemaker-control-panel/admin/src/pages/InviteCodesPage.tsx index c03ec8dd..d53ccc61 100644 --- a/changemaker-control-panel/admin/src/pages/InviteCodesPage.tsx +++ b/changemaker-control-panel/admin/src/pages/InviteCodesPage.tsx @@ -14,7 +14,7 @@ export default function InviteCodesPage() { const fetchCodes = useCallback(async () => { try { setLoading(true); - const { data } = await api.get('/api/invite-codes'); + const { data } = await api.get('/invite-codes'); setCodes(data.data || []); } catch { message.error('Failed to load invite codes'); @@ -28,7 +28,7 @@ export default function InviteCodesPage() { const handleCreate = async () => { try { setCreating(true); - const { data } = await api.post('/api/invite-codes'); + const { data } = await api.post('/invite-codes'); message.success(`Invite code created: ${data.code}`); fetchCodes(); } catch { @@ -40,7 +40,7 @@ export default function InviteCodesPage() { const handleRevoke = async (id: string) => { try { - await api.delete(`/api/invite-codes/${id}`); + await api.delete(`/invite-codes/${id}`); message.success('Invite code revoked'); fetchCodes(); } catch { diff --git a/changemaker-control-panel/agent/src/config/env.ts b/changemaker-control-panel/agent/src/config/env.ts index a69ad6e5..94e29c50 100644 --- a/changemaker-control-panel/agent/src/config/env.ts +++ b/changemaker-control-panel/agent/src/config/env.ts @@ -26,6 +26,7 @@ const envSchema = z.object({ INSTANCE_SLUG: z.string().default(''), INSTANCE_DOMAIN: z.string().default(''), INSTANCE_BASE_PATH: z.string().default(''), + COMPOSE_PROJECT: z.string().default(''), }); function validateEnv() { diff --git a/changemaker-control-panel/agent/src/routes/backup.routes.ts b/changemaker-control-panel/agent/src/routes/backup.routes.ts index 7494fabb..8e949e5d 100644 --- a/changemaker-control-panel/agent/src/routes/backup.routes.ts +++ b/changemaker-control-panel/agent/src/routes/backup.routes.ts @@ -1,105 +1,623 @@ import { Router, Request, Response } from 'express'; import { param } from '../utils/params'; import fs from 'fs/promises'; -import path from 'path'; -import { exec as execCb } from 'child_process'; +import { createReadStream, createWriteStream } from 'fs'; +import { pipeline as pipelineCb, Transform } from 'stream'; import { promisify } from 'util'; -import * as docker from '../services/docker.service'; +import path from 'path'; +import crypto from 'crypto'; +import { spawn } from 'child_process'; import { getSlugEntry } from '../services/registry.service'; import { env } from '../config/env'; import { logger } from '../utils/logger'; +import { withSlugLock, SlugBusyError, isSlugLocked } from '../services/slug-mutex'; +import { AgentError } from '../middleware/error-handler'; + +const pipeline = promisify(pipelineCb); -const exec = promisify(execCb); const router = Router(); -// POST /instance/:slug/backup — Run pg_dump + tar uploads → return backup info -router.post('/instance/:slug/backup', async (req: Request, res: Response) => { - const entry = await getSlugEntry(param(req, 'slug')); - const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); - const backupDir = path.join(env.AGENT_DATA_DIR, 'backups', param(req, 'slug'), timestamp); - await fs.mkdir(backupDir, { recursive: true }); +// ─── Helpers ────────────────────────────────────────────────────────── - const { pgPassword } = req.body; +const ID_REGEX = /^[a-zA-Z0-9_-]+$/; +const ARCHIVE_PREFIX = 'changemaker-v2-backup-'; +const ARCHIVE_SUFFIX = '.tar.gz'; + +function backupsDirFor(slug: string): string { + return path.join(env.AGENT_DATA_DIR, 'backups', slug); +} + +function archivePathFor(slug: string, id: string): string { + return path.join(backupsDirFor(slug), `${ARCHIVE_PREFIX}${id}${ARCHIVE_SUFFIX}`); +} + +async function sha256File(filePath: string): Promise { + return new Promise((resolve, reject) => { + const hash = crypto.createHash('sha256'); + const stream = createReadStream(filePath); + stream.on('data', (chunk) => hash.update(chunk)); + stream.on('end', () => resolve(hash.digest('hex'))); + stream.on('error', reject); + }); +} + +/** + * Read the manifest.json out of a backup archive without extracting it. + * backup.sh stores it at /changemaker-v2-backup-/manifest.json + */ +async function readManifestFromArchive(archivePath: string): Promise { + return new Promise((resolve) => { + const proc = spawn('tar', ['-xzOf', archivePath, '--wildcards', '*/manifest.json'], { + stdio: ['ignore', 'pipe', 'ignore'], + }); + let buf = ''; + proc.stdout.on('data', (chunk) => (buf += chunk.toString('utf-8'))); + proc.on('error', () => resolve(null)); + proc.on('close', (code) => { + if (code !== 0 || !buf.trim()) return resolve(null); + try { + resolve(JSON.parse(buf)); + } catch { + resolve(null); + } + }); + }); +} + +/** + * Extract the timestamp ID from a filename like "changemaker-v2-backup-20260409_143000.tar.gz". + */ +function idFromFilename(filename: string): string | null { + if (!filename.startsWith(ARCHIVE_PREFIX) || !filename.endsWith(ARCHIVE_SUFFIX)) return null; + return filename.slice(ARCHIVE_PREFIX.length, filename.length - ARCHIVE_SUFFIX.length); +} + +// ─── Routes ─────────────────────────────────────────────────────────── + +/** + * POST /instance/:slug/backup + * Shells out to the remote CML's scripts/backup.sh. Returns archive metadata + * so the CCP can immediately stream it down via the /download endpoint. + */ +router.post('/instance/:slug/backup', async (req: Request, res: Response) => { + const slug = param(req, 'slug'); + const entry = await getSlugEntry(slug); try { - // 1. pg_dump - const dumpFile = path.join(backupDir, 'database.sql'); - const dump = await docker.composeExec( - entry.basePath, entry.composeProject, - 'v2-postgres', - 'pg_dump -U changemaker -d changemaker', - 300_000, - pgPassword ? { PGPASSWORD: pgPassword } : undefined - ); - await fs.writeFile(dumpFile, dump, 'utf-8'); + const result = await withSlugLock(slug, 'backup', async () => { + const backupsDir = backupsDirFor(slug); + await fs.mkdir(backupsDir, { recursive: true }); - // Gzip the dump - await exec(`gzip '${dumpFile}'`, { timeout: 120_000 }); + // Verify scripts/backup.sh exists + const scriptPath = path.join(entry.basePath, 'scripts', 'backup.sh'); + try { + await fs.access(scriptPath); + } catch { + throw new AgentError(500, `scripts/backup.sh not found at ${scriptPath}`, 'BACKUP_SCRIPT_MISSING'); + } - // 2. Tar uploads if exists - const uploadsDir = path.join(entry.basePath, 'uploads'); - let hasUploads = false; - try { - await fs.access(uploadsDir); - hasUploads = true; - } catch { /* no uploads dir */ } - - if (hasUploads) { - await exec( - `tar -czf '${path.join(backupDir, 'uploads.tar.gz')}' -C '${entry.basePath}' uploads`, - { timeout: 300_000 } + // Snapshot existing archive filenames so we can identify the new one + const existingFiles = new Set( + (await fs.readdir(backupsDir)).filter((f) => f.startsWith(ARCHIVE_PREFIX) && f.endsWith(ARCHIVE_SUFFIX)) ); - } - // 3. Create final archive - const archiveName = `backup-${param(req, 'slug')}-${timestamp}.tar.gz`; - const archivePath = path.join(env.AGENT_DATA_DIR, 'backups', archiveName); - await exec( - `tar -czf '${archivePath}' -C '${path.dirname(backupDir)}' '${timestamp}'`, - { timeout: 300_000 } - ); + const logPath = path.join(backupsDir, `backup-${Date.now()}.log`); + const logFd = await fs.open(logPath, 'w'); - // Clean up temp dir - await fs.rm(backupDir, { recursive: true, force: true }); + // Spawn backup.sh with cwd=basePath so its .env detection works. + // Retention is effectively disabled here — CCP manages retention of + // the streamed-down archives, not the agent's transient copies. + // + // Container names: backup.sh defaults to `changemaker-v2-postgres` and + // `listmonk-db`, which match the main CML's `container_name:` overrides. + // If a deployment has custom naming, the operator can set PG_CONTAINER / + // LISTMONK_PG_CONTAINER in the instance's own .env (backup.sh loads it). + const spawnEnv: NodeJS.ProcessEnv = { + ...process.env, + BACKUP_DIR: backupsDir, + RETENTION_DAYS: '36500', // ~100 years; CCP controls retention + }; - const stats = await fs.stat(archivePath); - const backupId = timestamp; + logger.info(`[backup] Running scripts/backup.sh for ${slug} (basePath=${entry.basePath})`); - logger.info(`[backup] Created backup for ${param(req, 'slug')}: ${archivePath} (${stats.size} bytes)`); + const exitCode: number = await new Promise((resolve, reject) => { + const proc = spawn('bash', ['scripts/backup.sh'], { + cwd: entry.basePath, + env: spawnEnv, + stdio: ['ignore', 'pipe', 'pipe'], + }); + proc.stdout.on('data', (chunk) => logFd.write(chunk).catch(() => {})); + proc.stderr.on('data', (chunk) => logFd.write(chunk).catch(() => {})); + proc.on('error', reject); + proc.on('close', (code) => resolve(code ?? 1)); + }); - res.json({ - backupId, - archivePath, - sizeBytes: stats.size, - timestamp, + await logFd.close(); + + if (exitCode !== 0) { + // Return the tail of the log so the CCP can display it + let logTail = ''; + try { + const fullLog = await fs.readFile(logPath, 'utf-8'); + logTail = fullLog.split('\n').slice(-40).join('\n'); + } catch { /* ignore */ } + throw new AgentError(500, `backup.sh exited with code ${exitCode}\n${logTail}`, 'BACKUP_FAILED'); + } + + // Find the new archive + const afterFiles = (await fs.readdir(backupsDir)).filter( + (f) => f.startsWith(ARCHIVE_PREFIX) && f.endsWith(ARCHIVE_SUFFIX) + ); + const newFiles = afterFiles.filter((f) => !existingFiles.has(f)); + if (newFiles.length === 0) { + throw new AgentError(500, 'backup.sh succeeded but no new archive was created', 'BACKUP_NO_OUTPUT'); + } + // Pick the most recently modified (in case of oddities) + newFiles.sort(); + const newest = newFiles[newFiles.length - 1] as string; + const archivePath = path.join(backupsDir, newest); + const backupId = idFromFilename(newest); + if (!backupId || !ID_REGEX.test(backupId)) { + throw new AgentError(500, `Unexpected archive filename: ${newest}`, 'BACKUP_NAME_INVALID'); + } + + const stats = await fs.stat(archivePath); + const sha256 = await sha256File(archivePath); + const manifest = await readManifestFromArchive(archivePath); + + // Delete the log file once we know the backup succeeded + try { await fs.unlink(logPath); } catch { /* ignore */ } + + logger.info(`[backup] ${slug}: created ${newest} (${stats.size} bytes, sha256=${sha256.substring(0, 16)}...)`); + + return { + backupId, + filename: newest, + sizeBytes: stats.size, + sha256, + manifest, + createdAt: stats.mtime.toISOString(), + }; }); + + res.json(result); } catch (err) { - // Clean up on failure - try { await fs.rm(backupDir, { recursive: true, force: true }); } catch { /* ignore */ } + if (err instanceof SlugBusyError) { + res.status(409).json({ error: 'SLUG_BUSY', message: err.message }); + return; + } throw err; } }); -// GET /instance/:slug/backup/:id/download — Stream backup archive -router.get('/instance/:slug/backup/:id/download', async (req: Request, res: Response) => { - const archiveName = `backup-${param(req, 'slug')}-${param(req, 'id')}.tar.gz`; - const archivePath = path.join(env.AGENT_DATA_DIR, 'backups', archiveName); +/** + * GET /instance/:slug/backups + * Lists backup archives currently held on the agent for this slug. + */ +router.get('/instance/:slug/backups', async (req: Request, res: Response) => { + const slug = param(req, 'slug'); + await getSlugEntry(slug); // validate slug is registered + const backupsDir = backupsDirFor(slug); + let entries: string[] = []; try { - await fs.access(archivePath); + entries = await fs.readdir(backupsDir); + } catch { + res.json({ data: [] }); + return; + } + + const results = []; + for (const filename of entries) { + const id = idFromFilename(filename); + if (!id) continue; + try { + const stats = await fs.stat(path.join(backupsDir, filename)); + results.push({ + backupId: id, + filename, + sizeBytes: stats.size, + createdAt: stats.mtime.toISOString(), + }); + } catch { /* skip */ } + } + results.sort((a, b) => (a.createdAt < b.createdAt ? 1 : -1)); + res.json({ data: results }); +}); + +/** + * GET /instance/:slug/backup/:id/download + * Streams the backup archive (supports Content-Length so the CCP can verify size). + */ +router.get('/instance/:slug/backup/:id/download', async (req: Request, res: Response) => { + const slug = param(req, 'slug'); + const id = param(req, 'id'); + if (!ID_REGEX.test(id)) { + res.status(400).json({ error: 'INVALID_ID', message: 'Invalid backup id' }); + return; + } + await getSlugEntry(slug); + + const archivePath = archivePathFor(slug, id); + try { + const stats = await fs.stat(archivePath); + res.setHeader('Content-Type', 'application/gzip'); + res.setHeader('Content-Length', String(stats.size)); + res.setHeader('Content-Disposition', `attachment; filename="${path.basename(archivePath)}"`); + const stream = createReadStream(archivePath); + stream.on('error', (err) => { + logger.error(`[backup] stream error for ${archivePath}: ${err.message}`); + if (!res.headersSent) res.status(500).end(); + else res.destroy(err); + }); + stream.pipe(res); } catch { res.status(404).json({ error: 'NOT_FOUND', message: 'Backup archive not found' }); return; } +}); - const stats = await fs.stat(archivePath); - res.setHeader('Content-Type', 'application/gzip'); - res.setHeader('Content-Length', stats.size); - res.setHeader('Content-Disposition', `attachment; filename="${archiveName}"`); +/** + * DELETE /instance/:slug/backup/:id + * Deletes the archive from the agent's disk. The CCP calls this after it has + * successfully streamed the archive to its own storage. + */ +router.delete('/instance/:slug/backup/:id', async (req: Request, res: Response) => { + const slug = param(req, 'slug'); + const id = param(req, 'id'); + if (!ID_REGEX.test(id)) { + res.status(400).json({ error: 'INVALID_ID', message: 'Invalid backup id' }); + return; + } + await getSlugEntry(slug); - const { createReadStream } = await import('fs'); - const stream = createReadStream(archivePath); - stream.pipe(res); + const archivePath = archivePathFor(slug, id); + // Path traversal defense: ensure the resolved path is still inside the slug's backups dir + const resolved = path.resolve(archivePath); + const boundary = path.resolve(backupsDirFor(slug)); + if (!resolved.startsWith(boundary + path.sep)) { + res.status(400).json({ error: 'INVALID_ID', message: 'Invalid backup id' }); + return; + } + + try { + await fs.unlink(archivePath); + logger.info(`[backup] ${slug}: deleted ${path.basename(archivePath)}`); + res.json({ deleted: true }); + } catch (err) { + const code = (err as NodeJS.ErrnoException).code; + if (code === 'ENOENT') { + res.status(404).json({ error: 'NOT_FOUND', message: 'Backup archive not found' }); + return; + } + throw err; + } +}); + +// ─── Restore ────────────────────────────────────────────────────────── + +// Hard cap on a single restore upload. The CCP is trusted, but a buggy or +// compromised CCP shouldn't be able to fill the agent's disk in one request. +// 20 GB is well above any realistic Changemaker Lite backup size. +const MAX_RESTORE_UPLOAD_BYTES = 20 * 1024 * 1024 * 1024; + +function restoresDirFor(slug: string): string { + return path.join(env.AGENT_DATA_DIR, 'restores', slug); +} + +function restoreUploadDir(slug: string, uploadId: string): string { + return path.join(restoresDirFor(slug), uploadId); +} + +interface RestoreState { + status: 'UPLOADED' | 'RUNNING' | 'COMPLETED' | 'FAILED'; + uploadId: string; + startedAt: string; + completedAt?: string; + exitCode?: number; + logTail?: string; + errorMessage?: string; + options?: Record; +} + +async function readRestoreState(slug: string, uploadId: string): Promise { + const statePath = path.join(restoreUploadDir(slug, uploadId), 'restore-state.json'); + try { + const content = await fs.readFile(statePath, 'utf-8'); + return JSON.parse(content) as RestoreState; + } catch { + return null; + } +} + +async function writeRestoreState(slug: string, uploadId: string, state: RestoreState): Promise { + const statePath = path.join(restoreUploadDir(slug, uploadId), 'restore-state.json'); + await fs.writeFile(statePath, JSON.stringify(state, null, 2), 'utf-8'); +} + +/** + * POST /instance/:slug/restore/upload?sha256= + * Accepts an application/octet-stream upload of a backup archive and writes + * it to the agent's restores directory. Verifies SHA256 as it streams — if + * the hash doesn't match, the partial file is deleted and we return 400. + * + * Returns `{ uploadId, sizeBytes, sha256 }`. + */ +router.post('/instance/:slug/restore/upload', async (req: Request, res: Response) => { + const slug = param(req, 'slug'); + await getSlugEntry(slug); + + if (isSlugLocked(slug, 'restore')) { + res.status(409).json({ error: 'SLUG_BUSY', message: 'A restore is already in progress for this slug' }); + return; + } + if (isSlugLocked(slug, 'backup')) { + res.status(409).json({ error: 'SLUG_BUSY', message: 'A backup is in progress for this slug' }); + return; + } + + const expectedSha256 = typeof req.query.sha256 === 'string' ? req.query.sha256.toLowerCase() : undefined; + if (!expectedSha256 || !/^[a-f0-9]{64}$/.test(expectedSha256)) { + res.status(400).json({ error: 'VALIDATION', message: 'sha256 query parameter required (64 hex chars)' }); + return; + } + + const uploadId = crypto.randomBytes(16).toString('hex'); + const uploadDir = restoreUploadDir(slug, uploadId); + await fs.mkdir(uploadDir, { recursive: true }); + const archivePath = path.join(uploadDir, 'archive.tar.gz'); + + const hash = crypto.createHash('sha256'); + let bytesWritten = 0; + const hashTransform = new Transform({ + transform(chunk: Buffer, _enc, cb) { + bytesWritten += chunk.length; + if (bytesWritten > MAX_RESTORE_UPLOAD_BYTES) { + // Abort the stream — pipeline() will reject and the catch block below + // will remove the partial upload directory. + cb(new AgentError( + 413, + `Upload exceeds maximum allowed size of ${MAX_RESTORE_UPLOAD_BYTES} bytes`, + 'UPLOAD_TOO_LARGE' + )); + return; + } + hash.update(chunk); + cb(null, chunk); + }, + }); + + try { + const writeStream = createWriteStream(archivePath); + await pipeline(req, hashTransform, writeStream); + const sha256 = hash.digest('hex'); + + if (sha256 !== expectedSha256) { + // Integrity failure — nuke the upload + await fs.rm(uploadDir, { recursive: true, force: true }); + res.status(400).json({ + error: 'SHA256_MISMATCH', + message: `Expected sha256 ${expectedSha256}, got ${sha256}`, + }); + return; + } + + const stats = await fs.stat(archivePath); + + // Persist initial state so the progress endpoint works even before apply + await writeRestoreState(slug, uploadId, { + status: 'UPLOADED', + uploadId, + startedAt: new Date().toISOString(), + }); + + logger.info(`[restore] ${slug}: uploaded ${bytesWritten} bytes (sha256=${sha256.substring(0, 16)}...) upload_id=${uploadId}`); + + res.json({ + uploadId, + sizeBytes: stats.size, + sha256, + }); + } catch (err) { + // Stream error or write error — clean up + try { await fs.rm(uploadDir, { recursive: true, force: true }); } catch { /* ignore */ } + throw err; + } +}); + +/** + * POST /instance/:slug/restore/:uploadId/apply + * Body: { confirm: true, skipDb?, skipUploads?, skipListmonk?, dryRun? } + * + * Fires off `scripts/restore.sh --archive --force` in the background + * and writes progress to restore-state.json. The CCP polls the progress + * endpoint for updates. Mutex prevents concurrent restores/backups. + */ +router.post('/instance/:slug/restore/:uploadId/apply', async (req: Request, res: Response) => { + const slug = param(req, 'slug'); + const uploadId = param(req, 'uploadId'); + if (!ID_REGEX.test(uploadId)) { + res.status(400).json({ error: 'INVALID_ID', message: 'Invalid upload id' }); + return; + } + const entry = await getSlugEntry(slug); + + const { confirm, skipDb, skipUploads, skipListmonk, dryRun } = req.body ?? {}; + if (confirm !== true) { + res.status(400).json({ error: 'CONFIRMATION_REQUIRED', message: 'Body must include { confirm: true }' }); + return; + } + + const uploadDir = restoreUploadDir(slug, uploadId); + // Path traversal defense + const resolvedDir = path.resolve(uploadDir); + const boundary = path.resolve(restoresDirFor(slug)); + if (!resolvedDir.startsWith(boundary + path.sep)) { + res.status(400).json({ error: 'INVALID_ID', message: 'Invalid upload id' }); + return; + } + + const archivePath = path.join(uploadDir, 'archive.tar.gz'); + try { + await fs.access(archivePath); + } catch { + res.status(404).json({ error: 'NOT_FOUND', message: 'Upload not found or already applied' }); + return; + } + + // Verify scripts/restore.sh exists + const scriptPath = path.join(entry.basePath, 'scripts', 'restore.sh'); + try { + await fs.access(scriptPath); + } catch { + res.status(500).json({ error: 'RESTORE_SCRIPT_MISSING', message: `scripts/restore.sh not found at ${scriptPath}` }); + return; + } + + // Check mutex state (don't block — tell caller it's busy) + if (isSlugLocked(slug, 'restore') || isSlugLocked(slug, 'backup')) { + res.status(409).json({ error: 'SLUG_BUSY', message: 'Slug is busy with backup or restore' }); + return; + } + + // Fire-and-forget: acquire lock and run in background. Return immediately + // so CCP can start polling /progress. + const options = { + skipDb: !!skipDb, + skipUploads: !!skipUploads, + skipListmonk: !!skipListmonk, + dryRun: !!dryRun, + }; + + await writeRestoreState(slug, uploadId, { + status: 'RUNNING', + uploadId, + startedAt: new Date().toISOString(), + options, + }); + + // Build restore.sh args (all flags, no user input interpolated into a shell string) + const args = ['scripts/restore.sh', '--archive', archivePath, '--force']; + if (options.skipDb) args.push('--skip-db'); + if (options.skipUploads) args.push('--skip-uploads'); + if (options.skipListmonk) args.push('--skip-listmonk'); + if (options.dryRun) args.push('--dry-run'); + + const logPath = path.join(uploadDir, 'restore.log'); + + // Schedule the background task — don't await inside the handler + void withSlugLock(slug, 'restore', async () => { + const logFd = await fs.open(logPath, 'w'); + logger.info(`[restore] ${slug}: running ${args.join(' ')} (cwd=${entry.basePath})`); + + const exitCode: number = await new Promise((resolve, reject) => { + const proc = spawn('bash', args, { + cwd: entry.basePath, + env: { ...process.env }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + proc.stdout.on('data', (chunk) => logFd.write(chunk).catch(() => {})); + proc.stderr.on('data', (chunk) => logFd.write(chunk).catch(() => {})); + proc.on('error', reject); + proc.on('close', (code) => resolve(code ?? 1)); + }); + + await logFd.close(); + + // Read the tail of the log for the state file + let logTail = ''; + try { + const fullLog = await fs.readFile(logPath, 'utf-8'); + logTail = fullLog.split('\n').slice(-80).join('\n'); + } catch { /* ignore */ } + + const state: RestoreState = { + status: exitCode === 0 ? 'COMPLETED' : 'FAILED', + uploadId, + startedAt: (await readRestoreState(slug, uploadId))?.startedAt || new Date().toISOString(), + completedAt: new Date().toISOString(), + exitCode, + logTail, + options, + ...(exitCode !== 0 ? { errorMessage: `restore.sh exited with code ${exitCode}` } : {}), + }; + await writeRestoreState(slug, uploadId, state); + + logger.info(`[restore] ${slug}: restore.sh finished with exit ${exitCode}`); + }).catch(async (err) => { + logger.error(`[restore] ${slug}: background restore failed: ${(err as Error).message}`); + // If the mutex was the issue, state is already written. Otherwise, mark failed. + if (!(err instanceof SlugBusyError)) { + try { + await writeRestoreState(slug, uploadId, { + status: 'FAILED', + uploadId, + startedAt: new Date().toISOString(), + completedAt: new Date().toISOString(), + errorMessage: (err as Error).message, + options, + }); + } catch { /* ignore */ } + } + }); + + res.status(202).json({ applied: true, uploadId, options }); +}); + +/** + * GET /instance/:slug/restore/:uploadId/progress + * Returns the current state of a running or completed restore. + */ +router.get('/instance/:slug/restore/:uploadId/progress', async (req: Request, res: Response) => { + const slug = param(req, 'slug'); + const uploadId = param(req, 'uploadId'); + if (!ID_REGEX.test(uploadId)) { + res.status(400).json({ error: 'INVALID_ID', message: 'Invalid upload id' }); + return; + } + await getSlugEntry(slug); + + const state = await readRestoreState(slug, uploadId); + if (!state) { + res.status(404).json({ error: 'NOT_FOUND', message: 'Restore not found' }); + return; + } + res.json(state); +}); + +/** + * DELETE /instance/:slug/restore/:uploadId + * Removes a restore upload directory. Refuses if a restore is currently running. + */ +router.delete('/instance/:slug/restore/:uploadId', async (req: Request, res: Response) => { + const slug = param(req, 'slug'); + const uploadId = param(req, 'uploadId'); + if (!ID_REGEX.test(uploadId)) { + res.status(400).json({ error: 'INVALID_ID', message: 'Invalid upload id' }); + return; + } + await getSlugEntry(slug); + + const uploadDir = restoreUploadDir(slug, uploadId); + const resolvedDir = path.resolve(uploadDir); + const boundary = path.resolve(restoresDirFor(slug)); + if (!resolvedDir.startsWith(boundary + path.sep)) { + res.status(400).json({ error: 'INVALID_ID', message: 'Invalid upload id' }); + return; + } + + const state = await readRestoreState(slug, uploadId); + if (state?.status === 'RUNNING') { + res.status(409).json({ error: 'RESTORE_RUNNING', message: 'Cannot delete a running restore' }); + return; + } + + try { + await fs.rm(uploadDir, { recursive: true, force: true }); + res.json({ deleted: true }); + } catch (err) { + throw err; + } }); export default router; diff --git a/changemaker-control-panel/agent/src/routes/registry.routes.ts b/changemaker-control-panel/agent/src/routes/registry.routes.ts index 157921ab..e1b0db21 100644 --- a/changemaker-control-panel/agent/src/routes/registry.routes.ts +++ b/changemaker-control-panel/agent/src/routes/registry.routes.ts @@ -4,6 +4,13 @@ import { registerSlug, unregisterSlug, listSlugs } from '../services/registry.se const router = Router(); +// SECURITY: defense-in-depth slug validation. The CCP enforces ^[a-z0-9-]+$ +// upstream via Zod, but the registry slug is later interpolated into +// filesystem paths (backupsDirFor, etc.), so we validate independently here. +// A poisoned registry entry could otherwise let a compromised or buggy CCP +// escape AGENT_DATA_DIR. +const SLUG_RE = /^[a-z0-9-]{2,50}$/; + // POST /instances/register — Register a slug→basePath mapping router.post('/instances/register', async (req: Request, res: Response) => { const { slug, basePath, composeProject } = req.body; @@ -11,14 +18,23 @@ router.post('/instances/register', async (req: Request, res: Response) => { res.status(400).json({ error: 'VALIDATION', message: 'slug, basePath, and composeProject required' }); return; } + if (typeof slug !== 'string' || !SLUG_RE.test(slug)) { + res.status(400).json({ error: 'VALIDATION', message: 'Invalid slug format (expected ^[a-z0-9-]{2,50}$)' }); + return; + } await registerSlug(slug, basePath, composeProject); res.json({ registered: slug }); }); // DELETE /instances/:slug — Unregister slug router.delete('/instances/:slug', async (req: Request, res: Response) => { - await unregisterSlug(param(req, 'slug')); - res.json({ unregistered: param(req, 'slug') }); + const slug = param(req, 'slug'); + if (!SLUG_RE.test(slug)) { + res.status(400).json({ error: 'VALIDATION', message: 'Invalid slug format' }); + return; + } + await unregisterSlug(slug); + res.json({ unregistered: slug }); }); // GET /instances — List all managed slugs diff --git a/changemaker-control-panel/agent/src/routes/upgrade.routes.ts b/changemaker-control-panel/agent/src/routes/upgrade.routes.ts index 90bc1cdf..efdd74b8 100644 --- a/changemaker-control-panel/agent/src/routes/upgrade.routes.ts +++ b/changemaker-control-panel/agent/src/routes/upgrade.routes.ts @@ -1,11 +1,12 @@ import { Router, Request, Response } from 'express'; import { param } from '../utils/params'; -import { execFile } from 'child_process'; +import { execFile, spawn } from 'child_process'; import { promisify } from 'util'; import fs from 'fs/promises'; import path from 'path'; import { getSlugEntry } from '../services/registry.service'; import { logger } from '../utils/logger'; +import { withSlugLock, SlugBusyError, isSlugLocked } from '../services/slug-mutex'; const execFileAsync = promisify(execFile); const router = Router(); @@ -13,9 +14,108 @@ const router = Router(); /** Validate a git branch name — prevent shell injection. */ const SAFE_BRANCH = /^[a-zA-Z0-9][a-zA-Z0-9_.\/-]{0,99}$/; -// POST /instance/:slug/upgrade/start — Run upgrade.sh +/** + * Max age of an in-progress upgrade (by progress.json mtime) before we + * consider a previous attempt dead and allow a new one through. + * + * SECURITY NOTE: this must be LONGER than the CCP's REMOTE_UPGRADE_TIMEOUT + * AND longer than any realistic legitimate upgrade duration. The concern is + * a concurrent-upgrade scenario: + * - upgrade.sh is running and legitimately slow (large image pull + DB + * migration) + * - at 15 min the CCP side times out and marks the row FAILED + * - admin clicks "Upgrade" again → CCP's DB check sees no active row + * - if this staleness window is <= realistic upgrade time, the second + * /upgrade/start call would ALSO pass this check, spawning a second + * upgrade.sh process racing against the still-running first one + * + * 45 min gives headroom over the 15-min CCP timeout and covers realistic + * upgrade durations. For a truly bulletproof guard, switch to a PID lock + * file that verifies the process is still alive. + */ +const STALE_UPGRADE_MTIME_MS = 45 * 60 * 1000; + +/** + * Returns true if there's an in-progress upgrade visible on disk. + * + * Used as a second-line guard in case the in-memory mutex was lost to an + * agent restart mid-upgrade. The check looks at progress.json mtime and + * the absence of a result.json — together they indicate "started but not + * finished within the staleness window". + */ +async function isUpgradeRunningOnDisk(basePath: string): Promise { + const progressPath = path.join(basePath, 'data', 'upgrade', 'progress.json'); + const resultPath = path.join(basePath, 'data', 'upgrade', 'result.json'); + + let progressStat: import('fs').Stats; + try { + progressStat = await fs.stat(progressPath); + } catch { + return false; // no progress file → no in-progress upgrade + } + + // If a result file exists with mtime >= progress mtime, the run is finished + try { + const resultStat = await fs.stat(resultPath); + if (resultStat.mtimeMs >= progressStat.mtimeMs) return false; + } catch { /* no result file yet */ } + + // Stale: progress file is old and no result was written → assume the + // previous attempt died and let a new one through + if (Date.now() - progressStat.mtimeMs > STALE_UPGRADE_MTIME_MS) return false; + + return true; +} + +// POST /instance/:slug/upgrade/check — Run upgrade-check.sh and return status.json +router.post('/instance/:slug/upgrade/check', async (req: Request, res: Response) => { + const slug = param(req, 'slug'); + const entry = await getSlugEntry(slug); + + // Refuse during a running upgrade — check writes status.json which could + // race with upgrade.sh writing other files in data/upgrade/ + if (isSlugLocked(slug, 'upgrade') || await isUpgradeRunningOnDisk(entry.basePath)) { + res.status(409).json({ error: 'SLUG_BUSY', message: 'An upgrade is currently running' }); + return; + } + + const scriptPath = path.join(entry.basePath, 'scripts', 'upgrade-check.sh'); + try { + await fs.access(scriptPath); + } catch { + res.status(404).json({ error: 'SCRIPT_NOT_FOUND', message: `upgrade-check.sh not found at ${scriptPath}` }); + return; + } + + // Run upgrade-check.sh — it writes data/upgrade/status.json. Use execFile + // (no shell) and a 60s timeout. Failures are non-fatal: the script may + // still have written status.json before erroring out, so we always try + // to read it afterwards. + try { + await execFileAsync('bash', [scriptPath], { + cwd: entry.basePath, + timeout: 60_000, + maxBuffer: 4 * 1024 * 1024, + env: { ...process.env, COMPOSE_ANSI: 'never' }, + }); + } catch (err) { + logger.warn(`[upgrade] ${slug}: upgrade-check.sh failed: ${(err as Error).message}`); + // continue — try to read status.json anyway + } + + const statusPath = path.join(entry.basePath, 'data', 'upgrade', 'status.json'); + try { + const content = await fs.readFile(statusPath, 'utf-8'); + res.json(JSON.parse(content)); + } catch { + res.status(500).json({ error: 'STATUS_NOT_AVAILABLE', message: 'upgrade-check.sh did not produce status.json' }); + } +}); + +// POST /instance/:slug/upgrade/start — Run upgrade.sh in the background router.post('/instance/:slug/upgrade/start', async (req: Request, res: Response) => { - const entry = await getSlugEntry(param(req, 'slug')); + const slug = param(req, 'slug'); + const entry = await getSlugEntry(slug); const { skipBackup, useRegistry, branch } = req.body || {}; // SECURITY: Validate branch name to prevent injection @@ -28,26 +128,64 @@ router.post('/instance/:slug/upgrade/start', async (req: Request, res: Response) try { await fs.access(scriptPath); } catch { - res.status(400).json({ error: 'NOT_FOUND', message: 'upgrade.sh not found' }); + res.status(404).json({ error: 'NOT_FOUND', message: 'upgrade.sh not found' }); return; } - // SECURITY: Use execFile with args array — no shell interpolation - const args = ['--api-mode', '--force']; + // Refuse if an upgrade is already running (in-memory or on-disk indicators) + if (isSlugLocked(slug, 'upgrade') || await isUpgradeRunningOnDisk(entry.basePath)) { + res.status(409).json({ error: 'SLUG_BUSY', message: 'An upgrade is already in progress' }); + return; + } + // Backup or restore concurrency: refuse to start an upgrade while either is running + if (isSlugLocked(slug, 'backup') || isSlugLocked(slug, 'restore')) { + res.status(409).json({ error: 'SLUG_BUSY', message: 'A backup or restore is currently running' }); + return; + } + + // Clear stale progress/result files before starting so the on-disk staleness + // check doesn't think a brand-new upgrade is still finishing. + const progressPath = path.join(entry.basePath, 'data', 'upgrade', 'progress.json'); + const resultPath = path.join(entry.basePath, 'data', 'upgrade', 'result.json'); + await fs.mkdir(path.dirname(progressPath), { recursive: true }); + await fs.rm(progressPath, { force: true }); + await fs.rm(resultPath, { force: true }); + + // SECURITY: Use spawn with args array — no shell interpolation + const args: string[] = [scriptPath, '--api-mode', '--force']; if (skipBackup) args.push('--skip-backup'); if (useRegistry) args.push('--use-registry'); if (branch) args.push('--branch', branch); - // Fire-and-forget — CCP polls progress - execFileAsync('bash', [scriptPath, ...args], { - cwd: entry.basePath, - timeout: 600_000, - maxBuffer: 10 * 1024 * 1024, + // Schedule the background task under the slug lock. Use void so the + // promise doesn't block the response. Errors are caught and logged; the + // CCP detects them via the absence of a result file or via the timeout. + void withSlugLock(slug, 'upgrade', async () => { + logger.info(`[upgrade] ${slug}: spawning ${args.join(' ')} (cwd=${entry.basePath})`); + try { + await new Promise((resolve, reject) => { + const proc = spawn('bash', args, { + cwd: entry.basePath, + env: { ...process.env, COMPOSE_ANSI: 'never' }, + stdio: ['ignore', 'ignore', 'ignore'], // upgrade.sh writes its own logs + }); + proc.on('error', reject); + proc.on('close', (code) => { + if (code === 0) resolve(); + else reject(new Error(`upgrade.sh exited with code ${code}`)); + }); + }); + logger.info(`[upgrade] ${slug}: upgrade.sh completed`); + } catch (err) { + logger.error(`[upgrade] ${slug}: ${(err as Error).message}`); + } }).catch((err) => { - logger.error(`[upgrade] ${param(req, 'slug')} failed: ${(err as Error).message}`); + if (!(err instanceof SlugBusyError)) { + logger.error(`[upgrade] ${slug}: lock or background error: ${(err as Error).message}`); + } }); - res.json({ started: true }); + res.status(202).json({ started: true }); }); // GET /instance/:slug/upgrade/progress — Read progress.json diff --git a/changemaker-control-panel/agent/src/server.ts b/changemaker-control-panel/agent/src/server.ts index c6b3c811..db16d2c0 100644 --- a/changemaker-control-panel/agent/src/server.ts +++ b/changemaker-control-panel/agent/src/server.ts @@ -53,8 +53,24 @@ if (hasCerts()) { app.use(errorHandler); const server = https.createServer(tlsOptions, app); - server.listen(env.AGENT_PORT, () => { + server.listen(env.AGENT_PORT, async () => { logger.info(`CCP Agent (mTLS) listening on port ${env.AGENT_PORT}`); + + // Auto-register this instance's slug if configured + if (env.INSTANCE_SLUG && env.INSTANCE_BASE_PATH) { + const { registerSlug, getSlugEntry } = await import('./services/registry.service'); + try { + await getSlugEntry(env.INSTANCE_SLUG); + logger.debug(`[registry] Slug ${env.INSTANCE_SLUG} already registered`); + } catch { + // Detect compose project name: use env override, or derive from basePath directory name + // (Docker Compose default: directory name with special chars stripped) + const pathMod = await import('path'); + const composeProject = env.COMPOSE_PROJECT + || pathMod.basename(env.INSTANCE_BASE_PATH).replace(/[^a-zA-Z0-9]/g, '').toLowerCase(); + await registerSlug(env.INSTANCE_SLUG, env.INSTANCE_BASE_PATH, composeProject); + } + } }); } else { // Pre-approval mode — start HTTP, only health + phone-home polling diff --git a/changemaker-control-panel/agent/src/services/slug-mutex.ts b/changemaker-control-panel/agent/src/services/slug-mutex.ts new file mode 100644 index 00000000..6497e654 --- /dev/null +++ b/changemaker-control-panel/agent/src/services/slug-mutex.ts @@ -0,0 +1,65 @@ +/** + * Per-slug single-flight mutex. + * + * Guards long-running, mutating operations (backup, restore, upgrade) so that + * two concurrent CCP calls for the same slug can't trample each other. + * + * Usage: + * await withSlugLock(slug, 'backup', async () => { ... }); + * + * If a lock is already held for (slug, op), throws SlugBusyError which the + * route handler should convert to HTTP 409. + */ + +export class SlugBusyError extends Error { + constructor(public slug: string, public op: string) { + super(`Slug ${slug} is busy: ${op} already in progress`); + this.name = 'SlugBusyError'; + } +} + +type LockKey = string; +const locks = new Map(); + +function key(slug: string, op: string): LockKey { + return `${slug}::${op}`; +} + +/** + * Run `fn` while holding a single-flight lock on (slug, op). + * Throws SlugBusyError immediately if another call is already running. + */ +export async function withSlugLock( + slug: string, + op: string, + fn: () => Promise +): Promise { + const k = key(slug, op); + if (locks.has(k)) { + throw new SlugBusyError(slug, op); + } + locks.set(k, { op, startedAt: Date.now() }); + try { + return await fn(); + } finally { + locks.delete(k); + } +} + +/** + * Returns true if a lock is currently held for (slug, op). + */ +export function isSlugLocked(slug: string, op: string): boolean { + return locks.has(key(slug, op)); +} + +/** + * Returns debug info about all active locks. + */ +export function listActiveLocks(): Array<{ slug: string; op: string; ageMs: number }> { + const now = Date.now(); + return Array.from(locks.entries()).map(([k, v]) => { + const [slug] = k.split('::'); + return { slug: slug ?? '', op: v.op, ageMs: now - v.startedAt }; + }); +} diff --git a/changemaker-control-panel/api/prisma/migrations/20260410233048_add_instance_restore/migration.sql b/changemaker-control-panel/api/prisma/migrations/20260410233048_add_instance_restore/migration.sql new file mode 100644 index 00000000..b23fe7bb --- /dev/null +++ b/changemaker-control-panel/api/prisma/migrations/20260410233048_add_instance_restore/migration.sql @@ -0,0 +1,34 @@ +-- CreateEnum +CREATE TYPE "RestoreStatus" AS ENUM ('PENDING', 'UPLOADING', 'RUNNING', 'COMPLETED', 'FAILED'); + +-- AlterEnum +ALTER TYPE "AuditAction" ADD VALUE 'BACKUP_RESTORE'; + +-- CreateTable +CREATE TABLE "instance_restores" ( + "id" TEXT NOT NULL, + "instance_id" TEXT NOT NULL, + "backup_id" TEXT NOT NULL, + "status" "RestoreStatus" NOT NULL DEFAULT 'PENDING', + "upload_id" TEXT, + "progress_json" JSONB, + "log_tail" TEXT, + "error_message" TEXT, + "triggered_by_id" TEXT, + "started_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "completed_at" TIMESTAMP(3), + + CONSTRAINT "instance_restores_pkey" PRIMARY KEY ("id") +); + +-- CreateIndex +CREATE INDEX "instance_restores_instance_id_started_at_idx" ON "instance_restores"("instance_id", "started_at"); + +-- CreateIndex +CREATE INDEX "instance_restores_backup_id_idx" ON "instance_restores"("backup_id"); + +-- AddForeignKey +ALTER TABLE "instance_restores" ADD CONSTRAINT "instance_restores_instance_id_fkey" FOREIGN KEY ("instance_id") REFERENCES "instances"("id") ON DELETE CASCADE ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE "instance_restores" ADD CONSTRAINT "instance_restores_backup_id_fkey" FOREIGN KEY ("backup_id") REFERENCES "backups"("id") ON DELETE CASCADE ON UPDATE CASCADE; diff --git a/changemaker-control-panel/api/prisma/migrations/20260412045433_add_pangolin_subdomain_prefix/migration.sql b/changemaker-control-panel/api/prisma/migrations/20260412045433_add_pangolin_subdomain_prefix/migration.sql new file mode 100644 index 00000000..b936e59c --- /dev/null +++ b/changemaker-control-panel/api/prisma/migrations/20260412045433_add_pangolin_subdomain_prefix/migration.sql @@ -0,0 +1,2 @@ +-- AlterTable +ALTER TABLE "instances" ADD COLUMN "pangolin_subdomain_prefix" TEXT; diff --git a/changemaker-control-panel/api/prisma/migrations/20260412155638_add_pangolin_teardown_action/migration.sql b/changemaker-control-panel/api/prisma/migrations/20260412155638_add_pangolin_teardown_action/migration.sql new file mode 100644 index 00000000..6cbb9ad4 --- /dev/null +++ b/changemaker-control-panel/api/prisma/migrations/20260412155638_add_pangolin_teardown_action/migration.sql @@ -0,0 +1,2 @@ +-- AlterEnum +ALTER TYPE "AuditAction" ADD VALUE 'PANGOLIN_TEARDOWN'; diff --git a/changemaker-control-panel/api/prisma/schema.prisma b/changemaker-control-panel/api/prisma/schema.prisma index 8a81d33b..c8bdbf7b 100644 --- a/changemaker-control-panel/api/prisma/schema.prisma +++ b/changemaker-control-panel/api/prisma/schema.prisma @@ -109,6 +109,7 @@ model Instance { pangolinSiteId String? @map("pangolin_site_id") pangolinNewtId String? @map("pangolin_newt_id") pangolinNewtSecret String? @map("pangolin_newt_secret") + pangolinSubdomainPrefix String? @map("pangolin_subdomain_prefix") // SMTP smtpHost String? @map("smtp_host") @@ -125,6 +126,7 @@ model Instance { portAllocations PortAllocation[] healthChecks HealthCheck[] backups Backup[] + restores InstanceRestore[] auditLogs AuditLog[] upgrades InstanceUpgrade[] events InstanceEvent[] @@ -196,12 +198,44 @@ model Backup { s3Uploaded Boolean @default(false) @map("s3_uploaded") s3Key String? @map("s3_key") - instance Instance @relation(fields: [instanceId], references: [id], onDelete: Cascade) + instance Instance @relation(fields: [instanceId], references: [id], onDelete: Cascade) + restores InstanceRestore[] @@index([instanceId, startedAt]) @@map("backups") } +// ─── Restore ─────────────────────────────────────────────── + +enum RestoreStatus { + PENDING + UPLOADING + RUNNING + COMPLETED + FAILED +} + +model InstanceRestore { + id String @id @default(uuid()) + instanceId String @map("instance_id") + backupId String @map("backup_id") + status RestoreStatus @default(PENDING) + uploadId String? @map("upload_id") + progressJson Json? @map("progress_json") + logTail String? @map("log_tail") + errorMessage String? @map("error_message") + triggeredById String? @map("triggered_by_id") + startedAt DateTime @default(now()) @map("started_at") + completedAt DateTime? @map("completed_at") + + instance Instance @relation(fields: [instanceId], references: [id], onDelete: Cascade) + backup Backup @relation(fields: [backupId], references: [id], onDelete: Cascade) + + @@index([instanceId, startedAt]) + @@index([backupId]) + @@map("instance_restores") +} + // ─── Audit Log ───────────────────────────────────────────── enum AuditAction { @@ -215,7 +249,9 @@ enum AuditAction { SECRETS_VIEWED BACKUP_CREATE BACKUP_DELETE + BACKUP_RESTORE PANGOLIN_SETUP + PANGOLIN_TEARDOWN PANGOLIN_SYNC AGENT_CONNECT AGENT_REGISTER diff --git a/changemaker-control-panel/api/src/config/env.ts b/changemaker-control-panel/api/src/config/env.ts index 3841d130..1fcf5bcf 100644 --- a/changemaker-control-panel/api/src/config/env.ts +++ b/changemaker-control-panel/api/src/config/env.ts @@ -54,10 +54,11 @@ const envSchema = z.object({ USE_REGISTRY_IMAGES: z.enum(['true', 'false']).default('true').transform((v) => v === 'true'), IMAGE_TAG: z.string().default('latest'), - // Pangolin (optional) + // Pangolin (optional — for remote tunnel management) PANGOLIN_API_URL: z.string().default(''), PANGOLIN_API_KEY: z.string().default(''), PANGOLIN_ORG_ID: z.string().default(''), + PANGOLIN_ENDPOINT: z.string().default(''), // Newt WebSocket URL (may differ from API URL) // Health checks HEALTH_CHECK_INTERVAL_MS: z.coerce.number().default(300_000), // 5 min (0 to disable) diff --git a/changemaker-control-panel/api/src/modules/agents/agents.routes.ts b/changemaker-control-panel/api/src/modules/agents/agents.routes.ts index 4c9fc0d3..64586ada 100644 --- a/changemaker-control-panel/api/src/modules/agents/agents.routes.ts +++ b/changemaker-control-panel/api/src/modules/agents/agents.routes.ts @@ -169,7 +169,7 @@ router.post('/registrations/:id/approve', authenticate, requireRole('SUPER_ADMIN }); // Issue mTLS certificates - const certMaterials = await issueAgentCert(instance.id, registration.slug); + const certMaterials = await issueAgentCert(instance.id, registration.slug, registration.agentUrl); // Mark invite code as used const invite = await prisma.agentInviteCode.findUnique({ where: { id: registration.inviteCodeId } }); @@ -189,7 +189,7 @@ router.post('/registrations/:id/approve', authenticate, requireRole('SUPER_ADMIN caCertPem: certMaterials.caCertPem, agentCertPem: certMaterials.agentCertPem, agentKeyPem: certMaterials.agentKeyPem, - ccpFingerprint: certMaterials.caFingerprint, + ccpFingerprint: certMaterials.fingerprint, }, }, }); diff --git a/changemaker-control-panel/api/src/modules/instances/instances.routes.ts b/changemaker-control-panel/api/src/modules/instances/instances.routes.ts index 0f6f6a93..b8102bcb 100644 --- a/changemaker-control-panel/api/src/modules/instances/instances.routes.ts +++ b/changemaker-control-panel/api/src/modules/instances/instances.routes.ts @@ -4,11 +4,13 @@ import rateLimit from 'express-rate-limit'; import { prisma } from '../../lib/prisma'; import { authenticate, requireRole } from '../../middleware/auth'; import { validate } from '../../middleware/validate'; -import { createInstanceSchema, updateInstanceSchema, registerInstanceSchema, reconfigureInstanceSchema, configureTunnelSchema, importInstancesSchema } from './instances.schemas'; +import { createInstanceSchema, updateInstanceSchema, registerInstanceSchema, reconfigureInstanceSchema, configureTunnelSchema, importInstancesSchema, startUpgradeSchema, setupRemoteTunnelSchema } from './instances.schemas'; import * as instancesService from './instances.service'; import * as healthService from '../../services/health.service'; import * as backupService from '../../services/backup.service'; +import * as restoreService from '../../services/restore.service'; import * as upgradeService from '../../services/upgrade.service'; +import * as tunnelService from '../../services/tunnel.service'; import { discoverInstances } from '../../services/discovery.service'; const secretsLimiter = rateLimit({ @@ -186,6 +188,18 @@ router.delete( '/:id/tunnel', requireRole('SUPER_ADMIN', 'OPERATOR'), async (req: Request, res: Response) => { + // Branch: remote instances use the CCP's Pangolin API to teardown; + // local instances use the existing manual removal logic. + const instance = await prisma.instance.findUnique({ where: { id: req.params.id as string } }); + if (instance?.isRemote && instance.pangolinSiteId) { + const result = await tunnelService.teardownTunnel( + req.params.id as string, + req.user!.id, + req.ip + ); + res.json({ data: result }); + return; + } const result = await instancesService.removeTunnel( req.params.id as string, req.user!.id, @@ -195,6 +209,47 @@ router.delete( } ); +// Remote tunnel setup via CCP's Pangolin API credentials +router.post( + '/:id/tunnel/setup', + requireRole('SUPER_ADMIN'), + validate(setupRemoteTunnelSchema), + async (req: Request, res: Response) => { + const { subdomainPrefix } = req.body || {}; + const result = await tunnelService.setupTunnel( + req.params.id as string, + { subdomainPrefix }, + req.user!.id, + req.ip + ); + res.status(201).json({ data: result }); + } +); + +// Get tunnel status (resource matrix) — works for both local and remote +router.get( + '/:id/tunnel/status', + requireRole('SUPER_ADMIN', 'OPERATOR'), + async (req: Request, res: Response) => { + const status = await tunnelService.getTunnelStatus(req.params.id as string); + res.json({ data: status }); + } +); + +// Re-sync resources (idempotent — creates missing, leaves existing) +router.post( + '/:id/tunnel/sync', + requireRole('SUPER_ADMIN'), + async (req: Request, res: Response) => { + const result = await tunnelService.syncResources( + req.params.id as string, + req.user!.id, + req.ip + ); + res.json({ data: result }); + } +); + // ─── Lifecycle Endpoints ───────────────────────────────────────────── router.post( @@ -280,6 +335,7 @@ router.post( router.post( '/:id/upgrade', requireRole('SUPER_ADMIN', 'OPERATOR'), + validate(startUpgradeSchema), async (req: Request, res: Response) => { const { skipBackup, useRegistry, branch } = req.body || {}; const upgrade = await upgradeService.startUpgrade( @@ -356,4 +412,76 @@ router.get( } ); +// ─── Restores ────────────────────────────────────────────────────── + +/** + * POST /:id/restore + * Body: { backupId, options? } + * Starts a restore of the given backup onto this instance. Returns the + * InstanceRestore row immediately; caller polls GET /:id/restores or + * GET /:id/restores/:restoreId for status. + * + * DESTRUCTIVE: overwrites databases and uploads. Requires SUPER_ADMIN. + */ +router.post( + '/:id/restore', + requireRole('SUPER_ADMIN'), + async (req: Request, res: Response) => { + const instanceId = req.params.id as string; + const { backupId, options } = req.body ?? {}; + if (!backupId || typeof backupId !== 'string') { + res.status(400).json({ error: { message: 'backupId (string) is required', code: 'VALIDATION' } }); + return; + } + + // Defensive: ensure the backup belongs to this instance + const backup = await prisma.backup.findUnique({ where: { id: backupId } }); + if (!backup) { + res.status(404).json({ error: { message: 'Backup not found', code: 'NOT_FOUND' } }); + return; + } + if (backup.instanceId !== instanceId) { + res.status(400).json({ + error: { + message: 'Backup does not belong to this instance (cross-instance restore is not supported)', + code: 'CROSS_INSTANCE_RESTORE', + }, + }); + return; + } + + const restore = await restoreService.createRestore({ + backupId, + triggeredById: req.user!.id, + ipAddress: req.ip, + options, + }); + res.status(201).json({ data: restore }); + } +); + +router.get( + '/:id/restores', + requireRole('SUPER_ADMIN', 'OPERATOR'), + async (req: Request, res: Response) => { + const page = Math.max(1, parseInt(req.query.page as string, 10) || 1); + const limit = Math.min(100, Math.max(1, parseInt(req.query.limit as string, 10) || 50)); + const result = await restoreService.listRestores(req.params.id as string, page, limit); + res.json(result); + } +); + +router.get( + '/:id/restores/:restoreId', + requireRole('SUPER_ADMIN', 'OPERATOR'), + async (req: Request, res: Response) => { + const restore = await restoreService.getRestore(req.params.restoreId as string); + if (restore.instanceId !== req.params.id) { + res.status(404).json({ error: { message: 'Restore not found', code: 'NOT_FOUND' } }); + return; + } + res.json({ data: restore }); + } +); + export default router; diff --git a/changemaker-control-panel/api/src/modules/instances/instances.schemas.ts b/changemaker-control-panel/api/src/modules/instances/instances.schemas.ts index 1348ad07..4b03029d 100644 --- a/changemaker-control-panel/api/src/modules/instances/instances.schemas.ts +++ b/changemaker-control-panel/api/src/modules/instances/instances.schemas.ts @@ -108,9 +108,32 @@ export const importInstancesSchema = z.object({ instances: z.array(registerInstanceSchema).min(1).max(50), }); +// SECURITY: branch name is interpolated into a shell command string in the +// local `runUpgrade` path (exec, not spawn), so we must enforce the same +// strict allow-list the agent uses on its own end. This blocks names starting +// with `-` (avoiding flag confusion), shell metachars, and anything exotic. +export const startUpgradeSchema = z.object({ + skipBackup: z.boolean().optional(), + useRegistry: z.boolean().optional(), + branch: z + .string() + .regex(/^[a-zA-Z0-9][a-zA-Z0-9_.\/-]{0,99}$/, 'Invalid branch name') + .optional(), +}); + +export const setupRemoteTunnelSchema = z.object({ + subdomainPrefix: z + .string() + .min(1) + .max(50) + .regex(/^[a-z0-9-]+$/, 'Prefix must be lowercase alphanumeric with hyphens') + .optional(), +}); + export type CreateInstanceInput = z.infer; export type UpdateInstanceInput = z.infer; export type RegisterInstanceInput = z.infer; export type ReconfigureInstanceInput = z.infer; export type ConfigureTunnelInput = z.infer; export type ImportInstancesInput = z.infer; +export type StartUpgradeInput = z.infer; diff --git a/changemaker-control-panel/api/src/server.ts b/changemaker-control-panel/api/src/server.ts index 90f92e9b..6c73ce7e 100644 --- a/changemaker-control-panel/api/src/server.ts +++ b/changemaker-control-panel/api/src/server.ts @@ -8,6 +8,12 @@ import { env } from './config/env'; import { logger } from './utils/logger'; import { errorHandler } from './middleware/error-handler'; +// BigInt JSON serialization. Prisma's BigInt columns (e.g. Backup.sizeBytes) +// don't have a toJSON method by default, so res.json() throws. Stringify them. +(BigInt.prototype as unknown as { toJSON: () => string }).toJSON = function () { + return this.toString(); +}; + // Route imports import authRoutes from './modules/auth/auth.routes'; import instanceRoutes from './modules/instances/instances.routes'; diff --git a/changemaker-control-panel/api/src/services/backup.service.ts b/changemaker-control-panel/api/src/services/backup.service.ts index 2cfa32b2..a52de2c0 100644 --- a/changemaker-control-panel/api/src/services/backup.service.ts +++ b/changemaker-control-panel/api/src/services/backup.service.ts @@ -1,5 +1,6 @@ import { Prisma, BackupStatus, AuditAction, InstanceStatus } from '@prisma/client'; import fs from 'fs/promises'; +import { createReadStream } from 'fs'; import path from 'path'; import crypto from 'crypto'; import { execFile as execFileCb } from 'child_process'; @@ -10,6 +11,7 @@ import { AppError } from '../middleware/error-handler'; import { decryptJson } from '../utils/encryption'; import * as docker from './docker.service'; import { logger } from '../utils/logger'; +import { getRemoteDriverForInstance } from './execution-driver'; const execFile = promisify(execFileCb); /** @@ -24,11 +26,16 @@ function assertPathWithinBoundary(filePath: string, boundary: string, label: str } /** - * Compute SHA-256 hash of a file. + * Compute SHA-256 hash of a file by streaming its contents. */ async function fileHash(filePath: string): Promise { - const fileBuffer = await fs.readFile(filePath); - return crypto.createHash('sha256').update(fileBuffer).digest('hex'); + return new Promise((resolve, reject) => { + const hash = crypto.createHash('sha256'); + const stream = createReadStream(filePath); + stream.on('data', (chunk) => hash.update(chunk)); + stream.on('end', () => resolve(hash.digest('hex'))); + stream.on('error', reject); + }); } /** @@ -52,7 +59,11 @@ export async function createBackup(instanceId: string, userId?: string, ipAddres throw new AppError(400, `Cannot backup instance in ${instance.status} state`, 'INVALID_STATE'); } - if ((instance as { isRegistered?: boolean }).isRegistered) { + // `isRegistered` + `isRemote` = a remote CCP-managed instance (agent on the + // far side). `isRegistered` alone (without `isRemote`) would mean a local + // host-managed instance that CCP doesn't own the compose files for — that + // case we still can't back up. + if (instance.isRegistered && !instance.isRemote) { throw new AppError(400, 'Backups not managed by CCP for registered instances', 'NOT_MANAGED'); } @@ -72,9 +83,31 @@ export async function createBackup(instanceId: string, userId?: string, ipAddres return backup; } +type BackupInstance = { + id: string; + slug: string; + basePath: string; + composeProject: string; + encryptedSecrets: string | null; + isRemote: boolean; + agentUrl: string | null; +}; + async function performBackup( backupId: string, - instance: { id: string; slug: string; basePath: string; composeProject: string; encryptedSecrets: string | null }, + instance: BackupInstance, + userId?: string, + ipAddress?: string +) { + if (instance.isRemote) { + return performRemoteBackup(backupId, instance, userId, ipAddress); + } + return performLocalBackup(backupId, instance, userId, ipAddress); +} + +async function performLocalBackup( + backupId: string, + instance: BackupInstance, userId?: string, ipAddress?: string ) { @@ -221,6 +254,168 @@ async function performBackup( } } +/** + * Run a backup on a remote agent and stream the resulting archive to CCP storage. + * + * Flow: + * 1. Tell agent to run scripts/backup.sh → { backupId, sizeBytes, sha256, manifest } + * 2. Stream archive from agent → $BACKUP_STORAGE_PATH/{slug}/backup-{slug}-{backupId}.tar.gz + * 3. Verify local SHA256 matches what the agent reported (defense in depth) + * 4. Tell agent to delete its local copy (reclaim remote disk) + * 5. Update Backup row as COMPLETED + * + * On failure at any step after the remote backup was created, we leave the + * agent-side archive in place so the operator can retry the download. + */ +async function performRemoteBackup( + backupId: string, + instance: BackupInstance, + userId?: string, + ipAddress?: string +) { + let archivePath: string | null = null; + let agentBackupId: string | null = null; + + try { + await prisma.backup.update({ + where: { id: backupId }, + data: { status: BackupStatus.IN_PROGRESS }, + }); + + const driver = await getRemoteDriverForInstance({ + id: instance.id, + slug: instance.slug, + isRemote: instance.isRemote, + agentUrl: instance.agentUrl, + }); + + // 1. Trigger the backup on the agent (this blocks until backup.sh completes) + logger.info(`[backup] ${instance.slug}: triggering remote backup via agent`); + const result = await driver.createBackup(); + agentBackupId = result.backupId; + + logger.info( + `[backup] ${instance.slug}: agent backup complete — ${result.filename} ` + + `(${(result.sizeBytes / 1024 / 1024).toFixed(1)} MB, sha256=${result.sha256.substring(0, 16)}...)` + ); + + // 2. Resolve the destination archive path on CCP storage + const archiveName = `backup-${instance.slug}-${result.backupId}.tar.gz`; + archivePath = path.join(env.BACKUP_STORAGE_PATH, instance.slug, archiveName); + // Path traversal guard (slug should be safe but better to assert) + assertPathWithinBoundary(archivePath, env.BACKUP_STORAGE_PATH, 'Backup archive'); + await fs.mkdir(path.dirname(archivePath), { recursive: true }); + + // 3. Stream the archive from the agent to CCP storage + logger.info(`[backup] ${instance.slug}: streaming archive to ${archivePath}`); + const { bytesWritten } = await driver.downloadBackup(result.backupId, archivePath); + if (bytesWritten !== result.sizeBytes) { + throw new Error( + `Downloaded size ${bytesWritten} does not match agent-reported size ${result.sizeBytes}` + ); + } + + // 4. Re-hash the downloaded file and compare to the agent-reported hash. + // + // SECURITY NOTE: this check authenticates *transmission integrity* only, + // not content integrity against a malicious agent. Both the file bytes + // and the expected hash are supplied by the (semi-trusted) agent, so a + // compromised agent can trivially make this check pass while delivering + // arbitrary content. The check still catches accidental corruption (bit + // flips, truncation) and is essentially free. + // + // The mTLS channel guarantees that the bytes weren't modified in transit + // by an outside attacker. The remaining trust gap — "what if the agent + // itself is compromised?" — must be addressed before Phase B (restore) + // ships, since restore feeds the archive into pg_restore. Either: + // (a) HMAC-sign the hash on the agent with its mTLS private key and + // verify on the CCP using the agent cert public key, or + // (b) limit restore operations to require an additional out-of-band + // admin confirmation step. + const localSha256 = await fileHash(archivePath); + if (localSha256 !== result.sha256) { + throw new Error( + `SHA256 mismatch: agent reported ${result.sha256}, local file hashed ${localSha256}` + ); + } + + // 5. Reclaim disk on the remote agent + try { + await driver.deleteBackup(result.backupId); + } catch (err) { + logger.warn( + `[backup] ${instance.slug}: failed to delete remote backup ${result.backupId}: ${(err as Error).message}` + ); + // Non-fatal — CCP has the archive, remote copy will age out next retention sweep + } + + // 6. Persist the result. Store sha256 and agentBackupId inside the manifest + // since we don't have dedicated columns. + const mergedManifest = { + ...(result.manifest as Record | null ?? {}), + source: 'remote', + agentBackupId: result.backupId, + sha256: result.sha256, + createdAt: result.createdAt, + }; + + await prisma.backup.update({ + where: { id: backupId }, + data: { + status: BackupStatus.COMPLETED, + archivePath, + sizeBytes: BigInt(bytesWritten), + manifest: mergedManifest as unknown as Prisma.InputJsonValue, + completedAt: new Date(), + }, + }); + + if (userId) { + await prisma.auditLog.create({ + data: { + userId, + instanceId: instance.id, + action: AuditAction.BACKUP_CREATE, + details: { + backupId, + archiveName, + sizeBytes: bytesWritten, + source: 'remote', + agentBackupId: result.backupId, + }, + ipAddress, + }, + }); + } + + logger.info( + `[backup] ${instance.slug}: remote backup stored at ${archivePath} ` + + `(${(bytesWritten / 1024 / 1024).toFixed(1)} MB)` + ); + } catch (err) { + await prisma.backup.update({ + where: { id: backupId }, + data: { + status: BackupStatus.FAILED, + errorMessage: (err as Error).message, + completedAt: new Date(), + }, + }); + + // Clean up any partial local file; leave the remote copy so retry is possible + if (archivePath) { + try { await fs.unlink(archivePath); } catch { /* ignore */ } + } + if (agentBackupId) { + logger.warn( + `[backup] ${instance.slug}: leaving agent-side backup ${agentBackupId} in place for retry` + ); + } + + throw err; + } +} + /** * Delete a backup (file + DB record). */ diff --git a/changemaker-control-panel/api/src/services/ccp-pangolin.client.ts b/changemaker-control-panel/api/src/services/ccp-pangolin.client.ts new file mode 100644 index 00000000..444b0137 --- /dev/null +++ b/changemaker-control-panel/api/src/services/ccp-pangolin.client.ts @@ -0,0 +1,368 @@ +/** + * Pangolin Integration API client for the CCP. + * + * Ported from the main CML's pangolin.client.ts. Adapted to: + * - Accept credentials via constructor (not env singleton) + * - Be instantiable per-call so the CCP can use its own API token + * to manage tunnels for multiple remote instances + * + * The CCP never exposes its Pangolin API key to remote instances — it + * only pushes the resulting Newt credentials via the agent's writeFiles. + */ +import { logger } from '../utils/logger'; + +// ─── Types ───────────────────────────────────────────────────────── + +export interface PangolinSite { + siteId: string; + name: string; + orgId: string; + niceId: string; + pubKey?: string; + subnet?: string; + megabytesIn?: number; + megabytesOut?: number; + lastSeen?: string; + online?: boolean; + type?: string; + address?: string; +} + +export interface PangolinExitNode { + exitNodeId: string; + name: string; + location?: string; + region?: string; + online: boolean; + capacity?: number; + latency?: number; +} + +export interface PangolinResource { + resourceId: string; + name: string; + subdomain?: string; + fullDomain?: string; + ssl?: boolean; + blockAccess?: boolean; + active?: boolean; + proxyPort?: number; + protocol?: string; + domainBindings?: string[]; + http?: boolean; + targets?: PangolinTarget[]; +} + +export interface PangolinTarget { + targetId: string; + resourceId: string; + siteId: string; + ip: string; + port: number; + method: string; + enabled?: boolean; +} + +export interface PangolinNewt { + newtId: string; + secret: string; + siteId: string; +} + +export interface PangolinSiteDefaults { + newtId: string; + newtSecret: string; + address: string; +} + +export interface CreateSitePayload { + name: string; + type?: string; + subnet?: string; + exitNodeId?: string; + newtId?: string; + secret?: string; + address?: string; +} + +export interface CreateHttpResourcePayload { + name: string; + domainId: string; + subdomain?: string; + http: true; + protocol: 'tcp'; +} + +export interface CreateTargetPayload { + siteId: string | number; + ip: string; + port: number; + method: 'http' | 'https'; + enabled?: boolean; +} + +export interface PangolinDomain { + domainId: string; + baseDomain: string; + verified: boolean; + type?: string; + failed?: boolean; + configManaged?: boolean; +} + +export interface UpdateResourcePayload { + name?: string; + subdomain?: string; + fullDomain?: string; + ssl?: boolean; + sso?: boolean; + active?: boolean; + blockAccess?: boolean; + proxyPort?: number; + protocol?: string; + domainBindings?: string[]; +} + +export interface UpdateCertificatePayload { + autoRenew?: boolean; +} + +export interface PangolinCertificate { + certId: string; + domainId: string; + domain: string; + status: 'PENDING' | 'ACTIVE' | 'EXPIRED' | 'FAILED'; + issuedAt?: string; + expiresAt?: string; + autoRenew?: boolean; + issuer?: string; +} + +export interface PangolinConnectedClient { + clientId: string; + resourceId: string; + ipAddress: string; + connectedAt: string; + lastSeen: string; + bytesIn: number; + bytesOut: number; + online: boolean; +} + +// ─── Helpers ─────────────────────────────────────────────────────── + +/** Redact credential fields from Pangolin API request bodies before logging. */ +function redactSecrets(body: unknown): unknown { + if (!body || typeof body !== 'object') return body; + const obj = body as Record; + const redacted = { ...obj }; + if ('secret' in redacted) redacted.secret = '[REDACTED]'; + if ('newtSecret' in redacted) redacted.newtSecret = '[REDACTED]'; + return redacted; +} + +// ─── Client ──────────────────────────────────────────────────────── + +export class CcpPangolinClient { + constructor( + private baseUrl: string, + private apiKey: string, + private orgId: string + ) {} + + get configured(): boolean { + return !!(this.baseUrl && this.apiKey && this.orgId); + } + + private async request(method: string, path: string, body?: unknown): Promise { + if (!this.configured) { + throw new Error('Pangolin API not configured. Set PANGOLIN_API_URL, PANGOLIN_API_KEY, PANGOLIN_ORG_ID in CCP .env'); + } + + const url = `${this.baseUrl}${path}`; + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 15000); + + try { + logger.debug(`[pangolin] ${method} ${path}${body ? ` body=${JSON.stringify(redactSecrets(body))}` : ''}`); + + const res = await fetch(url, { + method, + headers: { + 'Authorization': `Bearer ${this.apiKey}`, + 'Content-Type': 'application/json', + }, + body: body ? JSON.stringify(body) : undefined, + signal: controller.signal, + }); + + if (!res.ok) { + const text = await res.text().catch(() => ''); + throw new Error(`Pangolin API ${method} ${path} returned ${res.status}: ${text}`); + } + + const contentType = res.headers.get('content-type') || ''; + if (contentType.includes('application/json')) { + const json = await res.json(); + return this.unwrapResponse(json); + } + return {} as T; + } finally { + clearTimeout(timeout); + } + } + + private unwrapResponse(json: unknown): T { + if (json && typeof json === 'object' && !Array.isArray(json)) { + const obj = json as Record; + if ('data' in obj && 'success' in obj) { + return obj.data as T; + } + } + return json as T; + } + + // ─── Health ─────────────────────────────────────────────────── + + async healthCheck(): Promise { + try { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 5000); + try { + const res = await fetch(`${this.baseUrl}/`, { + headers: { 'Authorization': `Bearer ${this.apiKey}` }, + signal: controller.signal, + }); + return res.ok; + } finally { + clearTimeout(timeout); + } + } catch { + return false; + } + } + + // ─── Site Defaults ──────────────────────────────────────────── + + async pickSiteDefaults(): Promise { + const res = await this.request('GET', `/org/${this.orgId}/pick-site-defaults`); + const obj = res as Record; + const newtId = obj.newtId as string || ''; + const newtSecret = obj.newtSecret as string || obj.secret as string || ''; + const address = obj.clientAddress as string || obj.address as string || ''; + if (!newtId || !newtSecret) { + throw new Error('Pangolin did not return Newt credentials from pick-site-defaults'); + } + return { newtId, newtSecret, address }; + } + + // ─── Sites ──────────────────────────────────────────────────── + + async listSites(): Promise { + const res = await this.request('GET', `/org/${this.orgId}/sites`); + return this.extractArray(res, 'sites', 'listSites'); + } + + async getSite(siteId: string): Promise { + return this.request('GET', `/site/${siteId}`); + } + + async createSite(data: CreateSitePayload): Promise { + return this.request('PUT', `/org/${this.orgId}/site`, data); + } + + async deleteSite(siteId: string): Promise { + await this.request('DELETE', `/site/${siteId}`); + } + + async listExitNodes(): Promise { + try { + const res = await this.request('GET', `/org/${this.orgId}/exit-nodes`); + return this.extractArray(res, 'exitNodes', 'listExitNodes'); + } catch { + return []; + } + } + + // ─── Resources ──────────────────────────────────────────────── + + async listResources(): Promise { + const res = await this.request('GET', `/org/${this.orgId}/resources`); + return this.extractArray(res, 'resources', 'listResources'); + } + + async getResource(resourceId: string): Promise { + return this.request('GET', `/resource/${resourceId}`); + } + + async createResource(data: CreateHttpResourcePayload): Promise { + logger.info(`[pangolin] createResource: ${data.name} (subdomain: ${data.subdomain || '(root)'})`); + return this.request('PUT', `/org/${this.orgId}/resource`, data); + } + + async updateResource(resourceId: string, data: UpdateResourcePayload): Promise { + return this.request('POST', `/resource/${resourceId}`, data); + } + + async deleteResource(resourceId: string): Promise { + await this.request('DELETE', `/resource/${resourceId}`); + } + + // ─── Targets ────────────────────────────────────────────────── + + async createTarget(resourceId: string, data: CreateTargetPayload): Promise { + logger.info(`[pangolin] createTarget: resource=${resourceId}, ip=${data.ip}:${data.port}`); + const payload = { ...data, siteId: Number(data.siteId) }; + return this.request('PUT', `/resource/${resourceId}/target`, payload); + } + + async listTargets(resourceId: string): Promise { + const res = await this.request('GET', `/resource/${resourceId}/targets`); + return this.extractArray(res, 'targets', 'listTargets'); + } + + async deleteTarget(targetId: string): Promise { + await this.request('DELETE', `/target/${targetId}`); + } + + // ─── Domains ────────────────────────────────────────────────── + + async listDomains(): Promise { + const res = await this.request('GET', `/org/${this.orgId}/domains`); + return this.extractArray(res, 'domains', 'listDomains'); + } + + // ─── Certificates ───────────────────────────────────────────── + + async getCertificate(domainId: string, domain: string): Promise { + return this.request('GET', `/org/${this.orgId}/certificate/${domainId}/${domain}`); + } + + async updateCertificate(certId: string, data: UpdateCertificatePayload): Promise { + return this.request('POST', `/certificate/${certId}`, data); + } + + // ─── Clients ────────────────────────────────────────────────── + + async listClients(resourceId: string): Promise { + const res = await this.request('GET', `/resource/${resourceId}/clients`); + return this.extractArray(res, 'clients', 'listClients'); + } + + // ─── Helpers ────────────────────────────────────────────────── + + private extractArray(res: unknown, key: string, context: string): T[] { + if (Array.isArray(res)) return res as T[]; + if (res && typeof res === 'object') { + const obj = res as Record; + if (Array.isArray(obj[key])) return obj[key] as T[]; + if (obj.data && typeof obj.data === 'object') { + const dataObj = obj.data as Record; + if (Array.isArray(dataObj[key])) return dataObj[key] as T[]; + } + if (Array.isArray(obj.data)) return obj.data as T[]; + } + logger.warn(`[pangolin] ${context}: could not extract array from response`); + return []; + } +} diff --git a/changemaker-control-panel/api/src/services/certificate.service.ts b/changemaker-control-panel/api/src/services/certificate.service.ts index acc7b21e..7fc21bbf 100644 --- a/changemaker-control-panel/api/src/services/certificate.service.ts +++ b/changemaker-control-panel/api/src/services/certificate.service.ts @@ -90,7 +90,7 @@ export async function ensureCA() { * Issue a certificate for a remote agent, signed by the CA. * Returns the certificate materials (plaintext) for one-time display. */ -export async function issueAgentCert(instanceId: string, slug: string) { +export async function issueAgentCert(instanceId: string, slug: string, agentUrl?: string) { const ca = await ensureCA(); const caKeyPem = decrypt(ca.encryptedKey); @@ -110,12 +110,29 @@ export async function issueAgentCert(instanceId: string, slug: string) { await fs.writeFile(caCertFile, ca.certPem); await fs.writeFile(serialFile, crypto.randomBytes(16).toString('hex')); - // Extensions for server+client auth - await fs.writeFile(extFile, [ + // Build SAN entries from the agent URL hostname + const sanEntries: string[] = []; + if (agentUrl) { + try { + const hostname = new URL(agentUrl).hostname; + // Detect IP vs DNS name + if (/^\d{1,3}(\.\d{1,3}){3}$/.test(hostname) || hostname.includes(':')) { + sanEntries.push(`IP:${hostname}`); + } else { + sanEntries.push(`DNS:${hostname}`); + } + } catch { /* ignore invalid URL */ } + } + sanEntries.push(`DNS:${commonName}`); + + // Extensions for server+client auth with SANs + const extLines = [ 'basicConstraints=CA:FALSE', 'keyUsage=digitalSignature,keyEncipherment', 'extendedKeyUsage=serverAuth,clientAuth', - ].join('\n')); + `subjectAltName=${sanEntries.join(',')}`, + ]; + await fs.writeFile(extFile, extLines.join('\n')); // Generate agent key await exec( diff --git a/changemaker-control-panel/api/src/services/execution-driver.ts b/changemaker-control-panel/api/src/services/execution-driver.ts index cbaa0462..c1d39040 100644 --- a/changemaker-control-panel/api/src/services/execution-driver.ts +++ b/changemaker-control-panel/api/src/services/execution-driver.ts @@ -60,7 +60,20 @@ export async function getDriverForInstance(instance: DriverInstance): Promise; + checkedAt: string; + error: string | null; +} + +export interface AgentUpgradeProgress { + phase?: number; + phaseName?: string; + percentage?: number; + message?: string; + timestamp?: string; +} + +export interface AgentUpgradeResult { + success: boolean; + message?: string; + previousCommit?: string; + newCommit?: string; + commitCount?: number; + durationSeconds?: number; + warnings?: string[]; +} + +export interface StartAgentUpgradeOptions { + skipBackup?: boolean; + useRegistry?: boolean; + branch?: string; +} + interface AgentRequestOptions { method: 'GET' | 'POST' | 'DELETE'; path: string; @@ -261,4 +338,261 @@ export class RemoteDriver implements ExecutionDriver { timeoutMs: env.AGENT_LONG_OP_TIMEOUT_MS, }); } + + // ─── Backup Operations ────────────────────────────────────── + + /** + * Trigger a backup on the remote agent. The agent shells out to scripts/backup.sh + * and returns metadata for the resulting archive. The archive stays on the + * agent's disk until downloadBackup() + deleteBackup() are called. + */ + async createBackup(): Promise { + return this.request({ + method: 'POST', + path: `/instance/${this.slug}/backup`, + timeoutMs: env.AGENT_LONG_OP_TIMEOUT_MS, + }); + } + + /** + * List backup archives currently held on the agent for this slug. + */ + async listAgentBackups(): Promise { + const resp = await this.request<{ data: AgentBackupListEntry[] }>({ + method: 'GET', + path: `/instance/${this.slug}/backups`, + }); + return resp.data; + } + + /** + * Delete an archive from the agent's disk. Called after a successful download. + */ + async deleteBackup(backupId: string): Promise { + await this.request({ + method: 'DELETE', + path: `/instance/${this.slug}/backup/${encodeURIComponent(backupId)}`, + }); + } + + /** + * Stream a backup archive from the agent to a local file path. + * Verifies the Content-Length header matches the bytes written. + */ + async downloadBackup(backupId: string, destPath: string): Promise<{ bytesWritten: number }> { + const url = new URL( + `/instance/${this.slug}/backup/${encodeURIComponent(backupId)}/download`, + this.agentUrl + ); + const timeoutMs = env.AGENT_LONG_OP_TIMEOUT_MS; + + return new Promise((resolve, reject) => { + const req = https.request( + { + hostname: url.hostname, + port: url.port || 7443, + path: url.pathname + url.search, + method: 'GET', + headers: { Accept: 'application/gzip' }, + cert: this.clientCert, + key: this.clientKey, + ca: this.caCert, + rejectUnauthorized: true, + timeout: timeoutMs, + }, + async (res) => { + if (res.statusCode && res.statusCode >= 400) { + let body = ''; + res.on('data', (c) => (body += c)); + res.on('end', () => reject(new Error(`Agent returned ${res.statusCode}: ${body.substring(0, 500)}`))); + return; + } + + const expectedSize = res.headers['content-length'] + ? parseInt(res.headers['content-length'] as string, 10) + : null; + + try { + const out = fs.createWriteStream(destPath); + await pipeline(res, out); + const stats = await fs.promises.stat(destPath); + if (expectedSize !== null && stats.size !== expectedSize) { + reject(new Error(`Downloaded size ${stats.size} does not match Content-Length ${expectedSize}`)); + return; + } + resolve({ bytesWritten: stats.size }); + } catch (err) { + reject(err); + } + } + ); + + req.on('error', (err) => { + reject(new AgentUnreachableError(this.agentUrl, err)); + }); + req.on('timeout', () => { + req.destroy(); + reject(new AgentUnreachableError(this.agentUrl, new Error(`Timed out after ${timeoutMs}ms`))); + }); + + req.end(); + }); + } + + // ─── Restore Operations ───────────────────────────────────── + + /** + * Stream a backup archive from a local path to the agent's upload endpoint. + * The expected SHA256 is passed as a query parameter and the agent verifies + * it during ingestion — if it mismatches, the upload is rejected with 400. + */ + async uploadRestore( + archivePath: string, + expectedSha256: string + ): Promise { + const stats = await fs.promises.stat(archivePath); + const url = new URL( + `/instance/${this.slug}/restore/upload?sha256=${encodeURIComponent(expectedSha256)}`, + this.agentUrl + ); + const timeoutMs = env.AGENT_LONG_OP_TIMEOUT_MS; + + return new Promise((resolve, reject) => { + const req = https.request( + { + hostname: url.hostname, + port: url.port || 7443, + path: url.pathname + url.search, + method: 'POST', + headers: { + 'Content-Type': 'application/octet-stream', + 'Content-Length': String(stats.size), + }, + cert: this.clientCert, + key: this.clientKey, + ca: this.caCert, + rejectUnauthorized: true, + timeout: timeoutMs, + }, + (res) => { + let body = ''; + res.on('data', (c) => (body += c)); + res.on('end', () => { + if (res.statusCode && res.statusCode >= 400) { + try { + const err = JSON.parse(body); + reject(new Error(err.message || `Agent returned ${res.statusCode}`)); + } catch { + reject(new Error(`Agent returned ${res.statusCode}: ${body.substring(0, 500)}`)); + } + return; + } + try { + resolve(JSON.parse(body) as AgentRestoreUploadResult); + } catch (err) { + reject(err); + } + }); + } + ); + + req.on('error', (err) => { + reject(new AgentUnreachableError(this.agentUrl, err)); + }); + req.on('timeout', () => { + req.destroy(); + reject(new AgentUnreachableError(this.agentUrl, new Error(`Timed out after ${timeoutMs}ms`))); + }); + + const fileStream = fs.createReadStream(archivePath); + fileStream.on('error', (err) => { + req.destroy(); + reject(err); + }); + fileStream.pipe(req); + }); + } + + /** + * Tell the agent to apply a previously-uploaded restore archive. The agent + * fires `scripts/restore.sh` in the background and returns immediately. + * Use `getRestoreProgress()` to poll for completion. + */ + async applyRestore(uploadId: string, options: AgentRestoreOptions = {}): Promise { + await this.request({ + method: 'POST', + path: `/instance/${this.slug}/restore/${encodeURIComponent(uploadId)}/apply`, + body: { confirm: true, ...options }, + }); + } + + /** + * Poll the agent for the current state of a restore. + */ + async getRestoreProgress(uploadId: string): Promise { + return this.request({ + method: 'GET', + path: `/instance/${this.slug}/restore/${encodeURIComponent(uploadId)}/progress`, + }); + } + + /** + * Delete a restore upload dir from the agent's disk. Called after the CCP + * has finalized the InstanceRestore row. + */ + async deleteRestoreUpload(uploadId: string): Promise { + await this.request({ + method: 'DELETE', + path: `/instance/${this.slug}/restore/${encodeURIComponent(uploadId)}`, + }); + } + + // ─── Upgrade Operations ───────────────────────────────────── + + /** + * Run upgrade-check.sh on the remote and return the parsed status.json. + */ + async checkForUpdates(): Promise { + return this.request({ + method: 'POST', + path: `/instance/${this.slug}/upgrade/check`, + timeoutMs: 90_000, + }); + } + + /** + * Trigger upgrade.sh --api-mode on the remote. Fire-and-forget; agent + * spawns the script in the background and returns 202 immediately. + * Use getUpgradeProgress / getUpgradeResult to track completion. + */ + async startUpgrade(options: StartAgentUpgradeOptions = {}): Promise { + await this.request({ + method: 'POST', + path: `/instance/${this.slug}/upgrade/start`, + body: options, + timeoutMs: 30_000, + }); + } + + /** + * Read the agent's data/upgrade/progress.json. Returns the default zero-state + * if no progress has been written yet. + */ + async getUpgradeProgress(): Promise { + return this.request({ + method: 'GET', + path: `/instance/${this.slug}/upgrade/progress`, + }); + } + + /** + * Read the agent's data/upgrade/result.json. Throws if no result is yet + * available; the caller should treat that as "still running". + */ + async getUpgradeResult(): Promise { + return this.request({ + method: 'GET', + path: `/instance/${this.slug}/upgrade/result`, + }); + } } diff --git a/changemaker-control-panel/api/src/services/restore.service.ts b/changemaker-control-panel/api/src/services/restore.service.ts new file mode 100644 index 00000000..284a8e98 --- /dev/null +++ b/changemaker-control-panel/api/src/services/restore.service.ts @@ -0,0 +1,376 @@ +import fs from 'fs/promises'; +import path from 'path'; +import crypto from 'crypto'; +import { createReadStream } from 'fs'; +import { Prisma, RestoreStatus, AuditAction, InstanceStatus } from '@prisma/client'; +import { prisma } from '../lib/prisma'; +import { env } from '../config/env'; +import { AppError } from '../middleware/error-handler'; +import { logger } from '../utils/logger'; +import { getRemoteDriverForInstance } from './execution-driver'; +import type { AgentRestoreOptions, AgentRestoreState } from './remote-driver'; + +/** + * Validate that a path is within the allowed backup storage boundary. + */ +function assertPathWithinBoundary(filePath: string, boundary: string, label: string): void { + const normalized = path.resolve(filePath); + const normalizedBoundary = path.resolve(boundary); + if (!normalized.startsWith(normalizedBoundary + path.sep)) { + throw new AppError(403, `${label} path outside allowed directory`, 'FORBIDDEN'); + } +} + +/** + * Compute SHA-256 hash of a file by streaming its contents. + */ +async function fileHash(filePath: string): Promise { + return new Promise((resolve, reject) => { + const hash = crypto.createHash('sha256'); + const stream = createReadStream(filePath); + stream.on('data', (chunk) => hash.update(chunk)); + stream.on('end', () => resolve(hash.digest('hex'))); + stream.on('error', reject); + }); +} + +const POLL_INTERVAL_MS = 3_000; +const POLL_TIMEOUT_MS = 15 * 60 * 1_000; // 15 min + +interface StartRestoreArgs { + backupId: string; + triggeredById?: string; + ipAddress?: string | null; + options?: AgentRestoreOptions; +} + +/** + * Kick off a restore for the given backup. Creates an InstanceRestore row + * and runs the full upload → apply → poll loop asynchronously. Returns the + * row so the caller (HTTP handler) can respond immediately. + */ +export async function createRestore(args: StartRestoreArgs) { + const backup = await prisma.backup.findUnique({ + where: { id: args.backupId }, + include: { instance: true }, + }); + if (!backup) { + throw new AppError(404, 'Backup not found', 'NOT_FOUND'); + } + if (backup.status !== 'COMPLETED') { + throw new AppError(400, `Backup is ${backup.status}, not COMPLETED`, 'INVALID_STATE'); + } + if (!backup.archivePath) { + throw new AppError(400, 'Backup has no archive path', 'NO_ARCHIVE'); + } + + const instance = backup.instance; + if (instance.status !== InstanceStatus.RUNNING) { + throw new AppError(400, `Cannot restore to instance in ${instance.status} state`, 'INVALID_STATE'); + } + // Phase B only supports remote restore. Local restore is deliberately stubbed + // — if you need it, add a performLocalRestore branch below. This also covers + // the registered-but-local case (CCP-adopted instances) since they have + // isRemote=false. + if (!instance.isRemote) { + throw new AppError(501, 'Local restore is not implemented — Phase B covers remote only', 'NOT_IMPLEMENTED'); + } + + // Make sure the archive is where it says it is and inside the boundary + assertPathWithinBoundary(backup.archivePath, env.BACKUP_STORAGE_PATH, 'Backup archive'); + try { + await fs.access(backup.archivePath); + } catch { + throw new AppError(404, 'Archive file is missing on disk', 'ARCHIVE_MISSING'); + } + + const restore = await prisma.instanceRestore.create({ + data: { + instanceId: instance.id, + backupId: backup.id, + status: RestoreStatus.PENDING, + triggeredById: args.triggeredById ?? null, + }, + }); + + // Fire-and-forget orchestration + performRemoteRestore(restore.id, backup.archivePath, args.options ?? {}, args.triggeredById, args.ipAddress ?? null) + .catch((err) => { + logger.error(`[restore] ${restore.id} failed: ${(err as Error).message}`); + }); + + return restore; +} + +/** + * End-to-end remote restore orchestration. + * + * Flow: + * 1. Compute sha256 of the archive on CCP disk + * 2. Upload to agent with sha256 query param (agent re-verifies on stream) + * 3. Apply via agent (shells out to restore.sh --force) + * 4. Poll progress every 3s until COMPLETED/FAILED or timeout + * 5. Delete the agent-side upload + * 6. Update the InstanceRestore row + audit log + */ +/** + * Write a BACKUP_RESTORE audit log entry. Wrapped in a try/catch so that an + * audit-log DB failure can never mask the underlying restore status update. + * + * Called in all three terminal paths: + * - success (outcome: 'success') + * - agent reported failure (outcome: 'agent_failed') + * - orchestration error / timeout / unexpected throw (outcome: 'orchestration_error') + */ +async function writeRestoreAuditLog(args: { + restoreId: string; + instanceId: string; + backupId: string; + triggeredById?: string; + ipAddress?: string | null; + options: AgentRestoreOptions; + outcome: 'success' | 'agent_failed' | 'orchestration_error'; + sha256?: string; + uploadId?: string | null; + errorMessage?: string; +}): Promise { + if (!args.triggeredById) return; + try { + await prisma.auditLog.create({ + data: { + userId: args.triggeredById, + instanceId: args.instanceId, + action: AuditAction.BACKUP_RESTORE, + details: { + backupId: args.backupId, + restoreId: args.restoreId, + source: 'remote', + outcome: args.outcome, + options: args.options as unknown as Prisma.InputJsonValue, + ...(args.sha256 ? { sha256: args.sha256 } : {}), + ...(args.uploadId ? { agentUploadId: args.uploadId } : {}), + ...(args.errorMessage ? { errorMessage: args.errorMessage.substring(0, 500) } : {}), + }, + ipAddress: args.ipAddress ?? null, + }, + }); + } catch (err) { + logger.error(`[restore] failed to write audit log for ${args.restoreId}: ${(err as Error).message}`); + } +} + +async function performRemoteRestore( + restoreId: string, + archivePath: string, + options: AgentRestoreOptions, + triggeredById?: string, + ipAddress?: string | null +) { + const restore = await prisma.instanceRestore.findUnique({ + where: { id: restoreId }, + include: { instance: true, backup: true }, + }); + if (!restore) { + logger.error(`[restore] row ${restoreId} vanished mid-flight`); + return; + } + const instance = restore.instance; + + let uploadId: string | null = null; + let sha256: string | undefined; + try { + await prisma.instanceRestore.update({ + where: { id: restoreId }, + data: { status: RestoreStatus.UPLOADING }, + }); + + const driver = await getRemoteDriverForInstance({ + id: instance.id, + slug: instance.slug, + isRemote: instance.isRemote, + agentUrl: instance.agentUrl, + }); + + // 1. Compute local SHA256 (authoritative — the agent will verify against this). + // We persist this in the audit log so there's an immutable record of exactly + // which bytes were restored, useful for post-incident comparison. + logger.info(`[restore] ${instance.slug}: hashing archive ${path.basename(archivePath)}`); + sha256 = await fileHash(archivePath); + + // 2. Stream upload to agent + logger.info(`[restore] ${instance.slug}: uploading archive (sha256=${sha256.substring(0, 16)}...)`); + const uploadResult = await driver.uploadRestore(archivePath, sha256); + uploadId = uploadResult.uploadId; + + await prisma.instanceRestore.update({ + where: { id: restoreId }, + data: { uploadId, status: RestoreStatus.RUNNING }, + }); + + // 3. Apply + logger.info(`[restore] ${instance.slug}: applying restore ${uploadId}`); + await driver.applyRestore(uploadId, options); + + // 4. Poll progress + const deadline = Date.now() + POLL_TIMEOUT_MS; + let finalState: AgentRestoreState | null = null; + while (Date.now() < deadline) { + await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS)); + try { + const state = await driver.getRestoreProgress(uploadId); + // Mirror progress to the DB row so the UI shows updates + await prisma.instanceRestore.update({ + where: { id: restoreId }, + data: { + progressJson: state as unknown as Prisma.InputJsonValue, + logTail: state.logTail ?? null, + }, + }); + if (state.status === 'COMPLETED' || state.status === 'FAILED') { + finalState = state; + break; + } + } catch (err) { + logger.warn(`[restore] ${instance.slug}: poll error: ${(err as Error).message}`); + // Keep polling — transient network blips shouldn't fail the restore + } + } + + if (!finalState) { + throw new Error(`Restore timed out after ${Math.round(POLL_TIMEOUT_MS / 1000)}s`); + } + + // 5. Clean up agent-side upload (best effort) + try { + await driver.deleteRestoreUpload(uploadId); + } catch (err) { + logger.warn(`[restore] ${instance.slug}: failed to delete agent upload ${uploadId}: ${(err as Error).message}`); + } + + // 6. Finalize DB row + if (finalState.status === 'COMPLETED') { + await prisma.instanceRestore.update({ + where: { id: restoreId }, + data: { + status: RestoreStatus.COMPLETED, + progressJson: finalState as unknown as Prisma.InputJsonValue, + logTail: finalState.logTail ?? null, + completedAt: new Date(), + }, + }); + + await writeRestoreAuditLog({ + restoreId, + instanceId: instance.id, + backupId: restore.backupId, + triggeredById, + ipAddress, + options, + outcome: 'success', + sha256, + uploadId, + }); + + logger.info(`[restore] ${instance.slug}: restore ${restoreId} COMPLETED`); + } else { + const errMsg = finalState.errorMessage || `Agent reported FAILED (exit ${finalState.exitCode})`; + await prisma.instanceRestore.update({ + where: { id: restoreId }, + data: { + status: RestoreStatus.FAILED, + progressJson: finalState as unknown as Prisma.InputJsonValue, + logTail: finalState.logTail ?? null, + errorMessage: errMsg, + completedAt: new Date(), + }, + }); + await writeRestoreAuditLog({ + restoreId, + instanceId: instance.id, + backupId: restore.backupId, + triggeredById, + ipAddress, + options, + outcome: 'agent_failed', + sha256, + uploadId, + errorMessage: errMsg, + }); + logger.warn(`[restore] ${instance.slug}: restore ${restoreId} FAILED (exit ${finalState.exitCode})`); + } + } catch (err) { + const errMsg = (err as Error).message; + await prisma.instanceRestore.update({ + where: { id: restoreId }, + data: { + status: RestoreStatus.FAILED, + errorMessage: errMsg, + completedAt: new Date(), + }, + }); + await writeRestoreAuditLog({ + restoreId, + instanceId: instance.id, + backupId: restore.backupId, + triggeredById, + ipAddress, + options, + outcome: 'orchestration_error', + sha256, + uploadId, + errorMessage: errMsg, + }); + logger.error(`[restore] ${restore.instance.slug}: ${errMsg}`); + + // Best-effort cleanup of the agent upload if we got that far + if (uploadId) { + try { + const driver = await getRemoteDriverForInstance({ + id: instance.id, + slug: instance.slug, + isRemote: instance.isRemote, + agentUrl: instance.agentUrl, + }); + await driver.deleteRestoreUpload(uploadId); + } catch { /* ignore */ } + } + } +} + +/** + * List restores with optional filtering and pagination. + */ +export async function listRestores(instanceId?: string, page = 1, limit = 50) { + const where = instanceId ? { instanceId } : {}; + const [data, total] = await Promise.all([ + prisma.instanceRestore.findMany({ + where, + orderBy: { startedAt: 'desc' }, + skip: (page - 1) * limit, + take: limit, + include: { + instance: { select: { id: true, name: true, slug: true } }, + backup: { select: { id: true, archivePath: true, sizeBytes: true } }, + }, + }), + prisma.instanceRestore.count({ where }), + ]); + return { data, total, page, limit }; +} + +/** + * Get a single restore by ID. + */ +export async function getRestore(restoreId: string) { + const restore = await prisma.instanceRestore.findUnique({ + where: { id: restoreId }, + include: { + instance: { select: { id: true, name: true, slug: true } }, + backup: { select: { id: true, archivePath: true, sizeBytes: true, manifest: true } }, + }, + }); + if (!restore) { + throw new AppError(404, 'Restore not found', 'NOT_FOUND'); + } + return restore; +} diff --git a/changemaker-control-panel/api/src/services/tunnel.service.ts b/changemaker-control-panel/api/src/services/tunnel.service.ts new file mode 100644 index 00000000..a3d582f2 --- /dev/null +++ b/changemaker-control-panel/api/src/services/tunnel.service.ts @@ -0,0 +1,599 @@ +/** + * Remote tunnel management service. + * + * Orchestrates Pangolin site/resource/target creation on behalf of remote CML + * instances, then pushes Newt credentials to the remote host via the mTLS agent. + * The CCP holds the Pangolin API token centrally — remote instances never touch + * the Pangolin API themselves. + */ +import { AuditAction, Prisma } from '@prisma/client'; +import { prisma } from '../lib/prisma'; +import { env } from '../config/env'; +import { AppError } from '../middleware/error-handler'; +import { logger } from '../utils/logger'; +import { getRemoteDriverForInstance } from './execution-driver'; +import { + CcpPangolinClient, + type PangolinDomain, + type PangolinResource, +} from './ccp-pangolin.client'; + +// ─── Resource definitions ────────────────────────────────────────── + +interface ResourceDef { + subdomain: string; + name: string; + required?: boolean; + featureFlag?: string; +} + +const RESOURCE_DEFINITIONS: ResourceDef[] = [ + { subdomain: 'app', name: 'Admin GUI', required: true }, + { subdomain: 'api', name: 'API', required: true }, + { subdomain: '', name: 'Public Site', required: true }, + { subdomain: 'media', name: 'Media API', featureFlag: 'enableMedia' }, + { subdomain: 'db', name: 'NocoDB', required: false }, + { subdomain: 'docs', name: 'Docs', required: false }, + { subdomain: 'code', name: 'Code Server', required: false }, + { subdomain: 'git', name: 'Gitea', required: false }, + { subdomain: 'home', name: 'Homepage', required: false }, + { subdomain: 'listmonk', name: 'Listmonk', featureFlag: 'enableListmonk' }, + { subdomain: 'qr', name: 'Mini QR', required: false }, + { subdomain: 'draw', name: 'Excalidraw', required: false }, + { subdomain: 'vault', name: 'Vaultwarden', required: false }, + { subdomain: 'mail', name: 'MailHog', required: false }, + { subdomain: 'chat', name: 'Rocket.Chat', featureFlag: 'enableChat' }, + { subdomain: 'events', name: 'Gancio', featureFlag: 'enableGancio' }, + { subdomain: 'meet', name: 'Jitsi Meet', featureFlag: 'enableMeet' }, + { subdomain: 'grafana', name: 'Grafana', featureFlag: 'enableMonitoring' }, +]; + +// ─── Helpers ─────────────────────────────────────────────────────── + +function getPangolinClient(): CcpPangolinClient { + if (!env.PANGOLIN_API_URL || !env.PANGOLIN_API_KEY || !env.PANGOLIN_ORG_ID) { + throw new AppError( + 501, + 'Pangolin API not configured on this CCP. Set PANGOLIN_API_URL, PANGOLIN_API_KEY, PANGOLIN_ORG_ID in the CCP .env file.', + 'PANGOLIN_NOT_CONFIGURED' + ); + } + return new CcpPangolinClient(env.PANGOLIN_API_URL, env.PANGOLIN_API_KEY, env.PANGOLIN_ORG_ID); +} + +function fullSubdomain(prefix: string, sub: string): string { + if (!sub) return prefix; // root domain → prefix alone (e.g., "ck") + return `${prefix}-${sub}`; // e.g., "ck-app", "ck-api" +} + +function shouldCreateResource( + def: ResourceDef, + instance: Record +): boolean { + if (def.required) return true; + if (def.featureFlag) return !!(instance as Record)[def.featureFlag]; + return true; // optional with no feature flag → always create +} + +async function findDomainForInstance( + client: CcpPangolinClient, + instanceDomain: string +): Promise { + const domains = await client.listDomains(); + // Match the instance's domain against registered Pangolin base domains + // e.g., instance.domain = "cursedknowledge.org" → look for base domain "cursedknowledge.org" + // or broader: instance.domain = "app.example.com" → look for "example.com" + const exact = domains.find((d) => d.baseDomain === instanceDomain); + if (exact) return exact; + + // Try matching parent domain (e.g., sub.example.com → example.com) + const parts = instanceDomain.split('.'); + for (let i = 1; i < parts.length - 1; i++) { + const parent = parts.slice(i).join('.'); + const match = domains.find((d) => d.baseDomain === parent); + if (match) return match; + } + + throw new AppError( + 400, + `No Pangolin domain matches instance domain "${instanceDomain}". Available: ${domains.map((d) => d.baseDomain).join(', ')}`, + 'DOMAIN_NOT_FOUND' + ); +} + +// ─── Setup ───────────────────────────────────────────────────────── + +export interface SetupTunnelOptions { + subdomainPrefix?: string; +} + +export interface TunnelSetupResult { + siteId: string; + newtId: string; + endpoint: string; + resourceCount: number; + resources: Array<{ subdomain: string; name: string; resourceId: string }>; +} + +export async function setupTunnel( + instanceId: string, + options: SetupTunnelOptions, + userId?: string, + ipAddress?: string | null +): Promise { + const client = getPangolinClient(); + + const instance = await prisma.instance.findUnique({ where: { id: instanceId } }); + if (!instance) throw new AppError(404, 'Instance not found', 'NOT_FOUND'); + if (!instance.isRemote) throw new AppError(400, 'Tunnel setup via Pangolin API is only for remote instances', 'NOT_REMOTE'); + if (instance.pangolinSiteId) { + throw new AppError(400, 'Tunnel is already configured. Use sync to update resources, or teardown first.', 'ALREADY_CONFIGURED'); + } + + const prefix = options.subdomainPrefix || instance.slug; + + const driver = await getRemoteDriverForInstance({ + id: instance.id, + slug: instance.slug, + isRemote: instance.isRemote, + agentUrl: instance.agentUrl, + }); + + // 1. Get Newt credentials + logger.info(`[tunnel] ${instance.slug}: picking site defaults`); + const defaults = await client.pickSiteDefaults(); + + // 2. Create site + logger.info(`[tunnel] ${instance.slug}: creating Pangolin site`); + const site = await client.createSite({ + name: instance.slug, + type: 'newt', + newtId: defaults.newtId, + secret: defaults.newtSecret, + address: defaults.address, + }); + const siteId = String(site.siteId); + const newtId = site.newt?.newtId || defaults.newtId; + const newtSecret = site.newt?.secret || defaults.newtSecret; + + // The Pangolin endpoint (what Newt connects to) may be different from + // the API URL. E.g., API = api.bnkserve.org/v1, endpoint = pangolin.bnkserve.org. + // If PANGOLIN_ENDPOINT is set, use it. Otherwise derive from API URL. + let endpoint = env.PANGOLIN_ENDPOINT || ''; + if (!endpoint) { + const endpointUrl = new URL(env.PANGOLIN_API_URL); + endpoint = `${endpointUrl.protocol}//${endpointUrl.hostname}${endpointUrl.port ? ':' + endpointUrl.port : ''}`; + } + + // 3. Find matching domain + const domain = await findDomainForInstance(client, instance.domain); + logger.info(`[tunnel] ${instance.slug}: matched domain ${domain.baseDomain} (id: ${domain.domainId})`); + + // 4. Create resources + targets + const createdResources: Array<{ subdomain: string; name: string; resourceId: string }> = []; + const existingResources = await client.listResources(); + + for (const def of RESOURCE_DEFINITIONS) { + if (!shouldCreateResource(def, instance as unknown as Record)) { + logger.debug(`[tunnel] ${instance.slug}: skipping ${def.name} (feature not enabled)`); + continue; + } + + const sub = fullSubdomain(prefix, def.subdomain); + // Build the expected full domain so we can do an idempotent check against + // Pangolin's existing resources. Pangolin returns `fullDomain` not `subdomain`. + const expectedFullDomain = sub + ? `${sub}.${domain.baseDomain}` + : domain.baseDomain; + + // Idempotent: skip if a resource with this fullDomain already exists + const existing = existingResources.find( + (r) => r.fullDomain === expectedFullDomain + ); + if (existing) { + logger.debug(`[tunnel] ${instance.slug}: resource ${def.name} (${expectedFullDomain}) already exists`); + createdResources.push({ subdomain: sub, name: def.name, resourceId: String(existing.resourceId) }); + continue; + } + + try { + const resourcePayload: Record = { + name: def.name, + domainId: domain.domainId, + http: true, + protocol: 'tcp', + }; + // Root domain: omit subdomain entirely (empty string is rejected by Pangolin) + if (sub) resourcePayload.subdomain = sub; + + const resource = await client.createResource(resourcePayload as unknown as Parameters[0]); + + // Make the resource public (no SSO, no access block) + try { + await client.updateResource(resource.resourceId, { sso: false, blockAccess: false }); + } catch (err) { + logger.warn(`[tunnel] ${instance.slug}: failed to make ${def.name} public: ${(err as Error).message}`); + } + + // Create target pointing to nginx:80 on the remote host + await client.createTarget(resource.resourceId, { + siteId: Number(siteId), + ip: 'nginx', + port: 80, + method: 'http', + enabled: true, + }); + + createdResources.push({ subdomain: sub, name: def.name, resourceId: resource.resourceId }); + logger.info(`[tunnel] ${instance.slug}: created resource ${def.name} → ${sub}.${domain.baseDomain}`); + } catch (err) { + if (def.required) throw err; + logger.warn(`[tunnel] ${instance.slug}: failed to create optional resource ${def.name}: ${(err as Error).message}`); + } + } + + // 5. Push Newt credentials to remote .env + logger.info(`[tunnel] ${instance.slug}: pushing Newt credentials to remote .env`); + const envLines = [ + `PANGOLIN_ENDPOINT=${endpoint}`, + `PANGOLIN_SITE_ID=${siteId}`, + `PANGOLIN_NEWT_ID=${newtId}`, + `PANGOLIN_NEWT_SECRET=${newtSecret}`, + ].join('\n') + '\n'; + + // Read current .env, append/replace Pangolin vars + const currentEnv = await driver.readEnvFile(''); + const envContent = buildUpdatedEnv(currentEnv, { + PANGOLIN_ENDPOINT: endpoint, + PANGOLIN_SITE_ID: siteId, + PANGOLIN_NEWT_ID: newtId, + PANGOLIN_NEWT_SECRET: newtSecret, + }); + + await driver.writeFiles('', [{ relativePath: '.env', content: envContent }]); + + // 6. Persist on Instance row + await prisma.instance.update({ + where: { id: instanceId }, + data: { + pangolinEndpoint: endpoint, + pangolinSiteId: siteId, + pangolinNewtId: newtId, + pangolinNewtSecret: newtSecret, + pangolinSubdomainPrefix: prefix, + }, + }); + + // 7. Recreate Newt container to pick up the new .env vars. + // `docker compose restart` does NOT re-read .env — it only sends SIGTERM+restart. + // `docker compose up -d newt` detects env var changes (via ${PANGOLIN_NEWT_ID} + // expansion in docker-compose.yml) and recreates the container automatically. + logger.info(`[tunnel] ${instance.slug}: recreating newt container with new credentials`); + try { + await driver.composeUp('', '', ['newt']); + } catch (err) { + logger.warn(`[tunnel] ${instance.slug}: composeUp(newt) failed: ${(err as Error).message}`); + } + + // 8. Audit log + if (userId) { + await prisma.auditLog.create({ + data: { + userId, + instanceId, + action: AuditAction.PANGOLIN_SETUP, + details: { + source: 'remote', + siteId, + newtId, + endpoint, + resourceCount: createdResources.length, + subdomainPrefix: prefix, + } as unknown as Prisma.InputJsonValue, + ipAddress: ipAddress ?? null, + }, + }); + } + + logger.info(`[tunnel] ${instance.slug}: tunnel setup complete — ${createdResources.length} resources created`); + + return { + siteId, + newtId, + endpoint, + resourceCount: createdResources.length, + resources: createdResources, + }; +} + +// ─── Sync ────────────────────────────────────────────────────────── + +export async function syncResources( + instanceId: string, + userId?: string, + ipAddress?: string | null +) { + const client = getPangolinClient(); + const instance = await prisma.instance.findUnique({ where: { id: instanceId } }); + if (!instance) throw new AppError(404, 'Instance not found', 'NOT_FOUND'); + if (!instance.pangolinSiteId) throw new AppError(400, 'No tunnel configured', 'NO_TUNNEL'); + + const prefix = instance.pangolinSubdomainPrefix || instance.slug; + const domain = await findDomainForInstance(client, instance.domain); + const existingResources = await client.listResources(); + const siteId = instance.pangolinSiteId; + + let created = 0; + for (const def of RESOURCE_DEFINITIONS) { + if (!shouldCreateResource(def, instance as unknown as Record)) continue; + + const sub = fullSubdomain(prefix, def.subdomain); + const expectedFullDomain = sub ? `${sub}.${domain.baseDomain}` : domain.baseDomain; + const existing = existingResources.find((r) => r.fullDomain === expectedFullDomain); + if (existing) continue; + + try { + const resourcePayload: Record = { + name: def.name, + domainId: domain.domainId, + http: true, + protocol: 'tcp', + }; + if (sub) resourcePayload.subdomain = sub; + + const resource = await client.createResource(resourcePayload as unknown as Parameters[0]); + await client.updateResource(resource.resourceId, { sso: false, blockAccess: false }); + await client.createTarget(resource.resourceId, { + siteId: Number(siteId), + ip: 'nginx', + port: 80, + method: 'http', + enabled: true, + }); + created++; + logger.info(`[tunnel] ${instance.slug}: sync created ${def.name} (${sub})`); + } catch (err) { + if (def.required) throw err; + logger.warn(`[tunnel] ${instance.slug}: sync failed for ${def.name}: ${(err as Error).message}`); + } + } + + if (userId) { + await prisma.auditLog.create({ + data: { + userId, + instanceId, + action: AuditAction.PANGOLIN_SYNC, + details: { source: 'remote', created, siteId } as unknown as Prisma.InputJsonValue, + ipAddress: ipAddress ?? null, + }, + }); + } + + return { synced: true, created }; +} + +// ─── Teardown ────────────────────────────────────────────────────── + +export async function teardownTunnel( + instanceId: string, + userId?: string, + ipAddress?: string | null +) { + const client = getPangolinClient(); + const instance = await prisma.instance.findUnique({ where: { id: instanceId } }); + if (!instance) throw new AppError(404, 'Instance not found', 'NOT_FOUND'); + if (!instance.pangolinSiteId) throw new AppError(400, 'No tunnel configured', 'NO_TUNNEL'); + + const siteId = instance.pangolinSiteId; + + // Delete site from Pangolin (cascades resources + targets) + try { + await client.deleteSite(siteId); + logger.info(`[tunnel] ${instance.slug}: deleted Pangolin site ${siteId}`); + } catch (err) { + logger.warn(`[tunnel] ${instance.slug}: deleteSite failed (may already be gone): ${(err as Error).message}`); + } + + // Clear Instance fields + await prisma.instance.update({ + where: { id: instanceId }, + data: { + pangolinEndpoint: null, + pangolinSiteId: null, + pangolinNewtId: null, + pangolinNewtSecret: null, + }, + }); + + // Push empty Pangolin vars to remote .env + if (instance.isRemote) { + try { + const driver = await getRemoteDriverForInstance({ + id: instance.id, + slug: instance.slug, + isRemote: instance.isRemote, + agentUrl: instance.agentUrl, + }); + const currentEnv = await driver.readEnvFile(''); + const envContent = buildUpdatedEnv(currentEnv, { + PANGOLIN_ENDPOINT: '', + PANGOLIN_SITE_ID: '', + PANGOLIN_NEWT_ID: '', + PANGOLIN_NEWT_SECRET: '', + }); + await driver.writeFiles('', [{ relativePath: '.env', content: envContent }]); + + // Stop newt container (best effort) + try { + await driver.composeStop('', ''); + await driver.composeUp('', ''); // restart everything except newt won't start without creds + } catch { /* ignore */ } + } catch (err) { + logger.warn(`[tunnel] ${instance.slug}: failed to push empty env to remote: ${(err as Error).message}`); + } + } + + // Audit log + if (userId) { + await prisma.auditLog.create({ + data: { + userId, + instanceId, + action: AuditAction.PANGOLIN_TEARDOWN, + details: { source: 'remote', siteId } as unknown as Prisma.InputJsonValue, + ipAddress: ipAddress ?? null, + }, + }); + } + + return { tornDown: true }; +} + +// ─── Status ──────────────────────────────────────────────────────── + +export interface TunnelStatus { + configured: boolean; + online?: boolean; + siteId?: string; + endpoint?: string; + resources?: Array<{ + subdomain: string; + name: string; + resourceId: string; + hasTarget: boolean; + targetIp?: string; + targetPort?: number; + }>; +} + +export async function getTunnelStatus(instanceId: string): Promise { + const instance = await prisma.instance.findUnique({ where: { id: instanceId } }); + if (!instance) throw new AppError(404, 'Instance not found', 'NOT_FOUND'); + + if (!instance.pangolinSiteId) { + return { configured: false }; + } + + // For local instances, return stored values without querying Pangolin API + if (!instance.isRemote) { + return { + configured: true, + siteId: instance.pangolinSiteId ?? undefined, + endpoint: instance.pangolinEndpoint ?? undefined, + }; + } + + const client = getPangolinClient(); + + let online = false; + try { + const site = await client.getSite(instance.pangolinSiteId); + online = site.online ?? false; + } catch (err) { + logger.warn(`[tunnel] ${instance.slug}: getSite failed: ${(err as Error).message}`); + } + + const resources: TunnelStatus['resources'] = []; + try { + const allResources = await client.listResources(); + const siteIdNum = Number(instance.pangolinSiteId); + // Filter to resources that have a target pointing to our siteId. + // This is the most reliable filter since it uses the actual Pangolin + // site association rather than guessing from subdomain names. + for (const res of allResources) { + let hasTarget = false; + let targetIp: string | undefined; + let targetPort: number | undefined; + let belongsToUs = false; + try { + const targets = await client.listTargets(String(res.resourceId)); + for (const t of targets) { + if (Number(t.siteId) === siteIdNum) { + belongsToUs = true; + hasTarget = true; + targetIp = t.ip; + targetPort = t.port; + break; + } + } + } catch { /* ignore */ } + + if (belongsToUs) { + // Extract subdomain from fullDomain for display + const fd = res.fullDomain || ''; + const domainSuffix = `.${instance.domain}`; + const subdomain = fd.endsWith(domainSuffix) + ? fd.slice(0, -domainSuffix.length) + : fd === instance.domain ? '' : fd; + resources.push({ + subdomain, + name: res.name, + resourceId: String(res.resourceId), + hasTarget, + targetIp, + targetPort, + }); + } + } + } catch (err) { + logger.warn(`[tunnel] ${instance.slug}: listResources failed: ${(err as Error).message}`); + } + + return { + configured: true, + online, + siteId: instance.pangolinSiteId ?? undefined, + endpoint: instance.pangolinEndpoint ?? undefined, + resources, + }; +} + +// ─── .env Helpers ────────────────────────────────────────────────── + +/** + * Quote a .env value if it contains characters that dotenv parsers interpret: + * # (comment), = (separator), spaces, quotes, backslashes, newlines. + * Pangolin-issued UUIDs/base64 secrets typically don't need quoting, but + * defensive quoting prevents silent corruption if they ever do. + */ +function quoteEnvValue(value: string): string { + if (/[\s#"'\\=\n\r]/.test(value)) { + return `"${value.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, '\\n')}"`; + } + return value; +} + +/** + * Build an updated .env string by replacing/appending the given key-value pairs. + * Preserves all existing keys not in the update set. + */ +function buildUpdatedEnv( + currentEnv: Record | null, + updates: Record +): string { + const lines: string[] = []; + const seen = new Set(); + + // If we have the current env, reproduce it with replacements + if (currentEnv) { + for (const [key, value] of Object.entries(currentEnv)) { + if (key in updates) { + if (updates[key]) lines.push(`${key}=${quoteEnvValue(updates[key]!)}`); + // If update value is empty, omit the line (remove the var) + seen.add(key); + } else { + lines.push(`${key}=${quoteEnvValue(value)}`); + } + } + } + + // Append new keys not already in the file + for (const [key, value] of Object.entries(updates)) { + if (!seen.has(key) && value) { + lines.push(`${key}=${quoteEnvValue(value)}`); + } + } + + return lines.join('\n') + '\n'; +} diff --git a/changemaker-control-panel/api/src/services/upgrade.service.ts b/changemaker-control-panel/api/src/services/upgrade.service.ts index c315af4b..993b3079 100644 --- a/changemaker-control-panel/api/src/services/upgrade.service.ts +++ b/changemaker-control-panel/api/src/services/upgrade.service.ts @@ -2,14 +2,61 @@ import { exec as execCb } from 'child_process'; import { promisify } from 'util'; import fs from 'fs/promises'; import path from 'path'; -import { UpgradeStatus, AuditAction, InstanceStatus, Prisma } from '@prisma/client'; +import { UpgradeStatus, AuditAction, InstanceStatus, Prisma, Instance } from '@prisma/client'; import { prisma } from '../lib/prisma'; import { logger } from '../utils/logger'; import { createEvent } from './event.service'; +import { getRemoteDriverForInstance } from './execution-driver'; +import type { AgentUpdateStatus } from './remote-driver'; + +/** + * Write an INSTANCE_UPGRADE audit log entry capturing a terminal outcome. + * Wrapped in try/catch so that an audit-log DB failure cannot mask the + * underlying upgrade row status update. + * + * Called from all three terminal paths (both local and remote): + * - 'completed' — upgrade.sh/agent reported success + * - 'failed' — upgrade.sh/agent reported failure + * - 'orchestration_error' — CCP-side exception, timeout, or unreachable agent + */ +async function writeUpgradeAuditLog(args: { + upgradeId: string; + instanceId: string; + triggeredById: string | null; + source: 'local' | 'remote'; + outcome: 'completed' | 'failed' | 'orchestration_error'; + previousCommit: string | null; + newCommit: string | null; + durationSeconds: number | null; + errorMessage?: string | null; +}): Promise { + if (!args.triggeredById) return; + try { + await prisma.auditLog.create({ + data: { + userId: args.triggeredById, + instanceId: args.instanceId, + action: AuditAction.INSTANCE_UPGRADE, + details: { + upgradeId: args.upgradeId, + source: args.source, + outcome: args.outcome, + previousCommit: args.previousCommit, + newCommit: args.newCommit, + durationSeconds: args.durationSeconds, + ...(args.errorMessage ? { errorMessage: args.errorMessage.substring(0, 500) } : {}), + } as unknown as Prisma.InputJsonValue, + }, + }); + } catch (err) { + logger.error(`[upgrade] failed to write audit log for ${args.upgradeId}: ${(err as Error).message}`); + } +} const exec = promisify(execCb); -const UPGRADE_TIMEOUT = 600_000; // 10 minutes +const UPGRADE_TIMEOUT = 600_000; // 10 minutes — local upgrades +const REMOTE_UPGRADE_TIMEOUT = 15 * 60 * 1000; // 15 minutes — remote (network round trips) const PROGRESS_POLL_INTERVAL = 2_000; // 2 seconds // ─── Update Check ───────────────────────────────────────────────── @@ -26,13 +73,57 @@ export interface UpdateStatus { } /** - * Check for available updates by running upgrade-check.sh in the instance's basePath. - * Falls back to reading an existing status.json if the script isn't available. + * Check for available updates. Branches on instance.isRemote: + * - Local: runs upgrade-check.sh in the instance's basePath and reads status.json + * - Remote: calls the agent's POST /upgrade/check endpoint over mTLS */ export async function checkForUpdates(instanceId: string): Promise { const instance = await prisma.instance.findUnique({ where: { id: instanceId } }); if (!instance) throw new Error('Instance not found'); + if (instance.isRemote) { + return checkForUpdatesRemote(instance); + } + return checkForUpdatesLocal(instance); +} + +/** + * Remote check: ask the agent to run upgrade-check.sh and return its status.json. + */ +async function checkForUpdatesRemote(instance: Instance): Promise { + try { + const driver = await getRemoteDriverForInstance({ + id: instance.id, + slug: instance.slug, + isRemote: instance.isRemote, + agentUrl: instance.agentUrl, + }); + const status: AgentUpdateStatus = await driver.checkForUpdates(); + return { + branch: status.branch, + currentCommit: status.currentCommit, + currentMessage: status.currentMessage, + remoteCommit: status.remoteCommit, + commitsBehind: status.commitsBehind, + changelog: status.changelog, + checkedAt: status.checkedAt, + error: status.error, + }; + } catch (err) { + logger.warn(`[upgrade] remote check failed for ${instance.slug}: ${(err as Error).message}`); + return { + branch: instance.gitBranch, + currentCommit: instance.gitCommit || 'unknown', + remoteCommit: null, + commitsBehind: 0, + changelog: [], + checkedAt: new Date().toISOString(), + error: `Remote check failed: ${(err as Error).message}`, + }; + } +} + +async function checkForUpdatesLocal(instance: Instance): Promise { const basePath = instance.basePath; const statusFile = path.join(basePath, 'data', 'upgrade', 'status.json'); const scriptPath = path.join(basePath, 'scripts', 'upgrade-check.sh'); @@ -119,16 +210,21 @@ export async function startUpgrade( throw new Error('An upgrade is already in progress for this instance'); } - // Get current commit for tracking - let currentCommit: string | null = null; - try { - const { stdout } = await exec('git rev-parse --short HEAD', { - cwd: instance.basePath, - timeout: 5_000, - }); - currentCommit = stdout.trim(); - } catch { - // Non-critical — may be a release install without .git + // Get current commit for tracking. For local instances we can read it from + // git directly; for remote instances we either trust the DB-tracked value + // (set by previous upgrade-check) or leave it null and let upgrade.sh + // report the previous commit in result.json. + let currentCommit: string | null = instance.gitCommit; + if (!instance.isRemote) { + try { + const { stdout } = await exec('git rev-parse --short HEAD', { + cwd: instance.basePath, + timeout: 5_000, + }); + currentCommit = stdout.trim(); + } catch { + // Non-critical — may be a release install without .git + } } const branch = options?.branch || instance.gitBranch; @@ -154,20 +250,222 @@ export async function startUpgrade( upgradeId: upgrade.id, previousCommit: currentCommit, branch, + source: instance.isRemote ? 'remote' : 'local', options: options || {}, } as unknown as Prisma.InputJsonValue, ipAddress, }, }); - // Fire-and-forget: run the upgrade asynchronously - runUpgrade(upgrade.id, instance.basePath, instance.slug, options).catch((err) => { - logger.error(`[upgrade] Upgrade orchestration failed for ${instance.slug}: ${err}`); - }); + // Fire-and-forget: branch on isRemote + if (instance.isRemote) { + runRemoteUpgrade(upgrade.id, instance, options).catch((err) => { + logger.error(`[upgrade] Remote upgrade orchestration failed for ${instance.slug}: ${err}`); + }); + } else { + runUpgrade(upgrade.id, instance.basePath, instance.slug, options).catch((err) => { + logger.error(`[upgrade] Upgrade orchestration failed for ${instance.slug}: ${err}`); + }); + } return upgrade; } +/** + * Async REMOTE upgrade runner. + * + * Flow: + * 1. Get RemoteDriver + * 2. Mark InstanceUpgrade IN_PROGRESS + * 3. Tell agent to start upgrade.sh in --api-mode + * 4. Poll agent /upgrade/progress every 2s, mirror to DB + * 5. Try /upgrade/result every poll cycle; when present, finalize + * 6. On timeout (15 min), mark FAILED and create error event + * + * Note: there is no shell or filesystem access on the CCP side — everything + * goes through the mTLS agent. The agent's spawn of upgrade.sh is itself + * fire-and-forget under a slug mutex. + */ +async function runRemoteUpgrade( + upgradeId: string, + instance: Instance, + options?: StartUpgradeOptions +) { + const slug = instance.slug; + + try { + const driver = await getRemoteDriverForInstance({ + id: instance.id, + slug: instance.slug, + isRemote: instance.isRemote, + agentUrl: instance.agentUrl, + }); + + // Mark IN_PROGRESS + await prisma.instanceUpgrade.update({ + where: { id: upgradeId }, + data: { + status: UpgradeStatus.IN_PROGRESS, + progressMessage: 'Starting remote upgrade...', + }, + }); + + // Tell the agent to start. The agent has its own mutex + stale-progress + // check, so this can return 409 if a previous upgrade is still running. + logger.info(`[upgrade] ${slug}: triggering remote upgrade.sh start`); + await driver.startUpgrade({ + skipBackup: options?.skipBackup, + useRegistry: options?.useRegistry, + branch: options?.branch, + }); + + // Poll progress + result. We treat /result returning 200 as the signal + // that upgrade.sh exited (successfully or with code != 0 — the script + // writes result.json either way in --api-mode). + const deadline = Date.now() + REMOTE_UPGRADE_TIMEOUT; + let lastProgress: { phase?: number; phaseName?: string; percentage?: number; message?: string } = {}; + + while (Date.now() < deadline) { + await new Promise((r) => setTimeout(r, PROGRESS_POLL_INTERVAL)); + + // Try to fetch the result first; if it exists, we're done + let result = null; + try { + result = await driver.getUpgradeResult(); + } catch { + // No result yet — keep polling progress + } + + if (result) { + // Final result available — write it and exit + const upgradeRowBefore = await prisma.instanceUpgrade.findUnique({ where: { id: upgradeId } }); + await prisma.instanceUpgrade.update({ + where: { id: upgradeId }, + data: { + status: result.success ? UpgradeStatus.COMPLETED : UpgradeStatus.FAILED, + newCommit: result.newCommit || null, + commitCount: result.commitCount || 0, + percentage: 100, + phaseName: 'Complete', + progressMessage: result.message || 'Upgrade completed', + durationSeconds: result.durationSeconds || null, + warnings: result.warnings?.length ? (result.warnings as unknown as Prisma.InputJsonValue) : undefined, + errorMessage: result.success ? null : (result.message || 'Upgrade failed'), + completedAt: new Date(), + }, + }); + + // Update Instance.gitCommit if we have a new commit + if (result.newCommit) { + await prisma.instance.update({ + where: { id: instance.id }, + data: { gitCommit: result.newCommit }, + }); + } + + if (!result.success) { + await createEvent( + instance.id, + 'ERROR', + 'upgrade', + 'Remote upgrade failed', + result.message || 'The remote upgrade process failed. Check the agent log for details.', + { upgradeId, source: 'remote', warnings: result.warnings } + ); + } + + await writeUpgradeAuditLog({ + upgradeId, + instanceId: instance.id, + triggeredById: upgradeRowBefore?.triggeredById ?? null, + source: 'remote', + outcome: result.success ? 'completed' : 'failed', + previousCommit: upgradeRowBefore?.previousCommit ?? null, + newCommit: result.newCommit || null, + durationSeconds: result.durationSeconds || null, + errorMessage: result.success ? null : (result.message || 'Upgrade failed'), + }); + + logger.info(`[upgrade] ${slug}: remote upgrade ${result.success ? 'COMPLETED' : 'FAILED'}`); + return; + } + + // No result yet — pull progress + try { + const progress = await driver.getUpgradeProgress(); + // Only update DB if something actually changed (avoid hot-loop writes) + if ( + progress.phase !== lastProgress.phase || + progress.percentage !== lastProgress.percentage || + progress.message !== lastProgress.message + ) { + lastProgress = { + phase: progress.phase, + phaseName: progress.phaseName, + percentage: progress.percentage, + message: progress.message, + }; + await prisma.instanceUpgrade.update({ + where: { id: upgradeId }, + data: { + currentPhase: progress.phase || 0, + phaseName: progress.phaseName || null, + percentage: progress.percentage || 0, + progressMessage: progress.message || null, + }, + }); + } + } catch (err) { + // Transient network blip during a long upgrade — keep polling + logger.debug(`[upgrade] ${slug}: progress poll error: ${(err as Error).message}`); + } + } + + // Timeout — mark FAILED + throw new Error(`Remote upgrade timed out after ${Math.round(REMOTE_UPGRADE_TIMEOUT / 60_000)} minutes`); + } catch (err) { + const errorMsg = (err as Error).message; + const isTimeout = errorMsg.includes('timed out'); + + const upgradeRowBefore = await prisma.instanceUpgrade.findUnique({ where: { id: upgradeId } }); + await prisma.instanceUpgrade.update({ + where: { id: upgradeId }, + data: { + status: UpgradeStatus.FAILED, + errorMessage: isTimeout ? errorMsg : errorMsg.slice(0, 2000), + progressMessage: 'Failed', + completedAt: new Date(), + }, + }); + + await createEvent( + instance.id, + 'ERROR', + 'upgrade', + isTimeout ? 'Remote upgrade timed out' : 'Remote upgrade failed', + errorMsg.slice(0, 500), + { upgradeId, source: 'remote' } + ); + + await writeUpgradeAuditLog({ + upgradeId, + instanceId: instance.id, + triggeredById: upgradeRowBefore?.triggeredById ?? null, + source: 'remote', + outcome: 'orchestration_error', + previousCommit: upgradeRowBefore?.previousCommit ?? null, + newCommit: null, + durationSeconds: null, + errorMessage: errorMsg, + }); + + // Don't flip the instance to ERROR state for remote upgrades — the agent + // health check will reflect the real state on the next poll, and we don't + // want to mask a recovered instance with stale CCP-side ERROR. + logger.error(`[upgrade] ${slug}: ${errorMsg}`); + } +} + /** * Async upgrade runner. Runs upgrade.sh and polls progress. */ @@ -271,19 +569,32 @@ async function runUpgrade( }); } - if (!result.success) { + const upgradeRow = await prisma.instanceUpgrade.findUnique({ where: { id: upgradeId } }); + + if (!result.success && upgradeRow) { // Create error event - const upgrade = await prisma.instanceUpgrade.findUnique({ where: { id: upgradeId } }); - if (upgrade) { - await createEvent( - upgrade.instanceId, - 'ERROR', - 'upgrade', - 'Upgrade failed', - result.message || 'The upgrade process failed. Check logs for details.', - { upgradeId, previousCommit: upgrade.previousCommit, warnings: result.warnings } - ); - } + await createEvent( + upgradeRow.instanceId, + 'ERROR', + 'upgrade', + 'Upgrade failed', + result.message || 'The upgrade process failed. Check logs for details.', + { upgradeId, previousCommit: upgradeRow.previousCommit, warnings: result.warnings } + ); + } + + if (upgradeRow) { + await writeUpgradeAuditLog({ + upgradeId, + instanceId: upgradeRow.instanceId, + triggeredById: upgradeRow.triggeredById, + source: 'local', + outcome: result.success ? 'completed' : 'failed', + previousCommit: upgradeRow.previousCommit, + newCommit: result.newCommit || newCommit, + durationSeconds: result.durationSeconds || null, + errorMessage: result.success ? null : (result.message || 'Upgrade failed'), + }); } logger.info(`[upgrade] ${slug}: Upgrade ${result.success ? 'completed' : 'failed'}`); @@ -327,6 +638,18 @@ async function runUpgrade( statusMessage: `Upgrade failed: ${isTimeout ? 'timeout' : errorMsg.slice(0, 200)}`, }, }); + + await writeUpgradeAuditLog({ + upgradeId, + instanceId: upgrade.instanceId, + triggeredById: upgrade.triggeredById, + source: 'local', + outcome: 'orchestration_error', + previousCommit: upgrade.previousCommit, + newCommit: null, + durationSeconds: result.durationSeconds || null, + errorMessage: errorMsg, + }); } logger.error(`[upgrade] ${slug}: Upgrade failed: ${errorMsg}`); diff --git a/config.sh b/config.sh index f06c49fd..a16efb0a 100755 --- a/config.sh +++ b/config.sh @@ -38,6 +38,11 @@ NI_MAPBOX_KEY="" NI_MAXMIND_ACCOUNT_ID="" NI_MAXMIND_LICENSE_KEY="" +# CCP (Changemaker Control Panel) registration flags +NI_CCP_URL="" +NI_CCP_INVITE_CODE="" +NI_CCP_AGENT_URL="" + # --- Arg parser --- while [[ $# -gt 0 ]]; do case "$1" in @@ -62,6 +67,10 @@ while [[ $# -gt 0 ]]; do --mapbox-key) NI_MAPBOX_KEY="$2"; shift 2 ;; --maxmind-account-id) NI_MAXMIND_ACCOUNT_ID="$2"; shift 2 ;; --maxmind-license-key) NI_MAXMIND_LICENSE_KEY="$2"; shift 2 ;; + # CCP (Changemaker Control Panel) + --ccp-url) NI_CCP_URL="$2"; shift 2 ;; + --ccp-invite-code) NI_CCP_INVITE_CODE="$2"; shift 2 ;; + --ccp-agent-url) NI_CCP_AGENT_URL="$2"; shift 2 ;; --help|-h) echo "Usage: bash config.sh [OPTIONS]" echo "" @@ -91,6 +100,11 @@ while [[ $# -gt 0 ]]; do echo " --maxmind-account-id ID MaxMind GeoIP account ID" echo " --maxmind-license-key K MaxMind GeoIP license key" echo "" + echo "CCP (Changemaker Control Panel) — all 3 flags required to register:" + echo " --ccp-url URL CCP server URL (e.g., https://ccp.example.com)" + echo " --ccp-invite-code CODE One-time invite code from CCP" + echo " --ccp-agent-url URL Agent URL the CCP reaches (e.g., https://this-host:7443)" + echo "" echo "Example:" echo " bash config.sh --non-interactive --domain example.org --admin-password MyStr0ngPass123" echo " bash config.sh -y --domain example.org --admin-password MyStr0ngPass123 \\" @@ -798,6 +812,17 @@ configure_features() { else warn "Set JVB_ADVERTISE_IP in .env before starting Jitsi containers." fi + else + # Non-interactive: auto-detect public IP for NAT traversal + local detected_ip + detected_ip=$(curl -sf --max-time 5 https://ifconfig.me 2>/dev/null || \ + curl -sf --max-time 5 https://api.ipify.org 2>/dev/null || true) + if [[ -n "$detected_ip" ]]; then + update_env_var "JVB_ADVERTISE_IP" "$detected_ip" + success "JVB advertise IP auto-detected: $detected_ip" + else + warn "Could not auto-detect public IP. Set JVB_ADVERTISE_IP in .env before starting Jitsi." + fi fi else MEET_ENABLED="no" @@ -838,13 +863,6 @@ configure_features() { update_env_var "ENABLE_PEOPLE" "false" fi - if prompt_yes_no "Enable Analytics & GeoIP (visitor tracking, geo dashboard)?"; then - update_env_var "ENABLE_ANALYTICS" "true" - success "Analytics enabled" - else - update_env_var "ENABLE_ANALYTICS" "false" - fi - if prompt_yes_no "Enable Docs Comments & Version History (Gitea-backed)?"; then update_env_var "GITEA_COMMENTS_ENABLED" "true" success "Docs Comments & Version History enabled" @@ -881,8 +899,14 @@ configure_features() { fi if prompt_yes_no "Enable Monitoring stack (Prometheus, Grafana, Alertmanager, cAdvisor)?" "y"; then - update_env_var "COMPOSE_PROFILES" "monitoring" - success "Monitoring enabled (COMPOSE_PROFILES=monitoring)" + local existing_profiles + existing_profiles=$(grep -oP 'COMPOSE_PROFILES=\K.*' "$ENV_FILE" 2>/dev/null || echo "") + if [[ -z "$existing_profiles" ]]; then + update_env_var "COMPOSE_PROFILES" "monitoring" + elif [[ "$existing_profiles" != *"monitoring"* ]]; then + update_env_var "COMPOSE_PROFILES" "${existing_profiles},monitoring" + fi + success "Monitoring enabled (COMPOSE_PROFILES includes monitoring)" MONITORING_ENABLED="yes" else MONITORING_ENABLED="no" @@ -1401,6 +1425,35 @@ pangolin_connect_first_site() { configure_control_panel() { header "Control Panel Registration" + # Non-interactive: use --ccp-* flags if all three provided, otherwise skip + if [[ "$NON_INTERACTIVE" == "true" ]]; then + if [[ -n "$NI_CCP_URL" && -n "$NI_CCP_INVITE_CODE" && -n "$NI_CCP_AGENT_URL" ]]; then + update_env_var "ENABLE_CCP_AGENT" "true" + update_env_var "CCP_URL" "$NI_CCP_URL" + update_env_var "CCP_INVITE_CODE" "$NI_CCP_INVITE_CODE" + update_env_var "CCP_AGENT_URL" "$NI_CCP_AGENT_URL" + + # Append ccp-agent to existing profiles (don't clobber monitoring) + local existing_profiles + existing_profiles=$(grep -oP 'COMPOSE_PROFILES=\K.*' "$ENV_FILE" 2>/dev/null || echo "") + if [[ -z "$existing_profiles" ]]; then + update_env_var "COMPOSE_PROFILES" "ccp-agent" + elif [[ "$existing_profiles" != *"ccp-agent"* ]]; then + update_env_var "COMPOSE_PROFILES" "${existing_profiles},ccp-agent" + fi + + success "CCP registration configured ($NI_CCP_URL)" + else + update_env_var "ENABLE_CCP_AGENT" "false" + if [[ -n "$NI_CCP_URL" || -n "$NI_CCP_INVITE_CODE" || -n "$NI_CCP_AGENT_URL" ]]; then + warn "CCP registration needs all 3 flags: --ccp-url, --ccp-invite-code, --ccp-agent-url" + else + info "Skipping CCP registration (no --ccp-url provided)" + fi + fi + return + fi + if prompt_yes_no "Register this instance with a Changemaker Control Panel?"; then echo "" read -rp " Enter Control Panel URL (e.g., https://ccp.example.com): " ccp_url @@ -2152,9 +2205,15 @@ main() { header "Release Mode Settings" update_env_var "IMAGE_TAG" "latest" update_env_var "NODE_ENV" "production" - # Ensure monitoring is included if user opted in + # Ensure monitoring is included if user opted in (preserve existing profiles) if [[ "${MONITORING_ENABLED:-no}" == "yes" ]]; then - update_env_var "COMPOSE_PROFILES" "monitoring" + local existing_profiles + existing_profiles=$(grep -oP 'COMPOSE_PROFILES=\K.*' "$ENV_FILE" 2>/dev/null || echo "") + if [[ -z "$existing_profiles" ]]; then + update_env_var "COMPOSE_PROFILES" "monitoring" + elif [[ "$existing_profiles" != *"monitoring"* ]]; then + update_env_var "COMPOSE_PROFILES" "${existing_profiles},monitoring" + fi fi success "Set IMAGE_TAG=latest, NODE_ENV=production (pre-built images)" fi diff --git a/scripts/build-release.sh b/scripts/build-release.sh index 69123de1..f9bdd967 100755 --- a/scripts/build-release.sh +++ b/scripts/build-release.sh @@ -103,7 +103,8 @@ cp "$PROJECT_DIR/api/prisma/init-nocodb-db.sh" "$STAGE_DIR/scripts/" cp "$PROJECT_DIR/api/prisma/init-gancio-db.sh" "$STAGE_DIR/scripts/" # Runtime scripts -for script in nocodb-init.sh gitea-init.sh mkdocs-entrypoint.sh backup.sh \ +for script in nocodb-init.sh gitea-init.sh mkdocs-entrypoint.sh \ + backup.sh restore.sh \ upgrade.sh upgrade-check.sh upgrade-watcher.sh \ uninstall.sh test-deployment.sh; do if [[ -f "$PROJECT_DIR/scripts/$script" ]]; then diff --git a/scripts/install.sh b/scripts/install.sh index 49f39587..6b92765e 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -294,7 +294,7 @@ if [[ "$START_SERVICES" =~ ^[Yy]$ ]]; then info " Database migrations and seeding run automatically on first boot." echo "" - CORE_SERVICES=("v2-postgres" "redis" "api" "admin") + CORE_SERVICES=("v2-postgres" "redis" "api" "admin" "nginx") ELAPSED=0 ALL_HEALTHY=false diff --git a/scripts/upgrade.sh b/scripts/upgrade.sh index 48a08c45..38266f8b 100755 --- a/scripts/upgrade.sh +++ b/scripts/upgrade.sh @@ -359,9 +359,13 @@ trap on_failure EXIT acquire_lock load_env -# Determine branch +# Determine branch (source mode only — release installs have no git) if [[ -z "$BRANCH" ]]; then - BRANCH="$(git rev-parse --abbrev-ref HEAD)" + if [[ "$INSTALL_MODE" == "release" ]]; then + BRANCH="release" + else + BRANCH="$(git rev-parse --abbrev-ref HEAD)" + fi fi # ============================================================================= @@ -461,13 +465,15 @@ else exit 1 fi -# Remote reachable -info "Checking git remote..." -if timeout 10 git ls-remote origin HEAD &>/dev/null 2>&1; then - success "Git remote reachable" -else - error "Cannot reach git remote. Check your network or remote configuration." - exit 1 +# Remote reachable (source mode only — release mode pulls from Gitea API later) +if [[ "$INSTALL_MODE" == "source" ]]; then + info "Checking git remote..." + if timeout 10 git ls-remote origin HEAD &>/dev/null 2>&1; then + success "Git remote reachable" + else + error "Cannot reach git remote. Check your network or remote configuration." + exit 1 + fi fi # Working directory checks @@ -490,9 +496,16 @@ fi success "Disk space: ${AVAILABLE_MB}MB available" # Record pre-upgrade state -PRE_UPGRADE_COMMIT="$(git rev-parse HEAD)" -PRE_UPGRADE_SHORT="$(git rev-parse --short HEAD)" -info "Current commit: $PRE_UPGRADE_SHORT ($(git log -1 --format='%s' HEAD))" +if [[ "$INSTALL_MODE" == "source" ]]; then + PRE_UPGRADE_COMMIT="$(git rev-parse HEAD)" + PRE_UPGRADE_SHORT="$(git rev-parse --short HEAD)" + info "Current commit: $PRE_UPGRADE_SHORT ($(git log -1 --format='%s' HEAD))" +else + # Release mode: derive "commit" from VERSION file (format: \n) + PRE_UPGRADE_COMMIT="$(head -2 "$PROJECT_DIR/VERSION" 2>/dev/null | tail -1 || echo "release")" + PRE_UPGRADE_SHORT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")" + info "Current version: $PRE_UPGRADE_SHORT" +fi info "Target branch: $BRANCH" # Record running containers (for restoring monitoring profile later) @@ -502,31 +515,36 @@ if docker ps --format '{{.Names}}' | grep -q 'prometheus-changemaker'; then info "Monitoring stack detected (will restart after upgrade)" fi -# Warn about uncommitted changes in project-owned paths -PROJECT_OWNED_PATHS="api/ admin/ docker-compose.yml" -DIRTY_PROJECT_FILES="$(git diff --name-only HEAD -- $PROJECT_OWNED_PATHS 2>/dev/null || true)" -if [[ -n "$DIRTY_PROJECT_FILES" ]]; then - warn "Uncommitted changes in project-owned files:" - echo "$DIRTY_PROJECT_FILES" | while read -r f; do echo " $f"; done - if [[ "$FORCE" != "true" ]]; then - error "Commit or stash these changes first, or use --force to continue." - exit 1 +# Source-mode-only checks: dirty files + upstream commit comparison +if [[ "$INSTALL_MODE" == "source" ]]; then + # Warn about uncommitted changes in project-owned paths + PROJECT_OWNED_PATHS="api/ admin/ docker-compose.yml" + DIRTY_PROJECT_FILES="$(git diff --name-only HEAD -- $PROJECT_OWNED_PATHS 2>/dev/null || true)" + if [[ -n "$DIRTY_PROJECT_FILES" ]]; then + warn "Uncommitted changes in project-owned files:" + echo "$DIRTY_PROJECT_FILES" | while read -r f; do echo " $f"; done + if [[ "$FORCE" != "true" ]]; then + error "Commit or stash these changes first, or use --force to continue." + exit 1 + fi + warn "Continuing with --force (changes will be stashed)" fi - warn "Continuing with --force (changes will be stashed)" -fi -# Check for available updates -LOCAL_HEAD="$(git rev-parse HEAD)" -REMOTE_HEAD="$(git ls-remote origin "$BRANCH" | cut -f1)" -if [[ "$LOCAL_HEAD" == "$REMOTE_HEAD" ]]; then - info "Already up to date ($PRE_UPGRADE_SHORT). No upstream changes." - if [[ "$FORCE" != "true" ]]; then - success "Nothing to upgrade." - release_lock - exit 0 + # Check for available updates + LOCAL_HEAD="$(git rev-parse HEAD)" + REMOTE_HEAD="$(git ls-remote origin "$BRANCH" | cut -f1)" + if [[ "$LOCAL_HEAD" == "$REMOTE_HEAD" ]]; then + info "Already up to date ($PRE_UPGRADE_SHORT). No upstream changes." + if [[ "$FORCE" != "true" ]]; then + success "Nothing to upgrade." + release_lock + exit 0 + fi + warn "Continuing with --force despite no upstream changes." fi - warn "Continuing with --force despite no upstream changes." fi +# Release mode: the upstream-version comparison happens later in the +# release-mode block (line ~597) which queries the Gitea Releases API. # ============================================================================= # Phase 2: Backup @@ -669,100 +687,105 @@ elif [[ "$DRY_RUN" == "true" ]]; then exit 0 fi -# Step 0: Save user-modifiable paths before any git operations -save_user_paths +# Source-mode git pull flow. Release mode handles its update via tarball +# download in the block above and skips this entire section. +if [[ "$INSTALL_MODE" == "source" ]]; then + # Step 0: Save user-modifiable paths before any git operations + save_user_paths -# Step 0b: Clear skip-worktree flags that prevent merge (e.g., repo-data JSON files) -SKIP_WORKTREE_FILES="$(git ls-files -v | grep '^S ' | awk '{print $2}' || true)" -if [[ -n "$SKIP_WORKTREE_FILES" ]]; then - info "Clearing skip-worktree flags on $(echo "$SKIP_WORKTREE_FILES" | wc -l | xargs) file(s)..." - echo "$SKIP_WORKTREE_FILES" | xargs git update-index --no-skip-worktree - success "Skip-worktree flags cleared" -fi - -# Step 0c: Fix Docker-owned directories that block git checkout -for owned_dir in api/upgrade api/uploads api/configs; do - if [[ -d "$PROJECT_DIR/$owned_dir" ]] && [[ ! -w "$PROJECT_DIR/$owned_dir" ]]; then - info "Fixing permissions on $owned_dir..." - docker run --rm -v "$PROJECT_DIR/$owned_dir:/fix" alpine chown -R "$(id -u):$(id -g)" /fix 2>/dev/null || true + # Step 0b: Clear skip-worktree flags that prevent merge (e.g., repo-data JSON files) + SKIP_WORKTREE_FILES="$(git ls-files -v | grep '^S ' | awk '{print $2}' || true)" + if [[ -n "$SKIP_WORKTREE_FILES" ]]; then + info "Clearing skip-worktree flags on $(echo "$SKIP_WORKTREE_FILES" | wc -l | xargs) file(s)..." + echo "$SKIP_WORKTREE_FILES" | xargs git update-index --no-skip-worktree + success "Skip-worktree flags cleared" fi -done -# Step 1: Stash user changes if any exist -HAS_CHANGES=false -if [[ -n "$(git status --porcelain 2>/dev/null)" ]]; then - HAS_CHANGES=true - STASH_NAME="upgrade-${TIMESTAMP}" - info "Stashing local changes as '$STASH_NAME'..." - git stash push --include-untracked -m "$STASH_NAME" - success "Local changes stashed" -fi + # Step 0c: Fix Docker-owned directories that block git checkout + for owned_dir in api/upgrade api/uploads api/configs; do + if [[ -d "$PROJECT_DIR/$owned_dir" ]] && [[ ! -w "$PROJECT_DIR/$owned_dir" ]]; then + info "Fixing permissions on $owned_dir..." + docker run --rm -v "$PROJECT_DIR/$owned_dir:/fix" alpine chown -R "$(id -u):$(id -g)" /fix 2>/dev/null || true + fi + done -# Step 3: Pull updates -info "Pulling updates from origin/$BRANCH..." -if ! git pull origin "$BRANCH" --no-edit 2>&1; then - error "git pull failed. This may indicate upstream force-push or branch issues." + # Step 1: Stash user changes if any exist + HAS_CHANGES=false + if [[ -n "$(git status --porcelain 2>/dev/null)" ]]; then + HAS_CHANGES=true + STASH_NAME="upgrade-${TIMESTAMP}" + info "Stashing local changes as '$STASH_NAME'..." + git stash push --include-untracked -m "$STASH_NAME" + success "Local changes stashed" + fi + + # Step 3: Pull updates + info "Pulling updates from origin/$BRANCH..." + if ! git pull origin "$BRANCH" --no-edit 2>&1; then + error "git pull failed. This may indicate upstream force-push or branch issues." + if [[ "$HAS_CHANGES" == "true" ]]; then + warn "Your stashed changes can be recovered with: git stash pop" + fi + exit 1 + fi + + POST_PULL_COMMIT="$(git rev-parse --short HEAD)" + success "Updated to $POST_PULL_COMMIT" + + # Step 4: Pop stash and handle conflicts if [[ "$HAS_CHANGES" == "true" ]]; then - warn "Your stashed changes can be recovered with: git stash pop" - fi - exit 1 -fi + info "Restoring local changes..." + if git stash pop 2>&1; then + success "Local changes restored cleanly" + else + warn "Merge conflicts detected during stash pop" -POST_PULL_COMMIT="$(git rev-parse --short HEAD)" -success "Updated to $POST_PULL_COMMIT" + # Auto-resolve user-modifiable paths by keeping user's version + RESOLVED_COUNT=0 + for user_path in "${USER_PATHS[@]}"; do + CONFLICTED="$(git diff --name-only --diff-filter=U -- "$user_path" 2>/dev/null || true)" + if [[ -n "$CONFLICTED" ]]; then + while IFS= read -r cf; do + info " Auto-resolving (keeping yours): $cf" + git checkout --theirs "$cf" 2>/dev/null || true + git add "$cf" + RESOLVED_COUNT=$((RESOLVED_COUNT + 1)) + done < <(echo "$CONFLICTED") + fi + done -# Step 4: Pop stash and handle conflicts -if [[ "$HAS_CHANGES" == "true" ]]; then - info "Restoring local changes..." - if git stash pop 2>&1; then - success "Local changes restored cleanly" - else - warn "Merge conflicts detected during stash pop" - - # Auto-resolve user-modifiable paths by keeping user's version - RESOLVED_COUNT=0 - for user_path in "${USER_PATHS[@]}"; do - CONFLICTED="$(git diff --name-only --diff-filter=U -- "$user_path" 2>/dev/null || true)" - if [[ -n "$CONFLICTED" ]]; then - while IFS= read -r cf; do - info " Auto-resolving (keeping yours): $cf" - git checkout --theirs "$cf" 2>/dev/null || true - git add "$cf" - RESOLVED_COUNT=$((RESOLVED_COUNT + 1)) - done < <(echo "$CONFLICTED") + # Check if any conflicts remain in project-owned files + REMAINING_CONFLICTS="$(git diff --name-only --diff-filter=U 2>/dev/null || true)" + if [[ -n "$REMAINING_CONFLICTS" ]]; then + error "Unresolved conflicts in project-owned files:" + echo "$REMAINING_CONFLICTS" | while read -r f; do echo " $f"; done + echo "" + error "These files have upstream changes that conflict with your edits." + error "Resolve manually, then run the upgrade again." + info "Your pre-upgrade commit: $PRE_UPGRADE_COMMIT" + info "To abort: git merge --abort OR git checkout $PRE_UPGRADE_COMMIT" + exit 1 fi - done - # Check if any conflicts remain in project-owned files - REMAINING_CONFLICTS="$(git diff --name-only --diff-filter=U 2>/dev/null || true)" - if [[ -n "$REMAINING_CONFLICTS" ]]; then - error "Unresolved conflicts in project-owned files:" - echo "$REMAINING_CONFLICTS" | while read -r f; do echo " $f"; done - echo "" - error "These files have upstream changes that conflict with your edits." - error "Resolve manually, then run the upgrade again." - info "Your pre-upgrade commit: $PRE_UPGRADE_COMMIT" - info "To abort: git merge --abort OR git checkout $PRE_UPGRADE_COMMIT" - exit 1 - fi - - if [[ $RESOLVED_COUNT -gt 0 ]]; then - success "Auto-resolved $RESOLVED_COUNT user-modifiable path(s) (kept your versions)" + if [[ $RESOLVED_COUNT -gt 0 ]]; then + success "Auto-resolved $RESOLVED_COUNT user-modifiable path(s) (kept your versions)" + fi fi fi -fi -# Step 4b: Restore user-modifiable paths (unconditionally overwrites with saved copies) -restore_user_paths + # Step 4b: Restore user-modifiable paths (unconditionally overwrites with saved copies) + restore_user_paths -# Step 4c: Restore any tracked files accidentally deleted by restore_user_paths -# (can happen when save_user_paths can't read root-owned files in user paths) -DELETED_TRACKED="$(git ls-files --deleted 2>/dev/null || true)" -if [[ -n "$DELETED_TRACKED" ]]; then - info "Restoring $(echo "$DELETED_TRACKED" | wc -l | xargs) tracked file(s) deleted during restore..." - echo "$DELETED_TRACKED" | xargs git checkout HEAD -- 2>/dev/null || true - success "Tracked files restored from HEAD" + # Step 4c: Restore any tracked files accidentally deleted by restore_user_paths + # (can happen when save_user_paths can't read root-owned files in user paths) + DELETED_TRACKED="$(git ls-files --deleted 2>/dev/null || true)" + if [[ -n "$DELETED_TRACKED" ]]; then + info "Restoring $(echo "$DELETED_TRACKED" | wc -l | xargs) tracked file(s) deleted during restore..." + echo "$DELETED_TRACKED" | xargs git checkout HEAD -- 2>/dev/null || true + success "Tracked files restored from HEAD" + fi fi +# End of source-mode git pull flow # Step 5: Detect new env vars info "Checking for new environment variables..." @@ -791,24 +814,30 @@ if [[ -f "$PROJECT_DIR/.env.example" ]] && [[ -f "$PROJECT_DIR/.env" ]]; then fi fi -# Step 6: Print update summary -COMMIT_RANGE="${PRE_UPGRADE_SHORT}..${POST_PULL_COMMIT}" -COMMIT_COUNT="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | wc -l | xargs)" -echo "" -info "Update summary: $COMMIT_COUNT commit(s) ($COMMIT_RANGE)" -git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | head -20 -if [[ "$COMMIT_COUNT" -gt 20 ]]; then - info " ... and $((COMMIT_COUNT - 20)) more" -fi - -# Flag commits that may require manual attention -BREAKING_COMMITS="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" --grep="BREAKING" --grep="\[manual\]" 2>/dev/null || true)" -if [[ -n "$BREAKING_COMMITS" ]]; then +# Step 6: Print update summary (source mode only — release mode has no commit range) +COMMIT_COUNT=0 +if [[ "$INSTALL_MODE" == "source" ]]; then + COMMIT_RANGE="${PRE_UPGRADE_SHORT}..${POST_PULL_COMMIT}" + # Use || true and check pipefail-safe to survive git failures + COMMIT_COUNT="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | wc -l | xargs || echo 0)" echo "" - warn "Commits requiring manual attention:" - echo "$BREAKING_COMMITS" | while read -r line; do - echo -e " ${YELLOW}$line${NC}" - done + info "Update summary: $COMMIT_COUNT commit(s) ($COMMIT_RANGE)" + git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | head -20 || true + if [[ "$COMMIT_COUNT" -gt 20 ]]; then + info " ... and $((COMMIT_COUNT - 20)) more" + fi + + # Flag commits that may require manual attention + BREAKING_COMMITS="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" --grep="BREAKING" --grep="\[manual\]" 2>/dev/null || true)" + if [[ -n "$BREAKING_COMMITS" ]]; then + echo "" + warn "Commits requiring manual attention:" + echo "$BREAKING_COMMITS" | while read -r line; do + echo -e " ${YELLOW}$line${NC}" + done + fi +else + info "Update summary: ${PRE_UPGRADE_SHORT} → release" fi # ============================================================================= @@ -1135,7 +1164,10 @@ verify_service_health() { done warn "$name: not responding after ${max_wait}s" VERIFY_FAILED=true - return 1 + # Always return 0 — under set -e a non-zero return from this helper would + # exit the script before write_result runs. The VERIFY_FAILED flag is the + # signal the caller actually checks. + return 0 } # API health (with polling — may still be running migrations) @@ -1194,7 +1226,11 @@ fi # ============================================================================= ELAPSED="$(elapsed)" -FINAL_COMMIT="$(git rev-parse --short HEAD)" +if [[ "$INSTALL_MODE" == "source" ]]; then + FINAL_COMMIT="$(git rev-parse --short HEAD)" +else + FINAL_COMMIT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")" +fi # Collect warnings for API mode result UPGRADE_WARNINGS="[]" @@ -1211,7 +1247,11 @@ echo -e "${BOLD}${GREEN} Upgrade Complete${NC}" echo -e "${BOLD}${GREEN}══════════════════════════════════════════════════${NC}" echo "" echo -e " ${BOLD}Previous:${NC} $PRE_UPGRADE_SHORT" -echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT ($(git log -1 --format='%s' HEAD))" +if [[ "$INSTALL_MODE" == "source" ]]; then + echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT ($(git log -1 --format='%s' HEAD 2>/dev/null || echo "$FINAL_COMMIT"))" +else + echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT" +fi echo -e " ${BOLD}Commits:${NC} $COMMIT_COUNT" echo -e " ${BOLD}Duration:${NC} $ELAPSED" echo -e " ${BOLD}Log:${NC} $LOG_FILE"