From 4a3d9d7c41e0da86feecf120abf1bb3903d55e6c Mon Sep 17 00:00:00 2001 From: bunker-admin Date: Thu, 21 May 2026 15:20:35 -0600 Subject: [PATCH] feat(upgrade): Approach B - image-only upgrade mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a "Quick Upgrade" path that pulls latest container images and recreates only the core app services (api, admin, media-api, nginx) without touching any tracked files. Tenant content (mkdocs/, configs/, scripts/) is implicitly preserved because the script never writes outside docker. Faster (~2 min vs ~4-5 min for full upgrade) and structurally safer for releases that don't change orchestration/templates. Pieces: - scripts/image-upgrade.sh: new ~350-line script. Phases: pre-flight + mkdocs snapshot, image pull, targeted recreate (broad up -d would cascade on misconfigured infra containers — proven on marcelle), light health checks, deferred ccp-agent restart. Writes the same progress.json + result.json schema as upgrade.sh so the CCP poll loop is unchanged. - agent/src/routes/upgrade.routes.ts: POST /instance/:slug/upgrade/start-image-only. Same lock + staleness guards as the existing /upgrade/start endpoint. - api/src/services/remote-driver.ts: RemoteDriver.startImageUpgrade(). - api/src/services/upgrade.service.ts: startImageUpgrade() entry point; reuses runRemoteUpgrade with mode='image-only' (only the initial agent call differs — result schema and polling are identical). - api/src/modules/instances/instances.routes.ts: POST /:id/upgrade-images + startImageUpgradeSchema. - admin/src/pages/InstanceDetailPage.tsx: secondary "Quick Upgrade" button next to "Upgrade Now" on the Updates tab. Tooltip explains when to use it. Tested locally on marcelle (v2.10.2 idempotent run): 1m 49s, mkdocs.yml md5 unchanged, file count unchanged, only api/admin/media-api/nginx touched. Subtle bug found and fixed: `set -o pipefail` + `grep -q` shorts pipe and SIGPIPEs the writer — captured services list once instead. Bunker Admin --- .../admin/src/pages/InstanceDetailPage.tsx | 67 ++- .../agent/src/routes/upgrade.routes.ts | 79 ++++ .../src/modules/instances/instances.routes.ts | 21 +- .../modules/instances/instances.schemas.ts | 11 + .../api/src/services/remote-driver.ts | 19 + .../api/src/services/upgrade.service.ts | 111 ++++- scripts/image-upgrade.sh | 383 ++++++++++++++++++ 7 files changed, 666 insertions(+), 25 deletions(-) create mode 100755 scripts/image-upgrade.sh diff --git a/changemaker-control-panel/admin/src/pages/InstanceDetailPage.tsx b/changemaker-control-panel/admin/src/pages/InstanceDetailPage.tsx index d798cc7..3617bb8 100644 --- a/changemaker-control-panel/admin/src/pages/InstanceDetailPage.tsx +++ b/changemaker-control-panel/admin/src/pages/InstanceDetailPage.tsx @@ -39,6 +39,7 @@ import { CloudOutlined, DisconnectOutlined, UploadOutlined, + ThunderboltOutlined, BellOutlined, CheckCircleOutlined, WarningOutlined, @@ -563,6 +564,24 @@ export default function InstanceDetailPage() { } }; + // Image-only upgrade (Approach B): pulls images + recreates core app services + // without touching tracked files. Faster + safer than full upgrade for releases + // that don't change compose/templates. + const handleStartImageUpgrade = async () => { + setUpgradingInstance(true); + try { + const { data } = await api.post(`/instances/${id}/upgrade-images`, {}); + setCurrentUpgrade(data.data); + message.success('Image-only upgrade started'); + } catch (err: unknown) { + const resp = (err as { response?: { data?: { error?: { message?: string } } } })?.response + ?.data?.error; + message.error(resp?.message || 'Failed to start image-only upgrade'); + } finally { + setUpgradingInstance(false); + } + }; + // Event handlers const handleAcknowledgeEvent = async (eventId: string) => { try { @@ -1632,25 +1651,41 @@ export default function InstanceDetailPage() { closable /> )} -
- - Pulls latest code, runs migrations, and restarts services. CCP backup is recommended before upgrading. +
+ + Full upgrade pulls the latest code, runs migrations, and restarts services. Quick upgrade only pulls images and recreates the core app — tenant content stays untouched and it's ~2 min faster. Use Quick when the release notes say no orchestration changes. - - - + + + + + +
)} diff --git a/changemaker-control-panel/agent/src/routes/upgrade.routes.ts b/changemaker-control-panel/agent/src/routes/upgrade.routes.ts index efdd74b..9c73117 100644 --- a/changemaker-control-panel/agent/src/routes/upgrade.routes.ts +++ b/changemaker-control-panel/agent/src/routes/upgrade.routes.ts @@ -188,6 +188,85 @@ router.post('/instance/:slug/upgrade/start', async (req: Request, res: Response) res.status(202).json({ started: true }); }); +// POST /instance/:slug/upgrade/start-image-only — Run image-upgrade.sh in background +// +// Image-only upgrade: pulls latest images + recreates services without touching +// tracked files (no git pull, no tarball extract, no VERSION mutation). Tenant +// content is implicitly safe because the script never writes outside data/upgrade. +// See scripts/image-upgrade.sh for full rationale. +// +// Schema-compatible with /upgrade/start: writes the same progress.json + result.json +// so the CCP poll loop in runRemoteUpgrade() works unchanged. +router.post('/instance/:slug/upgrade/start-image-only', async (req: Request, res: Response) => { + const slug = param(req, 'slug'); + const entry = await getSlugEntry(slug); + const { imageTag } = req.body || {}; + + // SECURITY: imageTag flows into bash via --image-tag. Constrain to a safe + // subset of docker tag chars (semver, SHA, named tags). Reject anything + // that could shell-escape. + if (imageTag && !/^[a-zA-Z0-9][a-zA-Z0-9_.-]{0,127}$/.test(String(imageTag))) { + res.status(400).json({ error: 'VALIDATION', message: 'Invalid imageTag' }); + return; + } + + const scriptPath = path.join(entry.basePath, 'scripts', 'image-upgrade.sh'); + try { + await fs.access(scriptPath); + } catch { + res.status(404).json({ error: 'NOT_FOUND', message: 'image-upgrade.sh not found' }); + return; + } + + // Same concurrency guards as the full /upgrade/start endpoint — uses the + // same lock + on-disk staleness check + backup/restore mutex. + if (isSlugLocked(slug, 'upgrade') || await isUpgradeRunningOnDisk(entry.basePath)) { + res.status(409).json({ error: 'SLUG_BUSY', message: 'An upgrade is already in progress' }); + return; + } + if (isSlugLocked(slug, 'backup') || isSlugLocked(slug, 'restore')) { + res.status(409).json({ error: 'SLUG_BUSY', message: 'A backup or restore is currently running' }); + return; + } + + // Clear stale progress/result files (same convention as /upgrade/start) + const progressPath = path.join(entry.basePath, 'data', 'upgrade', 'progress.json'); + const resultPath = path.join(entry.basePath, 'data', 'upgrade', 'result.json'); + await fs.mkdir(path.dirname(progressPath), { recursive: true }); + await fs.rm(progressPath, { force: true }); + await fs.rm(resultPath, { force: true }); + + const args: string[] = [scriptPath, '--api-mode']; + if (imageTag) args.push('--image-tag', String(imageTag)); + + void withSlugLock(slug, 'upgrade', async () => { + logger.info(`[image-upgrade] ${slug}: spawning ${args.join(' ')} (cwd=${entry.basePath})`); + try { + await new Promise((resolve, reject) => { + const proc = spawn('bash', args, { + cwd: entry.basePath, + env: { ...process.env, COMPOSE_ANSI: 'never' }, + stdio: ['ignore', 'ignore', 'ignore'], + }); + proc.on('error', reject); + proc.on('close', (code) => { + if (code === 0) resolve(); + else reject(new Error(`image-upgrade.sh exited with code ${code}`)); + }); + }); + logger.info(`[image-upgrade] ${slug}: image-upgrade.sh completed`); + } catch (err) { + logger.error(`[image-upgrade] ${slug}: ${(err as Error).message}`); + } + }).catch((err) => { + if (!(err instanceof SlugBusyError)) { + logger.error(`[image-upgrade] ${slug}: lock or background error: ${(err as Error).message}`); + } + }); + + res.status(202).json({ started: true, mode: 'image-only' }); +}); + // GET /instance/:slug/upgrade/progress — Read progress.json router.get('/instance/:slug/upgrade/progress', async (req: Request, res: Response) => { const entry = await getSlugEntry(param(req, 'slug')); diff --git a/changemaker-control-panel/api/src/modules/instances/instances.routes.ts b/changemaker-control-panel/api/src/modules/instances/instances.routes.ts index a22a653..d96ae13 100644 --- a/changemaker-control-panel/api/src/modules/instances/instances.routes.ts +++ b/changemaker-control-panel/api/src/modules/instances/instances.routes.ts @@ -4,7 +4,7 @@ import rateLimit from 'express-rate-limit'; import { prisma } from '../../lib/prisma'; import { authenticate, requireRole } from '../../middleware/auth'; import { validate } from '../../middleware/validate'; -import { createInstanceSchema, updateInstanceSchema, registerInstanceSchema, reconfigureInstanceSchema, configureTunnelSchema, importInstancesSchema, startUpgradeSchema, setupRemoteTunnelSchema } from './instances.schemas'; +import { createInstanceSchema, updateInstanceSchema, registerInstanceSchema, reconfigureInstanceSchema, configureTunnelSchema, importInstancesSchema, startUpgradeSchema, startImageUpgradeSchema, setupRemoteTunnelSchema } from './instances.schemas'; import * as instancesService from './instances.service'; import * as healthService from '../../services/health.service'; import * as backupService from '../../services/backup.service'; @@ -362,6 +362,25 @@ router.post( } ); +// Image-only upgrade (Approach B). Faster + safer than full upgrade for +// releases that don't change orchestration/templates. See upgrade.service.ts +// startImageUpgrade for full rationale. +router.post( + '/:id/upgrade-images', + requireRole('SUPER_ADMIN', 'OPERATOR'), + validate(startImageUpgradeSchema), + async (req: Request, res: Response) => { + const { imageTag } = req.body || {}; + const upgrade = await upgradeService.startImageUpgrade( + req.params.id as string, + req.user!.id, + req.ip, + { imageTag } + ); + res.status(201).json({ data: upgrade }); + } +); + router.get( '/:id/upgrade-status', requireRole('SUPER_ADMIN', 'OPERATOR'), diff --git a/changemaker-control-panel/api/src/modules/instances/instances.schemas.ts b/changemaker-control-panel/api/src/modules/instances/instances.schemas.ts index ee2f68d..cecb943 100644 --- a/changemaker-control-panel/api/src/modules/instances/instances.schemas.ts +++ b/changemaker-control-panel/api/src/modules/instances/instances.schemas.ts @@ -121,6 +121,17 @@ export const startUpgradeSchema = z.object({ .optional(), }); +// Approach B: image-only upgrade. Pulls images + recreates core app services +// without touching tracked files. imageTag is optional — if omitted, the +// agent uses whatever IMAGE_TAG the install's .env / compose env defines +// (typically `latest`). Tag must be a valid Docker tag. +export const startImageUpgradeSchema = z.object({ + imageTag: z + .string() + .regex(/^[a-zA-Z0-9][a-zA-Z0-9_.-]{0,127}$/, 'Invalid imageTag') + .optional(), +}); + export const setupRemoteTunnelSchema = z.object({ // Empty string or omitted → resources use standard subdomains (app., api., etc.) // A value like "ck" → creates ck-app., ck-api., etc. for multi-tenant domains diff --git a/changemaker-control-panel/api/src/services/remote-driver.ts b/changemaker-control-panel/api/src/services/remote-driver.ts index 82b6223..df79535 100644 --- a/changemaker-control-panel/api/src/services/remote-driver.ts +++ b/changemaker-control-panel/api/src/services/remote-driver.ts @@ -82,6 +82,10 @@ export interface StartAgentUpgradeOptions { branch?: string; } +export interface StartAgentImageUpgradeOptions { + imageTag?: string; +} + interface AgentRequestOptions { method: 'GET' | 'POST' | 'DELETE'; path: string; @@ -574,6 +578,21 @@ export class RemoteDriver implements ExecutionDriver { }); } + /** + * Trigger image-upgrade.sh --api-mode on the remote (Approach B: image-only + * upgrade — pulls images + recreates core app services without touching + * the install tree). Fire-and-forget; returns 202 immediately. Uses the + * same progress/result polling endpoints as startUpgrade. + */ + async startImageUpgrade(options: StartAgentImageUpgradeOptions = {}): Promise { + await this.request({ + method: 'POST', + path: `/instance/${this.slug}/upgrade/start-image-only`, + body: options, + timeoutMs: 30_000, + }); + } + /** * Read the agent's data/upgrade/progress.json. Returns the default zero-state * if no progress has been written yet. diff --git a/changemaker-control-panel/api/src/services/upgrade.service.ts b/changemaker-control-panel/api/src/services/upgrade.service.ts index c1ab063..f854b9f 100644 --- a/changemaker-control-panel/api/src/services/upgrade.service.ts +++ b/changemaker-control-panel/api/src/services/upgrade.service.ts @@ -205,6 +205,10 @@ export interface StartUpgradeOptions { branch?: string; } +export interface StartImageUpgradeOptions { + imageTag?: string; +} + /** * Start an upgrade for an instance. Returns the created InstanceUpgrade record. * The actual upgrade runs asynchronously (fire-and-forget). @@ -298,6 +302,86 @@ export async function startUpgrade( return upgrade; } +/** + * Start an IMAGE-ONLY upgrade (Approach B). Pulls latest images + recreates + * core app services without touching tracked files. Faster (~2 min vs ~4-5 + * min for full upgrade) and safer because no filesystem mutation outside + * docker — tenant content (mkdocs/, configs/) is implicitly preserved. + * + * Use this for releases that only bump container code or schema. For + * releases that change compose orchestration, nginx config, or other + * tracked files, use startUpgrade() instead. + * + * Remote-only for now: local mode would need a `runImageUpgrade` runner + * which we haven't built (all our instances are remote via mTLS agent). + */ +export async function startImageUpgrade( + instanceId: string, + userId: string, + ipAddress?: string, + options?: StartImageUpgradeOptions +) { + const instance = await prisma.instance.findUnique({ where: { id: instanceId } }); + if (!instance) throw new Error('Instance not found'); + + if (!instance.isRemote) { + throw new Error('Image-only upgrade is currently supported only for remote instances'); + } + + if (instance.status !== InstanceStatus.RUNNING && instance.status !== InstanceStatus.STOPPED) { + throw new Error(`Cannot upgrade instance in ${instance.status} state`); + } + + // Reuse the same in-progress guard as startUpgrade: only one upgrade + // (of either type) at a time per instance. + const active = await prisma.instanceUpgrade.findFirst({ + where: { + instanceId, + status: { in: [UpgradeStatus.PENDING, UpgradeStatus.IN_PROGRESS] }, + }, + }); + if (active) { + throw new Error('An upgrade is already in progress for this instance'); + } + + // Create upgrade record. branch is unused for image-only but keep it + // populated with current branch for audit trail consistency. + const upgrade = await prisma.instanceUpgrade.create({ + data: { + instanceId, + status: UpgradeStatus.PENDING, + previousCommit: instance.gitCommit, + branch: instance.gitBranch, + triggeredById: userId, + }, + }); + + // Audit log + await prisma.auditLog.create({ + data: { + userId, + instanceId, + action: AuditAction.INSTANCE_UPGRADE, + details: { + upgradeId: upgrade.id, + previousCommit: instance.gitCommit, + source: 'remote', + mode: 'image-only', + options: options || {}, + } as unknown as Prisma.InputJsonValue, + ipAddress, + }, + }); + + // Fire-and-forget: reuse runRemoteUpgrade with mode='image-only'. Same + // poll loop and result handling — only the initial agent call differs. + runRemoteUpgrade(upgrade.id, instance, undefined, 'image-only', options).catch((err) => { + logger.error(`[image-upgrade] Remote image upgrade orchestration failed for ${instance.slug}: ${err}`); + }); + + return upgrade; +} + /** * Async REMOTE upgrade runner. * @@ -316,7 +400,9 @@ export async function startUpgrade( async function runRemoteUpgrade( upgradeId: string, instance: Instance, - options?: StartUpgradeOptions + options?: StartUpgradeOptions, + mode: 'full' | 'image-only' = 'full', + imageOnlyOptions?: StartImageUpgradeOptions ) { const slug = instance.slug; @@ -333,18 +419,27 @@ async function runRemoteUpgrade( where: { id: upgradeId }, data: { status: UpgradeStatus.IN_PROGRESS, - progressMessage: 'Starting remote upgrade...', + progressMessage: mode === 'image-only' + ? 'Starting image-only upgrade...' + : 'Starting remote upgrade...', }, }); // Tell the agent to start. The agent has its own mutex + stale-progress // check, so this can return 409 if a previous upgrade is still running. - logger.info(`[upgrade] ${slug}: triggering remote upgrade.sh start`); - await driver.startUpgrade({ - skipBackup: options?.skipBackup, - useRegistry: options?.useRegistry, - branch: options?.branch, - }); + if (mode === 'image-only') { + logger.info(`[upgrade] ${slug}: triggering remote image-upgrade.sh start`); + await driver.startImageUpgrade({ + imageTag: imageOnlyOptions?.imageTag, + }); + } else { + logger.info(`[upgrade] ${slug}: triggering remote upgrade.sh start`); + await driver.startUpgrade({ + skipBackup: options?.skipBackup, + useRegistry: options?.useRegistry, + branch: options?.branch, + }); + } // Poll progress + result. We treat /result returning 200 as the signal // that upgrade.sh exited (successfully or with code != 0 — the script diff --git a/scripts/image-upgrade.sh b/scripts/image-upgrade.sh new file mode 100755 index 0000000..82c4140 --- /dev/null +++ b/scripts/image-upgrade.sh @@ -0,0 +1,383 @@ +#!/usr/bin/env bash +# image-upgrade.sh — Approach B: image-only upgrade +# +# Pulls latest images from the registry and recreates services WITHOUT touching +# tracked files in the install tree (no git pull, no tarball extract, no VERSION +# mutation). Tenant content (mkdocs/, configs/) is implicitly safe because this +# script never writes outside data/upgrade/ and the docker daemon. +# +# Used by CCP "Quick Upgrade" button. Pairs with scripts/upgrade.sh which +# remains the full upgrade path for orchestration-changing releases. +# +# Schema parity: writes data/upgrade/progress.json + result.json with the same +# fields upgrade.sh writes, so the CCP poll loop is unchanged. + +set -euo pipefail + +PROJECT_DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")/.." && pwd)" +SCRIPT_DIR="$PROJECT_DIR/scripts" +UPGRADE_DIR="$PROJECT_DIR/data/upgrade" +LOG_DIR="$PROJECT_DIR/logs" +LOG_FILE="$LOG_DIR/image-upgrade-$(date +%Y%m%d_%H%M%S).log" +LOCK_FILE="$PROJECT_DIR/.upgrade.lock" +PROGRESS_FILE="$UPGRADE_DIR/progress.json" +RESULT_FILE="$UPGRADE_DIR/result.json" + +START_TIME=$SECONDS + +# --- Detect install mode --- +if [[ -f "$PROJECT_DIR/VERSION" ]] && [[ ! -d "$PROJECT_DIR/.git" ]]; then + INSTALL_MODE="release" +else + INSTALL_MODE="source" +fi + +# --- Defaults --- +API_MODE=false +DRY_RUN=false +IMAGE_TAG="" + +usage() { + cat <&2; usage >&2; exit 1 ;; + esac +done + +# --- Colors --- +if [[ -t 1 ]] && [[ -z "${NO_COLOR:-}" ]]; then + RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' + CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' +else + RED='' GREEN='' YELLOW='' CYAN='' BOLD='' NC='' +fi +info() { echo -e "${CYAN}[INFO]${NC} $*"; } +success() { echo -e "${GREEN}[ OK ]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +error() { echo -e "${RED}[ERR ]${NC} $*" >&2; } +phase() { echo ""; echo -e "${BOLD}${CYAN}=== Phase $1: $2 ===${NC}"; } + +# --- Logging: mirror stdout/stderr to LOG_FILE --- +# logs/ may be root-owned on installs where upgrade.sh has run via ccp-agent. +# Fall back to /tmp if we can't write, so bunker-admin manual invocations don't +# crash with "Permission denied" on tee. +mkdir -p "$UPGRADE_DIR" +if mkdir -p "$LOG_DIR" 2>/dev/null && touch "$LOG_FILE" 2>/dev/null; then + : # primary log location is writable +else + LOG_FILE="/tmp/image-upgrade-$(date +%Y%m%d_%H%M%S)-$$.log" + echo "[INFO] logs/ not writable; using $LOG_FILE" >&2 +fi +exec > >(tee -a "$LOG_FILE") 2>&1 + +# --- Capture previous version for result.json --- +if [[ "$INSTALL_MODE" == "release" ]]; then + PRE_VERSION="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "unknown")" +else + PRE_VERSION="$(cd "$PROJECT_DIR" && git rev-parse --short HEAD 2>/dev/null || echo "unknown")" +fi + +write_progress() { + local phase_num="$1" phase_name="$2" pct="$3" msg="$4" + [[ "$API_MODE" != "true" ]] && return + mkdir -p "$UPGRADE_DIR" + cat > "$PROGRESS_FILE" </dev/null || echo "$PRE_VERSION")" + else + new_version="$(cd "$PROJECT_DIR" && git rev-parse --short HEAD 2>/dev/null || echo "$PRE_VERSION")" + fi + mkdir -p "$UPGRADE_DIR" + cat > "$RESULT_FILE" </dev/null || echo "")" + if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then + error "Upgrade already running (pid $pid). Refusing to start." + write_result "false" "Another upgrade is already running (pid $pid)" + exit 1 + fi + warn "Stale lock file found; removing" + rm -f "$LOCK_FILE" + fi + echo $$ > "$LOCK_FILE" +} + +release_lock() { rm -f "$LOCK_FILE" || true; } + +on_failure() { + local exit_code=$? + local line_no=${1:-?} + error "image-upgrade.sh failed at line $line_no (exit $exit_code)" + write_result "false" "Image upgrade failed at line $line_no (exit $exit_code)" + release_lock + exit "$exit_code" +} +trap 'on_failure $LINENO' ERR +trap 'release_lock' EXIT + +# --- Banner --- +echo "" +echo -e "${BOLD}${CYAN}================================================${NC}" +echo -e "${BOLD} Image-Only Upgrade${NC}" +echo -e "${BOLD}${CYAN}================================================${NC}" +echo "Install mode: $INSTALL_MODE" +echo "Project dir: $PROJECT_DIR" +echo "Pre-version: $PRE_VERSION" +[[ -n "$IMAGE_TAG" ]] && echo "Image tag: $IMAGE_TAG" +[[ "$DRY_RUN" == "true" ]] && echo "DRY RUN: no images will be pulled or services recreated" +echo "" + +acquire_lock + +# ============================================================================= +# Phase 1: Pre-flight + mkdocs snapshot (defensive) +# ============================================================================= +phase "1" "Pre-flight" +write_progress 1 "Pre-flight" 10 "Snapshotting mkdocs (defensive)..." + +# Source mkdocs-snapshot.sh and run it. This is the same snapshot every +# upgrade path takes — leaves mkdocs-backup-.tar.gz in project root. +# Image-only upgrades shouldn't damage mkdocs (no filesystem mutation), but +# the snapshot is cheap insurance and keeps operator habits consistent. +if [[ -r "$SCRIPT_DIR/lib/mkdocs-snapshot.sh" ]]; then + if [[ "$DRY_RUN" == "true" ]]; then + info "[DRY RUN] Would snapshot mkdocs/" + else + # shellcheck disable=SC1091 + PROJECT_DIR="$PROJECT_DIR" bash -c ". $SCRIPT_DIR/lib/mkdocs-snapshot.sh; snapshot_mkdocs" \ + || warn "mkdocs snapshot failed (non-fatal; continuing)" + fi +else + warn "scripts/lib/mkdocs-snapshot.sh not found; skipping snapshot" +fi + +# Sanity-check docker +if ! docker compose version &>/dev/null; then + error "docker compose is not available" + write_result "false" "docker compose not available" + exit 1 +fi +success "Pre-flight checks passed" + +# ============================================================================= +# Phase 2: Pull images +# ============================================================================= +phase "2" "Pull Images" +write_progress 2 "Pull Images" 30 "Pulling images from registry..." + +PULL_ENV=() +if [[ -n "$IMAGE_TAG" ]]; then + PULL_ENV+=("IMAGE_TAG=$IMAGE_TAG") +fi + +if [[ "$DRY_RUN" == "true" ]]; then + info "[DRY RUN] Would run: ${PULL_ENV[*]:-} docker compose pull" +else + info "Pulling all images (this may take a few minutes)..." + if (( ${#PULL_ENV[@]} > 0 )); then + if ! env "${PULL_ENV[@]}" docker compose pull; then + warn "docker compose pull had errors (continuing — some images may be local)" + fi + else + if ! docker compose pull; then + warn "docker compose pull had errors (continuing — some images may be local)" + fi + fi +fi +success "Image pull complete" + +# ============================================================================= +# Phase 3: Recreate core app services (targeted, not broad) +# ============================================================================= +phase "3" "Recreate Services" +write_progress 3 "Recreate Services" 60 "Recreating core app services with new images..." + +# Targeted recreate: only the services whose IMAGES are released as part of +# changemaker.lite (api, admin, media-api, nginx). Broader `up -d` is risky +# because a single misconfigured mount in any service (e.g. mkdocs-site-server) +# can cascade and leave dependent containers in "Created" state. Image-only +# upgrade should only touch the actual code containers, not third-party +# infrastructure that happens to live in the same compose file. +# +# Same Phase 6 pattern as upgrade.sh: drop ccp-agent from COMPOSE_PROFILES +# during recreate so we don't suicide-restart the agent that spawned us. +# Restart ccp-agent at the end via detached subshell. +PROFILES_SAVED="${COMPOSE_PROFILES:-}" +COMPOSE_PROFILES_WITHOUT_AGENT="$(echo "${PROFILES_SAVED}" \ + | tr ',' '\n' | grep -vx 'ccp-agent' | paste -sd, -)" + +UP_ENV=("COMPOSE_PROFILES=${COMPOSE_PROFILES_WITHOUT_AGENT}") +if [[ -n "$IMAGE_TAG" ]]; then + UP_ENV+=("IMAGE_TAG=$IMAGE_TAG") +fi + +# Core services that ship as v2 release images. nginx last so it doesn't +# briefly proxy to an old api. media-api may not be enabled on all installs; +# tolerate it being missing from compose. +CORE_SERVICES=(api admin media-api nginx) +EXISTING_SERVICES=() +# Capture the service list once. Don't pipe `docker compose config` into +# `grep -q` directly: with `set -o pipefail`, grep exits early on match and +# SIGPIPEs the docker writer, making the pipeline exit non-zero. The grep -q +# would then "match" all services as missing. Capture-then-check avoids it. +COMPOSE_SERVICES_LIST="$(docker compose config --services 2>/dev/null || true)" +for svc in "${CORE_SERVICES[@]}"; do + if grep -qx -- "$svc" <<<"$COMPOSE_SERVICES_LIST"; then + EXISTING_SERVICES+=("$svc") + else + info "Skipping service '$svc' (not in compose file)" + fi +done + +if (( ${#EXISTING_SERVICES[@]} == 0 )); then + warn "No core app services found in compose; skipping recreate" +elif [[ "$DRY_RUN" == "true" ]]; then + info "[DRY RUN] Would run: ${UP_ENV[*]} docker compose up -d ${EXISTING_SERVICES[*]}" +else + info "Recreating core services: ${EXISTING_SERVICES[*]}" + env "${UP_ENV[@]}" docker compose up -d "${EXISTING_SERVICES[@]}" +fi +success "Services recreated" + +# Restart Pangolin tunnel connector if running (image may have changed) +if docker ps --format '{{.Names}}' | grep -q 'newt'; then + if [[ "$DRY_RUN" == "true" ]]; then + info "[DRY RUN] Would restart newt" + else + info "Restarting Pangolin tunnel connector..." + docker compose restart newt 2>/dev/null || true + success "Newt tunnel restarted" + fi +fi + +# ============================================================================= +# Phase 4: Verify (light health checks) +# ============================================================================= +phase "4" "Verification" +write_progress 4 "Verification" 85 "Running health checks..." + +VERIFY_FAILED=false +UPGRADE_WARNINGS="[]" + +verify_health() { + local name="$1" check_cmd="$2" max_wait="${3:-45}" + local waited=0 + while [[ $waited -lt $max_wait ]]; do + if eval "$check_cmd" 2>/dev/null; then + success "$name: healthy (${waited}s)" + return 0 + fi + sleep 3 + waited=$((waited + 3)) + done + warn "$name: not responding after ${max_wait}s" + VERIFY_FAILED=true + return 0 +} + +if [[ "$DRY_RUN" != "true" ]]; then + verify_health "API (port 4000)" \ + "docker compose exec -T api wget -q --spider http://localhost:4000/api/health" 60 + verify_health "Admin (port 3000)" \ + "docker compose exec -T admin wget -q --spider http://localhost:3000/" 90 + if docker ps --format '{{.Names}}' | grep -q 'changemaker-media-api'; then + verify_health "Media API (port 4100)" \ + "docker compose exec -T media-api wget -q --spider http://127.0.0.1:4100/health" 30 + fi + + if "$VERIFY_FAILED"; then + UPGRADE_WARNINGS='["Some health checks failed after image-only upgrade — services may still be starting"]' + fi +fi + +# ============================================================================= +# Summary + deferred ccp-agent restart +# ============================================================================= +ELAPSED_MIN=$(( (SECONDS - START_TIME) / 60 )) +ELAPSED_SEC=$(( (SECONDS - START_TIME) % 60 )) +echo "" +echo -e "${BOLD}${GREEN}================================================${NC}" +echo -e "${BOLD} Image-Only Upgrade Complete${NC}" +echo -e "${BOLD}${GREEN}================================================${NC}" +printf " Previous: %s\n" "$PRE_VERSION" +printf " Duration: %dm %ds\n" "$ELAPSED_MIN" "$ELAPSED_SEC" +printf " Log: %s\n" "$LOG_FILE" + +write_progress 4 "Complete" 100 "Image-only upgrade complete" +write_result "true" "Image-only upgrade complete (previous: ${PRE_VERSION})" "$UPGRADE_WARNINGS" + +# Deferred ccp-agent restart — see upgrade.sh for full rationale. Same +# mechanism: nohup'd, disowned subshell that picks up the new image after +# this script has cleanly exited. +if echo "${PROFILES_SAVED:-}" | tr ',' '\n' | grep -qx 'ccp-agent'; then + if [[ "$DRY_RUN" == "true" ]]; then + info "[DRY RUN] Would schedule deferred ccp-agent restart" + else + info "Scheduling deferred ccp-agent restart..." + nohup bash -c " + sleep 3 + cd '$PROJECT_DIR' + COMPOSE_PROFILES='ccp-agent' docker compose --profile ccp-agent up -d ccp-agent + " >/dev/null 2>&1 < /dev/null & + disown + success "ccp-agent restart scheduled (will pick up new image)" + fi +fi + +release_lock +trap - EXIT +exit 0